1
#include "source/server/guarddog_impl.h"
2

            
3
#include <sys/types.h>
4

            
5
#include <chrono>
6
#include <memory>
7
#include <utility>
8
#include <vector>
9

            
10
#include "envoy/common/time.h"
11
#include "envoy/config/bootstrap/v3/bootstrap.pb.h"
12
#include "envoy/server/configuration.h"
13
#include "envoy/server/guarddog.h"
14
#include "envoy/server/guarddog_config.h"
15
#include "envoy/stats/scope.h"
16
#include "envoy/watchdog/v3/abort_action.pb.h"
17

            
18
#include "source/common/common/assert.h"
19
#include "source/common/common/fmt.h"
20
#include "source/common/common/lock_guard.h"
21
#include "source/common/common/logger.h"
22
#include "source/common/config/utility.h"
23
#include "source/common/protobuf/utility.h"
24
#include "source/common/stats/symbol_table.h"
25
#include "source/server/watchdog_impl.h"
26

            
27
#include "absl/synchronization/mutex.h"
28
#include "absl/synchronization/notification.h"
29

            
30
namespace Envoy {
31
namespace Server {
32

            
33
GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Watchdog& config,
34
                           Api::Api& api, absl::string_view name,
35
                           std::unique_ptr<TestInterlockHook>&& test_interlock)
36
21356
    : test_interlock_hook_(std::move(test_interlock)), stats_scope_(stats_scope),
37
21356
      time_source_(api.timeSource()), miss_timeout_(config.missTimeout()),
38
21356
      megamiss_timeout_(config.megaMissTimeout()), kill_timeout_(config.killTimeout()),
39
21356
      multi_kill_timeout_(config.multiKillTimeout()),
40
21356
      multi_kill_fraction_(config.multiKillThreshold() / 100.0),
41
21356
      loop_interval_([&]() -> std::chrono::milliseconds {
42
        // The loop interval is simply the minimum of all specified intervals,
43
        // but we must account for the 0=disabled case. This lambda takes care
44
        // of that and returns a value that initializes the const loop interval.
45
21356
        const auto min_of_nonfatal = std::min(miss_timeout_, megamiss_timeout_);
46
21356
        return std::min({killEnabled() ? kill_timeout_ : min_of_nonfatal,
47
21356
                         multikillEnabled() ? multi_kill_timeout_ : min_of_nonfatal,
48
21356
                         min_of_nonfatal});
49
21356
      }()),
50
21356
      watchdog_miss_counter_(stats_scope.counterFromStatName(
51
21356
          Stats::StatNameManagedStorage(absl::StrCat(name, ".watchdog_miss"),
52
21356
                                        stats_scope.symbolTable())
53
21356
              .statName())),
54
21356
      watchdog_megamiss_counter_(stats_scope.counterFromStatName(
55
21356
          Stats::StatNameManagedStorage(absl::StrCat(name, ".watchdog_mega_miss"),
56
21356
                                        stats_scope.symbolTable())
57
21356
              .statName())),
58
21356
      dispatcher_(api.allocateDispatcher(absl::StrCat(name, "_guarddog_thread"))),
59
86393
      loop_timer_(dispatcher_->createTimer([this]() { step(); })),
60
21356
      events_to_actions_([&](const Server::Configuration::Watchdog& config) -> EventToActionsMap {
61
21356
        EventToActionsMap map;
62

            
63
        // We should be able to share the dispatcher since guard dog's lifetime
64
        // should eclipse those of actions.
65
21356
        Configuration::GuardDogActionFactoryContext context = {api, *dispatcher_, stats_scope,
66
21356
                                                               name};
67

            
68
21356
        auto actions = config.actions();
69

            
70
        // Add default abort_action if kill and/or multi-kill is enabled.
71
21356
        if (config.killTimeout().count() > 0) {
72
8
          envoy::watchdog::v3::AbortActionConfig abort_config;
73
8
          WatchDogAction* abort_action_config = actions.Add();
74
8
          abort_action_config->set_event(WatchDogAction::KILL);
75
8
          abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config);
76
8
        }
77

            
78
21356
        if (config.multiKillTimeout().count() > 0) {
79
8
          envoy::watchdog::v3::AbortActionConfig abort_config;
80
8
          WatchDogAction* abort_action_config = actions.Add();
81
8
          abort_action_config->set_event(WatchDogAction::MULTIKILL);
82
8
          abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config);
83
8
        }
84

            
85
21392
        for (const auto& action : actions) {
86
          // Get factory and add the created cb
87
64
          auto& factory = Config::Utility::getAndCheckFactory<Configuration::GuardDogActionFactory>(
88
64
              action.config());
89
64
          map[action.event()].push_back(factory.createGuardDogActionFromProto(action, context));
90
64
        }
91

            
92
21356
        return map;
93
21356
      }(config)),
94
21356
      run_thread_(true) {
95
21356
  start(api);
96
21356
}
97

            
98
GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Watchdog& config,
99
                           Api::Api& api, absl::string_view name)
100
21328
    : GuardDogImpl(stats_scope, config, api, name, std::make_unique<TestInterlockHook>()) {}
101

            
102
21356
GuardDogImpl::~GuardDogImpl() { stop(); }
103

            
104
86334
void GuardDogImpl::step() {
105
  // Hold mutex_ for the duration of the step() function to ensure that watchdog still alive checks
106
  // and test interlocks happen in the expected order. Calls to forceCheckForTest() should result in
107
  // a full iteration of the step() function to process recent watchdog touches and monotonic time
108
  // changes.
109
86334
  Thread::LockGuard guard(mutex_);
110
86334
  if (!run_thread_) {
111
202
    return;
112
202
  }
113

            
114
86132
  const auto now = time_source_.monotonicTime();
115
86132
  std::vector<std::pair<Thread::ThreadId, MonotonicTime>> miss_threads;
116
86132
  std::vector<std::pair<Thread::ThreadId, MonotonicTime>> mega_miss_threads;
117

            
118
86132
  {
119
86132
    std::vector<std::pair<Thread::ThreadId, MonotonicTime>> multi_kill_threads;
120
86132
    Thread::LockGuard guard(wd_lock_);
121

            
122
    // Compute the multikill threshold
123
86132
    const size_t required_for_multi_kill =
124
86132
        std::max(static_cast<size_t>(2),
125
86132
                 static_cast<size_t>(ceil(multi_kill_fraction_ * watched_dogs_.size())));
126

            
127
88152
    for (auto& watched_dog : watched_dogs_) {
128
57772
      if (watched_dog->dog_->getTouchedAndReset()) {
129
        // Watchdog was touched since the guard dog last checked; update last check-in time.
130
57544
        watched_dog->last_checkin_ = now;
131
57544
        continue;
132
57544
      }
133

            
134
228
      const auto last_checkin = watched_dog->last_checkin_;
135
228
      const auto tid = watched_dog->dog_->threadId();
136
228
      const auto delta = now - last_checkin;
137
228
      if (watched_dog->last_alert_time_ && watched_dog->last_alert_time_.value() < last_checkin) {
138
31
        watched_dog->miss_alerted_ = false;
139
31
        watched_dog->megamiss_alerted_ = false;
140
31
      }
141
229
      if (delta > miss_timeout_) {
142
158
        if (!watched_dog->miss_alerted_) {
143
67
          watchdog_miss_counter_.inc();
144
67
          watched_dog->miss_counter_.inc();
145
67
          watched_dog->last_alert_time_ = last_checkin;
146
67
          watched_dog->miss_alerted_ = true;
147
67
          miss_threads.emplace_back(tid, last_checkin);
148
67
        }
149
158
      }
150
228
      if (delta > megamiss_timeout_) {
151
129
        if (!watched_dog->megamiss_alerted_) {
152
47
          watchdog_megamiss_counter_.inc();
153
47
          watched_dog->megamiss_counter_.inc();
154
47
          watched_dog->last_alert_time_ = last_checkin;
155
47
          watched_dog->megamiss_alerted_ = true;
156
47
          mega_miss_threads.emplace_back(tid, last_checkin);
157
47
        }
158
129
      }
159
228
      if (killEnabled() && delta > kill_timeout_) {
160
        invokeGuardDogActions(WatchDogAction::KILL, {{tid, last_checkin}}, now);
161
      }
162
228
      if (multikillEnabled() && delta > multi_kill_timeout_) {
163
4
        multi_kill_threads.emplace_back(tid, last_checkin);
164

            
165
4
        if (multi_kill_threads.size() >= required_for_multi_kill) {
166
          ENVOY_LOG_MISC(error, "Watchdog MULTIKILL as {} threads are stuck.",
167
                         multi_kill_threads.size());
168
          invokeGuardDogActions(WatchDogAction::MULTIKILL, multi_kill_threads, now);
169
        }
170
4
      }
171
228
    }
172
86132
  }
173

            
174
  // Run megamiss and miss handlers
175
86132
  if (!mega_miss_threads.empty()) {
176
45
    invokeGuardDogActions(WatchDogAction::MEGAMISS, mega_miss_threads, now);
177
45
  }
178

            
179
86132
  if (!miss_threads.empty()) {
180
65
    invokeGuardDogActions(WatchDogAction::MISS, miss_threads, now);
181
65
  }
182

            
183
86132
  test_interlock_hook_->signalFromImpl();
184
86145
  if (run_thread_) {
185
86083
    loop_timer_->enableTimer(loop_interval_);
186
86083
  }
187
86132
}
188

            
189
WatchDogSharedPtr GuardDogImpl::createWatchDog(Thread::ThreadId thread_id,
190
                                               const std::string& thread_name,
191
21277
                                               Event::Dispatcher& dispatcher) {
192
  // Timer started by WatchDog will try to fire at 1/2 of the interval of the
193
  // minimum timeout specified. loop_interval_ is const so all shared state
194
  // accessed out of the locked section below is const (time_source_ has no
195
  // state).
196
21277
  const auto wd_interval = loop_interval_ / 2;
197
21277
  auto new_watchdog = std::make_shared<WatchDogImpl>(std::move(thread_id));
198
21277
  WatchedDogPtr watched_dog = std::make_unique<WatchedDog>(stats_scope_, thread_name, new_watchdog);
199
21277
  new_watchdog->touch();
200
21277
  {
201
21277
    Thread::LockGuard guard(wd_lock_);
202
21277
    watched_dogs_.push_back(std::move(watched_dog));
203
21277
  }
204
21277
  dispatcher.registerWatchdog(new_watchdog, wd_interval);
205
21277
  new_watchdog->touch();
206
21277
  return new_watchdog;
207
21277
}
208

            
209
21247
void GuardDogImpl::stopWatching(WatchDogSharedPtr wd) {
210
21247
  Thread::LockGuard guard(wd_lock_);
211
21247
  auto found_wd = std::find_if(watched_dogs_.begin(), watched_dogs_.end(),
212
21489
                               [&wd](const WatchedDogPtr& d) -> bool { return d->dog_ == wd; });
213
21247
  if (found_wd != watched_dogs_.end()) {
214
21247
    watched_dogs_.erase(found_wd);
215
21247
  } else {
216
    ASSERT(false);
217
  }
218
21247
}
219

            
220
21356
void GuardDogImpl::start(Api::Api& api) {
221
21356
  Thread::LockGuard guard(mutex_);
222

            
223
  // Synchronize between calling thread and guarddog thread.
224
21356
  absl::Notification guarddog_thread_started;
225

            
226
  // See comments in WorkerImpl::start for the naming convention.
227
21356
  Thread::Options options{absl::StrCat("dog:", dispatcher_->name())};
228
21356
  thread_ = api.threadFactory().createThread(
229
21356
      [this, &guarddog_thread_started]() -> void {
230
21356
        loop_timer_->enableTimer(std::chrono::milliseconds(0));
231
21356
        dispatcher_->post([&guarddog_thread_started]() { guarddog_thread_started.Notify(); });
232
21356
        dispatcher_->run(Event::Dispatcher::RunType::RunUntilExit);
233
21356
      },
234
21356
      options);
235

            
236
21356
  guarddog_thread_started.WaitForNotification();
237
21356
}
238

            
239
21356
void GuardDogImpl::stop() {
240
21356
  {
241
21356
    Thread::LockGuard guard(mutex_);
242
21356
    run_thread_ = false;
243
21356
  }
244
21356
  dispatcher_->exit();
245
21356
  if (thread_) {
246
21356
    thread_->join();
247
21356
    thread_.reset();
248
21356
  }
249
21356
}
250

            
251
void GuardDogImpl::invokeGuardDogActions(
252
    WatchDogAction::WatchdogEvent event,
253
    std::vector<std::pair<Thread::ThreadId, MonotonicTime>> thread_last_checkin_pairs,
254
110
    MonotonicTime now) {
255
110
  const auto& registered_actions = events_to_actions_.find(event);
256
110
  if (registered_actions != events_to_actions_.end()) {
257
16
    for (auto& action : registered_actions->second) {
258
16
      action->run(event, thread_last_checkin_pairs, now);
259
16
    }
260
16
  }
261
110
}
262

            
263
GuardDogImpl::WatchedDog::WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name,
264
                                     const WatchDogImplSharedPtr& watch_dog)
265
21277
    : dog_(watch_dog),
266
21277
      miss_counter_(stats_scope.counterFromStatName(
267
21277
          Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_miss", thread_name),
268
21277
                                        stats_scope.symbolTable())
269
21277
              .statName())),
270
21277
      megamiss_counter_(stats_scope.counterFromStatName(
271
21277
          Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_mega_miss", thread_name),
272
21277
                                        stats_scope.symbolTable())
273
21277
              .statName())) {}
274

            
275
} // namespace Server
276
} // namespace Envoy