Coverage Report

Created: 2023-11-12 09:30

/proc/self/cwd/source/server/guarddog_impl.cc
Line
Count
Source (jump to first uncovered line)
1
#include "source/server/guarddog_impl.h"
2
3
#include <sys/types.h>
4
5
#include <chrono>
6
#include <memory>
7
#include <utility>
8
#include <vector>
9
10
#include "envoy/common/time.h"
11
#include "envoy/config/bootstrap/v3/bootstrap.pb.h"
12
#include "envoy/server/configuration.h"
13
#include "envoy/server/guarddog.h"
14
#include "envoy/server/guarddog_config.h"
15
#include "envoy/stats/scope.h"
16
#include "envoy/watchdog/v3/abort_action.pb.h"
17
18
#include "source/common/common/assert.h"
19
#include "source/common/common/fmt.h"
20
#include "source/common/common/lock_guard.h"
21
#include "source/common/common/logger.h"
22
#include "source/common/config/utility.h"
23
#include "source/common/protobuf/utility.h"
24
#include "source/common/stats/symbol_table.h"
25
#include "source/server/watchdog_impl.h"
26
27
#include "absl/synchronization/mutex.h"
28
#include "absl/synchronization/notification.h"
29
30
namespace Envoy {
31
namespace Server {
32
33
GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Watchdog& config,
34
                           Api::Api& api, absl::string_view name,
35
                           std::unique_ptr<TestInterlockHook>&& test_interlock)
36
    : test_interlock_hook_(std::move(test_interlock)), stats_scope_(stats_scope),
37
      time_source_(api.timeSource()), miss_timeout_(config.missTimeout()),
38
      megamiss_timeout_(config.megaMissTimeout()), kill_timeout_(config.killTimeout()),
39
      multi_kill_timeout_(config.multiKillTimeout()),
40
      multi_kill_fraction_(config.multiKillThreshold() / 100.0),
41
7.62k
      loop_interval_([&]() -> std::chrono::milliseconds {
42
        // The loop interval is simply the minimum of all specified intervals,
43
        // but we must account for the 0=disabled case. This lambda takes care
44
        // of that and returns a value that initializes the const loop interval.
45
7.62k
        const auto min_of_nonfatal = std::min(miss_timeout_, megamiss_timeout_);
46
7.62k
        return std::min({killEnabled() ? kill_timeout_ : min_of_nonfatal,
47
7.62k
                         multikillEnabled() ? multi_kill_timeout_ : min_of_nonfatal,
48
7.62k
                         min_of_nonfatal});
49
7.62k
      }()),
50
      watchdog_miss_counter_(stats_scope.counterFromStatName(
51
          Stats::StatNameManagedStorage(absl::StrCat(name, ".watchdog_miss"),
52
                                        stats_scope.symbolTable())
53
              .statName())),
54
      watchdog_megamiss_counter_(stats_scope.counterFromStatName(
55
          Stats::StatNameManagedStorage(absl::StrCat(name, ".watchdog_mega_miss"),
56
                                        stats_scope.symbolTable())
57
              .statName())),
58
      dispatcher_(api.allocateDispatcher(absl::StrCat(name, "_guarddog_thread"))),
59
14.6k
      loop_timer_(dispatcher_->createTimer([this]() { step(); })),
60
7.62k
      events_to_actions_([&](const Server::Configuration::Watchdog& config) -> EventToActionsMap {
61
7.62k
        EventToActionsMap map;
62
63
        // We should be able to share the dispatcher since guard dog's lifetime
64
        // should eclipse those of actions.
65
7.62k
        Configuration::GuardDogActionFactoryContext context = {api, *dispatcher_, stats_scope,
66
7.62k
                                                               name};
67
68
7.62k
        auto actions = config.actions();
69
70
        // Add default abort_action if kill and/or multi-kill is enabled.
71
7.62k
        if (config.killTimeout().count() > 0) {
72
18
          envoy::watchdog::v3::AbortActionConfig abort_config;
73
18
          WatchDogAction* abort_action_config = actions.Add();
74
18
          abort_action_config->set_event(WatchDogAction::KILL);
75
18
          abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config);
76
18
        }
77
78
7.62k
        if (config.multiKillTimeout().count() > 0) {
79
2
          envoy::watchdog::v3::AbortActionConfig abort_config;
80
2
          WatchDogAction* abort_action_config = actions.Add();
81
2
          abort_action_config->set_event(WatchDogAction::MULTIKILL);
82
2
          abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config);
83
2
        }
84
85
7.62k
        for (const auto& action : actions) {
86
          // Get factory and add the created cb
87
27
          auto& factory = Config::Utility::getAndCheckFactory<Configuration::GuardDogActionFactory>(
88
27
              action.config());
89
27
          map[action.event()].push_back(factory.createGuardDogActionFromProto(action, context));
90
27
        }
91
92
7.62k
        return map;
93
7.62k
      }(config)),
94
7.62k
      run_thread_(true) {
95
7.62k
  start(api);
96
7.62k
}
97
98
GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Watchdog& config,
99
                           Api::Api& api, absl::string_view name)
100
7.62k
    : GuardDogImpl(stats_scope, config, api, name, std::make_unique<TestInterlockHook>()) {}
101
102
7.62k
GuardDogImpl::~GuardDogImpl() { stop(); }
103
104
14.6k
void GuardDogImpl::step() {
105
  // Hold mutex_ for the duration of the step() function to ensure that watchdog still alive checks
106
  // and test interlocks happen in the expected order. Calls to forceCheckForTest() should result in
107
  // a full iteration of the step() function to process recent watchdog touches and monotonic time
108
  // changes.
109
14.6k
  Thread::LockGuard guard(mutex_);
110
14.6k
  if (!run_thread_) {
111
26
    return;
112
26
  }
113
114
14.6k
  const auto now = time_source_.monotonicTime();
115
14.6k
  std::vector<std::pair<Thread::ThreadId, MonotonicTime>> miss_threads;
116
14.6k
  std::vector<std::pair<Thread::ThreadId, MonotonicTime>> mega_miss_threads;
117
118
14.6k
  {
119
14.6k
    std::vector<std::pair<Thread::ThreadId, MonotonicTime>> multi_kill_threads;
120
14.6k
    Thread::LockGuard guard(wd_lock_);
121
122
    // Compute the multikill threshold
123
14.6k
    const size_t required_for_multi_kill =
124
14.6k
        std::max(static_cast<size_t>(2),
125
14.6k
                 static_cast<size_t>(ceil(multi_kill_fraction_ * watched_dogs_.size())));
126
127
14.6k
    for (auto& watched_dog : watched_dogs_) {
128
985
      if (watched_dog->dog_->getTouchedAndReset()) {
129
        // Watchdog was touched since the guard dog last checked; update last check-in time.
130
985
        watched_dog->last_checkin_ = now;
131
985
        continue;
132
985
      }
133
134
18.4E
      const auto last_checkin = watched_dog->last_checkin_;
135
18.4E
      const auto tid = watched_dog->dog_->threadId();
136
18.4E
      const auto delta = now - last_checkin;
137
18.4E
      if (watched_dog->last_alert_time_ && watched_dog->last_alert_time_.value() < last_checkin) {
138
0
        watched_dog->miss_alerted_ = false;
139
0
        watched_dog->megamiss_alerted_ = false;
140
0
      }
141
18.4E
      if (delta > miss_timeout_) {
142
0
        if (!watched_dog->miss_alerted_) {
143
0
          watchdog_miss_counter_.inc();
144
0
          watched_dog->miss_counter_.inc();
145
0
          watched_dog->last_alert_time_ = last_checkin;
146
0
          watched_dog->miss_alerted_ = true;
147
0
          miss_threads.emplace_back(tid, last_checkin);
148
0
        }
149
0
      }
150
18.4E
      if (delta > megamiss_timeout_) {
151
0
        if (!watched_dog->megamiss_alerted_) {
152
0
          watchdog_megamiss_counter_.inc();
153
0
          watched_dog->megamiss_counter_.inc();
154
0
          watched_dog->last_alert_time_ = last_checkin;
155
0
          watched_dog->megamiss_alerted_ = true;
156
0
          mega_miss_threads.emplace_back(tid, last_checkin);
157
0
        }
158
0
      }
159
18.4E
      if (killEnabled() && delta > kill_timeout_) {
160
0
        invokeGuardDogActions(WatchDogAction::KILL, {{tid, last_checkin}}, now);
161
0
      }
162
18.4E
      if (multikillEnabled() && delta > multi_kill_timeout_) {
163
0
        multi_kill_threads.emplace_back(tid, last_checkin);
164
165
0
        if (multi_kill_threads.size() >= required_for_multi_kill) {
166
0
          ENVOY_LOG_MISC(error, "Watchdog MULTIKILL as {} threads are stuck.",
167
0
                         multi_kill_threads.size());
168
0
          invokeGuardDogActions(WatchDogAction::MULTIKILL, multi_kill_threads, now);
169
0
        }
170
0
      }
171
18.4E
    }
172
14.6k
  }
173
174
  // Run megamiss and miss handlers
175
14.6k
  if (!mega_miss_threads.empty()) {
176
0
    invokeGuardDogActions(WatchDogAction::MEGAMISS, mega_miss_threads, now);
177
0
  }
178
179
14.6k
  if (!miss_threads.empty()) {
180
0
    invokeGuardDogActions(WatchDogAction::MISS, miss_threads, now);
181
0
  }
182
183
14.6k
  test_interlock_hook_->signalFromImpl();
184
14.6k
  if (run_thread_) {
185
14.6k
    loop_timer_->enableTimer(loop_interval_);
186
14.6k
  }
187
14.6k
}
188
189
WatchDogSharedPtr GuardDogImpl::createWatchDog(Thread::ThreadId thread_id,
190
                                               const std::string& thread_name,
191
5.29k
                                               Event::Dispatcher& dispatcher) {
192
  // Timer started by WatchDog will try to fire at 1/2 of the interval of the
193
  // minimum timeout specified. loop_interval_ is const so all shared state
194
  // accessed out of the locked section below is const (time_source_ has no
195
  // state).
196
5.29k
  const auto wd_interval = loop_interval_ / 2;
197
5.29k
  auto new_watchdog = std::make_shared<WatchDogImpl>(std::move(thread_id));
198
5.29k
  WatchedDogPtr watched_dog = std::make_unique<WatchedDog>(stats_scope_, thread_name, new_watchdog);
199
5.29k
  new_watchdog->touch();
200
5.29k
  {
201
5.29k
    Thread::LockGuard guard(wd_lock_);
202
5.29k
    watched_dogs_.push_back(std::move(watched_dog));
203
5.29k
  }
204
5.29k
  dispatcher.registerWatchdog(new_watchdog, wd_interval);
205
5.29k
  new_watchdog->touch();
206
5.29k
  return new_watchdog;
207
5.29k
}
208
209
5.29k
void GuardDogImpl::stopWatching(WatchDogSharedPtr wd) {
210
5.29k
  Thread::LockGuard guard(wd_lock_);
211
5.29k
  auto found_wd = std::find_if(watched_dogs_.begin(), watched_dogs_.end(),
212
5.29k
                               [&wd](const WatchedDogPtr& d) -> bool { return d->dog_ == wd; });
213
5.29k
  if (found_wd != watched_dogs_.end()) {
214
5.29k
    watched_dogs_.erase(found_wd);
215
5.29k
  } else {
216
0
    ASSERT(false);
217
0
  }
218
5.29k
}
219
220
7.62k
void GuardDogImpl::start(Api::Api& api) {
221
7.62k
  Thread::LockGuard guard(mutex_);
222
223
  // Synchronize between calling thread and guarddog thread.
224
7.62k
  absl::Notification guarddog_thread_started;
225
226
  // See comments in WorkerImpl::start for the naming convention.
227
7.62k
  Thread::Options options{absl::StrCat("dog:", dispatcher_->name())};
228
7.62k
  thread_ = api.threadFactory().createThread(
229
7.62k
      [this, &guarddog_thread_started]() -> void {
230
7.62k
        loop_timer_->enableTimer(std::chrono::milliseconds(0));
231
7.62k
        dispatcher_->post([&guarddog_thread_started]() { guarddog_thread_started.Notify(); });
232
7.62k
        dispatcher_->run(Event::Dispatcher::RunType::RunUntilExit);
233
7.62k
      },
234
7.62k
      options);
235
236
7.62k
  guarddog_thread_started.WaitForNotification();
237
7.62k
}
238
239
7.62k
void GuardDogImpl::stop() {
240
7.62k
  {
241
7.62k
    Thread::LockGuard guard(mutex_);
242
7.62k
    run_thread_ = false;
243
7.62k
  }
244
7.62k
  dispatcher_->exit();
245
7.62k
  if (thread_) {
246
7.62k
    thread_->join();
247
7.62k
    thread_.reset();
248
7.62k
  }
249
7.62k
}
250
251
void GuardDogImpl::invokeGuardDogActions(
252
    WatchDogAction::WatchdogEvent event,
253
    std::vector<std::pair<Thread::ThreadId, MonotonicTime>> thread_last_checkin_pairs,
254
0
    MonotonicTime now) {
255
0
  const auto& registered_actions = events_to_actions_.find(event);
256
0
  if (registered_actions != events_to_actions_.end()) {
257
0
    for (auto& action : registered_actions->second) {
258
0
      action->run(event, thread_last_checkin_pairs, now);
259
0
    }
260
0
  }
261
0
}
262
263
GuardDogImpl::WatchedDog::WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name,
264
                                     const WatchDogImplSharedPtr& watch_dog)
265
    : dog_(watch_dog),
266
      miss_counter_(stats_scope.counterFromStatName(
267
          Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_miss", thread_name),
268
                                        stats_scope.symbolTable())
269
              .statName())),
270
      megamiss_counter_(stats_scope.counterFromStatName(
271
          Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_mega_miss", thread_name),
272
                                        stats_scope.symbolTable())
273
5.29k
              .statName())) {}
274
275
} // namespace Server
276
} // namespace Envoy