Line data Source code
1 : #include "source/server/guarddog_impl.h"
2 :
3 : #include <sys/types.h>
4 :
5 : #include <chrono>
6 : #include <memory>
7 : #include <utility>
8 : #include <vector>
9 :
10 : #include "envoy/common/time.h"
11 : #include "envoy/config/bootstrap/v3/bootstrap.pb.h"
12 : #include "envoy/server/configuration.h"
13 : #include "envoy/server/guarddog.h"
14 : #include "envoy/server/guarddog_config.h"
15 : #include "envoy/stats/scope.h"
16 : #include "envoy/watchdog/v3/abort_action.pb.h"
17 :
18 : #include "source/common/common/assert.h"
19 : #include "source/common/common/fmt.h"
20 : #include "source/common/common/lock_guard.h"
21 : #include "source/common/common/logger.h"
22 : #include "source/common/config/utility.h"
23 : #include "source/common/protobuf/utility.h"
24 : #include "source/common/stats/symbol_table.h"
25 : #include "source/server/watchdog_impl.h"
26 :
27 : #include "absl/synchronization/mutex.h"
28 : #include "absl/synchronization/notification.h"
29 :
30 : namespace Envoy {
31 : namespace Server {
32 :
33 : GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Watchdog& config,
34 : Api::Api& api, absl::string_view name,
35 : std::unique_ptr<TestInterlockHook>&& test_interlock)
36 : : test_interlock_hook_(std::move(test_interlock)), stats_scope_(stats_scope),
37 : time_source_(api.timeSource()), miss_timeout_(config.missTimeout()),
38 : megamiss_timeout_(config.megaMissTimeout()), kill_timeout_(config.killTimeout()),
39 : multi_kill_timeout_(config.multiKillTimeout()),
40 : multi_kill_fraction_(config.multiKillThreshold() / 100.0),
41 222 : loop_interval_([&]() -> std::chrono::milliseconds {
42 : // The loop interval is simply the minimum of all specified intervals,
43 : // but we must account for the 0=disabled case. This lambda takes care
44 : // of that and returns a value that initializes the const loop interval.
45 222 : const auto min_of_nonfatal = std::min(miss_timeout_, megamiss_timeout_);
46 222 : return std::min({killEnabled() ? kill_timeout_ : min_of_nonfatal,
47 222 : multikillEnabled() ? multi_kill_timeout_ : min_of_nonfatal,
48 222 : min_of_nonfatal});
49 222 : }()),
50 : watchdog_miss_counter_(stats_scope.counterFromStatName(
51 : Stats::StatNameManagedStorage(absl::StrCat(name, ".watchdog_miss"),
52 : stats_scope.symbolTable())
53 : .statName())),
54 : watchdog_megamiss_counter_(stats_scope.counterFromStatName(
55 : Stats::StatNameManagedStorage(absl::StrCat(name, ".watchdog_mega_miss"),
56 : stats_scope.symbolTable())
57 : .statName())),
58 : dispatcher_(api.allocateDispatcher(absl::StrCat(name, "_guarddog_thread"))),
59 524 : loop_timer_(dispatcher_->createTimer([this]() { step(); })),
60 222 : events_to_actions_([&](const Server::Configuration::Watchdog& config) -> EventToActionsMap {
61 222 : EventToActionsMap map;
62 :
63 : // We should be able to share the dispatcher since guard dog's lifetime
64 : // should eclipse those of actions.
65 222 : Configuration::GuardDogActionFactoryContext context = {api, *dispatcher_, stats_scope,
66 222 : name};
67 :
68 222 : auto actions = config.actions();
69 :
70 : // Add default abort_action if kill and/or multi-kill is enabled.
71 222 : if (config.killTimeout().count() > 0) {
72 0 : envoy::watchdog::v3::AbortActionConfig abort_config;
73 0 : WatchDogAction* abort_action_config = actions.Add();
74 0 : abort_action_config->set_event(WatchDogAction::KILL);
75 0 : abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config);
76 0 : }
77 :
78 222 : if (config.multiKillTimeout().count() > 0) {
79 0 : envoy::watchdog::v3::AbortActionConfig abort_config;
80 0 : WatchDogAction* abort_action_config = actions.Add();
81 0 : abort_action_config->set_event(WatchDogAction::MULTIKILL);
82 0 : abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config);
83 0 : }
84 :
85 222 : for (const auto& action : actions) {
86 : // Get factory and add the created cb
87 0 : auto& factory = Config::Utility::getAndCheckFactory<Configuration::GuardDogActionFactory>(
88 0 : action.config());
89 0 : map[action.event()].push_back(factory.createGuardDogActionFromProto(action, context));
90 0 : }
91 :
92 222 : return map;
93 222 : }(config)),
94 222 : run_thread_(true) {
95 222 : start(api);
96 222 : }
97 :
98 : GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Watchdog& config,
99 : Api::Api& api, absl::string_view name)
100 222 : : GuardDogImpl(stats_scope, config, api, name, std::make_unique<TestInterlockHook>()) {}
101 :
102 222 : GuardDogImpl::~GuardDogImpl() { stop(); }
103 :
104 523 : void GuardDogImpl::step() {
105 : // Hold mutex_ for the duration of the step() function to ensure that watchdog still alive checks
106 : // and test interlocks happen in the expected order. Calls to forceCheckForTest() should result in
107 : // a full iteration of the step() function to process recent watchdog touches and monotonic time
108 : // changes.
109 523 : Thread::LockGuard guard(mutex_);
110 523 : if (!run_thread_) {
111 2 : return;
112 2 : }
113 :
114 521 : const auto now = time_source_.monotonicTime();
115 521 : std::vector<std::pair<Thread::ThreadId, MonotonicTime>> miss_threads;
116 521 : std::vector<std::pair<Thread::ThreadId, MonotonicTime>> mega_miss_threads;
117 :
118 521 : {
119 521 : std::vector<std::pair<Thread::ThreadId, MonotonicTime>> multi_kill_threads;
120 521 : Thread::LockGuard guard(wd_lock_);
121 :
122 : // Compute the multikill threshold
123 521 : const size_t required_for_multi_kill =
124 521 : std::max(static_cast<size_t>(2),
125 521 : static_cast<size_t>(ceil(multi_kill_fraction_ * watched_dogs_.size())));
126 :
127 521 : for (auto& watched_dog : watched_dogs_) {
128 210 : if (watched_dog->dog_->getTouchedAndReset()) {
129 : // Watchdog was touched since the guard dog last checked; update last check-in time.
130 210 : watched_dog->last_checkin_ = now;
131 210 : continue;
132 210 : }
133 :
134 0 : const auto last_checkin = watched_dog->last_checkin_;
135 0 : const auto tid = watched_dog->dog_->threadId();
136 0 : const auto delta = now - last_checkin;
137 0 : if (watched_dog->last_alert_time_ && watched_dog->last_alert_time_.value() < last_checkin) {
138 0 : watched_dog->miss_alerted_ = false;
139 0 : watched_dog->megamiss_alerted_ = false;
140 0 : }
141 0 : if (delta > miss_timeout_) {
142 0 : if (!watched_dog->miss_alerted_) {
143 0 : watchdog_miss_counter_.inc();
144 0 : watched_dog->miss_counter_.inc();
145 0 : watched_dog->last_alert_time_ = last_checkin;
146 0 : watched_dog->miss_alerted_ = true;
147 0 : miss_threads.emplace_back(tid, last_checkin);
148 0 : }
149 0 : }
150 0 : if (delta > megamiss_timeout_) {
151 0 : if (!watched_dog->megamiss_alerted_) {
152 0 : watchdog_megamiss_counter_.inc();
153 0 : watched_dog->megamiss_counter_.inc();
154 0 : watched_dog->last_alert_time_ = last_checkin;
155 0 : watched_dog->megamiss_alerted_ = true;
156 0 : mega_miss_threads.emplace_back(tid, last_checkin);
157 0 : }
158 0 : }
159 0 : if (killEnabled() && delta > kill_timeout_) {
160 0 : invokeGuardDogActions(WatchDogAction::KILL, {{tid, last_checkin}}, now);
161 0 : }
162 0 : if (multikillEnabled() && delta > multi_kill_timeout_) {
163 0 : multi_kill_threads.emplace_back(tid, last_checkin);
164 :
165 0 : if (multi_kill_threads.size() >= required_for_multi_kill) {
166 0 : ENVOY_LOG_MISC(error, "Watchdog MULTIKILL as {} threads are stuck.",
167 0 : multi_kill_threads.size());
168 0 : invokeGuardDogActions(WatchDogAction::MULTIKILL, multi_kill_threads, now);
169 0 : }
170 0 : }
171 0 : }
172 521 : }
173 :
174 : // Run megamiss and miss handlers
175 521 : if (!mega_miss_threads.empty()) {
176 0 : invokeGuardDogActions(WatchDogAction::MEGAMISS, mega_miss_threads, now);
177 0 : }
178 :
179 521 : if (!miss_threads.empty()) {
180 0 : invokeGuardDogActions(WatchDogAction::MISS, miss_threads, now);
181 0 : }
182 :
183 521 : test_interlock_hook_->signalFromImpl();
184 522 : if (run_thread_) {
185 522 : loop_timer_->enableTimer(loop_interval_);
186 522 : }
187 521 : }
188 :
189 : WatchDogSharedPtr GuardDogImpl::createWatchDog(Thread::ThreadId thread_id,
190 : const std::string& thread_name,
191 192 : Event::Dispatcher& dispatcher) {
192 : // Timer started by WatchDog will try to fire at 1/2 of the interval of the
193 : // minimum timeout specified. loop_interval_ is const so all shared state
194 : // accessed out of the locked section below is const (time_source_ has no
195 : // state).
196 192 : const auto wd_interval = loop_interval_ / 2;
197 192 : auto new_watchdog = std::make_shared<WatchDogImpl>(std::move(thread_id));
198 192 : WatchedDogPtr watched_dog = std::make_unique<WatchedDog>(stats_scope_, thread_name, new_watchdog);
199 192 : new_watchdog->touch();
200 192 : {
201 192 : Thread::LockGuard guard(wd_lock_);
202 192 : watched_dogs_.push_back(std::move(watched_dog));
203 192 : }
204 192 : dispatcher.registerWatchdog(new_watchdog, wd_interval);
205 192 : new_watchdog->touch();
206 192 : return new_watchdog;
207 192 : }
208 :
209 192 : void GuardDogImpl::stopWatching(WatchDogSharedPtr wd) {
210 192 : Thread::LockGuard guard(wd_lock_);
211 192 : auto found_wd = std::find_if(watched_dogs_.begin(), watched_dogs_.end(),
212 192 : [&wd](const WatchedDogPtr& d) -> bool { return d->dog_ == wd; });
213 192 : if (found_wd != watched_dogs_.end()) {
214 192 : watched_dogs_.erase(found_wd);
215 192 : } else {
216 0 : ASSERT(false);
217 0 : }
218 192 : }
219 :
220 222 : void GuardDogImpl::start(Api::Api& api) {
221 222 : Thread::LockGuard guard(mutex_);
222 :
223 : // Synchronize between calling thread and guarddog thread.
224 222 : absl::Notification guarddog_thread_started;
225 :
226 : // See comments in WorkerImpl::start for the naming convention.
227 222 : Thread::Options options{absl::StrCat("dog:", dispatcher_->name())};
228 222 : thread_ = api.threadFactory().createThread(
229 222 : [this, &guarddog_thread_started]() -> void {
230 222 : loop_timer_->enableTimer(std::chrono::milliseconds(0));
231 222 : dispatcher_->post([&guarddog_thread_started]() { guarddog_thread_started.Notify(); });
232 222 : dispatcher_->run(Event::Dispatcher::RunType::RunUntilExit);
233 222 : },
234 222 : options);
235 :
236 222 : guarddog_thread_started.WaitForNotification();
237 222 : }
238 :
239 222 : void GuardDogImpl::stop() {
240 222 : {
241 222 : Thread::LockGuard guard(mutex_);
242 222 : run_thread_ = false;
243 222 : }
244 222 : dispatcher_->exit();
245 222 : if (thread_) {
246 222 : thread_->join();
247 222 : thread_.reset();
248 222 : }
249 222 : }
250 :
251 : void GuardDogImpl::invokeGuardDogActions(
252 : WatchDogAction::WatchdogEvent event,
253 : std::vector<std::pair<Thread::ThreadId, MonotonicTime>> thread_last_checkin_pairs,
254 0 : MonotonicTime now) {
255 0 : const auto& registered_actions = events_to_actions_.find(event);
256 0 : if (registered_actions != events_to_actions_.end()) {
257 0 : for (auto& action : registered_actions->second) {
258 0 : action->run(event, thread_last_checkin_pairs, now);
259 0 : }
260 0 : }
261 0 : }
262 :
263 : GuardDogImpl::WatchedDog::WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name,
264 : const WatchDogImplSharedPtr& watch_dog)
265 : : dog_(watch_dog),
266 : miss_counter_(stats_scope.counterFromStatName(
267 : Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_miss", thread_name),
268 : stats_scope.symbolTable())
269 : .statName())),
270 : megamiss_counter_(stats_scope.counterFromStatName(
271 : Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_mega_miss", thread_name),
272 : stats_scope.symbolTable())
273 192 : .statName())) {}
274 :
275 : } // namespace Server
276 : } // namespace Envoy
|