/proc/self/cwd/source/server/guarddog_impl.cc
Line | Count | Source (jump to first uncovered line) |
1 | | #include "source/server/guarddog_impl.h" |
2 | | |
3 | | #include <sys/types.h> |
4 | | |
5 | | #include <chrono> |
6 | | #include <memory> |
7 | | #include <utility> |
8 | | #include <vector> |
9 | | |
10 | | #include "envoy/common/time.h" |
11 | | #include "envoy/config/bootstrap/v3/bootstrap.pb.h" |
12 | | #include "envoy/server/configuration.h" |
13 | | #include "envoy/server/guarddog.h" |
14 | | #include "envoy/server/guarddog_config.h" |
15 | | #include "envoy/stats/scope.h" |
16 | | #include "envoy/watchdog/v3/abort_action.pb.h" |
17 | | |
18 | | #include "source/common/common/assert.h" |
19 | | #include "source/common/common/fmt.h" |
20 | | #include "source/common/common/lock_guard.h" |
21 | | #include "source/common/common/logger.h" |
22 | | #include "source/common/config/utility.h" |
23 | | #include "source/common/protobuf/utility.h" |
24 | | #include "source/common/stats/symbol_table.h" |
25 | | #include "source/server/watchdog_impl.h" |
26 | | |
27 | | #include "absl/synchronization/mutex.h" |
28 | | #include "absl/synchronization/notification.h" |
29 | | |
30 | | namespace Envoy { |
31 | | namespace Server { |
32 | | |
33 | | GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Watchdog& config, |
34 | | Api::Api& api, absl::string_view name, |
35 | | std::unique_ptr<TestInterlockHook>&& test_interlock) |
36 | | : test_interlock_hook_(std::move(test_interlock)), stats_scope_(stats_scope), |
37 | | time_source_(api.timeSource()), miss_timeout_(config.missTimeout()), |
38 | | megamiss_timeout_(config.megaMissTimeout()), kill_timeout_(config.killTimeout()), |
39 | | multi_kill_timeout_(config.multiKillTimeout()), |
40 | | multi_kill_fraction_(config.multiKillThreshold() / 100.0), |
41 | 7.62k | loop_interval_([&]() -> std::chrono::milliseconds { |
42 | | // The loop interval is simply the minimum of all specified intervals, |
43 | | // but we must account for the 0=disabled case. This lambda takes care |
44 | | // of that and returns a value that initializes the const loop interval. |
45 | 7.62k | const auto min_of_nonfatal = std::min(miss_timeout_, megamiss_timeout_); |
46 | 7.62k | return std::min({killEnabled() ? kill_timeout_ : min_of_nonfatal, |
47 | 7.62k | multikillEnabled() ? multi_kill_timeout_ : min_of_nonfatal, |
48 | 7.62k | min_of_nonfatal}); |
49 | 7.62k | }()), |
50 | | watchdog_miss_counter_(stats_scope.counterFromStatName( |
51 | | Stats::StatNameManagedStorage(absl::StrCat(name, ".watchdog_miss"), |
52 | | stats_scope.symbolTable()) |
53 | | .statName())), |
54 | | watchdog_megamiss_counter_(stats_scope.counterFromStatName( |
55 | | Stats::StatNameManagedStorage(absl::StrCat(name, ".watchdog_mega_miss"), |
56 | | stats_scope.symbolTable()) |
57 | | .statName())), |
58 | | dispatcher_(api.allocateDispatcher(absl::StrCat(name, "_guarddog_thread"))), |
59 | 14.6k | loop_timer_(dispatcher_->createTimer([this]() { step(); })), |
60 | 7.62k | events_to_actions_([&](const Server::Configuration::Watchdog& config) -> EventToActionsMap { |
61 | 7.62k | EventToActionsMap map; |
62 | | |
63 | | // We should be able to share the dispatcher since guard dog's lifetime |
64 | | // should eclipse those of actions. |
65 | 7.62k | Configuration::GuardDogActionFactoryContext context = {api, *dispatcher_, stats_scope, |
66 | 7.62k | name}; |
67 | | |
68 | 7.62k | auto actions = config.actions(); |
69 | | |
70 | | // Add default abort_action if kill and/or multi-kill is enabled. |
71 | 7.62k | if (config.killTimeout().count() > 0) { |
72 | 18 | envoy::watchdog::v3::AbortActionConfig abort_config; |
73 | 18 | WatchDogAction* abort_action_config = actions.Add(); |
74 | 18 | abort_action_config->set_event(WatchDogAction::KILL); |
75 | 18 | abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config); |
76 | 18 | } |
77 | | |
78 | 7.62k | if (config.multiKillTimeout().count() > 0) { |
79 | 2 | envoy::watchdog::v3::AbortActionConfig abort_config; |
80 | 2 | WatchDogAction* abort_action_config = actions.Add(); |
81 | 2 | abort_action_config->set_event(WatchDogAction::MULTIKILL); |
82 | 2 | abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config); |
83 | 2 | } |
84 | | |
85 | 7.62k | for (const auto& action : actions) { |
86 | | // Get factory and add the created cb |
87 | 27 | auto& factory = Config::Utility::getAndCheckFactory<Configuration::GuardDogActionFactory>( |
88 | 27 | action.config()); |
89 | 27 | map[action.event()].push_back(factory.createGuardDogActionFromProto(action, context)); |
90 | 27 | } |
91 | | |
92 | 7.62k | return map; |
93 | 7.62k | }(config)), |
94 | 7.62k | run_thread_(true) { |
95 | 7.62k | start(api); |
96 | 7.62k | } |
97 | | |
98 | | GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Watchdog& config, |
99 | | Api::Api& api, absl::string_view name) |
100 | 7.62k | : GuardDogImpl(stats_scope, config, api, name, std::make_unique<TestInterlockHook>()) {} |
101 | | |
102 | 7.62k | GuardDogImpl::~GuardDogImpl() { stop(); } |
103 | | |
104 | 14.6k | void GuardDogImpl::step() { |
105 | | // Hold mutex_ for the duration of the step() function to ensure that watchdog still alive checks |
106 | | // and test interlocks happen in the expected order. Calls to forceCheckForTest() should result in |
107 | | // a full iteration of the step() function to process recent watchdog touches and monotonic time |
108 | | // changes. |
109 | 14.6k | Thread::LockGuard guard(mutex_); |
110 | 14.6k | if (!run_thread_) { |
111 | 26 | return; |
112 | 26 | } |
113 | | |
114 | 14.6k | const auto now = time_source_.monotonicTime(); |
115 | 14.6k | std::vector<std::pair<Thread::ThreadId, MonotonicTime>> miss_threads; |
116 | 14.6k | std::vector<std::pair<Thread::ThreadId, MonotonicTime>> mega_miss_threads; |
117 | | |
118 | 14.6k | { |
119 | 14.6k | std::vector<std::pair<Thread::ThreadId, MonotonicTime>> multi_kill_threads; |
120 | 14.6k | Thread::LockGuard guard(wd_lock_); |
121 | | |
122 | | // Compute the multikill threshold |
123 | 14.6k | const size_t required_for_multi_kill = |
124 | 14.6k | std::max(static_cast<size_t>(2), |
125 | 14.6k | static_cast<size_t>(ceil(multi_kill_fraction_ * watched_dogs_.size()))); |
126 | | |
127 | 14.6k | for (auto& watched_dog : watched_dogs_) { |
128 | 985 | if (watched_dog->dog_->getTouchedAndReset()) { |
129 | | // Watchdog was touched since the guard dog last checked; update last check-in time. |
130 | 985 | watched_dog->last_checkin_ = now; |
131 | 985 | continue; |
132 | 985 | } |
133 | | |
134 | 18.4E | const auto last_checkin = watched_dog->last_checkin_; |
135 | 18.4E | const auto tid = watched_dog->dog_->threadId(); |
136 | 18.4E | const auto delta = now - last_checkin; |
137 | 18.4E | if (watched_dog->last_alert_time_ && watched_dog->last_alert_time_.value() < last_checkin) { |
138 | 0 | watched_dog->miss_alerted_ = false; |
139 | 0 | watched_dog->megamiss_alerted_ = false; |
140 | 0 | } |
141 | 18.4E | if (delta > miss_timeout_) { |
142 | 0 | if (!watched_dog->miss_alerted_) { |
143 | 0 | watchdog_miss_counter_.inc(); |
144 | 0 | watched_dog->miss_counter_.inc(); |
145 | 0 | watched_dog->last_alert_time_ = last_checkin; |
146 | 0 | watched_dog->miss_alerted_ = true; |
147 | 0 | miss_threads.emplace_back(tid, last_checkin); |
148 | 0 | } |
149 | 0 | } |
150 | 18.4E | if (delta > megamiss_timeout_) { |
151 | 0 | if (!watched_dog->megamiss_alerted_) { |
152 | 0 | watchdog_megamiss_counter_.inc(); |
153 | 0 | watched_dog->megamiss_counter_.inc(); |
154 | 0 | watched_dog->last_alert_time_ = last_checkin; |
155 | 0 | watched_dog->megamiss_alerted_ = true; |
156 | 0 | mega_miss_threads.emplace_back(tid, last_checkin); |
157 | 0 | } |
158 | 0 | } |
159 | 18.4E | if (killEnabled() && delta > kill_timeout_) { |
160 | 0 | invokeGuardDogActions(WatchDogAction::KILL, {{tid, last_checkin}}, now); |
161 | 0 | } |
162 | 18.4E | if (multikillEnabled() && delta > multi_kill_timeout_) { |
163 | 0 | multi_kill_threads.emplace_back(tid, last_checkin); |
164 | |
|
165 | 0 | if (multi_kill_threads.size() >= required_for_multi_kill) { |
166 | 0 | ENVOY_LOG_MISC(error, "Watchdog MULTIKILL as {} threads are stuck.", |
167 | 0 | multi_kill_threads.size()); |
168 | 0 | invokeGuardDogActions(WatchDogAction::MULTIKILL, multi_kill_threads, now); |
169 | 0 | } |
170 | 0 | } |
171 | 18.4E | } |
172 | 14.6k | } |
173 | | |
174 | | // Run megamiss and miss handlers |
175 | 14.6k | if (!mega_miss_threads.empty()) { |
176 | 0 | invokeGuardDogActions(WatchDogAction::MEGAMISS, mega_miss_threads, now); |
177 | 0 | } |
178 | | |
179 | 14.6k | if (!miss_threads.empty()) { |
180 | 0 | invokeGuardDogActions(WatchDogAction::MISS, miss_threads, now); |
181 | 0 | } |
182 | | |
183 | 14.6k | test_interlock_hook_->signalFromImpl(); |
184 | 14.6k | if (run_thread_) { |
185 | 14.6k | loop_timer_->enableTimer(loop_interval_); |
186 | 14.6k | } |
187 | 14.6k | } |
188 | | |
189 | | WatchDogSharedPtr GuardDogImpl::createWatchDog(Thread::ThreadId thread_id, |
190 | | const std::string& thread_name, |
191 | 5.29k | Event::Dispatcher& dispatcher) { |
192 | | // Timer started by WatchDog will try to fire at 1/2 of the interval of the |
193 | | // minimum timeout specified. loop_interval_ is const so all shared state |
194 | | // accessed out of the locked section below is const (time_source_ has no |
195 | | // state). |
196 | 5.29k | const auto wd_interval = loop_interval_ / 2; |
197 | 5.29k | auto new_watchdog = std::make_shared<WatchDogImpl>(std::move(thread_id)); |
198 | 5.29k | WatchedDogPtr watched_dog = std::make_unique<WatchedDog>(stats_scope_, thread_name, new_watchdog); |
199 | 5.29k | new_watchdog->touch(); |
200 | 5.29k | { |
201 | 5.29k | Thread::LockGuard guard(wd_lock_); |
202 | 5.29k | watched_dogs_.push_back(std::move(watched_dog)); |
203 | 5.29k | } |
204 | 5.29k | dispatcher.registerWatchdog(new_watchdog, wd_interval); |
205 | 5.29k | new_watchdog->touch(); |
206 | 5.29k | return new_watchdog; |
207 | 5.29k | } |
208 | | |
209 | 5.29k | void GuardDogImpl::stopWatching(WatchDogSharedPtr wd) { |
210 | 5.29k | Thread::LockGuard guard(wd_lock_); |
211 | 5.29k | auto found_wd = std::find_if(watched_dogs_.begin(), watched_dogs_.end(), |
212 | 5.29k | [&wd](const WatchedDogPtr& d) -> bool { return d->dog_ == wd; }); |
213 | 5.29k | if (found_wd != watched_dogs_.end()) { |
214 | 5.29k | watched_dogs_.erase(found_wd); |
215 | 5.29k | } else { |
216 | 0 | ASSERT(false); |
217 | 0 | } |
218 | 5.29k | } |
219 | | |
220 | 7.62k | void GuardDogImpl::start(Api::Api& api) { |
221 | 7.62k | Thread::LockGuard guard(mutex_); |
222 | | |
223 | | // Synchronize between calling thread and guarddog thread. |
224 | 7.62k | absl::Notification guarddog_thread_started; |
225 | | |
226 | | // See comments in WorkerImpl::start for the naming convention. |
227 | 7.62k | Thread::Options options{absl::StrCat("dog:", dispatcher_->name())}; |
228 | 7.62k | thread_ = api.threadFactory().createThread( |
229 | 7.62k | [this, &guarddog_thread_started]() -> void { |
230 | 7.62k | loop_timer_->enableTimer(std::chrono::milliseconds(0)); |
231 | 7.62k | dispatcher_->post([&guarddog_thread_started]() { guarddog_thread_started.Notify(); }); |
232 | 7.62k | dispatcher_->run(Event::Dispatcher::RunType::RunUntilExit); |
233 | 7.62k | }, |
234 | 7.62k | options); |
235 | | |
236 | 7.62k | guarddog_thread_started.WaitForNotification(); |
237 | 7.62k | } |
238 | | |
239 | 7.62k | void GuardDogImpl::stop() { |
240 | 7.62k | { |
241 | 7.62k | Thread::LockGuard guard(mutex_); |
242 | 7.62k | run_thread_ = false; |
243 | 7.62k | } |
244 | 7.62k | dispatcher_->exit(); |
245 | 7.62k | if (thread_) { |
246 | 7.62k | thread_->join(); |
247 | 7.62k | thread_.reset(); |
248 | 7.62k | } |
249 | 7.62k | } |
250 | | |
251 | | void GuardDogImpl::invokeGuardDogActions( |
252 | | WatchDogAction::WatchdogEvent event, |
253 | | std::vector<std::pair<Thread::ThreadId, MonotonicTime>> thread_last_checkin_pairs, |
254 | 0 | MonotonicTime now) { |
255 | 0 | const auto& registered_actions = events_to_actions_.find(event); |
256 | 0 | if (registered_actions != events_to_actions_.end()) { |
257 | 0 | for (auto& action : registered_actions->second) { |
258 | 0 | action->run(event, thread_last_checkin_pairs, now); |
259 | 0 | } |
260 | 0 | } |
261 | 0 | } |
262 | | |
263 | | GuardDogImpl::WatchedDog::WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name, |
264 | | const WatchDogImplSharedPtr& watch_dog) |
265 | | : dog_(watch_dog), |
266 | | miss_counter_(stats_scope.counterFromStatName( |
267 | | Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_miss", thread_name), |
268 | | stats_scope.symbolTable()) |
269 | | .statName())), |
270 | | megamiss_counter_(stats_scope.counterFromStatName( |
271 | | Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_mega_miss", thread_name), |
272 | | stats_scope.symbolTable()) |
273 | 5.29k | .statName())) {} |
274 | | |
275 | | } // namespace Server |
276 | | } // namespace Envoy |