LCOV - code coverage report
Current view: top level - source/server - guarddog_impl.cc (source / functions) Hit Total Coverage
Test: coverage.dat Lines: 95 160 59.4 %
Date: 2024-01-05 06:35:25 Functions: 15 16 93.8 %

          Line data    Source code
       1             : #include "source/server/guarddog_impl.h"
       2             : 
       3             : #include <sys/types.h>
       4             : 
       5             : #include <chrono>
       6             : #include <memory>
       7             : #include <utility>
       8             : #include <vector>
       9             : 
      10             : #include "envoy/common/time.h"
      11             : #include "envoy/config/bootstrap/v3/bootstrap.pb.h"
      12             : #include "envoy/server/configuration.h"
      13             : #include "envoy/server/guarddog.h"
      14             : #include "envoy/server/guarddog_config.h"
      15             : #include "envoy/stats/scope.h"
      16             : #include "envoy/watchdog/v3/abort_action.pb.h"
      17             : 
      18             : #include "source/common/common/assert.h"
      19             : #include "source/common/common/fmt.h"
      20             : #include "source/common/common/lock_guard.h"
      21             : #include "source/common/common/logger.h"
      22             : #include "source/common/config/utility.h"
      23             : #include "source/common/protobuf/utility.h"
      24             : #include "source/common/stats/symbol_table.h"
      25             : #include "source/server/watchdog_impl.h"
      26             : 
      27             : #include "absl/synchronization/mutex.h"
      28             : #include "absl/synchronization/notification.h"
      29             : 
      30             : namespace Envoy {
      31             : namespace Server {
      32             : 
      33             : GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Watchdog& config,
      34             :                            Api::Api& api, absl::string_view name,
      35             :                            std::unique_ptr<TestInterlockHook>&& test_interlock)
      36             :     : test_interlock_hook_(std::move(test_interlock)), stats_scope_(stats_scope),
      37             :       time_source_(api.timeSource()), miss_timeout_(config.missTimeout()),
      38             :       megamiss_timeout_(config.megaMissTimeout()), kill_timeout_(config.killTimeout()),
      39             :       multi_kill_timeout_(config.multiKillTimeout()),
      40             :       multi_kill_fraction_(config.multiKillThreshold() / 100.0),
      41         222 :       loop_interval_([&]() -> std::chrono::milliseconds {
      42             :         // The loop interval is simply the minimum of all specified intervals,
      43             :         // but we must account for the 0=disabled case. This lambda takes care
      44             :         // of that and returns a value that initializes the const loop interval.
      45         222 :         const auto min_of_nonfatal = std::min(miss_timeout_, megamiss_timeout_);
      46         222 :         return std::min({killEnabled() ? kill_timeout_ : min_of_nonfatal,
      47         222 :                          multikillEnabled() ? multi_kill_timeout_ : min_of_nonfatal,
      48         222 :                          min_of_nonfatal});
      49         222 :       }()),
      50             :       watchdog_miss_counter_(stats_scope.counterFromStatName(
      51             :           Stats::StatNameManagedStorage(absl::StrCat(name, ".watchdog_miss"),
      52             :                                         stats_scope.symbolTable())
      53             :               .statName())),
      54             :       watchdog_megamiss_counter_(stats_scope.counterFromStatName(
      55             :           Stats::StatNameManagedStorage(absl::StrCat(name, ".watchdog_mega_miss"),
      56             :                                         stats_scope.symbolTable())
      57             :               .statName())),
      58             :       dispatcher_(api.allocateDispatcher(absl::StrCat(name, "_guarddog_thread"))),
      59         524 :       loop_timer_(dispatcher_->createTimer([this]() { step(); })),
      60         222 :       events_to_actions_([&](const Server::Configuration::Watchdog& config) -> EventToActionsMap {
      61         222 :         EventToActionsMap map;
      62             : 
      63             :         // We should be able to share the dispatcher since guard dog's lifetime
      64             :         // should eclipse those of actions.
      65         222 :         Configuration::GuardDogActionFactoryContext context = {api, *dispatcher_, stats_scope,
      66         222 :                                                                name};
      67             : 
      68         222 :         auto actions = config.actions();
      69             : 
      70             :         // Add default abort_action if kill and/or multi-kill is enabled.
      71         222 :         if (config.killTimeout().count() > 0) {
      72           0 :           envoy::watchdog::v3::AbortActionConfig abort_config;
      73           0 :           WatchDogAction* abort_action_config = actions.Add();
      74           0 :           abort_action_config->set_event(WatchDogAction::KILL);
      75           0 :           abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config);
      76           0 :         }
      77             : 
      78         222 :         if (config.multiKillTimeout().count() > 0) {
      79           0 :           envoy::watchdog::v3::AbortActionConfig abort_config;
      80           0 :           WatchDogAction* abort_action_config = actions.Add();
      81           0 :           abort_action_config->set_event(WatchDogAction::MULTIKILL);
      82           0 :           abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config);
      83           0 :         }
      84             : 
      85         222 :         for (const auto& action : actions) {
      86             :           // Get factory and add the created cb
      87           0 :           auto& factory = Config::Utility::getAndCheckFactory<Configuration::GuardDogActionFactory>(
      88           0 :               action.config());
      89           0 :           map[action.event()].push_back(factory.createGuardDogActionFromProto(action, context));
      90           0 :         }
      91             : 
      92         222 :         return map;
      93         222 :       }(config)),
      94         222 :       run_thread_(true) {
      95         222 :   start(api);
      96         222 : }
      97             : 
      98             : GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Watchdog& config,
      99             :                            Api::Api& api, absl::string_view name)
     100         222 :     : GuardDogImpl(stats_scope, config, api, name, std::make_unique<TestInterlockHook>()) {}
     101             : 
     102         222 : GuardDogImpl::~GuardDogImpl() { stop(); }
     103             : 
     104         523 : void GuardDogImpl::step() {
     105             :   // Hold mutex_ for the duration of the step() function to ensure that watchdog still alive checks
     106             :   // and test interlocks happen in the expected order. Calls to forceCheckForTest() should result in
     107             :   // a full iteration of the step() function to process recent watchdog touches and monotonic time
     108             :   // changes.
     109         523 :   Thread::LockGuard guard(mutex_);
     110         523 :   if (!run_thread_) {
     111           2 :     return;
     112           2 :   }
     113             : 
     114         521 :   const auto now = time_source_.monotonicTime();
     115         521 :   std::vector<std::pair<Thread::ThreadId, MonotonicTime>> miss_threads;
     116         521 :   std::vector<std::pair<Thread::ThreadId, MonotonicTime>> mega_miss_threads;
     117             : 
     118         521 :   {
     119         521 :     std::vector<std::pair<Thread::ThreadId, MonotonicTime>> multi_kill_threads;
     120         521 :     Thread::LockGuard guard(wd_lock_);
     121             : 
     122             :     // Compute the multikill threshold
     123         521 :     const size_t required_for_multi_kill =
     124         521 :         std::max(static_cast<size_t>(2),
     125         521 :                  static_cast<size_t>(ceil(multi_kill_fraction_ * watched_dogs_.size())));
     126             : 
     127         521 :     for (auto& watched_dog : watched_dogs_) {
     128         210 :       if (watched_dog->dog_->getTouchedAndReset()) {
     129             :         // Watchdog was touched since the guard dog last checked; update last check-in time.
     130         210 :         watched_dog->last_checkin_ = now;
     131         210 :         continue;
     132         210 :       }
     133             : 
     134           0 :       const auto last_checkin = watched_dog->last_checkin_;
     135           0 :       const auto tid = watched_dog->dog_->threadId();
     136           0 :       const auto delta = now - last_checkin;
     137           0 :       if (watched_dog->last_alert_time_ && watched_dog->last_alert_time_.value() < last_checkin) {
     138           0 :         watched_dog->miss_alerted_ = false;
     139           0 :         watched_dog->megamiss_alerted_ = false;
     140           0 :       }
     141           0 :       if (delta > miss_timeout_) {
     142           0 :         if (!watched_dog->miss_alerted_) {
     143           0 :           watchdog_miss_counter_.inc();
     144           0 :           watched_dog->miss_counter_.inc();
     145           0 :           watched_dog->last_alert_time_ = last_checkin;
     146           0 :           watched_dog->miss_alerted_ = true;
     147           0 :           miss_threads.emplace_back(tid, last_checkin);
     148           0 :         }
     149           0 :       }
     150           0 :       if (delta > megamiss_timeout_) {
     151           0 :         if (!watched_dog->megamiss_alerted_) {
     152           0 :           watchdog_megamiss_counter_.inc();
     153           0 :           watched_dog->megamiss_counter_.inc();
     154           0 :           watched_dog->last_alert_time_ = last_checkin;
     155           0 :           watched_dog->megamiss_alerted_ = true;
     156           0 :           mega_miss_threads.emplace_back(tid, last_checkin);
     157           0 :         }
     158           0 :       }
     159           0 :       if (killEnabled() && delta > kill_timeout_) {
     160           0 :         invokeGuardDogActions(WatchDogAction::KILL, {{tid, last_checkin}}, now);
     161           0 :       }
     162           0 :       if (multikillEnabled() && delta > multi_kill_timeout_) {
     163           0 :         multi_kill_threads.emplace_back(tid, last_checkin);
     164             : 
     165           0 :         if (multi_kill_threads.size() >= required_for_multi_kill) {
     166           0 :           ENVOY_LOG_MISC(error, "Watchdog MULTIKILL as {} threads are stuck.",
     167           0 :                          multi_kill_threads.size());
     168           0 :           invokeGuardDogActions(WatchDogAction::MULTIKILL, multi_kill_threads, now);
     169           0 :         }
     170           0 :       }
     171           0 :     }
     172         521 :   }
     173             : 
     174             :   // Run megamiss and miss handlers
     175         521 :   if (!mega_miss_threads.empty()) {
     176           0 :     invokeGuardDogActions(WatchDogAction::MEGAMISS, mega_miss_threads, now);
     177           0 :   }
     178             : 
     179         521 :   if (!miss_threads.empty()) {
     180           0 :     invokeGuardDogActions(WatchDogAction::MISS, miss_threads, now);
     181           0 :   }
     182             : 
     183         521 :   test_interlock_hook_->signalFromImpl();
     184         522 :   if (run_thread_) {
     185         522 :     loop_timer_->enableTimer(loop_interval_);
     186         522 :   }
     187         521 : }
     188             : 
     189             : WatchDogSharedPtr GuardDogImpl::createWatchDog(Thread::ThreadId thread_id,
     190             :                                                const std::string& thread_name,
     191         192 :                                                Event::Dispatcher& dispatcher) {
     192             :   // Timer started by WatchDog will try to fire at 1/2 of the interval of the
     193             :   // minimum timeout specified. loop_interval_ is const so all shared state
     194             :   // accessed out of the locked section below is const (time_source_ has no
     195             :   // state).
     196         192 :   const auto wd_interval = loop_interval_ / 2;
     197         192 :   auto new_watchdog = std::make_shared<WatchDogImpl>(std::move(thread_id));
     198         192 :   WatchedDogPtr watched_dog = std::make_unique<WatchedDog>(stats_scope_, thread_name, new_watchdog);
     199         192 :   new_watchdog->touch();
     200         192 :   {
     201         192 :     Thread::LockGuard guard(wd_lock_);
     202         192 :     watched_dogs_.push_back(std::move(watched_dog));
     203         192 :   }
     204         192 :   dispatcher.registerWatchdog(new_watchdog, wd_interval);
     205         192 :   new_watchdog->touch();
     206         192 :   return new_watchdog;
     207         192 : }
     208             : 
     209         192 : void GuardDogImpl::stopWatching(WatchDogSharedPtr wd) {
     210         192 :   Thread::LockGuard guard(wd_lock_);
     211         192 :   auto found_wd = std::find_if(watched_dogs_.begin(), watched_dogs_.end(),
     212         192 :                                [&wd](const WatchedDogPtr& d) -> bool { return d->dog_ == wd; });
     213         192 :   if (found_wd != watched_dogs_.end()) {
     214         192 :     watched_dogs_.erase(found_wd);
     215         192 :   } else {
     216           0 :     ASSERT(false);
     217           0 :   }
     218         192 : }
     219             : 
     220         222 : void GuardDogImpl::start(Api::Api& api) {
     221         222 :   Thread::LockGuard guard(mutex_);
     222             : 
     223             :   // Synchronize between calling thread and guarddog thread.
     224         222 :   absl::Notification guarddog_thread_started;
     225             : 
     226             :   // See comments in WorkerImpl::start for the naming convention.
     227         222 :   Thread::Options options{absl::StrCat("dog:", dispatcher_->name())};
     228         222 :   thread_ = api.threadFactory().createThread(
     229         222 :       [this, &guarddog_thread_started]() -> void {
     230         222 :         loop_timer_->enableTimer(std::chrono::milliseconds(0));
     231         222 :         dispatcher_->post([&guarddog_thread_started]() { guarddog_thread_started.Notify(); });
     232         222 :         dispatcher_->run(Event::Dispatcher::RunType::RunUntilExit);
     233         222 :       },
     234         222 :       options);
     235             : 
     236         222 :   guarddog_thread_started.WaitForNotification();
     237         222 : }
     238             : 
     239         222 : void GuardDogImpl::stop() {
     240         222 :   {
     241         222 :     Thread::LockGuard guard(mutex_);
     242         222 :     run_thread_ = false;
     243         222 :   }
     244         222 :   dispatcher_->exit();
     245         222 :   if (thread_) {
     246         222 :     thread_->join();
     247         222 :     thread_.reset();
     248         222 :   }
     249         222 : }
     250             : 
     251             : void GuardDogImpl::invokeGuardDogActions(
     252             :     WatchDogAction::WatchdogEvent event,
     253             :     std::vector<std::pair<Thread::ThreadId, MonotonicTime>> thread_last_checkin_pairs,
     254           0 :     MonotonicTime now) {
     255           0 :   const auto& registered_actions = events_to_actions_.find(event);
     256           0 :   if (registered_actions != events_to_actions_.end()) {
     257           0 :     for (auto& action : registered_actions->second) {
     258           0 :       action->run(event, thread_last_checkin_pairs, now);
     259           0 :     }
     260           0 :   }
     261           0 : }
     262             : 
     263             : GuardDogImpl::WatchedDog::WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name,
     264             :                                      const WatchDogImplSharedPtr& watch_dog)
     265             :     : dog_(watch_dog),
     266             :       miss_counter_(stats_scope.counterFromStatName(
     267             :           Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_miss", thread_name),
     268             :                                         stats_scope.symbolTable())
     269             :               .statName())),
     270             :       megamiss_counter_(stats_scope.counterFromStatName(
     271             :           Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_mega_miss", thread_name),
     272             :                                         stats_scope.symbolTable())
     273         192 :               .statName())) {}
     274             : 
     275             : } // namespace Server
     276             : } // namespace Envoy

Generated by: LCOV version 1.15