Line data Source code
1 : #pragma once 2 : 3 : #include <chrono> 4 : #include <vector> 5 : 6 : #include "envoy/api/api.h" 7 : #include "envoy/config/bootstrap/v3/bootstrap.pb.h" 8 : #include "envoy/event/timer.h" 9 : #include "envoy/server/configuration.h" 10 : #include "envoy/server/guarddog.h" 11 : #include "envoy/server/guarddog_config.h" 12 : #include "envoy/server/watchdog.h" 13 : #include "envoy/stats/scope.h" 14 : #include "envoy/stats/stats.h" 15 : 16 : #include "source/common/common/lock_guard.h" 17 : #include "source/common/common/logger.h" 18 : #include "source/common/common/thread.h" 19 : #include "source/common/event/libevent.h" 20 : #include "source/server/watchdog_impl.h" 21 : 22 : #include "absl/types/optional.h" 23 : 24 : namespace Envoy { 25 : namespace Server { 26 : 27 : /** 28 : * This feature performs deadlock detection stats collection & enforcement. 29 : * 30 : * It launches a thread that scans at an interval the minimum of the configured 31 : * intervals. If it finds starved threads or suspected deadlocks it will take 32 : * the appropriate action depending on the config parameters described below. 33 : * 34 : * Thread lifetime is tied to GuardDog object lifetime (RAII style). 35 : */ 36 : class GuardDogImpl : public GuardDog { 37 : public: 38 : /** 39 : * Defines a test interlock hook to enable tests to synchronize the guard-dog 40 : * execution so they can probe current counter values. The default 41 : * implementation that runs in production has empty methods, which are 42 : * overridden in the implementation used during tests. 43 : */ 44 : class TestInterlockHook { 45 : public: 46 222 : virtual ~TestInterlockHook() = default; 47 : 48 : /** 49 : * Called from GuardDogImpl to indicate that it has evaluated all watch-dogs up to a particular 50 : * point in time. Called while the GuardDog mutex is held. 51 : */ 52 522 : virtual void signalFromImpl() {} 53 : 54 : /** 55 : * Called from GuardDog tests to block until the implementation has reached the desired 56 : * condition. Called while the GuardDog mutex is held. 57 : * @param mutex The GuardDog's mutex for use by Thread::CondVar::wait. 58 : */ 59 0 : virtual void waitFromTest(Thread::MutexBasicLockable& /*mutex*/) {} 60 : }; 61 : 62 : /** 63 : * @param stats_scope Statistics scope to write watchdog_miss and 64 : * watchdog_mega_miss events into. 65 : * @param config Configuration object. 66 : * @param api API object. 67 : * @param test_interlock a hook for enabling interlock with unit tests. 68 : * 69 : * See the configuration documentation for details on the timeout settings. 70 : */ 71 : GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Watchdog& config, 72 : Api::Api& api, absl::string_view name, 73 : std::unique_ptr<TestInterlockHook>&& test_interlock); 74 : GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Watchdog& config, 75 : Api::Api& api, absl::string_view name); 76 : ~GuardDogImpl() override; 77 : 78 : /** 79 : * Exposed for testing purposes only (but harmless to call): 80 : */ 81 0 : const std::chrono::milliseconds loopIntervalForTest() const { return loop_interval_; } 82 : 83 : /** 84 : * Test hook to force a step() to catch up with the current watchdog state and simulated time. 85 : * This is inlined so that it does not need to be present in the production binary. 86 : */ 87 0 : void forceCheckForTest() { 88 0 : Thread::LockGuard guard(mutex_); 89 0 : dispatcher_->post([this]() { loop_timer_->enableTimer(std::chrono::milliseconds(0)); }); 90 0 : test_interlock_hook_->waitFromTest(mutex_); 91 0 : } 92 : 93 : // Server::GuardDog 94 : WatchDogSharedPtr createWatchDog(Thread::ThreadId thread_id, const std::string& thread_name, 95 : Event::Dispatcher& dispatcher) override; 96 : void stopWatching(WatchDogSharedPtr wd) override; 97 : 98 : private: 99 : void start(Api::Api& api); 100 : void step(); 101 : void stop(); 102 : // Per the C++ standard it is OK to use these in ctor initializer as long as 103 : // it is after kill and multikill timeout values are initialized. 104 222 : bool killEnabled() const { return kill_timeout_ > std::chrono::milliseconds(0); } 105 222 : bool multikillEnabled() const { return multi_kill_timeout_ > std::chrono::milliseconds(0); } 106 : 107 : using WatchDogAction = envoy::config::bootstrap::v3::Watchdog::WatchdogAction; 108 : // Helper function to invoke all the GuardDogActions registered for an Event. 109 : void invokeGuardDogActions( 110 : WatchDogAction::WatchdogEvent event, 111 : std::vector<std::pair<Thread::ThreadId, MonotonicTime>> thread_last_checkin_pairs, 112 : MonotonicTime now); 113 : 114 : using WatchDogImplSharedPtr = std::shared_ptr<WatchDogImpl>; 115 : struct WatchedDog { 116 : WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name, 117 : const WatchDogImplSharedPtr& watch_dog); 118 : 119 : const WatchDogImplSharedPtr dog_; 120 : MonotonicTime last_checkin_; 121 : absl::optional<MonotonicTime> last_alert_time_; 122 : bool miss_alerted_{}; 123 : bool megamiss_alerted_{}; 124 : Stats::Counter& miss_counter_; 125 : Stats::Counter& megamiss_counter_; 126 : }; 127 : using WatchedDogPtr = std::unique_ptr<WatchedDog>; 128 : 129 : std::unique_ptr<TestInterlockHook> test_interlock_hook_; 130 : Stats::Scope& stats_scope_; 131 : TimeSource& time_source_; 132 : const std::chrono::milliseconds miss_timeout_; 133 : const std::chrono::milliseconds megamiss_timeout_; 134 : const std::chrono::milliseconds kill_timeout_; 135 : const std::chrono::milliseconds multi_kill_timeout_; 136 : const double multi_kill_fraction_; 137 : const std::chrono::milliseconds loop_interval_; 138 : Stats::Counter& watchdog_miss_counter_; 139 : Stats::Counter& watchdog_megamiss_counter_; 140 : std::vector<WatchedDogPtr> watched_dogs_ ABSL_GUARDED_BY(wd_lock_); 141 : Thread::MutexBasicLockable wd_lock_; 142 : Thread::ThreadPtr thread_; 143 : Event::DispatcherPtr dispatcher_; 144 : Event::TimerPtr loop_timer_; 145 : using EventToActionsMap = absl::flat_hash_map<WatchDogAction::WatchdogEvent, 146 : std::vector<Configuration::GuardDogActionPtr>>; 147 : EventToActionsMap events_to_actions_; 148 : Thread::MutexBasicLockable mutex_; 149 : bool run_thread_ ABSL_GUARDED_BY(mutex_); 150 : }; 151 : 152 : } // namespace Server 153 : } // namespace Envoy