LCOV - code coverage report
Current view: top level - source/common/upstream - outlier_detection_impl.cc (source / functions) Hit Total Coverage
Test: coverage.dat Lines: 6 589 1.0 %
Date: 2024-01-05 06:35:25 Functions: 1 45 2.2 %

          Line data    Source code
       1             : #include "source/common/upstream/outlier_detection_impl.h"
       2             : 
       3             : #include <chrono>
       4             : #include <cstdint>
       5             : #include <memory>
       6             : #include <string>
       7             : #include <vector>
       8             : 
       9             : #include "envoy/config/cluster/v3/cluster.pb.h"
      10             : #include "envoy/config/cluster/v3/outlier_detection.pb.h"
      11             : #include "envoy/data/cluster/v3/outlier_detection_event.pb.h"
      12             : #include "envoy/event/dispatcher.h"
      13             : #include "envoy/stats/scope.h"
      14             : 
      15             : #include "source/common/common/assert.h"
      16             : #include "source/common/common/enum_to_int.h"
      17             : #include "source/common/common/fmt.h"
      18             : #include "source/common/common/utility.h"
      19             : #include "source/common/http/codes.h"
      20             : #include "source/common/protobuf/utility.h"
      21             : 
      22             : namespace Envoy {
      23             : namespace Upstream {
      24             : namespace Outlier {
      25             : 
      26             : absl::StatusOr<DetectorSharedPtr> DetectorImplFactory::createForCluster(
      27             :     Cluster& cluster, const envoy::config::cluster::v3::Cluster& cluster_config,
      28             :     Event::Dispatcher& dispatcher, Runtime::Loader& runtime, EventLoggerSharedPtr event_logger,
      29         159 :     Random::RandomGenerator& random) {
      30         159 :   if (cluster_config.has_outlier_detection()) {
      31             : 
      32           0 :     return DetectorImpl::create(cluster, cluster_config.outlier_detection(), dispatcher, runtime,
      33           0 :                                 dispatcher.timeSource(), std::move(event_logger), random);
      34         159 :   } else {
      35         159 :     return nullptr;
      36         159 :   }
      37         159 : }
      38             : 
      39             : DetectorHostMonitorImpl::DetectorHostMonitorImpl(std::shared_ptr<DetectorImpl> detector,
      40             :                                                  HostSharedPtr host)
      41             :     : detector_(detector), host_(host),
      42             :       // add Success Rate monitors
      43             :       external_origin_sr_monitor_(envoy::data::cluster::v3::SUCCESS_RATE),
      44           0 :       local_origin_sr_monitor_(envoy::data::cluster::v3::SUCCESS_RATE_LOCAL_ORIGIN) {
      45             :   // Setup method to call when putResult is invoked. Depending on the config's
      46             :   // split_external_local_origin_errors_ boolean value different method is called.
      47           0 :   put_result_func_ = detector->config().splitExternalLocalOriginErrors()
      48           0 :                          ? &DetectorHostMonitorImpl::putResultWithLocalExternalSplit
      49           0 :                          : &DetectorHostMonitorImpl::putResultNoLocalExternalSplit;
      50           0 : }
      51             : 
      52           0 : void DetectorHostMonitorImpl::eject(MonotonicTime ejection_time) {
      53           0 :   ASSERT(!host_.lock()->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK));
      54           0 :   host_.lock()->healthFlagSet(Host::HealthFlag::FAILED_OUTLIER_CHECK);
      55           0 :   num_ejections_++;
      56           0 :   last_ejection_time_ = ejection_time;
      57           0 : }
      58             : 
      59           0 : void DetectorHostMonitorImpl::uneject(MonotonicTime unejection_time) {
      60           0 :   last_unejection_time_ = (unejection_time);
      61           0 : }
      62             : 
      63           0 : void DetectorHostMonitorImpl::updateCurrentSuccessRateBucket() {
      64           0 :   external_origin_sr_monitor_.updateCurrentSuccessRateBucket();
      65           0 :   local_origin_sr_monitor_.updateCurrentSuccessRateBucket();
      66           0 : }
      67             : 
      68           0 : void DetectorHostMonitorImpl::putHttpResponseCode(uint64_t response_code) {
      69           0 :   external_origin_sr_monitor_.incTotalReqCounter();
      70           0 :   if (Http::CodeUtility::is5xx(response_code)) {
      71           0 :     std::shared_ptr<DetectorImpl> detector = detector_.lock();
      72           0 :     if (!detector) {
      73             :       // It's possible for the cluster/detector to go away while we still have a host in use.
      74           0 :       return;
      75           0 :     }
      76           0 :     if (Http::CodeUtility::isGatewayError(response_code)) {
      77           0 :       if (++consecutive_gateway_failure_ ==
      78           0 :           detector->runtime().snapshot().getInteger(
      79           0 :               ConsecutiveGatewayFailureRuntime, detector->config().consecutiveGatewayFailure())) {
      80           0 :         detector->onConsecutiveGatewayFailure(host_.lock());
      81           0 :       }
      82           0 :     } else {
      83           0 :       consecutive_gateway_failure_ = 0;
      84           0 :     }
      85             : 
      86           0 :     if (++consecutive_5xx_ == detector->runtime().snapshot().getInteger(
      87           0 :                                   Consecutive5xxRuntime, detector->config().consecutive5xx())) {
      88           0 :       detector->onConsecutive5xx(host_.lock());
      89           0 :     }
      90           0 :   } else {
      91           0 :     external_origin_sr_monitor_.incSuccessReqCounter();
      92           0 :     consecutive_5xx_ = 0;
      93           0 :     consecutive_gateway_failure_ = 0;
      94           0 :   }
      95           0 : }
      96             : 
      97           0 : absl::optional<Http::Code> DetectorHostMonitorImpl::resultToHttpCode(Result result) {
      98           0 :   Http::Code http_code = Http::Code::InternalServerError;
      99             : 
     100           0 :   switch (result) {
     101           0 :   case Result::ExtOriginRequestSuccess:
     102           0 :   case Result::LocalOriginConnectSuccessFinal:
     103           0 :     http_code = Http::Code::OK;
     104           0 :     break;
     105           0 :   case Result::LocalOriginTimeout:
     106           0 :     http_code = Http::Code::GatewayTimeout;
     107           0 :     break;
     108           0 :   case Result::LocalOriginConnectFailed:
     109           0 :     http_code = Http::Code::ServiceUnavailable;
     110           0 :     break;
     111           0 :   case Result::ExtOriginRequestFailed:
     112           0 :     http_code = Http::Code::InternalServerError;
     113           0 :     break;
     114             :     // LOCAL_ORIGIN_CONNECT_SUCCESS  is used is 2-layer protocols, like HTTP.
     115             :     // First connection is established and then higher level protocol runs.
     116             :     // If error happens in higher layer protocol, it will be mapped to
     117             :     // HTTP code indicating error. In order not to intervene with result of
     118             :     // higher layer protocol, this code is not mapped to HTTP code.
     119           0 :   case Result::LocalOriginConnectSuccess:
     120           0 :     return absl::nullopt;
     121           0 :   }
     122             : 
     123           0 :   return {http_code};
     124           0 : }
     125             : 
     126             : // Method is called by putResult when external and local origin errors
     127             : // are not treated differently. All errors are mapped to HTTP codes.
     128             : // Depending on the value of the parameter *code* the function behaves differently:
     129             : // - if the *code* is not defined, mapping uses resultToHttpCode method to do mapping.
     130             : // - if *code* is defined, it is taken as HTTP code and reported as such to outlier detector.
     131             : void DetectorHostMonitorImpl::putResultNoLocalExternalSplit(Result result,
     132           0 :                                                             absl::optional<uint64_t> code) {
     133           0 :   if (code) {
     134           0 :     putHttpResponseCode(code.value());
     135           0 :   } else {
     136           0 :     absl::optional<Http::Code> http_code = resultToHttpCode(result);
     137           0 :     if (http_code) {
     138           0 :       putHttpResponseCode(enumToInt(http_code.value()));
     139           0 :     }
     140           0 :   }
     141           0 : }
     142             : 
     143             : // Method is called by putResult when external and local origin errors
     144             : // are treated separately. Local origin errors have separate counters and
     145             : // separate success rate monitor.
     146             : void DetectorHostMonitorImpl::putResultWithLocalExternalSplit(Result result,
     147           0 :                                                               absl::optional<uint64_t>) {
     148           0 :   switch (result) {
     149             :   // SUCCESS is used to report success for connection level. Server may still respond with
     150             :   // error, but connection to server was OK.
     151           0 :   case Result::LocalOriginConnectSuccess:
     152           0 :   case Result::LocalOriginConnectSuccessFinal:
     153           0 :     return localOriginNoFailure();
     154             :   // Connectivity related errors.
     155           0 :   case Result::LocalOriginTimeout:
     156           0 :   case Result::LocalOriginConnectFailed:
     157           0 :     return localOriginFailure();
     158             :   // EXT_ORIGIN_REQUEST_FAILED is used when connection to server was successful, but transaction on
     159             :   // server level failed. Since it it similar to HTTP 5xx, map it to 5xx handler.
     160           0 :   case Result::ExtOriginRequestFailed:
     161             :     // map it to http code and call http handler.
     162           0 :     return putHttpResponseCode(enumToInt(Http::Code::ServiceUnavailable));
     163             :   // EXT_ORIGIN_REQUEST_SUCCESS is used to report that transaction with non-http server was
     164             :   // completed successfully. This means that connection and server level transactions were
     165             :   // successful. Map it to http code 200 OK and indicate that there was no errors on connection
     166             :   // level.
     167           0 :   case Result::ExtOriginRequestSuccess:
     168           0 :     putHttpResponseCode(enumToInt(Http::Code::OK));
     169           0 :     localOriginNoFailure();
     170           0 :     break;
     171           0 :   }
     172           0 : }
     173             : 
     174             : // Method is used by other components to reports success or error.
     175             : // It calls putResultWithLocalExternalSplit or put putResultNoLocalExternalSplit via
     176             : // std::function. The setting happens in constructor based on split_external_local_origin_errors
     177             : // config parameter.
     178           0 : void DetectorHostMonitorImpl::putResult(Result result, absl::optional<uint64_t> code) {
     179           0 :   put_result_func_(this, result, code);
     180           0 : }
     181             : 
     182           0 : void DetectorHostMonitorImpl::localOriginFailure() {
     183           0 :   std::shared_ptr<DetectorImpl> detector = detector_.lock();
     184           0 :   if (!detector) {
     185             :     // It's possible for the cluster/detector to go away while we still have a host in use.
     186           0 :     return;
     187           0 :   }
     188           0 :   local_origin_sr_monitor_.incTotalReqCounter();
     189           0 :   if (++consecutive_local_origin_failure_ ==
     190           0 :       detector->runtime().snapshot().getInteger(
     191           0 :           ConsecutiveLocalOriginFailureRuntime,
     192           0 :           detector->config().consecutiveLocalOriginFailure())) {
     193           0 :     detector->onConsecutiveLocalOriginFailure(host_.lock());
     194           0 :   }
     195           0 : }
     196             : 
     197           0 : void DetectorHostMonitorImpl::localOriginNoFailure() {
     198           0 :   std::shared_ptr<DetectorImpl> detector = detector_.lock();
     199           0 :   if (!detector) {
     200             :     // It's possible for the cluster/detector to go away while we still have a host in use.
     201           0 :     return;
     202           0 :   }
     203             : 
     204           0 :   local_origin_sr_monitor_.incTotalReqCounter();
     205           0 :   local_origin_sr_monitor_.incSuccessReqCounter();
     206             : 
     207           0 :   resetConsecutiveLocalOriginFailure();
     208           0 : }
     209             : 
     210             : DetectorConfig::DetectorConfig(const envoy::config::cluster::v3::OutlierDetection& config)
     211             :     : interval_ms_(
     212             :           static_cast<uint64_t>(PROTOBUF_GET_MS_OR_DEFAULT(config, interval, DEFAULT_INTERVAL_MS))),
     213             :       base_ejection_time_ms_(static_cast<uint64_t>(
     214             :           PROTOBUF_GET_MS_OR_DEFAULT(config, base_ejection_time, DEFAULT_BASE_EJECTION_TIME_MS))),
     215             :       consecutive_5xx_(static_cast<uint64_t>(
     216             :           PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, consecutive_5xx, DEFAULT_CONSECUTIVE_5XX))),
     217             :       consecutive_gateway_failure_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
     218             :           config, consecutive_gateway_failure, DEFAULT_CONSECUTIVE_GATEWAY_FAILURE))),
     219             :       max_ejection_percent_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
     220             :           config, max_ejection_percent, DEFAULT_MAX_EJECTION_PERCENT))),
     221             :       success_rate_minimum_hosts_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
     222             :           config, success_rate_minimum_hosts, DEFAULT_SUCCESS_RATE_MINIMUM_HOSTS))),
     223             :       success_rate_request_volume_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
     224             :           config, success_rate_request_volume, DEFAULT_SUCCESS_RATE_REQUEST_VOLUME))),
     225             :       success_rate_stdev_factor_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
     226             :           config, success_rate_stdev_factor, DEFAULT_SUCCESS_RATE_STDEV_FACTOR))),
     227             :       failure_percentage_threshold_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
     228             :           config, failure_percentage_threshold, DEFAULT_FAILURE_PERCENTAGE_THRESHOLD))),
     229             :       failure_percentage_minimum_hosts_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
     230             :           config, failure_percentage_minimum_hosts, DEFAULT_FAILURE_PERCENTAGE_MINIMUM_HOSTS))),
     231             :       failure_percentage_request_volume_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
     232             :           config, failure_percentage_request_volume, DEFAULT_FAILURE_PERCENTAGE_REQUEST_VOLUME))),
     233             :       enforcing_consecutive_5xx_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
     234             :           config, enforcing_consecutive_5xx, DEFAULT_ENFORCING_CONSECUTIVE_5XX))),
     235             :       enforcing_consecutive_gateway_failure_(static_cast<uint64_t>(
     236             :           PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_consecutive_gateway_failure,
     237             :                                           DEFAULT_ENFORCING_CONSECUTIVE_GATEWAY_FAILURE))),
     238             :       enforcing_success_rate_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
     239             :           config, enforcing_success_rate, DEFAULT_ENFORCING_SUCCESS_RATE))),
     240             :       enforcing_failure_percentage_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
     241             :           config, enforcing_failure_percentage, DEFAULT_ENFORCING_FAILURE_PERCENTAGE))),
     242             :       enforcing_failure_percentage_local_origin_(static_cast<uint64_t>(
     243             :           PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_failure_percentage_local_origin,
     244             :                                           DEFAULT_ENFORCING_FAILURE_PERCENTAGE_LOCAL_ORIGIN))),
     245             :       split_external_local_origin_errors_(config.split_external_local_origin_errors()),
     246             :       consecutive_local_origin_failure_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
     247             :           config, consecutive_local_origin_failure, DEFAULT_CONSECUTIVE_LOCAL_ORIGIN_FAILURE))),
     248             :       enforcing_consecutive_local_origin_failure_(static_cast<uint64_t>(
     249             :           PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_consecutive_local_origin_failure,
     250             :                                           DEFAULT_ENFORCING_CONSECUTIVE_LOCAL_ORIGIN_FAILURE))),
     251             :       enforcing_local_origin_success_rate_(static_cast<uint64_t>(
     252             :           PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_local_origin_success_rate,
     253             :                                           DEFAULT_ENFORCING_LOCAL_ORIGIN_SUCCESS_RATE))),
     254             :       // If max_ejection_time was not specified in the config, apply the default or
     255             :       // base_ejection_time whatever is larger.
     256             :       max_ejection_time_ms_(static_cast<uint64_t>(PROTOBUF_GET_MS_OR_DEFAULT(
     257             :           config, max_ejection_time,
     258             :           std::max(DEFAULT_MAX_EJECTION_TIME_MS, base_ejection_time_ms_)))),
     259             :       max_ejection_time_jitter_ms_(static_cast<uint64_t>(PROTOBUF_GET_MS_OR_DEFAULT(
     260             :           config, max_ejection_time_jitter, DEFAULT_MAX_EJECTION_TIME_JITTER_MS))),
     261             :       successful_active_health_check_uneject_host_(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
     262           0 :           config, successful_active_health_check_uneject_host, true)) {}
     263             : 
     264             : DetectorImpl::DetectorImpl(const Cluster& cluster,
     265             :                            const envoy::config::cluster::v3::OutlierDetection& config,
     266             :                            Event::Dispatcher& dispatcher, Runtime::Loader& runtime,
     267             :                            TimeSource& time_source, EventLoggerSharedPtr event_logger,
     268             :                            Random::RandomGenerator& random)
     269             :     : config_(config), dispatcher_(dispatcher), runtime_(runtime), time_source_(time_source),
     270             :       stats_(generateStats(cluster.info()->statsScope())),
     271           0 :       interval_timer_(dispatcher.createTimer([this]() -> void { onIntervalTimer(); })),
     272           0 :       event_logger_(event_logger), random_generator_(random) {
     273             :   // Insert success rate initial numbers for each type of SR detector
     274           0 :   external_origin_sr_num_ = {-1, -1};
     275           0 :   local_origin_sr_num_ = {-1, -1};
     276           0 : }
     277             : 
     278           0 : DetectorImpl::~DetectorImpl() {
     279           0 :   for (const auto& host : host_monitors_) {
     280           0 :     if (host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) {
     281           0 :       ASSERT(ejections_active_helper_.value() > 0);
     282           0 :       ejections_active_helper_.dec();
     283           0 :     }
     284           0 :   }
     285           0 : }
     286             : 
     287             : absl::StatusOr<std::shared_ptr<DetectorImpl>>
     288             : DetectorImpl::create(Cluster& cluster, const envoy::config::cluster::v3::OutlierDetection& config,
     289             :                      Event::Dispatcher& dispatcher, Runtime::Loader& runtime,
     290             :                      TimeSource& time_source, EventLoggerSharedPtr event_logger,
     291           0 :                      Random::RandomGenerator& random) {
     292           0 :   std::shared_ptr<DetectorImpl> detector(
     293           0 :       new DetectorImpl(cluster, config, dispatcher, runtime, time_source, event_logger, random));
     294             : 
     295           0 :   if (detector->config().maxEjectionTimeMs() < detector->config().baseEjectionTimeMs()) {
     296           0 :     return absl::InvalidArgumentError(
     297           0 :         "outlier detector's max_ejection_time cannot be smaller than base_ejection_time");
     298           0 :   }
     299           0 :   detector->initialize(cluster);
     300             : 
     301           0 :   return detector;
     302           0 : }
     303             : 
     304           0 : void DetectorImpl::initialize(Cluster& cluster) {
     305           0 :   for (auto& host_set : cluster.prioritySet().hostSetsPerPriority()) {
     306           0 :     for (const HostSharedPtr& host : host_set->hosts()) {
     307           0 :       addHostMonitor(host);
     308           0 :     }
     309           0 :   }
     310             : 
     311           0 :   if (config_.successfulActiveHealthCheckUnejectHost() && cluster.healthChecker() != nullptr) {
     312           0 :     cluster.healthChecker()->addHostCheckCompleteCb([this](HostSharedPtr host, HealthTransition) {
     313             :       // If the host is ejected by outlier detection and active health check succeeds,
     314             :       // we should treat this host as healthy.
     315           0 :       if (!host->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC) &&
     316           0 :           host->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) {
     317           0 :         host->healthFlagClear(Host::HealthFlag::FAILED_OUTLIER_CHECK);
     318           0 :         unejectHost(host);
     319           0 :       }
     320           0 :     });
     321           0 :   }
     322           0 :   member_update_cb_ = cluster.prioritySet().addMemberUpdateCb(
     323           0 :       [this](const HostVector& hosts_added, const HostVector& hosts_removed) -> void {
     324           0 :         for (const HostSharedPtr& host : hosts_added) {
     325           0 :           addHostMonitor(host);
     326           0 :         }
     327             : 
     328           0 :         for (const HostSharedPtr& host : hosts_removed) {
     329           0 :           ASSERT(host_monitors_.count(host) == 1);
     330           0 :           if (host->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) {
     331           0 :             ASSERT(ejections_active_helper_.value() > 0);
     332           0 :             ejections_active_helper_.dec();
     333           0 :           }
     334             : 
     335           0 :           host_monitors_.erase(host);
     336           0 :         }
     337           0 :       });
     338             : 
     339           0 :   armIntervalTimer();
     340           0 : }
     341             : 
     342           0 : void DetectorImpl::addHostMonitor(HostSharedPtr host) {
     343           0 :   ASSERT(host_monitors_.count(host) == 0);
     344           0 :   DetectorHostMonitorImpl* monitor = new DetectorHostMonitorImpl(shared_from_this(), host);
     345           0 :   host_monitors_[host] = monitor;
     346           0 :   host->setOutlierDetector(DetectorHostMonitorPtr{monitor});
     347           0 : }
     348             : 
     349           0 : void DetectorImpl::armIntervalTimer() {
     350           0 :   interval_timer_->enableTimer(std::chrono::milliseconds(
     351           0 :       runtime_.snapshot().getInteger(IntervalMsRuntime, config_.intervalMs())));
     352           0 : }
     353             : 
     354             : void DetectorImpl::checkHostForUneject(HostSharedPtr host, DetectorHostMonitorImpl* monitor,
     355           0 :                                        MonotonicTime now) {
     356           0 :   if (!host->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) {
     357           0 :     return;
     358           0 :   }
     359             : 
     360           0 :   const std::chrono::milliseconds base_eject_time = std::chrono::milliseconds(
     361           0 :       runtime_.snapshot().getInteger(BaseEjectionTimeMsRuntime, config_.baseEjectionTimeMs()));
     362           0 :   const std::chrono::milliseconds max_eject_time = std::chrono::milliseconds(
     363           0 :       runtime_.snapshot().getInteger(MaxEjectionTimeMsRuntime, config_.maxEjectionTimeMs()));
     364           0 :   const std::chrono::milliseconds jitter = monitor->getJitter();
     365           0 :   ASSERT(monitor->numEjections() > 0);
     366           0 :   if ((min(base_eject_time * monitor->ejectTimeBackoff(), max_eject_time) + jitter) <=
     367           0 :       (now - monitor->lastEjectionTime().value())) {
     368           0 :     unejectHost(host);
     369           0 :   }
     370           0 : }
     371             : 
     372           0 : void DetectorImpl::unejectHost(HostSharedPtr host) {
     373           0 :   ejections_active_helper_.dec();
     374           0 :   host->healthFlagClear(Host::HealthFlag::FAILED_OUTLIER_CHECK);
     375             :   // Reset the consecutive failure counters to avoid re-ejection on very few new errors due
     376             :   // to the non-triggering counter being close to its trigger value.
     377           0 :   host_monitors_[host]->resetConsecutive5xx();
     378           0 :   host_monitors_[host]->resetConsecutiveGatewayFailure();
     379           0 :   host_monitors_[host]->resetConsecutiveLocalOriginFailure();
     380           0 :   host_monitors_[host]->uneject(time_source_.monotonicTime());
     381           0 :   runCallbacks(host);
     382             : 
     383           0 :   if (event_logger_) {
     384           0 :     event_logger_->logUneject(host);
     385           0 :   }
     386           0 : }
     387             : 
     388           0 : bool DetectorImpl::enforceEjection(envoy::data::cluster::v3::OutlierEjectionType type) {
     389           0 :   switch (type) {
     390           0 :     PANIC_ON_PROTO_ENUM_SENTINEL_VALUES;
     391           0 :   case envoy::data::cluster::v3::CONSECUTIVE_5XX:
     392           0 :     return runtime_.snapshot().featureEnabled(EnforcingConsecutive5xxRuntime,
     393           0 :                                               config_.enforcingConsecutive5xx());
     394           0 :   case envoy::data::cluster::v3::CONSECUTIVE_GATEWAY_FAILURE:
     395           0 :     return runtime_.snapshot().featureEnabled(EnforcingConsecutiveGatewayFailureRuntime,
     396           0 :                                               config_.enforcingConsecutiveGatewayFailure());
     397           0 :   case envoy::data::cluster::v3::SUCCESS_RATE:
     398           0 :     return runtime_.snapshot().featureEnabled(EnforcingSuccessRateRuntime,
     399           0 :                                               config_.enforcingSuccessRate());
     400           0 :   case envoy::data::cluster::v3::CONSECUTIVE_LOCAL_ORIGIN_FAILURE:
     401           0 :     return runtime_.snapshot().featureEnabled(EnforcingConsecutiveLocalOriginFailureRuntime,
     402           0 :                                               config_.enforcingConsecutiveLocalOriginFailure());
     403           0 :   case envoy::data::cluster::v3::SUCCESS_RATE_LOCAL_ORIGIN:
     404           0 :     return runtime_.snapshot().featureEnabled(EnforcingLocalOriginSuccessRateRuntime,
     405           0 :                                               config_.enforcingLocalOriginSuccessRate());
     406           0 :   case envoy::data::cluster::v3::FAILURE_PERCENTAGE:
     407           0 :     return runtime_.snapshot().featureEnabled(EnforcingFailurePercentageRuntime,
     408           0 :                                               config_.enforcingFailurePercentage());
     409           0 :   case envoy::data::cluster::v3::FAILURE_PERCENTAGE_LOCAL_ORIGIN:
     410           0 :     return runtime_.snapshot().featureEnabled(EnforcingFailurePercentageLocalOriginRuntime,
     411           0 :                                               config_.enforcingFailurePercentageLocalOrigin());
     412           0 :   }
     413             : 
     414           0 :   PANIC_DUE_TO_CORRUPT_ENUM;
     415           0 : }
     416             : 
     417           0 : void DetectorImpl::updateEnforcedEjectionStats(envoy::data::cluster::v3::OutlierEjectionType type) {
     418           0 :   stats_.ejections_enforced_total_.inc();
     419           0 :   switch (type) {
     420           0 :     PANIC_ON_PROTO_ENUM_SENTINEL_VALUES;
     421           0 :   case envoy::data::cluster::v3::SUCCESS_RATE:
     422           0 :     stats_.ejections_enforced_success_rate_.inc();
     423           0 :     break;
     424           0 :   case envoy::data::cluster::v3::CONSECUTIVE_5XX:
     425           0 :     stats_.ejections_enforced_consecutive_5xx_.inc();
     426           0 :     break;
     427           0 :   case envoy::data::cluster::v3::CONSECUTIVE_GATEWAY_FAILURE:
     428           0 :     stats_.ejections_enforced_consecutive_gateway_failure_.inc();
     429           0 :     break;
     430           0 :   case envoy::data::cluster::v3::CONSECUTIVE_LOCAL_ORIGIN_FAILURE:
     431           0 :     stats_.ejections_enforced_consecutive_local_origin_failure_.inc();
     432           0 :     break;
     433           0 :   case envoy::data::cluster::v3::SUCCESS_RATE_LOCAL_ORIGIN:
     434           0 :     stats_.ejections_enforced_local_origin_success_rate_.inc();
     435           0 :     break;
     436           0 :   case envoy::data::cluster::v3::FAILURE_PERCENTAGE:
     437           0 :     stats_.ejections_enforced_failure_percentage_.inc();
     438           0 :     break;
     439           0 :   case envoy::data::cluster::v3::FAILURE_PERCENTAGE_LOCAL_ORIGIN:
     440           0 :     stats_.ejections_enforced_local_origin_failure_percentage_.inc();
     441           0 :     break;
     442           0 :   }
     443           0 : }
     444             : 
     445           0 : void DetectorImpl::updateDetectedEjectionStats(envoy::data::cluster::v3::OutlierEjectionType type) {
     446           0 :   switch (type) {
     447           0 :     PANIC_ON_PROTO_ENUM_SENTINEL_VALUES;
     448           0 :   case envoy::data::cluster::v3::SUCCESS_RATE:
     449           0 :     stats_.ejections_detected_success_rate_.inc();
     450           0 :     break;
     451           0 :   case envoy::data::cluster::v3::CONSECUTIVE_5XX:
     452           0 :     stats_.ejections_detected_consecutive_5xx_.inc();
     453           0 :     break;
     454           0 :   case envoy::data::cluster::v3::CONSECUTIVE_GATEWAY_FAILURE:
     455           0 :     stats_.ejections_detected_consecutive_gateway_failure_.inc();
     456           0 :     break;
     457           0 :   case envoy::data::cluster::v3::CONSECUTIVE_LOCAL_ORIGIN_FAILURE:
     458           0 :     stats_.ejections_detected_consecutive_local_origin_failure_.inc();
     459           0 :     break;
     460           0 :   case envoy::data::cluster::v3::SUCCESS_RATE_LOCAL_ORIGIN:
     461           0 :     stats_.ejections_detected_local_origin_success_rate_.inc();
     462           0 :     break;
     463           0 :   case envoy::data::cluster::v3::FAILURE_PERCENTAGE:
     464           0 :     stats_.ejections_detected_failure_percentage_.inc();
     465           0 :     break;
     466           0 :   case envoy::data::cluster::v3::FAILURE_PERCENTAGE_LOCAL_ORIGIN:
     467           0 :     stats_.ejections_detected_local_origin_failure_percentage_.inc();
     468           0 :     break;
     469           0 :   }
     470           0 : }
     471             : 
     472             : void DetectorImpl::ejectHost(HostSharedPtr host,
     473           0 :                              envoy::data::cluster::v3::OutlierEjectionType type) {
     474           0 :   uint64_t max_ejection_percent = std::min<uint64_t>(
     475           0 :       100, runtime_.snapshot().getInteger(MaxEjectionPercentRuntime, config_.maxEjectionPercent()));
     476           0 :   double ejected_percent = 100.0 * (ejections_active_helper_.value() + 1) / host_monitors_.size();
     477             :   // Note this is not currently checked per-priority level, so it is possible
     478             :   // for outlier detection to eject all hosts at any given priority level.
     479           0 :   bool should_eject = (ejected_percent <= max_ejection_percent);
     480           0 :   if (!Runtime::runtimeFeatureEnabled("envoy.reloadable_features.check_mep_on_first_eject")) {
     481           0 :     should_eject = (ejections_active_helper_.value() == 0) || should_eject;
     482           0 :   }
     483           0 :   if (should_eject) {
     484           0 :     if (type == envoy::data::cluster::v3::CONSECUTIVE_5XX ||
     485           0 :         type == envoy::data::cluster::v3::SUCCESS_RATE) {
     486             :       // Deprecated counter, preserving old behaviour until it's removed.
     487           0 :       stats_.ejections_total_.inc();
     488           0 :     }
     489           0 :     if (enforceEjection(type)) {
     490           0 :       ejections_active_helper_.inc();
     491           0 :       updateEnforcedEjectionStats(type);
     492           0 :       host_monitors_[host]->eject(time_source_.monotonicTime());
     493           0 :       const std::chrono::milliseconds base_eject_time = std::chrono::milliseconds(
     494           0 :           runtime_.snapshot().getInteger(BaseEjectionTimeMsRuntime, config_.baseEjectionTimeMs()));
     495           0 :       const std::chrono::milliseconds max_eject_time = std::chrono::milliseconds(
     496           0 :           runtime_.snapshot().getInteger(MaxEjectionTimeMsRuntime, config_.maxEjectionTimeMs()));
     497             : 
     498             :       // Generate random jitter so that not all hosts uneject at the same time,
     499             :       // which could possibly generate a connection storm.
     500             : 
     501             :       // Retrieve max_eject_time_jitter configuration and then calculate the jitter.
     502           0 :       const uint64_t max_eject_time_jitter = runtime_.snapshot().getInteger(
     503           0 :           MaxEjectionTimeJitterMsRuntime, config_.maxEjectionTimeJitterMs());
     504             : 
     505           0 :       const std::chrono::milliseconds jitter =
     506           0 :           std::chrono::milliseconds(random_generator_() % (max_eject_time_jitter + 1));
     507             : 
     508             :       // Save the jitter on the current host_monitor.
     509           0 :       host_monitors_[host]->setJitter(jitter);
     510             : 
     511           0 :       if ((host_monitors_[host]->ejectTimeBackoff() * base_eject_time) <
     512           0 :           (max_eject_time + base_eject_time)) {
     513           0 :         host_monitors_[host]->ejectTimeBackoff()++;
     514           0 :       }
     515             : 
     516           0 :       runCallbacks(host);
     517           0 :       if (event_logger_) {
     518           0 :         event_logger_->logEject(host, *this, type, true);
     519           0 :       }
     520           0 :     } else {
     521           0 :       if (event_logger_) {
     522           0 :         event_logger_->logEject(host, *this, type, false);
     523           0 :       }
     524           0 :     }
     525           0 :   } else {
     526           0 :     stats_.ejections_overflow_.inc();
     527           0 :   }
     528           0 : }
     529             : 
     530           0 : DetectionStats DetectorImpl::generateStats(Stats::Scope& scope) {
     531           0 :   std::string prefix("outlier_detection.");
     532           0 :   return {ALL_OUTLIER_DETECTION_STATS(POOL_COUNTER_PREFIX(scope, prefix),
     533           0 :                                       POOL_GAUGE_PREFIX(scope, prefix))};
     534           0 : }
     535             : 
     536             : void DetectorImpl::notifyMainThreadConsecutiveError(
     537           0 :     HostSharedPtr host, envoy::data::cluster::v3::OutlierEjectionType type) {
     538             :   // This event will come from all threads, so we synchronize with a post to the main thread.
     539             :   // NOTE: Unfortunately consecutive errors are complicated from a threading perspective because
     540             :   //       we catch consecutive errors on worker threads and then post back to the main thread.
     541             :   //       Clusters can get removed, and this means there is a race condition with this
     542             :   //       reverse post. The way we handle this is as follows:
     543             :   //       1) The only strong pointer to the detector is owned by the cluster.
     544             :   //       2) We post a weak pointer to the main thread.
     545             :   //       3) If when running on the main thread the weak pointer can be converted to a strong
     546             :   //          pointer, the detector/cluster must still exist so we can safely fire callbacks.
     547             :   //          Otherwise we do nothing since the detector/cluster is already gone.
     548           0 :   std::weak_ptr<DetectorImpl> weak_this = shared_from_this();
     549           0 :   dispatcher_.post([weak_this, host, type]() -> void {
     550           0 :     std::shared_ptr<DetectorImpl> shared_this = weak_this.lock();
     551           0 :     if (shared_this) {
     552           0 :       shared_this->onConsecutiveErrorWorker(host, type);
     553           0 :     }
     554           0 :   });
     555           0 : }
     556             : 
     557           0 : void DetectorImpl::onConsecutive5xx(HostSharedPtr host) {
     558           0 :   notifyMainThreadConsecutiveError(host, envoy::data::cluster::v3::CONSECUTIVE_5XX);
     559           0 : }
     560             : 
     561           0 : void DetectorImpl::onConsecutiveGatewayFailure(HostSharedPtr host) {
     562           0 :   notifyMainThreadConsecutiveError(host, envoy::data::cluster::v3::CONSECUTIVE_GATEWAY_FAILURE);
     563           0 : }
     564             : 
     565           0 : void DetectorImpl::onConsecutiveLocalOriginFailure(HostSharedPtr host) {
     566           0 :   notifyMainThreadConsecutiveError(host,
     567           0 :                                    envoy::data::cluster::v3::CONSECUTIVE_LOCAL_ORIGIN_FAILURE);
     568           0 : }
     569             : 
     570             : void DetectorImpl::onConsecutiveErrorWorker(HostSharedPtr host,
     571           0 :                                             envoy::data::cluster::v3::OutlierEjectionType type) {
     572             :   // Ejections come in cross thread. There is a chance that the host has already been removed from
     573             :   // the set. If so, just ignore it.
     574           0 :   if (host_monitors_.count(host) == 0) {
     575           0 :     return;
     576           0 :   }
     577           0 :   if (host->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) {
     578           0 :     return;
     579           0 :   }
     580             : 
     581             :   // We also reset the appropriate counter here to allow the monitor to detect a bout of consecutive
     582             :   // error responses even if the monitor is not charged with an interleaved non-error code.
     583           0 :   updateDetectedEjectionStats(type);
     584           0 :   ejectHost(host, type);
     585             : 
     586             :   // reset counters
     587           0 :   switch (type) {
     588           0 :     PANIC_ON_PROTO_ENUM_SENTINEL_VALUES;
     589           0 :   case envoy::data::cluster::v3::SUCCESS_RATE:
     590           0 :     FALLTHRU;
     591           0 :   case envoy::data::cluster::v3::SUCCESS_RATE_LOCAL_ORIGIN:
     592           0 :     FALLTHRU;
     593           0 :   case envoy::data::cluster::v3::FAILURE_PERCENTAGE:
     594           0 :     FALLTHRU;
     595           0 :   case envoy::data::cluster::v3::FAILURE_PERCENTAGE_LOCAL_ORIGIN:
     596           0 :     IS_ENVOY_BUG("unexpected non-consecutive error");
     597           0 :     return;
     598           0 :   case envoy::data::cluster::v3::CONSECUTIVE_5XX:
     599           0 :     stats_.ejections_consecutive_5xx_.inc(); // Deprecated
     600           0 :     host_monitors_[host]->resetConsecutive5xx();
     601           0 :     break;
     602           0 :   case envoy::data::cluster::v3::CONSECUTIVE_GATEWAY_FAILURE:
     603           0 :     host_monitors_[host]->resetConsecutiveGatewayFailure();
     604           0 :     break;
     605           0 :   case envoy::data::cluster::v3::CONSECUTIVE_LOCAL_ORIGIN_FAILURE:
     606           0 :     host_monitors_[host]->resetConsecutiveLocalOriginFailure();
     607           0 :     break;
     608           0 :   }
     609           0 : }
     610             : 
     611             : DetectorImpl::EjectionPair DetectorImpl::successRateEjectionThreshold(
     612             :     double success_rate_sum, const std::vector<HostSuccessRatePair>& valid_success_rate_hosts,
     613           0 :     double success_rate_stdev_factor) {
     614             :   // This function is using mean and standard deviation as statistical measures for outlier
     615             :   // detection. First the mean is calculated by dividing the sum of success rate data over the
     616             :   // number of data points. Then variance is calculated by taking the mean of the
     617             :   // squared difference of data points to the mean of the data. Then standard deviation is
     618             :   // calculated by taking the square root of the variance. Then the outlier threshold is
     619             :   // calculated as the difference between the mean and the product of the standard
     620             :   // deviation and a constant factor.
     621             :   //
     622             :   // For example with a data set that looks like success_rate_data = {50, 100, 100, 100, 100} the
     623             :   // math would work as follows:
     624             :   // success_rate_sum = 450
     625             :   // mean = 90
     626             :   // variance = 400
     627             :   // stdev = 20
     628             :   // threshold returned = 52
     629           0 :   double mean = success_rate_sum / valid_success_rate_hosts.size();
     630           0 :   double variance = 0;
     631           0 :   std::for_each(valid_success_rate_hosts.begin(), valid_success_rate_hosts.end(),
     632           0 :                 [&variance, mean](HostSuccessRatePair v) {
     633           0 :                   variance += std::pow(v.success_rate_ - mean, 2);
     634           0 :                 });
     635           0 :   variance /= valid_success_rate_hosts.size();
     636           0 :   double stdev = std::sqrt(variance);
     637             : 
     638           0 :   return {mean, (mean - (success_rate_stdev_factor * stdev))};
     639           0 : }
     640             : 
     641             : void DetectorImpl::processSuccessRateEjections(
     642           0 :     DetectorHostMonitor::SuccessRateMonitorType monitor_type) {
     643           0 :   uint64_t success_rate_minimum_hosts = runtime_.snapshot().getInteger(
     644           0 :       SuccessRateMinimumHostsRuntime, config_.successRateMinimumHosts());
     645           0 :   uint64_t success_rate_request_volume = runtime_.snapshot().getInteger(
     646           0 :       SuccessRateRequestVolumeRuntime, config_.successRateRequestVolume());
     647           0 :   uint64_t failure_percentage_minimum_hosts = runtime_.snapshot().getInteger(
     648           0 :       FailurePercentageMinimumHostsRuntime, config_.failurePercentageMinimumHosts());
     649           0 :   uint64_t failure_percentage_request_volume = runtime_.snapshot().getInteger(
     650           0 :       FailurePercentageRequestVolumeRuntime, config_.failurePercentageRequestVolume());
     651             : 
     652           0 :   std::vector<HostSuccessRatePair> valid_success_rate_hosts;
     653           0 :   std::vector<HostSuccessRatePair> valid_failure_percentage_hosts;
     654           0 :   double success_rate_sum = 0;
     655             : 
     656             :   // Reset the Detector's success rate mean and stdev.
     657           0 :   getSRNums(monitor_type) = {-1, -1};
     658             : 
     659             :   // Exit early if there are not enough hosts.
     660           0 :   if (host_monitors_.size() < success_rate_minimum_hosts &&
     661           0 :       host_monitors_.size() < failure_percentage_minimum_hosts) {
     662           0 :     return;
     663           0 :   }
     664             : 
     665             :   // reserve upper bound of vector size to avoid reallocation.
     666           0 :   valid_success_rate_hosts.reserve(host_monitors_.size());
     667           0 :   valid_failure_percentage_hosts.reserve(host_monitors_.size());
     668             : 
     669           0 :   for (const auto& host : host_monitors_) {
     670             :     // Don't do work if the host is already ejected.
     671           0 :     if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) {
     672           0 :       absl::optional<std::pair<double, uint64_t>> host_success_rate_and_volume =
     673           0 :           host.second->getSRMonitor(monitor_type)
     674           0 :               .successRateAccumulator()
     675           0 :               .getSuccessRateAndVolume();
     676             : 
     677           0 :       if (!host_success_rate_and_volume) {
     678           0 :         continue;
     679           0 :       }
     680           0 :       double success_rate = host_success_rate_and_volume.value().first;
     681           0 :       double request_volume = host_success_rate_and_volume.value().second;
     682             : 
     683           0 :       if (request_volume >=
     684           0 :           std::min(success_rate_request_volume, failure_percentage_request_volume)) {
     685           0 :         host.second->successRate(monitor_type, success_rate);
     686           0 :       }
     687             : 
     688           0 :       if (request_volume >= success_rate_request_volume) {
     689           0 :         valid_success_rate_hosts.emplace_back(HostSuccessRatePair(host.first, success_rate));
     690           0 :         success_rate_sum += success_rate;
     691           0 :       }
     692           0 :       if (request_volume >= failure_percentage_request_volume) {
     693           0 :         valid_failure_percentage_hosts.emplace_back(HostSuccessRatePair(host.first, success_rate));
     694           0 :       }
     695           0 :     }
     696           0 :   }
     697             : 
     698           0 :   if (!valid_success_rate_hosts.empty() &&
     699           0 :       valid_success_rate_hosts.size() >= success_rate_minimum_hosts) {
     700           0 :     const double success_rate_stdev_factor =
     701           0 :         runtime_.snapshot().getInteger(SuccessRateStdevFactorRuntime,
     702           0 :                                        config_.successRateStdevFactor()) /
     703           0 :         1000.0;
     704           0 :     getSRNums(monitor_type) = successRateEjectionThreshold(
     705           0 :         success_rate_sum, valid_success_rate_hosts, success_rate_stdev_factor);
     706           0 :     const double success_rate_ejection_threshold = getSRNums(monitor_type).ejection_threshold_;
     707           0 :     for (const auto& host_success_rate_pair : valid_success_rate_hosts) {
     708           0 :       if (host_success_rate_pair.success_rate_ < success_rate_ejection_threshold) {
     709           0 :         stats_.ejections_success_rate_.inc(); // Deprecated.
     710           0 :         const envoy::data::cluster::v3::OutlierEjectionType type =
     711           0 :             host_monitors_[host_success_rate_pair.host_]
     712           0 :                 ->getSRMonitor(monitor_type)
     713           0 :                 .getEjectionType();
     714           0 :         updateDetectedEjectionStats(type);
     715           0 :         ejectHost(host_success_rate_pair.host_, type);
     716           0 :       }
     717           0 :     }
     718           0 :   }
     719             : 
     720           0 :   if (!valid_failure_percentage_hosts.empty() &&
     721           0 :       valid_failure_percentage_hosts.size() >= failure_percentage_minimum_hosts) {
     722           0 :     const double failure_percentage_threshold = runtime_.snapshot().getInteger(
     723           0 :         FailurePercentageThresholdRuntime, config_.failurePercentageThreshold());
     724             : 
     725           0 :     for (const auto& host_success_rate_pair : valid_failure_percentage_hosts) {
     726           0 :       if ((100.0 - host_success_rate_pair.success_rate_) >= failure_percentage_threshold) {
     727             :         // We should eject.
     728             : 
     729             :         // The ejection type returned by the SuccessRateMonitor's getEjectionType() will be a
     730             :         // SUCCESS_RATE type, so we need to figure it out for ourselves.
     731           0 :         const envoy::data::cluster::v3::OutlierEjectionType type =
     732           0 :             (monitor_type == DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)
     733           0 :                 ? envoy::data::cluster::v3::FAILURE_PERCENTAGE
     734           0 :                 : envoy::data::cluster::v3::FAILURE_PERCENTAGE_LOCAL_ORIGIN;
     735           0 :         updateDetectedEjectionStats(type);
     736           0 :         ejectHost(host_success_rate_pair.host_, type);
     737           0 :       }
     738           0 :     }
     739           0 :   }
     740           0 : }
     741             : 
     742           0 : void DetectorImpl::onIntervalTimer() {
     743           0 :   MonotonicTime now = time_source_.monotonicTime();
     744             : 
     745           0 :   for (auto host : host_monitors_) {
     746           0 :     checkHostForUneject(host.first, host.second, now);
     747             : 
     748             :     // Need to update the writer bucket to keep the data valid.
     749           0 :     host.second->updateCurrentSuccessRateBucket();
     750             :     // Refresh host success rate stat for the /clusters endpoint. If there is a new valid value, it
     751             :     // will get updated in processSuccessRateEjections().
     752           0 :     host.second->successRate(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin, -1);
     753           0 :     host.second->successRate(DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin, -1);
     754           0 :   }
     755             : 
     756           0 :   processSuccessRateEjections(DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin);
     757           0 :   processSuccessRateEjections(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin);
     758             : 
     759             :   // Decrement time backoff for all hosts which have not been ejected.
     760           0 :   for (auto host : host_monitors_) {
     761           0 :     if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) {
     762           0 :       auto& monitor = host.second;
     763             :       // Node is healthy and was not ejected since the last check.
     764           0 :       if (monitor->lastUnejectionTime().has_value() &&
     765           0 :           ((now - monitor->lastUnejectionTime().value()) >=
     766           0 :            std::chrono::milliseconds(
     767           0 :                runtime_.snapshot().getInteger(IntervalMsRuntime, config_.intervalMs())))) {
     768           0 :         if (monitor->ejectTimeBackoff() != 0) {
     769           0 :           monitor->ejectTimeBackoff()--;
     770           0 :         }
     771           0 :       }
     772           0 :     }
     773           0 :   }
     774             : 
     775           0 :   armIntervalTimer();
     776           0 : }
     777             : 
     778           0 : void DetectorImpl::runCallbacks(HostSharedPtr host) {
     779           0 :   for (const ChangeStateCb& cb : callbacks_) {
     780           0 :     cb(host);
     781           0 :   }
     782           0 : }
     783             : 
     784             : void EventLoggerImpl::logEject(const HostDescriptionConstSharedPtr& host, Detector& detector,
     785           0 :                                envoy::data::cluster::v3::OutlierEjectionType type, bool enforced) {
     786           0 :   envoy::data::cluster::v3::OutlierDetectionEvent event;
     787           0 :   event.set_type(type);
     788             : 
     789           0 :   absl::optional<MonotonicTime> time = host->outlierDetector().lastUnejectionTime();
     790           0 :   setCommonEventParams(event, host, time);
     791             : 
     792           0 :   event.set_action(envoy::data::cluster::v3::EJECT);
     793             : 
     794           0 :   event.set_enforced(enforced);
     795             : 
     796           0 :   if ((type == envoy::data::cluster::v3::SUCCESS_RATE) ||
     797           0 :       (type == envoy::data::cluster::v3::SUCCESS_RATE_LOCAL_ORIGIN)) {
     798           0 :     const DetectorHostMonitor::SuccessRateMonitorType monitor_type =
     799           0 :         (type == envoy::data::cluster::v3::SUCCESS_RATE)
     800           0 :             ? DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin
     801           0 :             : DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin;
     802           0 :     event.mutable_eject_success_rate_event()->set_cluster_average_success_rate(
     803           0 :         detector.successRateAverage(monitor_type));
     804           0 :     event.mutable_eject_success_rate_event()->set_cluster_success_rate_ejection_threshold(
     805           0 :         detector.successRateEjectionThreshold(monitor_type));
     806           0 :     event.mutable_eject_success_rate_event()->set_host_success_rate(
     807           0 :         host->outlierDetector().successRate(monitor_type));
     808           0 :   } else if ((type == envoy::data::cluster::v3::FAILURE_PERCENTAGE) ||
     809           0 :              (type == envoy::data::cluster::v3::FAILURE_PERCENTAGE_LOCAL_ORIGIN)) {
     810           0 :     const DetectorHostMonitor::SuccessRateMonitorType monitor_type =
     811           0 :         (type == envoy::data::cluster::v3::FAILURE_PERCENTAGE)
     812           0 :             ? DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin
     813           0 :             : DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin;
     814           0 :     event.mutable_eject_failure_percentage_event()->set_host_success_rate(
     815           0 :         host->outlierDetector().successRate(monitor_type));
     816           0 :   } else {
     817           0 :     event.mutable_eject_consecutive_event();
     818           0 :   }
     819             : 
     820           0 :   std::string json;
     821           0 : #ifdef ENVOY_ENABLE_YAML
     822           0 :   json = MessageUtil::getJsonStringFromMessageOrError(event, /* pretty_print */ false,
     823           0 :                                                       /* always_print_primitive_fields */ true);
     824             : #else
     825             :   IS_ENVOY_BUG("attempting outlier logging with JSON support removed");
     826             : #endif
     827           0 :   file_->write(fmt::format("{}\n", json));
     828           0 : }
     829             : 
     830           0 : void EventLoggerImpl::logUneject(const HostDescriptionConstSharedPtr& host) {
     831           0 :   envoy::data::cluster::v3::OutlierDetectionEvent event;
     832             : 
     833           0 :   absl::optional<MonotonicTime> time = host->outlierDetector().lastEjectionTime();
     834           0 :   setCommonEventParams(event, host, time);
     835             : 
     836           0 :   event.set_action(envoy::data::cluster::v3::UNEJECT);
     837             : 
     838           0 :   std::string json;
     839           0 : #ifdef ENVOY_ENABLE_YAML
     840           0 :   json = MessageUtil::getJsonStringFromMessageOrError(event, /* pretty_print */ false,
     841           0 :                                                       /* always_print_primitive_fields */ true);
     842             : #else
     843             :   IS_ENVOY_BUG("attempting outlier logging with JSON support removed");
     844             : #endif
     845           0 :   file_->write(fmt::format("{}\n", json));
     846           0 : }
     847             : 
     848             : void EventLoggerImpl::setCommonEventParams(envoy::data::cluster::v3::OutlierDetectionEvent& event,
     849             :                                            const HostDescriptionConstSharedPtr& host,
     850           0 :                                            absl::optional<MonotonicTime> time) {
     851           0 :   MonotonicTime monotonic_now = time_source_.monotonicTime();
     852           0 :   if (time) {
     853           0 :     std::chrono::seconds secsFromLastAction =
     854           0 :         std::chrono::duration_cast<std::chrono::seconds>(monotonic_now - time.value());
     855           0 :     event.mutable_secs_since_last_action()->set_value(secsFromLastAction.count());
     856           0 :   }
     857           0 :   event.set_cluster_name(host->cluster().name());
     858           0 :   event.set_upstream_url(host->address()->asString());
     859           0 :   event.set_num_ejections(host->outlierDetector().numEjections());
     860           0 :   TimestampUtil::systemClockToTimestamp(time_source_.systemTime(), *event.mutable_timestamp());
     861           0 : }
     862             : 
     863           0 : SuccessRateAccumulatorBucket* SuccessRateAccumulator::updateCurrentWriter() {
     864             :   // Right now current is being written to and backup is not. Flush the backup and swap.
     865           0 :   backup_success_rate_bucket_->success_request_counter_ = 0;
     866           0 :   backup_success_rate_bucket_->total_request_counter_ = 0;
     867             : 
     868           0 :   current_success_rate_bucket_.swap(backup_success_rate_bucket_);
     869             : 
     870           0 :   return current_success_rate_bucket_.get();
     871           0 : }
     872             : 
     873           0 : absl::optional<std::pair<double, uint64_t>> SuccessRateAccumulator::getSuccessRateAndVolume() {
     874           0 :   if (!backup_success_rate_bucket_->total_request_counter_) {
     875           0 :     return absl::nullopt;
     876           0 :   }
     877             : 
     878           0 :   double success_rate = backup_success_rate_bucket_->success_request_counter_ * 100.0 /
     879           0 :                         backup_success_rate_bucket_->total_request_counter_;
     880             : 
     881           0 :   return {{success_rate, backup_success_rate_bucket_->total_request_counter_}};
     882           0 : }
     883             : 
     884             : } // namespace Outlier
     885             : } // namespace Upstream
     886             : } // namespace Envoy

Generated by: LCOV version 1.15