Line data Source code
1 : #include "source/common/upstream/outlier_detection_impl.h"
2 :
3 : #include <chrono>
4 : #include <cstdint>
5 : #include <memory>
6 : #include <string>
7 : #include <vector>
8 :
9 : #include "envoy/config/cluster/v3/cluster.pb.h"
10 : #include "envoy/config/cluster/v3/outlier_detection.pb.h"
11 : #include "envoy/data/cluster/v3/outlier_detection_event.pb.h"
12 : #include "envoy/event/dispatcher.h"
13 : #include "envoy/stats/scope.h"
14 :
15 : #include "source/common/common/assert.h"
16 : #include "source/common/common/enum_to_int.h"
17 : #include "source/common/common/fmt.h"
18 : #include "source/common/common/utility.h"
19 : #include "source/common/http/codes.h"
20 : #include "source/common/protobuf/utility.h"
21 :
22 : namespace Envoy {
23 : namespace Upstream {
24 : namespace Outlier {
25 :
26 : absl::StatusOr<DetectorSharedPtr> DetectorImplFactory::createForCluster(
27 : Cluster& cluster, const envoy::config::cluster::v3::Cluster& cluster_config,
28 : Event::Dispatcher& dispatcher, Runtime::Loader& runtime, EventLoggerSharedPtr event_logger,
29 159 : Random::RandomGenerator& random) {
30 159 : if (cluster_config.has_outlier_detection()) {
31 :
32 0 : return DetectorImpl::create(cluster, cluster_config.outlier_detection(), dispatcher, runtime,
33 0 : dispatcher.timeSource(), std::move(event_logger), random);
34 159 : } else {
35 159 : return nullptr;
36 159 : }
37 159 : }
38 :
39 : DetectorHostMonitorImpl::DetectorHostMonitorImpl(std::shared_ptr<DetectorImpl> detector,
40 : HostSharedPtr host)
41 : : detector_(detector), host_(host),
42 : // add Success Rate monitors
43 : external_origin_sr_monitor_(envoy::data::cluster::v3::SUCCESS_RATE),
44 0 : local_origin_sr_monitor_(envoy::data::cluster::v3::SUCCESS_RATE_LOCAL_ORIGIN) {
45 : // Setup method to call when putResult is invoked. Depending on the config's
46 : // split_external_local_origin_errors_ boolean value different method is called.
47 0 : put_result_func_ = detector->config().splitExternalLocalOriginErrors()
48 0 : ? &DetectorHostMonitorImpl::putResultWithLocalExternalSplit
49 0 : : &DetectorHostMonitorImpl::putResultNoLocalExternalSplit;
50 0 : }
51 :
52 0 : void DetectorHostMonitorImpl::eject(MonotonicTime ejection_time) {
53 0 : ASSERT(!host_.lock()->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK));
54 0 : host_.lock()->healthFlagSet(Host::HealthFlag::FAILED_OUTLIER_CHECK);
55 0 : num_ejections_++;
56 0 : last_ejection_time_ = ejection_time;
57 0 : }
58 :
59 0 : void DetectorHostMonitorImpl::uneject(MonotonicTime unejection_time) {
60 0 : last_unejection_time_ = (unejection_time);
61 0 : }
62 :
63 0 : void DetectorHostMonitorImpl::updateCurrentSuccessRateBucket() {
64 0 : external_origin_sr_monitor_.updateCurrentSuccessRateBucket();
65 0 : local_origin_sr_monitor_.updateCurrentSuccessRateBucket();
66 0 : }
67 :
68 0 : void DetectorHostMonitorImpl::putHttpResponseCode(uint64_t response_code) {
69 0 : external_origin_sr_monitor_.incTotalReqCounter();
70 0 : if (Http::CodeUtility::is5xx(response_code)) {
71 0 : std::shared_ptr<DetectorImpl> detector = detector_.lock();
72 0 : if (!detector) {
73 : // It's possible for the cluster/detector to go away while we still have a host in use.
74 0 : return;
75 0 : }
76 0 : if (Http::CodeUtility::isGatewayError(response_code)) {
77 0 : if (++consecutive_gateway_failure_ ==
78 0 : detector->runtime().snapshot().getInteger(
79 0 : ConsecutiveGatewayFailureRuntime, detector->config().consecutiveGatewayFailure())) {
80 0 : detector->onConsecutiveGatewayFailure(host_.lock());
81 0 : }
82 0 : } else {
83 0 : consecutive_gateway_failure_ = 0;
84 0 : }
85 :
86 0 : if (++consecutive_5xx_ == detector->runtime().snapshot().getInteger(
87 0 : Consecutive5xxRuntime, detector->config().consecutive5xx())) {
88 0 : detector->onConsecutive5xx(host_.lock());
89 0 : }
90 0 : } else {
91 0 : external_origin_sr_monitor_.incSuccessReqCounter();
92 0 : consecutive_5xx_ = 0;
93 0 : consecutive_gateway_failure_ = 0;
94 0 : }
95 0 : }
96 :
97 0 : absl::optional<Http::Code> DetectorHostMonitorImpl::resultToHttpCode(Result result) {
98 0 : Http::Code http_code = Http::Code::InternalServerError;
99 :
100 0 : switch (result) {
101 0 : case Result::ExtOriginRequestSuccess:
102 0 : case Result::LocalOriginConnectSuccessFinal:
103 0 : http_code = Http::Code::OK;
104 0 : break;
105 0 : case Result::LocalOriginTimeout:
106 0 : http_code = Http::Code::GatewayTimeout;
107 0 : break;
108 0 : case Result::LocalOriginConnectFailed:
109 0 : http_code = Http::Code::ServiceUnavailable;
110 0 : break;
111 0 : case Result::ExtOriginRequestFailed:
112 0 : http_code = Http::Code::InternalServerError;
113 0 : break;
114 : // LOCAL_ORIGIN_CONNECT_SUCCESS is used is 2-layer protocols, like HTTP.
115 : // First connection is established and then higher level protocol runs.
116 : // If error happens in higher layer protocol, it will be mapped to
117 : // HTTP code indicating error. In order not to intervene with result of
118 : // higher layer protocol, this code is not mapped to HTTP code.
119 0 : case Result::LocalOriginConnectSuccess:
120 0 : return absl::nullopt;
121 0 : }
122 :
123 0 : return {http_code};
124 0 : }
125 :
126 : // Method is called by putResult when external and local origin errors
127 : // are not treated differently. All errors are mapped to HTTP codes.
128 : // Depending on the value of the parameter *code* the function behaves differently:
129 : // - if the *code* is not defined, mapping uses resultToHttpCode method to do mapping.
130 : // - if *code* is defined, it is taken as HTTP code and reported as such to outlier detector.
131 : void DetectorHostMonitorImpl::putResultNoLocalExternalSplit(Result result,
132 0 : absl::optional<uint64_t> code) {
133 0 : if (code) {
134 0 : putHttpResponseCode(code.value());
135 0 : } else {
136 0 : absl::optional<Http::Code> http_code = resultToHttpCode(result);
137 0 : if (http_code) {
138 0 : putHttpResponseCode(enumToInt(http_code.value()));
139 0 : }
140 0 : }
141 0 : }
142 :
143 : // Method is called by putResult when external and local origin errors
144 : // are treated separately. Local origin errors have separate counters and
145 : // separate success rate monitor.
146 : void DetectorHostMonitorImpl::putResultWithLocalExternalSplit(Result result,
147 0 : absl::optional<uint64_t>) {
148 0 : switch (result) {
149 : // SUCCESS is used to report success for connection level. Server may still respond with
150 : // error, but connection to server was OK.
151 0 : case Result::LocalOriginConnectSuccess:
152 0 : case Result::LocalOriginConnectSuccessFinal:
153 0 : return localOriginNoFailure();
154 : // Connectivity related errors.
155 0 : case Result::LocalOriginTimeout:
156 0 : case Result::LocalOriginConnectFailed:
157 0 : return localOriginFailure();
158 : // EXT_ORIGIN_REQUEST_FAILED is used when connection to server was successful, but transaction on
159 : // server level failed. Since it it similar to HTTP 5xx, map it to 5xx handler.
160 0 : case Result::ExtOriginRequestFailed:
161 : // map it to http code and call http handler.
162 0 : return putHttpResponseCode(enumToInt(Http::Code::ServiceUnavailable));
163 : // EXT_ORIGIN_REQUEST_SUCCESS is used to report that transaction with non-http server was
164 : // completed successfully. This means that connection and server level transactions were
165 : // successful. Map it to http code 200 OK and indicate that there was no errors on connection
166 : // level.
167 0 : case Result::ExtOriginRequestSuccess:
168 0 : putHttpResponseCode(enumToInt(Http::Code::OK));
169 0 : localOriginNoFailure();
170 0 : break;
171 0 : }
172 0 : }
173 :
174 : // Method is used by other components to reports success or error.
175 : // It calls putResultWithLocalExternalSplit or put putResultNoLocalExternalSplit via
176 : // std::function. The setting happens in constructor based on split_external_local_origin_errors
177 : // config parameter.
178 0 : void DetectorHostMonitorImpl::putResult(Result result, absl::optional<uint64_t> code) {
179 0 : put_result_func_(this, result, code);
180 0 : }
181 :
182 0 : void DetectorHostMonitorImpl::localOriginFailure() {
183 0 : std::shared_ptr<DetectorImpl> detector = detector_.lock();
184 0 : if (!detector) {
185 : // It's possible for the cluster/detector to go away while we still have a host in use.
186 0 : return;
187 0 : }
188 0 : local_origin_sr_monitor_.incTotalReqCounter();
189 0 : if (++consecutive_local_origin_failure_ ==
190 0 : detector->runtime().snapshot().getInteger(
191 0 : ConsecutiveLocalOriginFailureRuntime,
192 0 : detector->config().consecutiveLocalOriginFailure())) {
193 0 : detector->onConsecutiveLocalOriginFailure(host_.lock());
194 0 : }
195 0 : }
196 :
197 0 : void DetectorHostMonitorImpl::localOriginNoFailure() {
198 0 : std::shared_ptr<DetectorImpl> detector = detector_.lock();
199 0 : if (!detector) {
200 : // It's possible for the cluster/detector to go away while we still have a host in use.
201 0 : return;
202 0 : }
203 :
204 0 : local_origin_sr_monitor_.incTotalReqCounter();
205 0 : local_origin_sr_monitor_.incSuccessReqCounter();
206 :
207 0 : resetConsecutiveLocalOriginFailure();
208 0 : }
209 :
210 : DetectorConfig::DetectorConfig(const envoy::config::cluster::v3::OutlierDetection& config)
211 : : interval_ms_(
212 : static_cast<uint64_t>(PROTOBUF_GET_MS_OR_DEFAULT(config, interval, DEFAULT_INTERVAL_MS))),
213 : base_ejection_time_ms_(static_cast<uint64_t>(
214 : PROTOBUF_GET_MS_OR_DEFAULT(config, base_ejection_time, DEFAULT_BASE_EJECTION_TIME_MS))),
215 : consecutive_5xx_(static_cast<uint64_t>(
216 : PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, consecutive_5xx, DEFAULT_CONSECUTIVE_5XX))),
217 : consecutive_gateway_failure_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
218 : config, consecutive_gateway_failure, DEFAULT_CONSECUTIVE_GATEWAY_FAILURE))),
219 : max_ejection_percent_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
220 : config, max_ejection_percent, DEFAULT_MAX_EJECTION_PERCENT))),
221 : success_rate_minimum_hosts_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
222 : config, success_rate_minimum_hosts, DEFAULT_SUCCESS_RATE_MINIMUM_HOSTS))),
223 : success_rate_request_volume_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
224 : config, success_rate_request_volume, DEFAULT_SUCCESS_RATE_REQUEST_VOLUME))),
225 : success_rate_stdev_factor_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
226 : config, success_rate_stdev_factor, DEFAULT_SUCCESS_RATE_STDEV_FACTOR))),
227 : failure_percentage_threshold_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
228 : config, failure_percentage_threshold, DEFAULT_FAILURE_PERCENTAGE_THRESHOLD))),
229 : failure_percentage_minimum_hosts_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
230 : config, failure_percentage_minimum_hosts, DEFAULT_FAILURE_PERCENTAGE_MINIMUM_HOSTS))),
231 : failure_percentage_request_volume_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
232 : config, failure_percentage_request_volume, DEFAULT_FAILURE_PERCENTAGE_REQUEST_VOLUME))),
233 : enforcing_consecutive_5xx_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
234 : config, enforcing_consecutive_5xx, DEFAULT_ENFORCING_CONSECUTIVE_5XX))),
235 : enforcing_consecutive_gateway_failure_(static_cast<uint64_t>(
236 : PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_consecutive_gateway_failure,
237 : DEFAULT_ENFORCING_CONSECUTIVE_GATEWAY_FAILURE))),
238 : enforcing_success_rate_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
239 : config, enforcing_success_rate, DEFAULT_ENFORCING_SUCCESS_RATE))),
240 : enforcing_failure_percentage_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
241 : config, enforcing_failure_percentage, DEFAULT_ENFORCING_FAILURE_PERCENTAGE))),
242 : enforcing_failure_percentage_local_origin_(static_cast<uint64_t>(
243 : PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_failure_percentage_local_origin,
244 : DEFAULT_ENFORCING_FAILURE_PERCENTAGE_LOCAL_ORIGIN))),
245 : split_external_local_origin_errors_(config.split_external_local_origin_errors()),
246 : consecutive_local_origin_failure_(static_cast<uint64_t>(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
247 : config, consecutive_local_origin_failure, DEFAULT_CONSECUTIVE_LOCAL_ORIGIN_FAILURE))),
248 : enforcing_consecutive_local_origin_failure_(static_cast<uint64_t>(
249 : PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_consecutive_local_origin_failure,
250 : DEFAULT_ENFORCING_CONSECUTIVE_LOCAL_ORIGIN_FAILURE))),
251 : enforcing_local_origin_success_rate_(static_cast<uint64_t>(
252 : PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_local_origin_success_rate,
253 : DEFAULT_ENFORCING_LOCAL_ORIGIN_SUCCESS_RATE))),
254 : // If max_ejection_time was not specified in the config, apply the default or
255 : // base_ejection_time whatever is larger.
256 : max_ejection_time_ms_(static_cast<uint64_t>(PROTOBUF_GET_MS_OR_DEFAULT(
257 : config, max_ejection_time,
258 : std::max(DEFAULT_MAX_EJECTION_TIME_MS, base_ejection_time_ms_)))),
259 : max_ejection_time_jitter_ms_(static_cast<uint64_t>(PROTOBUF_GET_MS_OR_DEFAULT(
260 : config, max_ejection_time_jitter, DEFAULT_MAX_EJECTION_TIME_JITTER_MS))),
261 : successful_active_health_check_uneject_host_(PROTOBUF_GET_WRAPPED_OR_DEFAULT(
262 0 : config, successful_active_health_check_uneject_host, true)) {}
263 :
264 : DetectorImpl::DetectorImpl(const Cluster& cluster,
265 : const envoy::config::cluster::v3::OutlierDetection& config,
266 : Event::Dispatcher& dispatcher, Runtime::Loader& runtime,
267 : TimeSource& time_source, EventLoggerSharedPtr event_logger,
268 : Random::RandomGenerator& random)
269 : : config_(config), dispatcher_(dispatcher), runtime_(runtime), time_source_(time_source),
270 : stats_(generateStats(cluster.info()->statsScope())),
271 0 : interval_timer_(dispatcher.createTimer([this]() -> void { onIntervalTimer(); })),
272 0 : event_logger_(event_logger), random_generator_(random) {
273 : // Insert success rate initial numbers for each type of SR detector
274 0 : external_origin_sr_num_ = {-1, -1};
275 0 : local_origin_sr_num_ = {-1, -1};
276 0 : }
277 :
278 0 : DetectorImpl::~DetectorImpl() {
279 0 : for (const auto& host : host_monitors_) {
280 0 : if (host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) {
281 0 : ASSERT(ejections_active_helper_.value() > 0);
282 0 : ejections_active_helper_.dec();
283 0 : }
284 0 : }
285 0 : }
286 :
287 : absl::StatusOr<std::shared_ptr<DetectorImpl>>
288 : DetectorImpl::create(Cluster& cluster, const envoy::config::cluster::v3::OutlierDetection& config,
289 : Event::Dispatcher& dispatcher, Runtime::Loader& runtime,
290 : TimeSource& time_source, EventLoggerSharedPtr event_logger,
291 0 : Random::RandomGenerator& random) {
292 0 : std::shared_ptr<DetectorImpl> detector(
293 0 : new DetectorImpl(cluster, config, dispatcher, runtime, time_source, event_logger, random));
294 :
295 0 : if (detector->config().maxEjectionTimeMs() < detector->config().baseEjectionTimeMs()) {
296 0 : return absl::InvalidArgumentError(
297 0 : "outlier detector's max_ejection_time cannot be smaller than base_ejection_time");
298 0 : }
299 0 : detector->initialize(cluster);
300 :
301 0 : return detector;
302 0 : }
303 :
304 0 : void DetectorImpl::initialize(Cluster& cluster) {
305 0 : for (auto& host_set : cluster.prioritySet().hostSetsPerPriority()) {
306 0 : for (const HostSharedPtr& host : host_set->hosts()) {
307 0 : addHostMonitor(host);
308 0 : }
309 0 : }
310 :
311 0 : if (config_.successfulActiveHealthCheckUnejectHost() && cluster.healthChecker() != nullptr) {
312 0 : cluster.healthChecker()->addHostCheckCompleteCb([this](HostSharedPtr host, HealthTransition) {
313 : // If the host is ejected by outlier detection and active health check succeeds,
314 : // we should treat this host as healthy.
315 0 : if (!host->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC) &&
316 0 : host->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) {
317 0 : host->healthFlagClear(Host::HealthFlag::FAILED_OUTLIER_CHECK);
318 0 : unejectHost(host);
319 0 : }
320 0 : });
321 0 : }
322 0 : member_update_cb_ = cluster.prioritySet().addMemberUpdateCb(
323 0 : [this](const HostVector& hosts_added, const HostVector& hosts_removed) -> void {
324 0 : for (const HostSharedPtr& host : hosts_added) {
325 0 : addHostMonitor(host);
326 0 : }
327 :
328 0 : for (const HostSharedPtr& host : hosts_removed) {
329 0 : ASSERT(host_monitors_.count(host) == 1);
330 0 : if (host->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) {
331 0 : ASSERT(ejections_active_helper_.value() > 0);
332 0 : ejections_active_helper_.dec();
333 0 : }
334 :
335 0 : host_monitors_.erase(host);
336 0 : }
337 0 : });
338 :
339 0 : armIntervalTimer();
340 0 : }
341 :
342 0 : void DetectorImpl::addHostMonitor(HostSharedPtr host) {
343 0 : ASSERT(host_monitors_.count(host) == 0);
344 0 : DetectorHostMonitorImpl* monitor = new DetectorHostMonitorImpl(shared_from_this(), host);
345 0 : host_monitors_[host] = monitor;
346 0 : host->setOutlierDetector(DetectorHostMonitorPtr{monitor});
347 0 : }
348 :
349 0 : void DetectorImpl::armIntervalTimer() {
350 0 : interval_timer_->enableTimer(std::chrono::milliseconds(
351 0 : runtime_.snapshot().getInteger(IntervalMsRuntime, config_.intervalMs())));
352 0 : }
353 :
354 : void DetectorImpl::checkHostForUneject(HostSharedPtr host, DetectorHostMonitorImpl* monitor,
355 0 : MonotonicTime now) {
356 0 : if (!host->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) {
357 0 : return;
358 0 : }
359 :
360 0 : const std::chrono::milliseconds base_eject_time = std::chrono::milliseconds(
361 0 : runtime_.snapshot().getInteger(BaseEjectionTimeMsRuntime, config_.baseEjectionTimeMs()));
362 0 : const std::chrono::milliseconds max_eject_time = std::chrono::milliseconds(
363 0 : runtime_.snapshot().getInteger(MaxEjectionTimeMsRuntime, config_.maxEjectionTimeMs()));
364 0 : const std::chrono::milliseconds jitter = monitor->getJitter();
365 0 : ASSERT(monitor->numEjections() > 0);
366 0 : if ((min(base_eject_time * monitor->ejectTimeBackoff(), max_eject_time) + jitter) <=
367 0 : (now - monitor->lastEjectionTime().value())) {
368 0 : unejectHost(host);
369 0 : }
370 0 : }
371 :
372 0 : void DetectorImpl::unejectHost(HostSharedPtr host) {
373 0 : ejections_active_helper_.dec();
374 0 : host->healthFlagClear(Host::HealthFlag::FAILED_OUTLIER_CHECK);
375 : // Reset the consecutive failure counters to avoid re-ejection on very few new errors due
376 : // to the non-triggering counter being close to its trigger value.
377 0 : host_monitors_[host]->resetConsecutive5xx();
378 0 : host_monitors_[host]->resetConsecutiveGatewayFailure();
379 0 : host_monitors_[host]->resetConsecutiveLocalOriginFailure();
380 0 : host_monitors_[host]->uneject(time_source_.monotonicTime());
381 0 : runCallbacks(host);
382 :
383 0 : if (event_logger_) {
384 0 : event_logger_->logUneject(host);
385 0 : }
386 0 : }
387 :
388 0 : bool DetectorImpl::enforceEjection(envoy::data::cluster::v3::OutlierEjectionType type) {
389 0 : switch (type) {
390 0 : PANIC_ON_PROTO_ENUM_SENTINEL_VALUES;
391 0 : case envoy::data::cluster::v3::CONSECUTIVE_5XX:
392 0 : return runtime_.snapshot().featureEnabled(EnforcingConsecutive5xxRuntime,
393 0 : config_.enforcingConsecutive5xx());
394 0 : case envoy::data::cluster::v3::CONSECUTIVE_GATEWAY_FAILURE:
395 0 : return runtime_.snapshot().featureEnabled(EnforcingConsecutiveGatewayFailureRuntime,
396 0 : config_.enforcingConsecutiveGatewayFailure());
397 0 : case envoy::data::cluster::v3::SUCCESS_RATE:
398 0 : return runtime_.snapshot().featureEnabled(EnforcingSuccessRateRuntime,
399 0 : config_.enforcingSuccessRate());
400 0 : case envoy::data::cluster::v3::CONSECUTIVE_LOCAL_ORIGIN_FAILURE:
401 0 : return runtime_.snapshot().featureEnabled(EnforcingConsecutiveLocalOriginFailureRuntime,
402 0 : config_.enforcingConsecutiveLocalOriginFailure());
403 0 : case envoy::data::cluster::v3::SUCCESS_RATE_LOCAL_ORIGIN:
404 0 : return runtime_.snapshot().featureEnabled(EnforcingLocalOriginSuccessRateRuntime,
405 0 : config_.enforcingLocalOriginSuccessRate());
406 0 : case envoy::data::cluster::v3::FAILURE_PERCENTAGE:
407 0 : return runtime_.snapshot().featureEnabled(EnforcingFailurePercentageRuntime,
408 0 : config_.enforcingFailurePercentage());
409 0 : case envoy::data::cluster::v3::FAILURE_PERCENTAGE_LOCAL_ORIGIN:
410 0 : return runtime_.snapshot().featureEnabled(EnforcingFailurePercentageLocalOriginRuntime,
411 0 : config_.enforcingFailurePercentageLocalOrigin());
412 0 : }
413 :
414 0 : PANIC_DUE_TO_CORRUPT_ENUM;
415 0 : }
416 :
417 0 : void DetectorImpl::updateEnforcedEjectionStats(envoy::data::cluster::v3::OutlierEjectionType type) {
418 0 : stats_.ejections_enforced_total_.inc();
419 0 : switch (type) {
420 0 : PANIC_ON_PROTO_ENUM_SENTINEL_VALUES;
421 0 : case envoy::data::cluster::v3::SUCCESS_RATE:
422 0 : stats_.ejections_enforced_success_rate_.inc();
423 0 : break;
424 0 : case envoy::data::cluster::v3::CONSECUTIVE_5XX:
425 0 : stats_.ejections_enforced_consecutive_5xx_.inc();
426 0 : break;
427 0 : case envoy::data::cluster::v3::CONSECUTIVE_GATEWAY_FAILURE:
428 0 : stats_.ejections_enforced_consecutive_gateway_failure_.inc();
429 0 : break;
430 0 : case envoy::data::cluster::v3::CONSECUTIVE_LOCAL_ORIGIN_FAILURE:
431 0 : stats_.ejections_enforced_consecutive_local_origin_failure_.inc();
432 0 : break;
433 0 : case envoy::data::cluster::v3::SUCCESS_RATE_LOCAL_ORIGIN:
434 0 : stats_.ejections_enforced_local_origin_success_rate_.inc();
435 0 : break;
436 0 : case envoy::data::cluster::v3::FAILURE_PERCENTAGE:
437 0 : stats_.ejections_enforced_failure_percentage_.inc();
438 0 : break;
439 0 : case envoy::data::cluster::v3::FAILURE_PERCENTAGE_LOCAL_ORIGIN:
440 0 : stats_.ejections_enforced_local_origin_failure_percentage_.inc();
441 0 : break;
442 0 : }
443 0 : }
444 :
445 0 : void DetectorImpl::updateDetectedEjectionStats(envoy::data::cluster::v3::OutlierEjectionType type) {
446 0 : switch (type) {
447 0 : PANIC_ON_PROTO_ENUM_SENTINEL_VALUES;
448 0 : case envoy::data::cluster::v3::SUCCESS_RATE:
449 0 : stats_.ejections_detected_success_rate_.inc();
450 0 : break;
451 0 : case envoy::data::cluster::v3::CONSECUTIVE_5XX:
452 0 : stats_.ejections_detected_consecutive_5xx_.inc();
453 0 : break;
454 0 : case envoy::data::cluster::v3::CONSECUTIVE_GATEWAY_FAILURE:
455 0 : stats_.ejections_detected_consecutive_gateway_failure_.inc();
456 0 : break;
457 0 : case envoy::data::cluster::v3::CONSECUTIVE_LOCAL_ORIGIN_FAILURE:
458 0 : stats_.ejections_detected_consecutive_local_origin_failure_.inc();
459 0 : break;
460 0 : case envoy::data::cluster::v3::SUCCESS_RATE_LOCAL_ORIGIN:
461 0 : stats_.ejections_detected_local_origin_success_rate_.inc();
462 0 : break;
463 0 : case envoy::data::cluster::v3::FAILURE_PERCENTAGE:
464 0 : stats_.ejections_detected_failure_percentage_.inc();
465 0 : break;
466 0 : case envoy::data::cluster::v3::FAILURE_PERCENTAGE_LOCAL_ORIGIN:
467 0 : stats_.ejections_detected_local_origin_failure_percentage_.inc();
468 0 : break;
469 0 : }
470 0 : }
471 :
472 : void DetectorImpl::ejectHost(HostSharedPtr host,
473 0 : envoy::data::cluster::v3::OutlierEjectionType type) {
474 0 : uint64_t max_ejection_percent = std::min<uint64_t>(
475 0 : 100, runtime_.snapshot().getInteger(MaxEjectionPercentRuntime, config_.maxEjectionPercent()));
476 0 : double ejected_percent = 100.0 * (ejections_active_helper_.value() + 1) / host_monitors_.size();
477 : // Note this is not currently checked per-priority level, so it is possible
478 : // for outlier detection to eject all hosts at any given priority level.
479 0 : bool should_eject = (ejected_percent <= max_ejection_percent);
480 0 : if (!Runtime::runtimeFeatureEnabled("envoy.reloadable_features.check_mep_on_first_eject")) {
481 0 : should_eject = (ejections_active_helper_.value() == 0) || should_eject;
482 0 : }
483 0 : if (should_eject) {
484 0 : if (type == envoy::data::cluster::v3::CONSECUTIVE_5XX ||
485 0 : type == envoy::data::cluster::v3::SUCCESS_RATE) {
486 : // Deprecated counter, preserving old behaviour until it's removed.
487 0 : stats_.ejections_total_.inc();
488 0 : }
489 0 : if (enforceEjection(type)) {
490 0 : ejections_active_helper_.inc();
491 0 : updateEnforcedEjectionStats(type);
492 0 : host_monitors_[host]->eject(time_source_.monotonicTime());
493 0 : const std::chrono::milliseconds base_eject_time = std::chrono::milliseconds(
494 0 : runtime_.snapshot().getInteger(BaseEjectionTimeMsRuntime, config_.baseEjectionTimeMs()));
495 0 : const std::chrono::milliseconds max_eject_time = std::chrono::milliseconds(
496 0 : runtime_.snapshot().getInteger(MaxEjectionTimeMsRuntime, config_.maxEjectionTimeMs()));
497 :
498 : // Generate random jitter so that not all hosts uneject at the same time,
499 : // which could possibly generate a connection storm.
500 :
501 : // Retrieve max_eject_time_jitter configuration and then calculate the jitter.
502 0 : const uint64_t max_eject_time_jitter = runtime_.snapshot().getInteger(
503 0 : MaxEjectionTimeJitterMsRuntime, config_.maxEjectionTimeJitterMs());
504 :
505 0 : const std::chrono::milliseconds jitter =
506 0 : std::chrono::milliseconds(random_generator_() % (max_eject_time_jitter + 1));
507 :
508 : // Save the jitter on the current host_monitor.
509 0 : host_monitors_[host]->setJitter(jitter);
510 :
511 0 : if ((host_monitors_[host]->ejectTimeBackoff() * base_eject_time) <
512 0 : (max_eject_time + base_eject_time)) {
513 0 : host_monitors_[host]->ejectTimeBackoff()++;
514 0 : }
515 :
516 0 : runCallbacks(host);
517 0 : if (event_logger_) {
518 0 : event_logger_->logEject(host, *this, type, true);
519 0 : }
520 0 : } else {
521 0 : if (event_logger_) {
522 0 : event_logger_->logEject(host, *this, type, false);
523 0 : }
524 0 : }
525 0 : } else {
526 0 : stats_.ejections_overflow_.inc();
527 0 : }
528 0 : }
529 :
530 0 : DetectionStats DetectorImpl::generateStats(Stats::Scope& scope) {
531 0 : std::string prefix("outlier_detection.");
532 0 : return {ALL_OUTLIER_DETECTION_STATS(POOL_COUNTER_PREFIX(scope, prefix),
533 0 : POOL_GAUGE_PREFIX(scope, prefix))};
534 0 : }
535 :
536 : void DetectorImpl::notifyMainThreadConsecutiveError(
537 0 : HostSharedPtr host, envoy::data::cluster::v3::OutlierEjectionType type) {
538 : // This event will come from all threads, so we synchronize with a post to the main thread.
539 : // NOTE: Unfortunately consecutive errors are complicated from a threading perspective because
540 : // we catch consecutive errors on worker threads and then post back to the main thread.
541 : // Clusters can get removed, and this means there is a race condition with this
542 : // reverse post. The way we handle this is as follows:
543 : // 1) The only strong pointer to the detector is owned by the cluster.
544 : // 2) We post a weak pointer to the main thread.
545 : // 3) If when running on the main thread the weak pointer can be converted to a strong
546 : // pointer, the detector/cluster must still exist so we can safely fire callbacks.
547 : // Otherwise we do nothing since the detector/cluster is already gone.
548 0 : std::weak_ptr<DetectorImpl> weak_this = shared_from_this();
549 0 : dispatcher_.post([weak_this, host, type]() -> void {
550 0 : std::shared_ptr<DetectorImpl> shared_this = weak_this.lock();
551 0 : if (shared_this) {
552 0 : shared_this->onConsecutiveErrorWorker(host, type);
553 0 : }
554 0 : });
555 0 : }
556 :
557 0 : void DetectorImpl::onConsecutive5xx(HostSharedPtr host) {
558 0 : notifyMainThreadConsecutiveError(host, envoy::data::cluster::v3::CONSECUTIVE_5XX);
559 0 : }
560 :
561 0 : void DetectorImpl::onConsecutiveGatewayFailure(HostSharedPtr host) {
562 0 : notifyMainThreadConsecutiveError(host, envoy::data::cluster::v3::CONSECUTIVE_GATEWAY_FAILURE);
563 0 : }
564 :
565 0 : void DetectorImpl::onConsecutiveLocalOriginFailure(HostSharedPtr host) {
566 0 : notifyMainThreadConsecutiveError(host,
567 0 : envoy::data::cluster::v3::CONSECUTIVE_LOCAL_ORIGIN_FAILURE);
568 0 : }
569 :
570 : void DetectorImpl::onConsecutiveErrorWorker(HostSharedPtr host,
571 0 : envoy::data::cluster::v3::OutlierEjectionType type) {
572 : // Ejections come in cross thread. There is a chance that the host has already been removed from
573 : // the set. If so, just ignore it.
574 0 : if (host_monitors_.count(host) == 0) {
575 0 : return;
576 0 : }
577 0 : if (host->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) {
578 0 : return;
579 0 : }
580 :
581 : // We also reset the appropriate counter here to allow the monitor to detect a bout of consecutive
582 : // error responses even if the monitor is not charged with an interleaved non-error code.
583 0 : updateDetectedEjectionStats(type);
584 0 : ejectHost(host, type);
585 :
586 : // reset counters
587 0 : switch (type) {
588 0 : PANIC_ON_PROTO_ENUM_SENTINEL_VALUES;
589 0 : case envoy::data::cluster::v3::SUCCESS_RATE:
590 0 : FALLTHRU;
591 0 : case envoy::data::cluster::v3::SUCCESS_RATE_LOCAL_ORIGIN:
592 0 : FALLTHRU;
593 0 : case envoy::data::cluster::v3::FAILURE_PERCENTAGE:
594 0 : FALLTHRU;
595 0 : case envoy::data::cluster::v3::FAILURE_PERCENTAGE_LOCAL_ORIGIN:
596 0 : IS_ENVOY_BUG("unexpected non-consecutive error");
597 0 : return;
598 0 : case envoy::data::cluster::v3::CONSECUTIVE_5XX:
599 0 : stats_.ejections_consecutive_5xx_.inc(); // Deprecated
600 0 : host_monitors_[host]->resetConsecutive5xx();
601 0 : break;
602 0 : case envoy::data::cluster::v3::CONSECUTIVE_GATEWAY_FAILURE:
603 0 : host_monitors_[host]->resetConsecutiveGatewayFailure();
604 0 : break;
605 0 : case envoy::data::cluster::v3::CONSECUTIVE_LOCAL_ORIGIN_FAILURE:
606 0 : host_monitors_[host]->resetConsecutiveLocalOriginFailure();
607 0 : break;
608 0 : }
609 0 : }
610 :
611 : DetectorImpl::EjectionPair DetectorImpl::successRateEjectionThreshold(
612 : double success_rate_sum, const std::vector<HostSuccessRatePair>& valid_success_rate_hosts,
613 0 : double success_rate_stdev_factor) {
614 : // This function is using mean and standard deviation as statistical measures for outlier
615 : // detection. First the mean is calculated by dividing the sum of success rate data over the
616 : // number of data points. Then variance is calculated by taking the mean of the
617 : // squared difference of data points to the mean of the data. Then standard deviation is
618 : // calculated by taking the square root of the variance. Then the outlier threshold is
619 : // calculated as the difference between the mean and the product of the standard
620 : // deviation and a constant factor.
621 : //
622 : // For example with a data set that looks like success_rate_data = {50, 100, 100, 100, 100} the
623 : // math would work as follows:
624 : // success_rate_sum = 450
625 : // mean = 90
626 : // variance = 400
627 : // stdev = 20
628 : // threshold returned = 52
629 0 : double mean = success_rate_sum / valid_success_rate_hosts.size();
630 0 : double variance = 0;
631 0 : std::for_each(valid_success_rate_hosts.begin(), valid_success_rate_hosts.end(),
632 0 : [&variance, mean](HostSuccessRatePair v) {
633 0 : variance += std::pow(v.success_rate_ - mean, 2);
634 0 : });
635 0 : variance /= valid_success_rate_hosts.size();
636 0 : double stdev = std::sqrt(variance);
637 :
638 0 : return {mean, (mean - (success_rate_stdev_factor * stdev))};
639 0 : }
640 :
641 : void DetectorImpl::processSuccessRateEjections(
642 0 : DetectorHostMonitor::SuccessRateMonitorType monitor_type) {
643 0 : uint64_t success_rate_minimum_hosts = runtime_.snapshot().getInteger(
644 0 : SuccessRateMinimumHostsRuntime, config_.successRateMinimumHosts());
645 0 : uint64_t success_rate_request_volume = runtime_.snapshot().getInteger(
646 0 : SuccessRateRequestVolumeRuntime, config_.successRateRequestVolume());
647 0 : uint64_t failure_percentage_minimum_hosts = runtime_.snapshot().getInteger(
648 0 : FailurePercentageMinimumHostsRuntime, config_.failurePercentageMinimumHosts());
649 0 : uint64_t failure_percentage_request_volume = runtime_.snapshot().getInteger(
650 0 : FailurePercentageRequestVolumeRuntime, config_.failurePercentageRequestVolume());
651 :
652 0 : std::vector<HostSuccessRatePair> valid_success_rate_hosts;
653 0 : std::vector<HostSuccessRatePair> valid_failure_percentage_hosts;
654 0 : double success_rate_sum = 0;
655 :
656 : // Reset the Detector's success rate mean and stdev.
657 0 : getSRNums(monitor_type) = {-1, -1};
658 :
659 : // Exit early if there are not enough hosts.
660 0 : if (host_monitors_.size() < success_rate_minimum_hosts &&
661 0 : host_monitors_.size() < failure_percentage_minimum_hosts) {
662 0 : return;
663 0 : }
664 :
665 : // reserve upper bound of vector size to avoid reallocation.
666 0 : valid_success_rate_hosts.reserve(host_monitors_.size());
667 0 : valid_failure_percentage_hosts.reserve(host_monitors_.size());
668 :
669 0 : for (const auto& host : host_monitors_) {
670 : // Don't do work if the host is already ejected.
671 0 : if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) {
672 0 : absl::optional<std::pair<double, uint64_t>> host_success_rate_and_volume =
673 0 : host.second->getSRMonitor(monitor_type)
674 0 : .successRateAccumulator()
675 0 : .getSuccessRateAndVolume();
676 :
677 0 : if (!host_success_rate_and_volume) {
678 0 : continue;
679 0 : }
680 0 : double success_rate = host_success_rate_and_volume.value().first;
681 0 : double request_volume = host_success_rate_and_volume.value().second;
682 :
683 0 : if (request_volume >=
684 0 : std::min(success_rate_request_volume, failure_percentage_request_volume)) {
685 0 : host.second->successRate(monitor_type, success_rate);
686 0 : }
687 :
688 0 : if (request_volume >= success_rate_request_volume) {
689 0 : valid_success_rate_hosts.emplace_back(HostSuccessRatePair(host.first, success_rate));
690 0 : success_rate_sum += success_rate;
691 0 : }
692 0 : if (request_volume >= failure_percentage_request_volume) {
693 0 : valid_failure_percentage_hosts.emplace_back(HostSuccessRatePair(host.first, success_rate));
694 0 : }
695 0 : }
696 0 : }
697 :
698 0 : if (!valid_success_rate_hosts.empty() &&
699 0 : valid_success_rate_hosts.size() >= success_rate_minimum_hosts) {
700 0 : const double success_rate_stdev_factor =
701 0 : runtime_.snapshot().getInteger(SuccessRateStdevFactorRuntime,
702 0 : config_.successRateStdevFactor()) /
703 0 : 1000.0;
704 0 : getSRNums(monitor_type) = successRateEjectionThreshold(
705 0 : success_rate_sum, valid_success_rate_hosts, success_rate_stdev_factor);
706 0 : const double success_rate_ejection_threshold = getSRNums(monitor_type).ejection_threshold_;
707 0 : for (const auto& host_success_rate_pair : valid_success_rate_hosts) {
708 0 : if (host_success_rate_pair.success_rate_ < success_rate_ejection_threshold) {
709 0 : stats_.ejections_success_rate_.inc(); // Deprecated.
710 0 : const envoy::data::cluster::v3::OutlierEjectionType type =
711 0 : host_monitors_[host_success_rate_pair.host_]
712 0 : ->getSRMonitor(monitor_type)
713 0 : .getEjectionType();
714 0 : updateDetectedEjectionStats(type);
715 0 : ejectHost(host_success_rate_pair.host_, type);
716 0 : }
717 0 : }
718 0 : }
719 :
720 0 : if (!valid_failure_percentage_hosts.empty() &&
721 0 : valid_failure_percentage_hosts.size() >= failure_percentage_minimum_hosts) {
722 0 : const double failure_percentage_threshold = runtime_.snapshot().getInteger(
723 0 : FailurePercentageThresholdRuntime, config_.failurePercentageThreshold());
724 :
725 0 : for (const auto& host_success_rate_pair : valid_failure_percentage_hosts) {
726 0 : if ((100.0 - host_success_rate_pair.success_rate_) >= failure_percentage_threshold) {
727 : // We should eject.
728 :
729 : // The ejection type returned by the SuccessRateMonitor's getEjectionType() will be a
730 : // SUCCESS_RATE type, so we need to figure it out for ourselves.
731 0 : const envoy::data::cluster::v3::OutlierEjectionType type =
732 0 : (monitor_type == DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)
733 0 : ? envoy::data::cluster::v3::FAILURE_PERCENTAGE
734 0 : : envoy::data::cluster::v3::FAILURE_PERCENTAGE_LOCAL_ORIGIN;
735 0 : updateDetectedEjectionStats(type);
736 0 : ejectHost(host_success_rate_pair.host_, type);
737 0 : }
738 0 : }
739 0 : }
740 0 : }
741 :
742 0 : void DetectorImpl::onIntervalTimer() {
743 0 : MonotonicTime now = time_source_.monotonicTime();
744 :
745 0 : for (auto host : host_monitors_) {
746 0 : checkHostForUneject(host.first, host.second, now);
747 :
748 : // Need to update the writer bucket to keep the data valid.
749 0 : host.second->updateCurrentSuccessRateBucket();
750 : // Refresh host success rate stat for the /clusters endpoint. If there is a new valid value, it
751 : // will get updated in processSuccessRateEjections().
752 0 : host.second->successRate(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin, -1);
753 0 : host.second->successRate(DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin, -1);
754 0 : }
755 :
756 0 : processSuccessRateEjections(DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin);
757 0 : processSuccessRateEjections(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin);
758 :
759 : // Decrement time backoff for all hosts which have not been ejected.
760 0 : for (auto host : host_monitors_) {
761 0 : if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) {
762 0 : auto& monitor = host.second;
763 : // Node is healthy and was not ejected since the last check.
764 0 : if (monitor->lastUnejectionTime().has_value() &&
765 0 : ((now - monitor->lastUnejectionTime().value()) >=
766 0 : std::chrono::milliseconds(
767 0 : runtime_.snapshot().getInteger(IntervalMsRuntime, config_.intervalMs())))) {
768 0 : if (monitor->ejectTimeBackoff() != 0) {
769 0 : monitor->ejectTimeBackoff()--;
770 0 : }
771 0 : }
772 0 : }
773 0 : }
774 :
775 0 : armIntervalTimer();
776 0 : }
777 :
778 0 : void DetectorImpl::runCallbacks(HostSharedPtr host) {
779 0 : for (const ChangeStateCb& cb : callbacks_) {
780 0 : cb(host);
781 0 : }
782 0 : }
783 :
784 : void EventLoggerImpl::logEject(const HostDescriptionConstSharedPtr& host, Detector& detector,
785 0 : envoy::data::cluster::v3::OutlierEjectionType type, bool enforced) {
786 0 : envoy::data::cluster::v3::OutlierDetectionEvent event;
787 0 : event.set_type(type);
788 :
789 0 : absl::optional<MonotonicTime> time = host->outlierDetector().lastUnejectionTime();
790 0 : setCommonEventParams(event, host, time);
791 :
792 0 : event.set_action(envoy::data::cluster::v3::EJECT);
793 :
794 0 : event.set_enforced(enforced);
795 :
796 0 : if ((type == envoy::data::cluster::v3::SUCCESS_RATE) ||
797 0 : (type == envoy::data::cluster::v3::SUCCESS_RATE_LOCAL_ORIGIN)) {
798 0 : const DetectorHostMonitor::SuccessRateMonitorType monitor_type =
799 0 : (type == envoy::data::cluster::v3::SUCCESS_RATE)
800 0 : ? DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin
801 0 : : DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin;
802 0 : event.mutable_eject_success_rate_event()->set_cluster_average_success_rate(
803 0 : detector.successRateAverage(monitor_type));
804 0 : event.mutable_eject_success_rate_event()->set_cluster_success_rate_ejection_threshold(
805 0 : detector.successRateEjectionThreshold(monitor_type));
806 0 : event.mutable_eject_success_rate_event()->set_host_success_rate(
807 0 : host->outlierDetector().successRate(monitor_type));
808 0 : } else if ((type == envoy::data::cluster::v3::FAILURE_PERCENTAGE) ||
809 0 : (type == envoy::data::cluster::v3::FAILURE_PERCENTAGE_LOCAL_ORIGIN)) {
810 0 : const DetectorHostMonitor::SuccessRateMonitorType monitor_type =
811 0 : (type == envoy::data::cluster::v3::FAILURE_PERCENTAGE)
812 0 : ? DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin
813 0 : : DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin;
814 0 : event.mutable_eject_failure_percentage_event()->set_host_success_rate(
815 0 : host->outlierDetector().successRate(monitor_type));
816 0 : } else {
817 0 : event.mutable_eject_consecutive_event();
818 0 : }
819 :
820 0 : std::string json;
821 0 : #ifdef ENVOY_ENABLE_YAML
822 0 : json = MessageUtil::getJsonStringFromMessageOrError(event, /* pretty_print */ false,
823 0 : /* always_print_primitive_fields */ true);
824 : #else
825 : IS_ENVOY_BUG("attempting outlier logging with JSON support removed");
826 : #endif
827 0 : file_->write(fmt::format("{}\n", json));
828 0 : }
829 :
830 0 : void EventLoggerImpl::logUneject(const HostDescriptionConstSharedPtr& host) {
831 0 : envoy::data::cluster::v3::OutlierDetectionEvent event;
832 :
833 0 : absl::optional<MonotonicTime> time = host->outlierDetector().lastEjectionTime();
834 0 : setCommonEventParams(event, host, time);
835 :
836 0 : event.set_action(envoy::data::cluster::v3::UNEJECT);
837 :
838 0 : std::string json;
839 0 : #ifdef ENVOY_ENABLE_YAML
840 0 : json = MessageUtil::getJsonStringFromMessageOrError(event, /* pretty_print */ false,
841 0 : /* always_print_primitive_fields */ true);
842 : #else
843 : IS_ENVOY_BUG("attempting outlier logging with JSON support removed");
844 : #endif
845 0 : file_->write(fmt::format("{}\n", json));
846 0 : }
847 :
848 : void EventLoggerImpl::setCommonEventParams(envoy::data::cluster::v3::OutlierDetectionEvent& event,
849 : const HostDescriptionConstSharedPtr& host,
850 0 : absl::optional<MonotonicTime> time) {
851 0 : MonotonicTime monotonic_now = time_source_.monotonicTime();
852 0 : if (time) {
853 0 : std::chrono::seconds secsFromLastAction =
854 0 : std::chrono::duration_cast<std::chrono::seconds>(monotonic_now - time.value());
855 0 : event.mutable_secs_since_last_action()->set_value(secsFromLastAction.count());
856 0 : }
857 0 : event.set_cluster_name(host->cluster().name());
858 0 : event.set_upstream_url(host->address()->asString());
859 0 : event.set_num_ejections(host->outlierDetector().numEjections());
860 0 : TimestampUtil::systemClockToTimestamp(time_source_.systemTime(), *event.mutable_timestamp());
861 0 : }
862 :
863 0 : SuccessRateAccumulatorBucket* SuccessRateAccumulator::updateCurrentWriter() {
864 : // Right now current is being written to and backup is not. Flush the backup and swap.
865 0 : backup_success_rate_bucket_->success_request_counter_ = 0;
866 0 : backup_success_rate_bucket_->total_request_counter_ = 0;
867 :
868 0 : current_success_rate_bucket_.swap(backup_success_rate_bucket_);
869 :
870 0 : return current_success_rate_bucket_.get();
871 0 : }
872 :
873 0 : absl::optional<std::pair<double, uint64_t>> SuccessRateAccumulator::getSuccessRateAndVolume() {
874 0 : if (!backup_success_rate_bucket_->total_request_counter_) {
875 0 : return absl::nullopt;
876 0 : }
877 :
878 0 : double success_rate = backup_success_rate_bucket_->success_request_counter_ * 100.0 /
879 0 : backup_success_rate_bucket_->total_request_counter_;
880 :
881 0 : return {{success_rate, backup_success_rate_bucket_->total_request_counter_}};
882 0 : }
883 :
884 : } // namespace Outlier
885 : } // namespace Upstream
886 : } // namespace Envoy
|