1
#pragma once
2

            
3
#include <atomic>
4
#include <chrono>
5
#include <cstdint>
6
#include <list>
7
#include <memory>
8
#include <string>
9
#include <vector>
10

            
11
#include "envoy/access_log/access_log.h"
12
#include "envoy/common/callback.h"
13
#include "envoy/common/time.h"
14
#include "envoy/config/cluster/v3/cluster.pb.h"
15
#include "envoy/config/cluster/v3/outlier_detection.pb.h"
16
#include "envoy/data/cluster/v3/outlier_detection_event.pb.h"
17
#include "envoy/event/timer.h"
18
#include "envoy/http/codes.h"
19
#include "envoy/runtime/runtime.h"
20
#include "envoy/stats/scope.h"
21
#include "envoy/stats/stats.h"
22
#include "envoy/upstream/outlier_detection.h"
23

            
24
#include "source/common/upstream/upstream_impl.h"
25

            
26
#include "absl/container/node_hash_map.h"
27

            
28
namespace Envoy {
29
namespace Upstream {
30
namespace Outlier {
31

            
32
/**
33
 * Factory for creating a detector from a proto configuration.
34
 */
35
class DetectorImplFactory {
36
public:
37
  static absl::StatusOr<DetectorSharedPtr>
38
  createForCluster(Cluster& cluster, const envoy::config::cluster::v3::Cluster& cluster_config,
39
                   Event::Dispatcher& dispatcher, Runtime::Loader& runtime,
40
                   EventLoggerSharedPtr event_logger, Random::RandomGenerator& random);
41
};
42

            
43
/**
44
 * Thin struct to facilitate calculations for success rate outlier detection.
45
 */
46
struct HostSuccessRatePair {
47
  HostSuccessRatePair(HostSharedPtr host, double success_rate)
48
56
      : host_(host), success_rate_(success_rate) {}
49
  HostSharedPtr host_;
50
  double success_rate_;
51
};
52

            
53
struct SuccessRateAccumulatorBucket {
54
  std::atomic<uint64_t> success_request_counter_;
55
  std::atomic<uint64_t> total_request_counter_;
56
};
57

            
58
/**
59
 * The SuccessRateAccumulator uses the SuccessRateAccumulatorBucket to get per host success rate
60
 * stats. This implementation has a fixed window size of time, and thus only needs a
61
 * bucket to write to, and a bucket to accumulate/run stats over.
62
 */
63
class SuccessRateAccumulator {
64
public:
65
  SuccessRateAccumulator()
66
228
      : current_success_rate_bucket_(new SuccessRateAccumulatorBucket()),
67
228
        backup_success_rate_bucket_(new SuccessRateAccumulatorBucket()) {}
68

            
69
  /**
70
   * This function updates the bucket to write data to.
71
   * @return a pointer to the SuccessRateAccumulatorBucket.
72
   */
73
  SuccessRateAccumulatorBucket* updateCurrentWriter();
74
  /**
75
   * This function returns the success rate of a host over a window of time if the request volume is
76
   * high enough. The underlying window of time could be dynamically adjusted. In the current
77
   * implementation it is a fixed time window.
78
   * @param success_rate_request_volume the threshold of requests an accumulator has to have in
79
   *                                    order to be able to return a significant success rate value.
80
   * @return a valid absl::optional<double> with the success rate. If there were not enough
81
   * requests, an invalid absl::optional<double> is returned.
82
   */
83
  absl::optional<std::pair<double, uint64_t>> getSuccessRateAndVolume();
84

            
85
private:
86
  std::unique_ptr<SuccessRateAccumulatorBucket> current_success_rate_bucket_;
87
  std::unique_ptr<SuccessRateAccumulatorBucket> backup_success_rate_bucket_;
88
};
89

            
90
class SuccessRateMonitor {
91
public:
92
  SuccessRateMonitor(envoy::data::cluster::v3::OutlierEjectionType ejection_type)
93
228
      : ejection_type_(ejection_type) {
94
    // Point the success_rate_accumulator_bucket_ pointer to a bucket.
95
228
    updateCurrentSuccessRateBucket();
96
228
  }
97
13
  double getSuccessRate() const { return success_rate_; }
98
163
  SuccessRateAccumulator& successRateAccumulator() { return success_rate_accumulator_; }
99
271607
  void setSuccessRate(double new_success_rate) { success_rate_ = new_success_rate; }
100
271806
  void updateCurrentSuccessRateBucket() {
101
271806
    success_rate_accumulator_bucket_.store(success_rate_accumulator_.updateCurrentWriter());
102
271806
  }
103
7228
  void incTotalReqCounter() { success_rate_accumulator_bucket_.load()->total_request_counter_++; }
104
4375
  void incSuccessReqCounter() {
105
4375
    success_rate_accumulator_bucket_.load()->success_request_counter_++;
106
4375
  }
107

            
108
3
  envoy::data::cluster::v3::OutlierEjectionType getEjectionType() const { return ejection_type_; }
109

            
110
private:
111
  SuccessRateAccumulator success_rate_accumulator_;
112
  std::atomic<SuccessRateAccumulatorBucket*> success_rate_accumulator_bucket_;
113
  envoy::data::cluster::v3::OutlierEjectionType ejection_type_;
114
  double success_rate_{-1};
115
};
116

            
117
class DetectorImpl;
118

            
119
/**
120
 * Implementation of DetectorHostMonitor for the generic detector.
121
 */
122
class DetectorHostMonitorImpl : public DetectorHostMonitor {
123
public:
124
  DetectorHostMonitorImpl(std::shared_ptr<DetectorImpl> detector, HostSharedPtr host);
125

            
126
  void eject(MonotonicTime ejection_time);
127
  void uneject(MonotonicTime ejection_time);
128
  void degrade(MonotonicTime degraded_time);
129
  void undegrade(MonotonicTime degraded_time);
130

            
131
1886
  uint32_t& ejectTimeBackoff() { return eject_time_backoff_; }
132
32
  uint32_t& degradeTimeBackoff() { return degrade_time_backoff_; }
133

            
134
453
  void resetConsecutive5xx() { consecutive_5xx_ = 0; }
135
276
  void resetConsecutiveGatewayFailure() { consecutive_gateway_failure_ = 0; }
136
2446
  void resetConsecutiveLocalOriginFailure() { consecutive_local_origin_failure_ = 0; }
137
  static absl::optional<Http::Code> resultToHttpCode(Result result);
138

            
139
  // Upstream::Outlier::DetectorHostMonitor
140
  uint32_t numEjections() override { return num_ejections_; }
141
  void putResult(Result result, absl::optional<uint64_t> code) override;
142
49
  void putResponseTime(std::chrono::milliseconds) override {}
143
1548
  const absl::optional<MonotonicTime>& lastEjectionTime() override { return last_ejection_time_; }
144
134501
  const absl::optional<MonotonicTime>& lastUnejectionTime() override {
145
134501
    return last_unejection_time_;
146
134501
  }
147
10
  const absl::optional<MonotonicTime>& lastDegradedTime() const { return last_degraded_time_; }
148
135798
  const absl::optional<MonotonicTime>& lastUndegradedTime() const { return last_undegraded_time_; }
149
  uint32_t numDegradations() const { return num_degradations_; }
150

            
151
  void putHttpResponseCode(uint64_t response_code);
152

            
153
271786
  const SuccessRateMonitor& getSRMonitor(SuccessRateMonitorType type) const {
154
271786
    return (SuccessRateMonitorType::ExternalOrigin == type) ? external_origin_sr_monitor_
155
271786
                                                            : local_origin_sr_monitor_;
156
271786
  }
157

            
158
271773
  SuccessRateMonitor& getSRMonitor(SuccessRateMonitorType type) {
159
    // Call const version of the same method
160
271773
    return const_cast<SuccessRateMonitor&>(
161
271773
        const_cast<const DetectorHostMonitorImpl*>(this)->getSRMonitor(type));
162
271773
  }
163

            
164
13
  double successRate(SuccessRateMonitorType type) const override {
165
13
    return getSRMonitor(type).getSuccessRate();
166
13
  }
167
  void updateCurrentSuccessRateBucket();
168
271607
  void successRate(SuccessRateMonitorType type, double new_success_rate) {
169
271607
    getSRMonitor(type).setSuccessRate(new_success_rate);
170
271607
  }
171

            
172
  // handlers for reporting local origin errors
173
  void localOriginFailure();
174
  void localOriginNoFailure();
175

            
176
  // handlers for setting and getting jitter, used to add a random value
177
  // to outlier eject time in order to prevent a connection storm when
178
  // hosts are unejected
179
153
  void setJitter(const std::chrono::milliseconds jitter) { jitter_ = jitter; }
180
1560
  std::chrono::milliseconds getJitter() const { return jitter_; }
181

            
182
private:
183
  std::weak_ptr<DetectorImpl> detector_;
184
  std::weak_ptr<Host> host_;
185
  absl::optional<MonotonicTime> last_ejection_time_;
186
  absl::optional<MonotonicTime> last_unejection_time_;
187
  uint32_t num_ejections_{};
188
  // Determines ejection time. Each time a node is ejected,
189
  // the eject_time_backoff is incremented. The value is decremented
190
  // each time the node was healthy and not ejected.
191
  uint32_t eject_time_backoff_{};
192

            
193
  // Degradation tracking (similar to ejection)
194
  absl::optional<MonotonicTime> last_degraded_time_;
195
  absl::optional<MonotonicTime> last_undegraded_time_;
196
  uint32_t num_degradations_{};
197
  // Determines degradation time. Each time a node is degraded,
198
  // the degrade_time_backoff is incremented. The value is decremented
199
  // each time the node was healthy and not degraded.
200
  uint32_t degrade_time_backoff_{};
201

            
202
  // counters for externally generated failures
203
  std::atomic<uint32_t> consecutive_5xx_{0};
204
  std::atomic<uint32_t> consecutive_gateway_failure_{0};
205

            
206
  // counters for local origin failures
207
  std::atomic<uint32_t> consecutive_local_origin_failure_{0};
208

            
209
  // jitter for outlier ejection time
210
  std::chrono::milliseconds jitter_;
211

            
212
  // success rate monitors:
213
  // - external_origin: for all events when external/local are not split
214
  //   and for external origin failures when external/local events are split
215
  // - local origin: for local events when external/local events are split and
216
  //   not used when external/local events are not split.
217
  SuccessRateMonitor external_origin_sr_monitor_;
218
  SuccessRateMonitor local_origin_sr_monitor_;
219

            
220
  void putResultNoLocalExternalSplit(Result result, absl::optional<uint64_t> code);
221
  void putResultWithLocalExternalSplit(Result result, absl::optional<uint64_t> code);
222
  std::function<void(DetectorHostMonitorImpl*, Result, absl::optional<uint64_t> code)>
223
      put_result_func_;
224
};
225

            
226
/**
227
 * All outlier detection stats. @see stats_macros.h
228
 */
229
#define ALL_OUTLIER_DETECTION_STATS(COUNTER, GAUGE)                                                \
230
101
  COUNTER(ejections_consecutive_5xx)                                                               \
231
101
  COUNTER(ejections_detected_consecutive_5xx)                                                      \
232
101
  COUNTER(ejections_detected_consecutive_gateway_failure)                                          \
233
101
  COUNTER(ejections_detected_success_rate)                                                         \
234
101
  COUNTER(ejections_detected_failure_percentage)                                                   \
235
101
  COUNTER(ejections_enforced_consecutive_5xx)                                                      \
236
101
  COUNTER(ejections_enforced_consecutive_gateway_failure)                                          \
237
101
  COUNTER(ejections_enforced_success_rate)                                                         \
238
101
  COUNTER(ejections_enforced_failure_percentage)                                                   \
239
101
  COUNTER(ejections_detected_consecutive_local_origin_failure)                                     \
240
101
  COUNTER(ejections_enforced_consecutive_local_origin_failure)                                     \
241
101
  COUNTER(ejections_detected_local_origin_success_rate)                                            \
242
101
  COUNTER(ejections_enforced_local_origin_success_rate)                                            \
243
101
  COUNTER(ejections_detected_local_origin_failure_percentage)                                      \
244
101
  COUNTER(ejections_enforced_local_origin_failure_percentage)                                      \
245
101
  COUNTER(ejections_enforced_total)                                                                \
246
101
  COUNTER(ejections_overflow)                                                                      \
247
101
  COUNTER(ejections_success_rate)                                                                  \
248
101
  COUNTER(ejections_total)                                                                         \
249
101
  COUNTER(ejections_detected_degradation)                                                          \
250
101
  GAUGE(ejections_active, Accumulate)
251

            
252
/**
253
 * Struct definition for all outlier detection stats. @see stats_macros.h
254
 */
255
struct DetectionStats {
256
  ALL_OUTLIER_DETECTION_STATS(GENERATE_COUNTER_STRUCT, GENERATE_GAUGE_STRUCT)
257
};
258

            
259
// Names used in runtime configuration.
260
constexpr absl::string_view MaxEjectionPercentRuntime = "outlier_detection.max_ejection_percent";
261
constexpr absl::string_view ConsecutiveGatewayFailureRuntime =
262
    "outlier_detection.consecutive_gateway_failure";
263
constexpr absl::string_view Consecutive5xxRuntime = "outlier_detection.consecutive_5xx";
264
constexpr absl::string_view ConsecutiveLocalOriginFailureRuntime =
265
    "outlier_detection.consecutive_local_origin_failure";
266
constexpr absl::string_view IntervalMsRuntime = "outlier_detection.interval_ms";
267
constexpr absl::string_view BaseEjectionTimeMsRuntime = "outlier_detection.base_ejection_time_ms";
268
constexpr absl::string_view MaxEjectionTimeMsRuntime = "outlier_detection.max_ejection_time_ms";
269
constexpr absl::string_view EnforcingConsecutive5xxRuntime =
270
    "outlier_detection.enforcing_consecutive_5xx";
271
constexpr absl::string_view EnforcingConsecutiveGatewayFailureRuntime =
272
    "outlier_detection.enforcing_consecutive_gateway_failure";
273
constexpr absl::string_view EnforcingSuccessRateRuntime =
274
    "outlier_detection.enforcing_success_rate";
275
constexpr absl::string_view EnforcingConsecutiveLocalOriginFailureRuntime =
276
    "outlier_detection.enforcing_consecutive_local_origin_failure";
277
constexpr absl::string_view EnforcingLocalOriginSuccessRateRuntime =
278
    "outlier_detection.enforcing_local_origin_success_rate";
279
constexpr absl::string_view EnforcingFailurePercentageRuntime =
280
    "outlier_detection.enforcing_failure_percentage";
281
constexpr absl::string_view EnforcingFailurePercentageLocalOriginRuntime =
282
    "outlier_detection.enforcing_failure_percentage_local_origin";
283
constexpr absl::string_view SuccessRateMinimumHostsRuntime =
284
    "outlier_detection.success_rate_minimum_hosts";
285
constexpr absl::string_view SuccessRateRequestVolumeRuntime =
286
    "outlier_detection.success_rate_request_volume";
287
constexpr absl::string_view FailurePercentageMinimumHostsRuntime =
288
    "outlier_detection.failure_percentage_minimum_hosts";
289
constexpr absl::string_view FailurePercentageRequestVolumeRuntime =
290
    "outlier_detection.failure_percentage_request_volume";
291
constexpr absl::string_view SuccessRateStdevFactorRuntime =
292
    "outlier_detection.success_rate_stdev_factor";
293
constexpr absl::string_view FailurePercentageThresholdRuntime =
294
    "outlier_detection.failure_percentage_threshold";
295
constexpr absl::string_view MaxEjectionTimeJitterMsRuntime =
296
    "outlier_detection.max_ejection_time_jitter_ms";
297

            
298
/**
299
 * Configuration for the outlier detection.
300
 */
301
class DetectorConfig {
302
public:
303
  DetectorConfig(const envoy::config::cluster::v3::OutlierDetection& config);
304

            
305
135986
  uint64_t intervalMs() const { return interval_ms_; }
306
1817
  uint64_t baseEjectionTimeMs() const { return base_ejection_time_ms_; }
307
2363
  uint64_t consecutive5xx() const { return consecutive_5xx_; }
308
1259
  uint64_t consecutiveGatewayFailure() const { return consecutive_gateway_failure_; }
309
630
  uint64_t maxEjectionPercent() const { return max_ejection_percent_; }
310
629
  bool alwaysEjectOneHost() const { return always_eject_one_host_; }
311
271429
  uint64_t successRateMinimumHosts() const { return success_rate_minimum_hosts_; }
312
271429
  uint64_t successRateRequestVolume() const { return success_rate_request_volume_; }
313
4
  uint64_t successRateStdevFactor() const { return success_rate_stdev_factor_; }
314
10
  uint64_t failurePercentageThreshold() const { return failure_percentage_threshold_; }
315
271429
  uint64_t failurePercentageMinimumHosts() const { return failure_percentage_minimum_hosts_; }
316
271429
  uint64_t failurePercentageRequestVolume() const { return failure_percentage_request_volume_; }
317
346
  uint64_t enforcingConsecutive5xx() const { return enforcing_consecutive_5xx_; }
318
176
  uint64_t enforcingConsecutiveGatewayFailure() const {
319
176
    return enforcing_consecutive_gateway_failure_;
320
176
  }
321
2
  uint64_t enforcingSuccessRate() const { return enforcing_success_rate_; }
322
5
  uint64_t enforcingFailurePercentage() const { return enforcing_failure_percentage_; }
323
2
  uint64_t enforcingFailurePercentageLocalOrigin() const {
324
2
    return enforcing_failure_percentage_local_origin_;
325
2
  }
326
114
  bool splitExternalLocalOriginErrors() const { return split_external_local_origin_errors_; }
327
495
  uint64_t consecutiveLocalOriginFailure() const { return consecutive_local_origin_failure_; }
328
94
  uint64_t enforcingConsecutiveLocalOriginFailure() const {
329
94
    return enforcing_consecutive_local_origin_failure_;
330
94
  }
331
2
  uint64_t enforcingLocalOriginSuccessRate() const { return enforcing_local_origin_success_rate_; }
332
1815
  uint64_t maxEjectionTimeMs() const { return max_ejection_time_ms_; }
333
155
  uint64_t maxEjectionTimeJitterMs() const { return max_ejection_time_jitter_ms_; }
334
100
  bool successfulActiveHealthCheckUnejectHost() const {
335
100
    return successful_active_health_check_uneject_host_;
336
100
  }
337
135800
  bool detectDegraded() const { return detect_degraded_; }
338

            
339
private:
340
  const uint64_t interval_ms_;
341
  const uint64_t base_ejection_time_ms_;
342
  const uint64_t consecutive_5xx_;
343
  const uint64_t consecutive_gateway_failure_;
344
  const uint64_t max_ejection_percent_;
345
  const bool always_eject_one_host_;
346
  const uint64_t success_rate_minimum_hosts_;
347
  const uint64_t success_rate_request_volume_;
348
  const uint64_t success_rate_stdev_factor_;
349
  const uint64_t failure_percentage_threshold_;
350
  const uint64_t failure_percentage_minimum_hosts_;
351
  const uint64_t failure_percentage_request_volume_;
352
  const uint64_t enforcing_consecutive_5xx_;
353
  const uint64_t enforcing_consecutive_gateway_failure_;
354
  const uint64_t enforcing_success_rate_;
355
  const uint64_t enforcing_failure_percentage_;
356
  const uint64_t enforcing_failure_percentage_local_origin_;
357
  const bool split_external_local_origin_errors_;
358
  const uint64_t consecutive_local_origin_failure_;
359
  const uint64_t enforcing_consecutive_local_origin_failure_;
360
  const uint64_t enforcing_local_origin_success_rate_;
361
  const uint64_t max_ejection_time_ms_;
362
  const uint64_t max_ejection_time_jitter_ms_;
363
  const bool successful_active_health_check_uneject_host_;
364
  const bool detect_degraded_;
365

            
366
  static constexpr uint64_t DEFAULT_INTERVAL_MS = 10000;
367
  static constexpr uint64_t DEFAULT_BASE_EJECTION_TIME_MS = 30000;
368
  static constexpr uint64_t DEFAULT_CONSECUTIVE_5XX = 5;
369
  static constexpr uint64_t DEFAULT_CONSECUTIVE_GATEWAY_FAILURE = 5;
370
  static constexpr uint64_t DEFAULT_MAX_EJECTION_PERCENT = 10;
371
  static constexpr uint64_t DEFAULT_SUCCESS_RATE_MINIMUM_HOSTS = 5;
372
  static constexpr uint64_t DEFAULT_SUCCESS_RATE_REQUEST_VOLUME = 100;
373
  static constexpr uint64_t DEFAULT_SUCCESS_RATE_STDEV_FACTOR = 1900;
374
  static constexpr uint64_t DEFAULT_FAILURE_PERCENTAGE_THRESHOLD = 85;
375
  static constexpr uint64_t DEFAULT_FAILURE_PERCENTAGE_MINIMUM_HOSTS = 5;
376
  static constexpr uint64_t DEFAULT_FAILURE_PERCENTAGE_REQUEST_VOLUME = 50;
377
  static constexpr uint64_t DEFAULT_ENFORCING_CONSECUTIVE_5XX = 100;
378
  static constexpr uint64_t DEFAULT_ENFORCING_CONSECUTIVE_GATEWAY_FAILURE = 0;
379
  static constexpr uint64_t DEFAULT_ENFORCING_SUCCESS_RATE = 100;
380
  static constexpr uint64_t DEFAULT_ENFORCING_FAILURE_PERCENTAGE = 0;
381
  static constexpr uint64_t DEFAULT_ENFORCING_FAILURE_PERCENTAGE_LOCAL_ORIGIN = 0;
382
  static constexpr uint64_t DEFAULT_CONSECUTIVE_LOCAL_ORIGIN_FAILURE = 5;
383
  static constexpr uint64_t DEFAULT_ENFORCING_CONSECUTIVE_LOCAL_ORIGIN_FAILURE = 100;
384
  static constexpr uint64_t DEFAULT_ENFORCING_LOCAL_ORIGIN_SUCCESS_RATE = 100;
385
  static constexpr uint64_t DEFAULT_MAX_EJECTION_TIME_MS = 10 * DEFAULT_BASE_EJECTION_TIME_MS;
386
  static constexpr uint64_t DEFAULT_MAX_EJECTION_TIME_JITTER_MS = 0;
387
};
388

            
389
/**
390
 * An implementation of an outlier detector. In the future we may support multiple outlier detection
391
 * implementations with different configuration. For now, as we iterate everything is contained
392
 * within this implementation.
393
 */
394
class DetectorImpl : public Detector, public std::enable_shared_from_this<DetectorImpl> {
395
public:
396
  static absl::StatusOr<std::shared_ptr<DetectorImpl>>
397
  create(Cluster& cluster, const envoy::config::cluster::v3::OutlierDetection& config,
398
         Event::Dispatcher& dispatcher, Runtime::Loader& runtime, TimeSource& time_source,
399
         EventLoggerSharedPtr event_logger, Random::RandomGenerator& random);
400
  ~DetectorImpl() override;
401

            
402
  void onConsecutive5xx(HostSharedPtr host);
403
  void onConsecutiveGatewayFailure(HostSharedPtr host);
404
  void onConsecutiveLocalOriginFailure(HostSharedPtr host);
405
4105
  Runtime::Loader& runtime() { return runtime_; }
406
4460
  DetectorConfig& config() { return config_; }
407
  void unejectHost(HostSharedPtr host);
408
  void setHostDegraded(HostSharedPtr host);
409

            
410
  // Upstream::Outlier::Detector
411
140
  void addChangedStateCb(ChangeStateCb cb) override { callbacks_.push_back(cb); }
412
  double
413
11
  successRateAverage(DetectorHostMonitor::SuccessRateMonitorType monitor_type) const override {
414
11
    return getSRNums(monitor_type).success_rate_average_;
415
11
  }
416
  double successRateEjectionThreshold(
417
11
      DetectorHostMonitor::SuccessRateMonitorType monitor_type) const override {
418
11
    return getSRNums(monitor_type).ejection_threshold_;
419
11
  }
420

            
421
  /**
422
   * This function returns pair of double values for success rate outlier detection. The pair
423
   * contains the average success rate of all valid hosts in the cluster and the ejection threshold.
424
   * If a host's success rate is under this threshold, the host is an outlier.
425
   * @param success_rate_sum is the sum of the data in the success_rate_data vector.
426
   * @param valid_success_rate_hosts is the vector containing the individual success rate data
427
   *        points.
428
   * @return EjectionPair
429
   */
430
  struct EjectionPair {
431
    double success_rate_average_; // average success rate of all valid hosts in the cluster
432
    double ejection_threshold_;   // ejection threshold for the cluster
433
  };
434
  static EjectionPair
435
  successRateEjectionThreshold(double success_rate_sum,
436
                               const std::vector<HostSuccessRatePair>& valid_success_rate_hosts,
437
                               double success_rate_stdev_factor);
438

            
439
2
  const absl::node_hash_map<HostSharedPtr, DetectorHostMonitorImpl*>& getHostMonitors() {
440
2
    return host_monitors_;
441
2
  }
442

            
443
private:
444
  DetectorImpl(const Cluster& cluster, const envoy::config::cluster::v3::OutlierDetection& config,
445
               Event::Dispatcher& dispatcher, Runtime::Loader& runtime, TimeSource& time_source,
446
               EventLoggerSharedPtr event_logger, Random::RandomGenerator& random);
447

            
448
  void addHostMonitor(HostSharedPtr host);
449
  void armIntervalTimer();
450
  void checkHostForUneject(HostSharedPtr host, DetectorHostMonitorImpl* monitor, MonotonicTime now);
451
  void checkHostForUndegrade(HostSharedPtr host, DetectorHostMonitorImpl* monitor,
452
                             MonotonicTime now);
453
  void ejectHost(HostSharedPtr host, envoy::data::cluster::v3::OutlierEjectionType type);
454
  static DetectionStats generateStats(Stats::Scope& scope);
455
  void initialize(Cluster& cluster);
456
  void onConsecutiveErrorWorker(HostSharedPtr host,
457
                                envoy::data::cluster::v3::OutlierEjectionType type);
458
  void notifyMainThreadConsecutiveError(HostSharedPtr host,
459
                                        envoy::data::cluster::v3::OutlierEjectionType type);
460
  void notifyMainThreadHostDegraded(HostSharedPtr host);
461
  void setHostDegradedMainThread(HostSharedPtr host);
462
  void onIntervalTimer();
463
  void runCallbacks(HostSharedPtr host);
464
  bool enforceEjection(envoy::data::cluster::v3::OutlierEjectionType type);
465
  void updateEnforcedEjectionStats(envoy::data::cluster::v3::OutlierEjectionType type);
466
  void updateDetectedEjectionStats(envoy::data::cluster::v3::OutlierEjectionType type);
467
  void processSuccessRateEjections(DetectorHostMonitor::SuccessRateMonitorType monitor_type);
468

            
469
  // The helper to double write value and gauge. The gauge could be null value since because any
470
  // stat might be deactivated.
471
  class EjectionsActiveHelper {
472
  public:
473
101
    EjectionsActiveHelper(Envoy::Stats::Gauge& gauge) : ejections_active_ref_(gauge) {}
474
143
    void inc() {
475
143
      ejections_active_ref_.inc();
476
143
      ++ejections_active_value_;
477
143
    }
478
143
    void dec() {
479
143
      ejections_active_ref_.dec();
480
143
      --ejections_active_value_;
481
143
    }
482
632
    uint64_t value() { return ejections_active_value_.load(); }
483
    Envoy::Stats::Gauge& ejections_active_ref_;
484
    std::atomic<uint64_t> ejections_active_value_{0};
485
  };
486
  DetectorConfig config_;
487
  Event::Dispatcher& dispatcher_;
488
  Runtime::Loader& runtime_;
489
  TimeSource& time_source_;
490
  DetectionStats stats_;
491
  EjectionsActiveHelper ejections_active_helper_{stats_.ejections_active_};
492
  Event::TimerPtr interval_timer_;
493
  std::list<ChangeStateCb> callbacks_;
494
  absl::node_hash_map<HostSharedPtr, DetectorHostMonitorImpl*> host_monitors_;
495
  EventLoggerSharedPtr event_logger_;
496
  Common::CallbackHandlePtr member_update_cb_;
497
  Random::RandomGenerator& random_generator_;
498

            
499
  // EjectionPair for external and local origin events.
500
  // When external/local origin events are not split, external_origin_sr_num_ are used for
501
  // both types of events: external and local. local_origin_sr_num_ is not used.
502
  // When external/local origin events are split, external_origin_sr_num_ are used only
503
  // for external events and local_origin_sr_num_ is used for local origin events.
504
  EjectionPair external_origin_sr_num_;
505
  EjectionPair local_origin_sr_num_;
506

            
507
271456
  const EjectionPair& getSRNums(DetectorHostMonitor::SuccessRateMonitorType monitor_type) const {
508
271456
    return (DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin == monitor_type)
509
271456
               ? external_origin_sr_num_
510
271456
               : local_origin_sr_num_;
511
271456
  }
512
271434
  EjectionPair& getSRNums(DetectorHostMonitor::SuccessRateMonitorType monitor_type) {
513
271434
    return const_cast<EjectionPair&>(
514
271434
        static_cast<const DetectorImpl&>(*this).getSRNums(monitor_type));
515
271434
  }
516
};
517

            
518
class EventLoggerImpl : public EventLogger {
519
public:
520
  static absl::StatusOr<std::unique_ptr<EventLoggerImpl>>
521
  create(AccessLog::AccessLogManager& log_manager, const std::string& file_name,
522
4
         TimeSource& time_source) {
523
4
    auto file_or_error = log_manager.createAccessLog(
524
4
        Filesystem::FilePathAndType{Filesystem::DestinationType::File, file_name});
525
4
    RETURN_IF_NOT_OK_REF(file_or_error.status());
526
4
    return std::unique_ptr<EventLoggerImpl>(
527
4
        new EventLoggerImpl(std::move(*file_or_error), time_source));
528
4
  }
529
  // Upstream::Outlier::EventLogger
530
  void logEject(const HostDescriptionConstSharedPtr& host, Detector& detector,
531
                envoy::data::cluster::v3::OutlierEjectionType type, bool enforced) override;
532

            
533
  void logUneject(const HostDescriptionConstSharedPtr& host) override;
534

            
535
protected:
536
  EventLoggerImpl(AccessLog::AccessLogFileSharedPtr&& file, TimeSource& time_source)
537
4
      : file_(std::move(file)), time_source_(time_source) {}
538

            
539
private:
540
  void setCommonEventParams(envoy::data::cluster::v3::OutlierDetectionEvent& event,
541
                            const HostDescriptionConstSharedPtr& host,
542
                            absl::optional<MonotonicTime> time);
543

            
544
  AccessLog::AccessLogFileSharedPtr file_;
545
  TimeSource& time_source_;
546
};
547

            
548
} // namespace Outlier
549
} // namespace Upstream
550
} // namespace Envoy