1
#include "source/extensions/load_balancing_policies/maglev/maglev_lb.h"
2

            
3
#include "envoy/config/cluster/v3/cluster.pb.h"
4

            
5
#include "source/common/runtime/runtime_features.h"
6

            
7
namespace Envoy {
8
namespace Upstream {
9
namespace {
10
491
bool shouldUseCompactTable(size_t num_hosts, uint64_t table_size) {
11
  // Don't use compact maglev on 32-bit platforms.
12
  if constexpr (!(ENVOY_BIT_ARRAY_SUPPORTED)) {
13
    return false;
14
  }
15

            
16
16
#ifdef MAGLEV_LB_FORCE_ORIGINAL_IMPL
17
16
  return false;
18
#endif
19

            
20
475
  if (num_hosts > MaglevTable::MaxNumberOfHostsForCompactMaglev) {
21
    return false;
22
  }
23

            
24
475
  constexpr size_t shared_ptr_size = sizeof(HostConstSharedPtr);
25
475
  const uint64_t original_maglev_cost = shared_ptr_size * table_size;
26
  // We might be off by a byte e.g. due to rounding down when going from bits to
27
  // bytes.
28
475
  const uint64_t compact_maglev_cost =
29
475
      shared_ptr_size * num_hosts + ((absl::bit_width(num_hosts) * table_size) / 8);
30
475
  return compact_maglev_cost < original_maglev_cost;
31
475
}
32

            
33
/**
34
 * Factory for creating the optimal Maglev table instance for the given parameters.
35
 */
36
class MaglevFactory : private Logger::Loggable<Logger::Id::upstream> {
37
public:
38
  static MaglevTableSharedPtr
39
  createMaglevTable(const NormalizedHostWeightVector& normalized_host_weights,
40
                    double max_normalized_weight, uint64_t table_size,
41
491
                    bool use_hostname_for_hashing, MaglevLoadBalancerStats& stats) {
42

            
43
491
    MaglevTableSharedPtr maglev_table;
44
491
    if (shouldUseCompactTable(normalized_host_weights.size(), table_size)) {
45
475
      maglev_table =
46
475
          std::make_shared<CompactMaglevTable>(normalized_host_weights, max_normalized_weight,
47
475
                                               table_size, use_hostname_for_hashing, stats);
48
475
      ENVOY_LOG(debug, "creating compact maglev table given table size {} and number of hosts {}",
49
475
                table_size, normalized_host_weights.size());
50
491
    } else {
51
16
      maglev_table =
52
16
          std::make_shared<OriginalMaglevTable>(normalized_host_weights, max_normalized_weight,
53
16
                                                table_size, use_hostname_for_hashing, stats);
54
16
      ENVOY_LOG(debug, "creating original maglev table given table size {} and number of hosts {}",
55
16
                table_size, normalized_host_weights.size());
56
16
    }
57

            
58
491
    return maglev_table;
59
491
  }
60
};
61

            
62
} // namespace
63

            
64
TypedMaglevLbConfig::TypedMaglevLbConfig(const CommonLbConfigProto& common_lb_config,
65
464
                                         const LegacyMaglevLbProto& lb_config) {
66
464
  LoadBalancerConfigHelper::convertHashLbConfigTo(common_lb_config, lb_config_);
67
464
  if (common_lb_config.has_locality_weighted_lb_config()) {
68
2
    lb_config_.mutable_locality_weighted_lb_config();
69
2
  }
70

            
71
464
  if (lb_config.has_table_size()) {
72
2
    *lb_config_.mutable_table_size() = lb_config.table_size();
73
2
  }
74
464
}
75

            
76
TypedMaglevLbConfig::TypedMaglevLbConfig(const MaglevLbProto& lb_config,
77
                                         Regex::Engine& regex_engine, absl::Status& creation_status)
78
42
    : TypedHashLbConfigBase(lb_config.consistent_hashing_lb_config().hash_policy(), regex_engine,
79
42
                            creation_status),
80
42
      lb_config_(lb_config) {}
81

            
82
ThreadAwareLoadBalancerBase::HashingLoadBalancerSharedPtr
83
MaglevLoadBalancer::createLoadBalancer(const NormalizedHostWeightVector& normalized_host_weights,
84
                                       double /* min_normalized_weight */,
85
491
                                       double max_normalized_weight) {
86
491
  HashingLoadBalancerSharedPtr maglev_lb =
87
491
      MaglevFactory::createMaglevTable(normalized_host_weights, max_normalized_weight, table_size_,
88
491
                                       use_hostname_for_hashing_, stats_);
89

            
90
491
  if (hash_balance_factor_ == 0) {
91
489
    return maglev_lb;
92
489
  }
93

            
94
2
  return std::make_shared<BoundedLoadHashingLoadBalancer>(
95
2
      maglev_lb, std::move(normalized_host_weights), hash_balance_factor_);
96
491
}
97

            
98
void MaglevTable::constructMaglevTableInternal(
99
    const NormalizedHostWeightVector& normalized_host_weights, double max_normalized_weight,
100
495
    bool use_hostname_for_hashing) {
101
  // We can't do anything sensible with no hosts.
102
495
  if (normalized_host_weights.empty()) {
103
26
    ENVOY_LOG(debug, "maglev: normalized hosts weights is empty, skipping building table");
104
26
    return;
105
26
  }
106

            
107
  // Prepare stable (sorted) vector of host_weight.
108
  // Maglev requires stable order of table_build_entries because the hash table will be filled in
109
  // the order. Unstable table_build_entries results the change of backend assignment.
110
469
  std::vector<std::tuple<absl::string_view, HostConstSharedPtr, double>> sorted_host_weights;
111
469
  sorted_host_weights.reserve(normalized_host_weights.size());
112
2614
  for (const auto& host_weight : normalized_host_weights) {
113
2614
    const auto& host = host_weight.first;
114
2614
    const absl::string_view key_to_hash = hashKey(host, use_hostname_for_hashing);
115
2614
    ASSERT(!key_to_hash.empty());
116

            
117
2614
    sorted_host_weights.emplace_back(key_to_hash, host_weight.first, host_weight.second);
118
2614
  }
119

            
120
469
  std::sort(sorted_host_weights.begin(), sorted_host_weights.end());
121

            
122
  // Implementation of pseudocode listing 1 in the paper (see header file for more info).
123
469
  std::vector<TableBuildEntry> table_build_entries;
124
469
  table_build_entries.reserve(normalized_host_weights.size());
125
2614
  for (const auto& sorted_host_weight : sorted_host_weights) {
126
2614
    const auto& key_to_hash = std::get<0>(sorted_host_weight);
127
2614
    const auto& host = std::get<1>(sorted_host_weight);
128
2614
    const auto& weight = std::get<2>(sorted_host_weight);
129

            
130
2614
    table_build_entries.emplace_back(host, HashUtil::xxHash64(key_to_hash) % table_size_,
131
2614
                                     (HashUtil::xxHash64(key_to_hash, 1) % (table_size_ - 1)) + 1,
132
2614
                                     weight);
133
2614
  }
134

            
135
469
  constructImplementationInternals(table_build_entries, max_normalized_weight);
136

            
137
  // Update Stats
138
469
  uint64_t min_entries_per_host = table_size_;
139
469
  uint64_t max_entries_per_host = 0;
140
2614
  for (const auto& entry : table_build_entries) {
141
2614
    min_entries_per_host = std::min(entry.count_, min_entries_per_host);
142
2614
    max_entries_per_host = std::max(entry.count_, max_entries_per_host);
143
2614
  }
144
469
  stats_.min_entries_per_host_.set(min_entries_per_host);
145
469
  stats_.max_entries_per_host_.set(max_entries_per_host);
146

            
147
469
  if (ENVOY_LOG_CHECK_LEVEL(trace)) {
148
    logMaglevTable(use_hostname_for_hashing);
149
  }
150
469
}
151

            
152
void OriginalMaglevTable::constructImplementationInternals(
153
13
    std::vector<TableBuildEntry>& table_build_entries, double max_normalized_weight) {
154
  // Size internal representation for maglev table correctly.
155
13
  table_.resize(table_size_);
156

            
157
  // Iterate through the table build entries as many times as it takes to fill up the table.
158
13
  uint64_t table_index = 0;
159
64585
  for (uint32_t iteration = 1; table_index < table_size_; ++iteration) {
160
66126022
    for (uint64_t i = 0; i < table_build_entries.size() && table_index < table_size_; i++) {
161
66061450
      TableBuildEntry& entry = table_build_entries[i];
162
      // To understand how target_weight_ and weight_ are used below, consider a host with weight
163
      // equal to max_normalized_weight. This would be picked on every single iteration. If it had
164
      // weight equal to max_normalized_weight / 3, then it would only be picked every 3 iterations,
165
      // etc.
166
66061450
      if (iteration * entry.weight_ < entry.target_weight_) {
167
65995789
        continue;
168
65995789
      }
169
65661
      entry.target_weight_ += max_normalized_weight;
170
65661
      uint64_t c = permutation(entry);
171
66787
      while (table_[c] != nullptr) {
172
1126
        entry.next_++;
173
1126
        c = permutation(entry);
174
1126
      }
175

            
176
65661
      table_[c] = entry.host_;
177
65661
      entry.next_++;
178
65661
      entry.count_++;
179
65661
      table_index++;
180
65661
    }
181
64572
  }
182
13
}
183
CompactMaglevTable::CompactMaglevTable(const NormalizedHostWeightVector& normalized_host_weights,
184
                                       double max_normalized_weight, uint64_t table_size,
185
                                       bool use_hostname_for_hashing,
186
                                       MaglevLoadBalancerStats& stats)
187
479
    : MaglevTable(table_size, stats),
188
479
      table_(absl::bit_width(normalized_host_weights.size()), table_size) {
189
479
  constructMaglevTableInternal(normalized_host_weights, max_normalized_weight,
190
479
                               use_hostname_for_hashing);
191
479
}
192

            
193
void CompactMaglevTable::constructImplementationInternals(
194
456
    std::vector<TableBuildEntry>& table_build_entries, double max_normalized_weight) {
195
  // Populate the host table. Index into table_build_entries[i] will align with
196
  // the host here.
197
456
  host_table_.reserve(table_build_entries.size());
198
1534
  for (const auto& entry : table_build_entries) {
199
1534
    host_table_.emplace_back(entry.host_);
200
1534
  }
201
456
  host_table_.shrink_to_fit();
202

            
203
  // Vector to track whether or not a given fixed width bit is set in the
204
  // BitArray used as the maglev table.
205
456
  std::vector<bool> occupied(table_size_, false);
206

            
207
  // Iterate through the table build entries as many times as it takes to fill up the table.
208
456
  uint64_t table_index = 0;
209
28540866
  for (uint32_t iteration = 1; table_index < table_size_; ++iteration) {
210
123372611
    for (uint32_t i = 0; i < table_build_entries.size() && table_index < table_size_; i++) {
211
94832201
      TableBuildEntry& entry = table_build_entries[i];
212
      // To understand how target_weight_ and weight_ are used below, consider a host with weight
213
      // equal to max_normalized_weight. This would be picked on every single iteration. If it had
214
      // weight equal to max_normalized_weight / 3, then it would only be picked every 3 iterations,
215
      // etc.
216
94832201
      if (iteration * entry.weight_ < entry.target_weight_) {
217
65995789
        continue;
218
65995789
      }
219
28836412
      entry.target_weight_ += max_normalized_weight;
220
      // As we're using the compact implementation, our table size is limited to
221
      // 32-bit, hence static_cast here should be safe.
222
28836412
      uint32_t c = static_cast<uint32_t>(permutation(entry));
223
29542860
      while (occupied[c]) {
224
706448
        entry.next_++;
225
706448
        c = static_cast<uint32_t>(permutation(entry));
226
706448
      }
227

            
228
      // Record the index of the given host.
229
28836412
      table_.set(c, i);
230
28836412
      occupied[c] = true;
231

            
232
28836412
      entry.next_++;
233
28836412
      entry.count_++;
234
28836412
      table_index++;
235
28836412
    }
236
28540410
  }
237
456
}
238

            
239
void OriginalMaglevTable::logMaglevTable(bool use_hostname_for_hashing) const {
240
  for (uint64_t i = 0; i < table_.size(); ++i) {
241
    const absl::string_view key_to_hash = hashKey(table_[i], use_hostname_for_hashing);
242
    ENVOY_LOG(trace, "maglev: i={} address={} host={}", i, table_[i]->address()->asString(),
243
              key_to_hash);
244
  }
245
}
246

            
247
4
void CompactMaglevTable::logMaglevTable(bool use_hostname_for_hashing) const {
248
12
  for (uint64_t i = 0; i < table_size_; ++i) {
249

            
250
8
    const uint32_t index = table_.get(i);
251
8
    ASSERT(index < host_table_.size(), "Compact MaglevTable index into host table out of range");
252
8
    const auto& host = host_table_[index];
253

            
254
8
    const absl::string_view key_to_hash = hashKey(host, use_hostname_for_hashing);
255
8
    ENVOY_LOG(trace, "maglev: i={} address={} host={}", i, host->address()->asString(),
256
8
              key_to_hash);
257
8
  }
258
4
}
259

            
260
MaglevTable::MaglevTable(uint64_t table_size, MaglevLoadBalancerStats& stats)
261
495
    : table_size_(table_size), stats_(stats) {}
262

            
263
65884
HostSelectionResponse OriginalMaglevTable::chooseHost(uint64_t hash, uint32_t attempt) const {
264
65884
  if (table_.empty()) {
265
2
    return {nullptr};
266
2
  }
267

            
268
65882
  if (attempt > 0) {
269
    // If a retry host predicate is being applied, mutate the hash to choose an alternate host.
270
    // By using value with most bits set for the retry attempts, we achieve a larger change in
271
    // the hash, thereby reducing the likelihood that all retries are directed to a single host.
272
3
    hash ^= ~0ULL - attempt + 1;
273
3
  }
274

            
275
65882
  return {table_[hash % table_size_]};
276
65884
}
277

            
278
66111
HostSelectionResponse CompactMaglevTable::chooseHost(uint64_t hash, uint32_t attempt) const {
279
66111
  if (host_table_.empty()) {
280
2
    return {nullptr};
281
2
  }
282

            
283
66109
  if (attempt > 0) {
284
    // If a retry host predicate is being applied, mutate the hash to choose an alternate host.
285
    // By using value with most bits set for the retry attempts, we achieve a larger change in
286
    // the hash, thereby reducing the likelihood that all retries are directed to a single host.
287
3
    hash ^= ~0ULL - attempt + 1;
288
3
  }
289

            
290
66109
  const uint32_t index = table_.get(hash % table_size_);
291
66109
  ASSERT(index < host_table_.size(), "Compact MaglevTable index into host table out of range");
292
66109
  return {host_table_[index]};
293
66111
}
294

            
295
29609647
uint64_t MaglevTable::permutation(const TableBuildEntry& entry) {
296
29609647
  return (entry.offset_ + (entry.skip_ * entry.next_)) % table_size_;
297
29609647
}
298

            
299
MaglevLoadBalancer::MaglevLoadBalancer(const PrioritySet& priority_set, ClusterLbStats& stats,
300
                                       Stats::Scope& scope, Runtime::Loader& runtime,
301
                                       Random::RandomGenerator& random,
302
                                       uint32_t healthy_panic_threshold,
303
                                       const MaglevLbProto& config, HashPolicySharedPtr hash_policy)
304
466
    : ThreadAwareLoadBalancerBase(priority_set, stats, runtime, random, healthy_panic_threshold,
305
466
                                  config.has_locality_weighted_lb_config(), std::move(hash_policy)),
306
466
      scope_(scope.createScope("maglev_lb.")), stats_(generateStats(*scope_)),
307
      table_size_(
308
466
          PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, table_size, MaglevTable::DefaultTableSize)),
309
      use_hostname_for_hashing_(
310
466
          config.has_consistent_hashing_lb_config()
311
466
              ? config.consistent_hashing_lb_config().use_hostname_for_hashing()
312
466
              : false),
313
466
      hash_balance_factor_(PROTOBUF_GET_WRAPPED_OR_DEFAULT(config.consistent_hashing_lb_config(),
314
466
                                                           hash_balance_factor, 0)) {
315
466
  ENVOY_LOG(debug, "maglev table size: {}", table_size_);
316
  // The table size must be prime number.
317
466
  if (!Primes::isPrime(table_size_)) {
318
3
    throw EnvoyException("The table size of maglev must be prime number");
319
3
  }
320
466
}
321

            
322
468
MaglevLoadBalancerStats MaglevLoadBalancer::generateStats(Stats::Scope& scope) {
323
468
  return {ALL_MAGLEV_LOAD_BALANCER_STATS(POOL_GAUGE(scope))};
324
468
}
325

            
326
} // namespace Upstream
327
} // namespace Envoy