Coverage Report

Created: 2025-10-26 07:13

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rocksdb/memtable/hash_linklist_rep.cc
Line
Count
Source
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
7
#include <algorithm>
8
#include <atomic>
9
10
#include "db/memtable.h"
11
#include "memory/arena.h"
12
#include "memtable/skiplist.h"
13
#include "monitoring/histogram.h"
14
#include "port/port.h"
15
#include "rocksdb/memtablerep.h"
16
#include "rocksdb/slice.h"
17
#include "rocksdb/slice_transform.h"
18
#include "rocksdb/utilities/options_type.h"
19
#include "util/hash.h"
20
21
namespace ROCKSDB_NAMESPACE {
22
namespace {
23
24
using Key = const char*;
25
using MemtableSkipList = SkipList<Key, const MemTableRep::KeyComparator&>;
26
using Pointer = std::atomic<void*>;
27
28
// A data structure used as the header of a link list of a hash bucket.
29
struct BucketHeader {
30
  Pointer next;
31
  std::atomic<uint32_t> num_entries;
32
33
  explicit BucketHeader(void* n, uint32_t count)
34
0
      : next(n), num_entries(count) {}
35
36
0
  bool IsSkipListBucket() {
37
0
    return next.load(std::memory_order_relaxed) == this;
38
0
  }
39
40
0
  uint32_t GetNumEntries() const {
41
0
    return num_entries.load(std::memory_order_relaxed);
42
0
  }
43
44
  // REQUIRES: called from single-threaded Insert()
45
0
  void IncNumEntries() {
46
    // Only one thread can do write at one time. No need to do atomic
47
    // incremental. Update it with relaxed load and store.
48
0
    num_entries.store(GetNumEntries() + 1, std::memory_order_relaxed);
49
0
  }
50
};
51
52
// A data structure used as the header of a skip list of a hash bucket.
53
struct SkipListBucketHeader {
54
  BucketHeader Counting_header;
55
  MemtableSkipList skip_list;
56
57
  explicit SkipListBucketHeader(const MemTableRep::KeyComparator& cmp,
58
                                Allocator* allocator, uint32_t count)
59
0
      : Counting_header(this,  // Pointing to itself to indicate header type.
60
0
                        count),
61
0
        skip_list(cmp, allocator) {}
62
};
63
64
struct Node {
65
  // Accessors/mutators for links.  Wrapped in methods so we can
66
  // add the appropriate barriers as necessary.
67
0
  Node* Next() {
68
    // Use an 'acquire load' so that we observe a fully initialized
69
    // version of the returned Node.
70
0
    return next_.load(std::memory_order_acquire);
71
0
  }
72
0
  void SetNext(Node* x) {
73
    // Use a 'release store' so that anybody who reads through this
74
    // pointer observes a fully initialized version of the inserted node.
75
0
    next_.store(x, std::memory_order_release);
76
0
  }
77
  // No-barrier variants that can be safely used in a few locations.
78
0
  Node* NoBarrier_Next() { return next_.load(std::memory_order_relaxed); }
79
80
0
  void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); }
81
82
  // Needed for placement new below which is fine
83
0
  Node() = default;
84
85
 private:
86
  std::atomic<Node*> next_;
87
88
  // Prohibit copying due to the below
89
  Node(const Node&) = delete;
90
  Node& operator=(const Node&) = delete;
91
92
 public:
93
  char key[1];
94
};
95
96
// Memory structure of the mem table:
97
// It is a hash table, each bucket points to one entry, a linked list or a
98
// skip list. In order to track total number of records in a bucket to determine
99
// whether should switch to skip list, a header is added just to indicate
100
// number of entries in the bucket.
101
//
102
//
103
//          +-----> NULL    Case 1. Empty bucket
104
//          |
105
//          |
106
//          | +---> +-------+
107
//          | |     | Next  +--> NULL
108
//          | |     +-------+
109
//  +-----+ | |     |       |  Case 2. One Entry in bucket.
110
//  |     +-+ |     | Data  |          next pointer points to
111
//  +-----+   |     |       |          NULL. All other cases
112
//  |     |   |     |       |          next pointer is not NULL.
113
//  +-----+   |     +-------+
114
//  |     +---+
115
//  +-----+     +-> +-------+  +> +-------+  +-> +-------+
116
//  |     |     |   | Next  +--+  | Next  +--+   | Next  +-->NULL
117
//  +-----+     |   +-------+     +-------+      +-------+
118
//  |     +-----+   | Count |     |       |      |       |
119
//  +-----+         +-------+     | Data  |      | Data  |
120
//  |     |                       |       |      |       |
121
//  +-----+          Case 3.      |       |      |       |
122
//  |     |          A header     +-------+      +-------+
123
//  +-----+          points to
124
//  |     |          a linked list. Count indicates total number
125
//  +-----+          of rows in this bucket.
126
//  |     |
127
//  +-----+    +-> +-------+ <--+
128
//  |     |    |   | Next  +----+
129
//  +-----+    |   +-------+   Case 4. A header points to a skip
130
//  |     +----+   | Count |           list and next pointer points to
131
//  +-----+        +-------+           itself, to distinguish case 3 or 4.
132
//  |     |        |       |           Count still is kept to indicates total
133
//  +-----+        | Skip +-->         of entries in the bucket for debugging
134
//  |     |        | List  |   Data    purpose.
135
//  |     |        |      +-->
136
//  +-----+        |       |
137
//  |     |        +-------+
138
//  +-----+
139
//
140
// We don't have data race when changing cases because:
141
// (1) When changing from case 2->3, we create a new bucket header, put the
142
//     single node there first without changing the original node, and do a
143
//     release store when changing the bucket pointer. In that case, a reader
144
//     who sees a stale value of the bucket pointer will read this node, while
145
//     a reader sees the correct value because of the release store.
146
// (2) When changing case 3->4, a new header is created with skip list points
147
//     to the data, before doing an acquire store to change the bucket pointer.
148
//     The old header and nodes are never changed, so any reader sees any
149
//     of those existing pointers will guarantee to be able to iterate to the
150
//     end of the linked list.
151
// (3) Header's next pointer in case 3 might change, but they are never equal
152
//     to itself, so no matter a reader sees any stale or newer value, it will
153
//     be able to correctly distinguish case 3 and 4.
154
//
155
// The reason that we use case 2 is we want to make the format to be efficient
156
// when the utilization of buckets is relatively low. If we use case 3 for
157
// single entry bucket, we will need to waste 12 bytes for every entry,
158
// which can be significant decrease of memory utilization.
159
class HashLinkListRep : public MemTableRep {
160
 public:
161
  HashLinkListRep(const MemTableRep::KeyComparator& compare,
162
                  Allocator* allocator, const SliceTransform* transform,
163
                  size_t bucket_size, uint32_t threshold_use_skiplist,
164
                  size_t huge_page_tlb_size, Logger* logger,
165
                  int bucket_entries_logging_threshold,
166
                  bool if_log_bucket_dist_when_flash);
167
168
  KeyHandle Allocate(const size_t len, char** buf) override;
169
170
  void Insert(KeyHandle handle) override;
171
172
  bool Contains(const char* key) const override;
173
174
  size_t ApproximateMemoryUsage() override;
175
176
  void Get(const LookupKey& k, void* callback_args,
177
           bool (*callback_func)(void* arg, const char* entry)) override;
178
179
  ~HashLinkListRep() override;
180
181
  MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override;
182
183
  MemTableRep::Iterator* GetDynamicPrefixIterator(
184
      Arena* arena = nullptr) override;
185
186
 private:
187
  friend class DynamicIterator;
188
189
  size_t bucket_size_;
190
191
  // Maps slices (which are transformed user keys) to buckets of keys sharing
192
  // the same transform.
193
  Pointer* buckets_;
194
195
  const uint32_t threshold_use_skiplist_;
196
197
  // The user-supplied transform whose domain is the user keys.
198
  const SliceTransform* transform_;
199
200
  const MemTableRep::KeyComparator& compare_;
201
202
  Logger* logger_;
203
  int bucket_entries_logging_threshold_;
204
  bool if_log_bucket_dist_when_flash_;
205
206
  bool LinkListContains(Node* head, const Slice& key) const;
207
208
0
  bool IsEmptyBucket(Pointer& bucket_pointer) const {
209
0
    return bucket_pointer.load(std::memory_order_acquire) == nullptr;
210
0
  }
211
212
  // Precondition: GetLinkListFirstNode() must have been called first and return
213
  // null so that it must be a skip list bucket
214
  SkipListBucketHeader* GetSkipListBucketHeader(Pointer& bucket_pointer) const;
215
216
  // Returning nullptr indicates it is a skip list bucket.
217
  Node* GetLinkListFirstNode(Pointer& bucket_pointer) const;
218
219
0
  Slice GetPrefix(const Slice& internal_key) const {
220
0
    return transform_->Transform(ExtractUserKey(internal_key));
221
0
  }
222
223
0
  size_t GetHash(const Slice& slice) const {
224
0
    return GetSliceRangedNPHash(slice, bucket_size_);
225
0
  }
226
227
0
  Pointer& GetBucket(size_t i) const { return buckets_[i]; }
228
229
0
  Pointer& GetBucket(const Slice& slice) const {
230
0
    return GetBucket(GetHash(slice));
231
0
  }
232
233
0
  bool Equal(const Slice& a, const Key& b) const {
234
0
    return (compare_(b, a) == 0);
235
0
  }
236
237
0
  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
238
239
0
  bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const {
240
    // nullptr n is considered infinite
241
0
    return (n != nullptr) && (compare_(n->key, internal_key) < 0);
242
0
  }
243
244
0
  bool KeyIsAfterNode(const Key& key, const Node* n) const {
245
0
    // nullptr n is considered infinite
246
0
    return (n != nullptr) && (compare_(n->key, key) < 0);
247
0
  }
248
249
0
  bool KeyIsAfterOrAtNode(const Slice& internal_key, const Node* n) const {
250
0
    // nullptr n is considered infinite
251
0
    return (n != nullptr) && (compare_(n->key, internal_key) <= 0);
252
0
  }
253
254
0
  bool KeyIsAfterOrAtNode(const Key& key, const Node* n) const {
255
0
    // nullptr n is considered infinite
256
0
    return (n != nullptr) && (compare_(n->key, key) <= 0);
257
0
  }
258
259
  Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const;
260
  Node* FindLessOrEqualInBucket(Node* head, const Slice& key) const;
261
262
  class FullListIterator : public MemTableRep::Iterator {
263
   public:
264
    explicit FullListIterator(MemtableSkipList* list, Allocator* allocator)
265
0
        : iter_(list), full_list_(list), allocator_(allocator) {}
266
267
0
    ~FullListIterator() override = default;
268
269
    // Returns true iff the iterator is positioned at a valid node.
270
0
    bool Valid() const override { return iter_.Valid(); }
271
272
    // Returns the key at the current position.
273
    // REQUIRES: Valid()
274
0
    const char* key() const override {
275
0
      assert(Valid());
276
0
      return iter_.key();
277
0
    }
278
279
    // Advances to the next position.
280
    // REQUIRES: Valid()
281
0
    void Next() override {
282
0
      assert(Valid());
283
0
      iter_.Next();
284
0
    }
285
286
    // Advances to the previous position.
287
    // REQUIRES: Valid()
288
0
    void Prev() override {
289
0
      assert(Valid());
290
0
      iter_.Prev();
291
0
    }
292
293
    // Advance to the first entry with a key >= target
294
0
    void Seek(const Slice& internal_key, const char* memtable_key) override {
295
0
      const char* encoded_key = (memtable_key != nullptr)
296
0
                                    ? memtable_key
297
0
                                    : EncodeKey(&tmp_, internal_key);
298
0
      iter_.Seek(encoded_key);
299
0
    }
300
301
    // Retreat to the last entry with a key <= target
302
    void SeekForPrev(const Slice& internal_key,
303
0
                     const char* memtable_key) override {
304
0
      const char* encoded_key = (memtable_key != nullptr)
305
0
                                    ? memtable_key
306
0
                                    : EncodeKey(&tmp_, internal_key);
307
0
      iter_.SeekForPrev(encoded_key);
308
0
    }
309
310
    // Position at the first entry in collection.
311
    // Final state of iterator is Valid() iff collection is not empty.
312
0
    void SeekToFirst() override { iter_.SeekToFirst(); }
313
314
    // Position at the last entry in collection.
315
    // Final state of iterator is Valid() iff collection is not empty.
316
0
    void SeekToLast() override { iter_.SeekToLast(); }
317
318
   private:
319
    MemtableSkipList::Iterator iter_;
320
    // To destruct with the iterator.
321
    std::unique_ptr<MemtableSkipList> full_list_;
322
    std::unique_ptr<Allocator> allocator_;
323
    std::string tmp_;  // For passing to EncodeKey
324
  };
325
326
  class LinkListIterator : public MemTableRep::Iterator {
327
   public:
328
    explicit LinkListIterator(const HashLinkListRep* const hash_link_list_rep,
329
                              Node* head)
330
0
        : hash_link_list_rep_(hash_link_list_rep),
331
0
          head_(head),
332
0
          node_(nullptr) {}
333
334
    ~LinkListIterator() override = default;
335
336
    // Returns true iff the iterator is positioned at a valid node.
337
0
    bool Valid() const override { return node_ != nullptr; }
338
339
    // Returns the key at the current position.
340
    // REQUIRES: Valid()
341
0
    const char* key() const override {
342
0
      assert(Valid());
343
0
      return node_->key;
344
0
    }
345
346
    // Advances to the next position.
347
    // REQUIRES: Valid()
348
0
    void Next() override {
349
0
      assert(Valid());
350
0
      node_ = node_->Next();
351
0
    }
352
353
    // Advances to the previous position.
354
    // REQUIRES: Valid()
355
0
    void Prev() override {
356
      // Prefix iterator does not support total order.
357
      // We simply set the iterator to invalid state
358
0
      Reset(nullptr);
359
0
    }
360
361
    // Advance to the first entry with a key >= target
362
    void Seek(const Slice& internal_key,
363
0
              const char* /*memtable_key*/) override {
364
0
      node_ =
365
0
          hash_link_list_rep_->FindGreaterOrEqualInBucket(head_, internal_key);
366
0
    }
367
368
    // Retreat to the last entry with a key <= target
369
    void SeekForPrev(const Slice& /*internal_key*/,
370
0
                     const char* /*memtable_key*/) override {
371
      // Since we do not support Prev()
372
      // We simply do not support SeekForPrev
373
0
      Reset(nullptr);
374
0
    }
375
376
    // Position at the first entry in collection.
377
    // Final state of iterator is Valid() iff collection is not empty.
378
0
    void SeekToFirst() override {
379
      // Prefix iterator does not support total order.
380
      // We simply set the iterator to invalid state
381
0
      Reset(nullptr);
382
0
    }
383
384
    // Position at the last entry in collection.
385
    // Final state of iterator is Valid() iff collection is not empty.
386
0
    void SeekToLast() override {
387
      // Prefix iterator does not support total order.
388
      // We simply set the iterator to invalid state
389
0
      Reset(nullptr);
390
0
    }
391
392
   protected:
393
0
    void Reset(Node* head) {
394
0
      head_ = head;
395
0
      node_ = nullptr;
396
0
    }
397
398
   private:
399
    friend class HashLinkListRep;
400
    const HashLinkListRep* const hash_link_list_rep_;
401
    Node* head_;
402
    Node* node_;
403
404
0
    virtual void SeekToHead() { node_ = head_; }
405
  };
406
407
  class DynamicIterator : public HashLinkListRep::LinkListIterator {
408
   public:
409
    explicit DynamicIterator(HashLinkListRep& memtable_rep)
410
0
        : HashLinkListRep::LinkListIterator(&memtable_rep, nullptr),
411
0
          memtable_rep_(memtable_rep) {}
412
413
    // Advance to the first entry with a key >= target
414
0
    void Seek(const Slice& k, const char* memtable_key) override {
415
0
      auto transformed = memtable_rep_.GetPrefix(k);
416
0
      Pointer& bucket = memtable_rep_.GetBucket(transformed);
417
418
0
      if (memtable_rep_.IsEmptyBucket(bucket)) {
419
0
        skip_list_iter_.reset();
420
0
        Reset(nullptr);
421
0
      } else {
422
0
        Node* first_linked_list_node =
423
0
            memtable_rep_.GetLinkListFirstNode(bucket);
424
0
        if (first_linked_list_node != nullptr) {
425
          // The bucket is organized as a linked list
426
0
          skip_list_iter_.reset();
427
0
          Reset(first_linked_list_node);
428
0
          HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
429
430
0
        } else {
431
0
          SkipListBucketHeader* skip_list_header =
432
0
              memtable_rep_.GetSkipListBucketHeader(bucket);
433
0
          assert(skip_list_header != nullptr);
434
          // The bucket is organized as a skip list
435
0
          if (!skip_list_iter_) {
436
0
            skip_list_iter_.reset(
437
0
                new MemtableSkipList::Iterator(&skip_list_header->skip_list));
438
0
          } else {
439
0
            skip_list_iter_->SetList(&skip_list_header->skip_list);
440
0
          }
441
0
          if (memtable_key != nullptr) {
442
0
            skip_list_iter_->Seek(memtable_key);
443
0
          } else {
444
0
            IterKey encoded_key;
445
0
            encoded_key.EncodeLengthPrefixedKey(k);
446
0
            skip_list_iter_->Seek(encoded_key.GetUserKey().data());
447
0
          }
448
0
        }
449
0
      }
450
0
    }
451
452
0
    bool Valid() const override {
453
0
      if (skip_list_iter_) {
454
0
        return skip_list_iter_->Valid();
455
0
      }
456
0
      return HashLinkListRep::LinkListIterator::Valid();
457
0
    }
458
459
0
    const char* key() const override {
460
0
      if (skip_list_iter_) {
461
0
        return skip_list_iter_->key();
462
0
      }
463
0
      return HashLinkListRep::LinkListIterator::key();
464
0
    }
465
466
0
    void Next() override {
467
0
      if (skip_list_iter_) {
468
0
        skip_list_iter_->Next();
469
0
      } else {
470
0
        HashLinkListRep::LinkListIterator::Next();
471
0
      }
472
0
    }
473
474
   private:
475
    // the underlying memtable
476
    const HashLinkListRep& memtable_rep_;
477
    std::unique_ptr<MemtableSkipList::Iterator> skip_list_iter_;
478
  };
479
480
  class EmptyIterator : public MemTableRep::Iterator {
481
    // This is used when there wasn't a bucket. It is cheaper than
482
    // instantiating an empty bucket over which to iterate.
483
   public:
484
    EmptyIterator() = default;
485
0
    bool Valid() const override { return false; }
486
0
    const char* key() const override {
487
0
      assert(false);
488
0
      return nullptr;
489
0
    }
490
0
    void Next() override {}
491
0
    void Prev() override {}
492
    void Seek(const Slice& /*user_key*/,
493
0
              const char* /*memtable_key*/) override {}
494
    void SeekForPrev(const Slice& /*user_key*/,
495
0
                     const char* /*memtable_key*/) override {}
496
0
    void SeekToFirst() override {}
497
0
    void SeekToLast() override {}
498
499
   private:
500
  };
501
};
502
503
HashLinkListRep::HashLinkListRep(
504
    const MemTableRep::KeyComparator& compare, Allocator* allocator,
505
    const SliceTransform* transform, size_t bucket_size,
506
    uint32_t threshold_use_skiplist, size_t huge_page_tlb_size, Logger* logger,
507
    int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash)
508
0
    : MemTableRep(allocator),
509
0
      bucket_size_(bucket_size),
510
      // Threshold to use skip list doesn't make sense if less than 3, so we
511
      // force it to be minimum of 3 to simplify implementation.
512
0
      threshold_use_skiplist_(std::max(threshold_use_skiplist, 3U)),
513
0
      transform_(transform),
514
0
      compare_(compare),
515
0
      logger_(logger),
516
0
      bucket_entries_logging_threshold_(bucket_entries_logging_threshold),
517
0
      if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) {
518
0
  char* mem = allocator_->AllocateAligned(sizeof(Pointer) * bucket_size,
519
0
                                          huge_page_tlb_size, logger);
520
521
0
  buckets_ = new (mem) Pointer[bucket_size];
522
523
0
  for (size_t i = 0; i < bucket_size_; ++i) {
524
0
    buckets_[i].store(nullptr, std::memory_order_relaxed);
525
0
  }
526
0
}
527
528
HashLinkListRep::~HashLinkListRep() = default;
529
530
0
KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
531
0
  char* mem = allocator_->AllocateAligned(sizeof(Node) + len);
532
0
  Node* x = new (mem) Node();
533
0
  *buf = x->key;
534
0
  return static_cast<void*>(x);
535
0
}
536
537
SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
538
0
    Pointer& bucket_pointer) const {
539
0
  Pointer* first_next_pointer =
540
0
      static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
541
0
  assert(first_next_pointer != nullptr);
542
0
  assert(first_next_pointer->load(std::memory_order_relaxed) != nullptr);
543
544
  // Counting header
545
0
  BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
546
0
  assert(header->IsSkipListBucket());
547
0
  assert(header->GetNumEntries() > threshold_use_skiplist_);
548
0
  auto* skip_list_bucket_header =
549
0
      reinterpret_cast<SkipListBucketHeader*>(header);
550
0
  assert(skip_list_bucket_header->Counting_header.next.load(
551
0
             std::memory_order_relaxed) == header);
552
0
  return skip_list_bucket_header;
553
0
}
554
555
0
Node* HashLinkListRep::GetLinkListFirstNode(Pointer& bucket_pointer) const {
556
0
  Pointer* first_next_pointer =
557
0
      static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
558
0
  assert(first_next_pointer != nullptr);
559
0
  if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
560
    // Single entry bucket
561
0
    return reinterpret_cast<Node*>(first_next_pointer);
562
0
  }
563
564
  // It is possible that after we fetch first_next_pointer it is modified
565
  // and the next is not null anymore. In this case, the bucket should have been
566
  // modified to a counting header, so we should reload the first_next_pointer
567
  // to make sure we see the update.
568
0
  first_next_pointer =
569
0
      static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
570
  // Counting header
571
0
  BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
572
0
  if (!header->IsSkipListBucket()) {
573
0
    assert(header->GetNumEntries() <= threshold_use_skiplist_);
574
0
    return reinterpret_cast<Node*>(
575
0
        header->next.load(std::memory_order_acquire));
576
0
  }
577
0
  assert(header->GetNumEntries() > threshold_use_skiplist_);
578
0
  return nullptr;
579
0
}
580
581
0
void HashLinkListRep::Insert(KeyHandle handle) {
582
0
  Node* x = static_cast<Node*>(handle);
583
0
  assert(!Contains(x->key));
584
0
  Slice internal_key = GetLengthPrefixedSlice(x->key);
585
0
  auto transformed = GetPrefix(internal_key);
586
0
  auto& bucket = buckets_[GetHash(transformed)];
587
0
  Pointer* first_next_pointer =
588
0
      static_cast<Pointer*>(bucket.load(std::memory_order_relaxed));
589
590
0
  if (first_next_pointer == nullptr) {
591
    // Case 1. empty bucket
592
    // NoBarrier_SetNext() suffices since we will add a barrier when
593
    // we publish a pointer to "x" in prev[i].
594
0
    x->NoBarrier_SetNext(nullptr);
595
0
    bucket.store(x, std::memory_order_release);
596
0
    return;
597
0
  }
598
599
0
  BucketHeader* header = nullptr;
600
0
  if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
601
    // Case 2. only one entry in the bucket
602
    // Need to convert to a Counting bucket and turn to case 4.
603
0
    Node* first = reinterpret_cast<Node*>(first_next_pointer);
604
    // Need to add a bucket header.
605
    // We have to first convert it to a bucket with header before inserting
606
    // the new node. Otherwise, we might need to change next pointer of first.
607
    // In that case, a reader might sees the next pointer is NULL and wrongly
608
    // think the node is a bucket header.
609
0
    auto* mem = allocator_->AllocateAligned(sizeof(BucketHeader));
610
0
    header = new (mem) BucketHeader(first, 1);
611
0
    bucket.store(header, std::memory_order_release);
612
0
  } else {
613
0
    header = reinterpret_cast<BucketHeader*>(first_next_pointer);
614
0
    if (header->IsSkipListBucket()) {
615
      // Case 4. Bucket is already a skip list
616
0
      assert(header->GetNumEntries() > threshold_use_skiplist_);
617
0
      auto* skip_list_bucket_header =
618
0
          reinterpret_cast<SkipListBucketHeader*>(header);
619
      // Only one thread can execute Insert() at one time. No need to do atomic
620
      // incremental.
621
0
      skip_list_bucket_header->Counting_header.IncNumEntries();
622
0
      skip_list_bucket_header->skip_list.Insert(x->key);
623
0
      return;
624
0
    }
625
0
  }
626
627
0
  if (bucket_entries_logging_threshold_ > 0 &&
628
0
      header->GetNumEntries() ==
629
0
          static_cast<uint32_t>(bucket_entries_logging_threshold_)) {
630
0
    Info(logger_,
631
0
         "HashLinkedList bucket %" ROCKSDB_PRIszt
632
0
         " has more than %d "
633
0
         "entries. Key to insert: %s",
634
0
         GetHash(transformed), header->GetNumEntries(),
635
0
         GetLengthPrefixedSlice(x->key).ToString(true).c_str());
636
0
  }
637
638
0
  if (header->GetNumEntries() == threshold_use_skiplist_) {
639
    // Case 3. number of entries reaches the threshold so need to convert to
640
    // skip list.
641
0
    LinkListIterator bucket_iter(
642
0
        this, reinterpret_cast<Node*>(
643
0
                  first_next_pointer->load(std::memory_order_relaxed)));
644
0
    auto mem = allocator_->AllocateAligned(sizeof(SkipListBucketHeader));
645
0
    SkipListBucketHeader* new_skip_list_header = new (mem)
646
0
        SkipListBucketHeader(compare_, allocator_, header->GetNumEntries() + 1);
647
0
    auto& skip_list = new_skip_list_header->skip_list;
648
649
    // Add all current entries to the skip list
650
0
    for (bucket_iter.SeekToHead(); bucket_iter.Valid(); bucket_iter.Next()) {
651
0
      skip_list.Insert(bucket_iter.key());
652
0
    }
653
654
    // insert the new entry
655
0
    skip_list.Insert(x->key);
656
    // Set the bucket
657
0
    bucket.store(new_skip_list_header, std::memory_order_release);
658
0
  } else {
659
    // Case 5. Need to insert to the sorted linked list without changing the
660
    // header.
661
0
    Node* first =
662
0
        reinterpret_cast<Node*>(header->next.load(std::memory_order_relaxed));
663
0
    assert(first != nullptr);
664
    // Advance counter unless the bucket needs to be advanced to skip list.
665
    // In that case, we need to make sure the previous count never exceeds
666
    // threshold_use_skiplist_ to avoid readers to cast to wrong format.
667
0
    header->IncNumEntries();
668
669
0
    Node* cur = first;
670
0
    Node* prev = nullptr;
671
0
    while (true) {
672
0
      if (cur == nullptr) {
673
0
        break;
674
0
      }
675
0
      Node* next = cur->Next();
676
      // Make sure the lists are sorted.
677
      // If x points to head_ or next points nullptr, it is trivially satisfied.
678
0
      assert((cur == first) || (next == nullptr) ||
679
0
             KeyIsAfterNode(next->key, cur));
680
0
      if (KeyIsAfterNode(internal_key, cur)) {
681
        // Keep searching in this list
682
0
        prev = cur;
683
0
        cur = next;
684
0
      } else {
685
0
        break;
686
0
      }
687
0
    }
688
689
    // Our data structure does not allow duplicate insertion
690
0
    assert(cur == nullptr || !Equal(x->key, cur->key));
691
692
    // NoBarrier_SetNext() suffices since we will add a barrier when
693
    // we publish a pointer to "x" in prev[i].
694
0
    x->NoBarrier_SetNext(cur);
695
696
0
    if (prev) {
697
0
      prev->SetNext(x);
698
0
    } else {
699
0
      header->next.store(static_cast<void*>(x), std::memory_order_release);
700
0
    }
701
0
  }
702
0
}
703
704
0
bool HashLinkListRep::Contains(const char* key) const {
705
0
  Slice internal_key = GetLengthPrefixedSlice(key);
706
707
0
  auto transformed = GetPrefix(internal_key);
708
0
  Pointer& bucket = GetBucket(transformed);
709
0
  if (IsEmptyBucket(bucket)) {
710
0
    return false;
711
0
  }
712
713
0
  Node* linked_list_node = GetLinkListFirstNode(bucket);
714
0
  if (linked_list_node != nullptr) {
715
0
    return LinkListContains(linked_list_node, internal_key);
716
0
  }
717
718
0
  SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket);
719
0
  if (skip_list_header != nullptr) {
720
0
    return skip_list_header->skip_list.Contains(key);
721
0
  }
722
0
  return false;
723
0
}
724
725
0
size_t HashLinkListRep::ApproximateMemoryUsage() {
726
  // Memory is always allocated from the allocator.
727
0
  return 0;
728
0
}
729
730
void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
731
0
                          bool (*callback_func)(void* arg, const char* entry)) {
732
0
  auto transformed = transform_->Transform(k.user_key());
733
0
  Pointer& bucket = GetBucket(transformed);
734
735
0
  if (IsEmptyBucket(bucket)) {
736
0
    return;
737
0
  }
738
739
0
  auto* link_list_head = GetLinkListFirstNode(bucket);
740
0
  if (link_list_head != nullptr) {
741
0
    LinkListIterator iter(this, link_list_head);
742
0
    for (iter.Seek(k.internal_key(), nullptr);
743
0
         iter.Valid() && callback_func(callback_args, iter.key());
744
0
         iter.Next()) {
745
0
    }
746
0
  } else {
747
0
    auto* skip_list_header = GetSkipListBucketHeader(bucket);
748
0
    if (skip_list_header != nullptr) {
749
      // Is a skip list
750
0
      MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
751
0
      for (iter.Seek(k.memtable_key().data());
752
0
           iter.Valid() && callback_func(callback_args, iter.key());
753
0
           iter.Next()) {
754
0
      }
755
0
    }
756
0
  }
757
0
}
758
759
0
MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) {
760
  // allocate a new arena of similar size to the one currently in use
761
0
  Arena* new_arena = new Arena(allocator_->BlockSize());
762
0
  auto list = new MemtableSkipList(compare_, new_arena);
763
0
  HistogramImpl keys_per_bucket_hist;
764
765
0
  for (size_t i = 0; i < bucket_size_; ++i) {
766
0
    int count = 0;
767
0
    Pointer& bucket = GetBucket(i);
768
0
    if (!IsEmptyBucket(bucket)) {
769
0
      auto* link_list_head = GetLinkListFirstNode(bucket);
770
0
      if (link_list_head != nullptr) {
771
0
        LinkListIterator itr(this, link_list_head);
772
0
        for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
773
0
          list->Insert(itr.key());
774
0
          count++;
775
0
        }
776
0
      } else {
777
0
        auto* skip_list_header = GetSkipListBucketHeader(bucket);
778
0
        assert(skip_list_header != nullptr);
779
        // Is a skip list
780
0
        MemtableSkipList::Iterator itr(&skip_list_header->skip_list);
781
0
        for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
782
0
          list->Insert(itr.key());
783
0
          count++;
784
0
        }
785
0
      }
786
0
    }
787
0
    if (if_log_bucket_dist_when_flash_) {
788
0
      keys_per_bucket_hist.Add(count);
789
0
    }
790
0
  }
791
0
  if (if_log_bucket_dist_when_flash_ && logger_ != nullptr) {
792
0
    Info(logger_, "hashLinkedList Entry distribution among buckets: %s",
793
0
         keys_per_bucket_hist.ToString().c_str());
794
0
  }
795
796
0
  if (alloc_arena == nullptr) {
797
0
    return new FullListIterator(list, new_arena);
798
0
  } else {
799
0
    auto mem = alloc_arena->AllocateAligned(sizeof(FullListIterator));
800
0
    return new (mem) FullListIterator(list, new_arena);
801
0
  }
802
0
}
803
804
MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator(
805
0
    Arena* alloc_arena) {
806
0
  if (alloc_arena == nullptr) {
807
0
    return new DynamicIterator(*this);
808
0
  } else {
809
0
    auto mem = alloc_arena->AllocateAligned(sizeof(DynamicIterator));
810
0
    return new (mem) DynamicIterator(*this);
811
0
  }
812
0
}
813
814
bool HashLinkListRep::LinkListContains(Node* head,
815
0
                                       const Slice& user_key) const {
816
0
  Node* x = FindGreaterOrEqualInBucket(head, user_key);
817
0
  return (x != nullptr && Equal(user_key, x->key));
818
0
}
819
820
Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
821
0
                                                  const Slice& key) const {
822
0
  Node* x = head;
823
0
  while (true) {
824
0
    if (x == nullptr) {
825
0
      return x;
826
0
    }
827
0
    Node* next = x->Next();
828
    // Make sure the lists are sorted.
829
    // If x points to head_ or next points nullptr, it is trivially satisfied.
830
0
    assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x));
831
0
    if (KeyIsAfterNode(key, x)) {
832
      // Keep searching in this list
833
0
      x = next;
834
0
    } else {
835
0
      break;
836
0
    }
837
0
  }
838
0
  return x;
839
0
}
840
841
struct HashLinkListRepOptions {
842
0
  static const char* kName() { return "HashLinkListRepFactoryOptions"; }
843
  size_t bucket_count;
844
  uint32_t threshold_use_skiplist;
845
  size_t huge_page_tlb_size;
846
  int bucket_entries_logging_threshold;
847
  bool if_log_bucket_dist_when_flash;
848
};
849
850
static std::unordered_map<std::string, OptionTypeInfo> hash_linklist_info = {
851
    {"bucket_count",
852
     {offsetof(struct HashLinkListRepOptions, bucket_count), OptionType::kSizeT,
853
      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
854
    {"threshold",
855
     {offsetof(struct HashLinkListRepOptions, threshold_use_skiplist),
856
      OptionType::kUInt32T, OptionVerificationType::kNormal,
857
      OptionTypeFlags::kNone}},
858
    {"huge_page_size",
859
     {offsetof(struct HashLinkListRepOptions, huge_page_tlb_size),
860
      OptionType::kSizeT, OptionVerificationType::kNormal,
861
      OptionTypeFlags::kNone}},
862
    {"logging_threshold",
863
     {offsetof(struct HashLinkListRepOptions, bucket_entries_logging_threshold),
864
      OptionType::kInt, OptionVerificationType::kNormal,
865
      OptionTypeFlags::kNone}},
866
    {"log_when_flash",
867
     {offsetof(struct HashLinkListRepOptions, if_log_bucket_dist_when_flash),
868
      OptionType::kBoolean, OptionVerificationType::kNormal,
869
      OptionTypeFlags::kNone}},
870
};
871
872
class HashLinkListRepFactory : public MemTableRepFactory {
873
 public:
874
  explicit HashLinkListRepFactory(size_t bucket_count,
875
                                  uint32_t threshold_use_skiplist,
876
                                  size_t huge_page_tlb_size,
877
                                  int bucket_entries_logging_threshold,
878
0
                                  bool if_log_bucket_dist_when_flash) {
879
0
    options_.bucket_count = bucket_count;
880
0
    options_.threshold_use_skiplist = threshold_use_skiplist;
881
0
    options_.huge_page_tlb_size = huge_page_tlb_size;
882
0
    options_.bucket_entries_logging_threshold =
883
0
        bucket_entries_logging_threshold;
884
0
    options_.if_log_bucket_dist_when_flash = if_log_bucket_dist_when_flash;
885
0
    RegisterOptions(&options_, &hash_linklist_info);
886
0
  }
887
888
  using MemTableRepFactory::CreateMemTableRep;
889
  MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& compare,
890
                                 Allocator* allocator,
891
                                 const SliceTransform* transform,
892
                                 Logger* logger) override;
893
894
0
  static const char* kClassName() { return "HashLinkListRepFactory"; }
895
0
  static const char* kNickName() { return "hash_linkedlist"; }
896
0
  const char* Name() const override { return kClassName(); }
897
0
  const char* NickName() const override { return kNickName(); }
898
899
 private:
900
  HashLinkListRepOptions options_;
901
};
902
903
}  // namespace
904
905
MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
906
    const MemTableRep::KeyComparator& compare, Allocator* allocator,
907
0
    const SliceTransform* transform, Logger* logger) {
908
0
  return new HashLinkListRep(
909
0
      compare, allocator, transform, options_.bucket_count,
910
0
      options_.threshold_use_skiplist, options_.huge_page_tlb_size, logger,
911
0
      options_.bucket_entries_logging_threshold,
912
0
      options_.if_log_bucket_dist_when_flash);
913
0
}
914
915
MemTableRepFactory* NewHashLinkListRepFactory(
916
    size_t bucket_count, size_t huge_page_tlb_size,
917
    int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash,
918
0
    uint32_t threshold_use_skiplist) {
919
0
  return new HashLinkListRepFactory(
920
0
      bucket_count, threshold_use_skiplist, huge_page_tlb_size,
921
0
      bucket_entries_logging_threshold, if_log_bucket_dist_when_flash);
922
0
}
923
924
}  // namespace ROCKSDB_NAMESPACE