Coverage Report

Created: 2026-02-14 06:58

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rocksdb/memtable/inlineskiplist.h
Line
Count
Source
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.  Use of
7
// this source code is governed by a BSD-style license that can be found
8
// in the LICENSE file. See the AUTHORS file for names of contributors.
9
//
10
// InlineSkipList is derived from SkipList (skiplist.h), but it optimizes
11
// the memory layout by requiring that the key storage be allocated through
12
// the skip list instance.  For the common case of SkipList<const char*,
13
// Cmp> this saves 1 pointer per skip list node and gives better cache
14
// locality, at the expense of wasted padding from using AllocateAligned
15
// instead of Allocate for the keys.  The unused padding will be from
16
// 0 to sizeof(void*)-1 bytes, and the space savings are sizeof(void*)
17
// bytes, so despite the padding the space used is always less than
18
// SkipList<const char*, ..>.
19
//
20
// Thread safety -------------
21
//
22
// Writes via Insert require external synchronization, most likely a mutex.
23
// InsertConcurrently can be safely called concurrently with reads and
24
// with other concurrent inserts.  Reads require a guarantee that the
25
// InlineSkipList will not be destroyed while the read is in progress.
26
// Apart from that, reads progress without any internal locking or
27
// synchronization.
28
//
29
// Invariants:
30
//
31
// (1) Allocated nodes are never deleted until the InlineSkipList is
32
// destroyed.  This is trivially guaranteed by the code since we never
33
// delete any skip list nodes.
34
//
35
// (2) The contents of a Node except for the next/prev pointers are
36
// immutable after the Node has been linked into the InlineSkipList.
37
// Only Insert() modifies the list, and it is careful to initialize a
38
// node and use release-stores to publish the nodes in one or more lists.
39
//
40
// ... prev vs. next pointer ordering ...
41
//
42
43
#pragma once
44
#include <assert.h>
45
#include <stdlib.h>
46
47
#include <type_traits>
48
49
#include "memory/allocator.h"
50
#include "port/likely.h"
51
#include "port/port.h"
52
#include "rocksdb/slice.h"
53
#include "test_util/sync_point.h"
54
#include "util/atomic.h"
55
#include "util/random.h"
56
57
namespace ROCKSDB_NAMESPACE {
58
59
template <class Comparator>
60
class InlineSkipList {
61
 private:
62
  struct Node;
63
  struct Splice;
64
65
 public:
66
  using DecodedKey =
67
      typename std::remove_reference<Comparator>::type::DecodedType;
68
69
  static const uint16_t kMaxPossibleHeight = 32;
70
71
  // Create a new InlineSkipList object that will use "cmp" for comparing
72
  // keys, and will allocate memory using "*allocator".  Objects allocated
73
  // in the allocator must remain allocated for the lifetime of the
74
  // skiplist object.
75
  explicit InlineSkipList(Comparator cmp, Allocator* allocator,
76
                          int32_t max_height = 12,
77
                          int32_t branching_factor = 4);
78
  // No copying allowed
79
  InlineSkipList(const InlineSkipList&) = delete;
80
  InlineSkipList& operator=(const InlineSkipList&) = delete;
81
82
  // Allocates a key and a skip-list node, returning a pointer to the key
83
  // portion of the node.  This method is thread-safe if the allocator
84
  // is thread-safe.
85
  char* AllocateKey(size_t key_size);
86
87
  // Allocate a splice using allocator.
88
  Splice* AllocateSplice();
89
90
  // Allocate a splice on heap.
91
  Splice* AllocateSpliceOnHeap();
92
93
  // Inserts a key allocated by AllocateKey, after the actual key value
94
  // has been filled in.
95
  //
96
  // REQUIRES: nothing that compares equal to key is currently in the list.
97
  // REQUIRES: no concurrent calls to any of inserts.
98
  bool Insert(const char* key);
99
100
  // Inserts a key allocated by AllocateKey with a hint of last insert
101
  // position in the skip-list. If hint points to nullptr, a new hint will be
102
  // populated, which can be used in subsequent calls.
103
  //
104
  // It can be used to optimize the workload where there are multiple groups
105
  // of keys, and each key is likely to insert to a location close to the last
106
  // inserted key in the same group. One example is sequential inserts.
107
  //
108
  // REQUIRES: nothing that compares equal to key is currently in the list.
109
  // REQUIRES: no concurrent calls to any of inserts.
110
  bool InsertWithHint(const char* key, void** hint);
111
112
  // Like InsertConcurrently, but with a hint
113
  //
114
  // REQUIRES: nothing that compares equal to key is currently in the list.
115
  // REQUIRES: no concurrent calls that use same hint
116
  bool InsertWithHintConcurrently(const char* key, void** hint);
117
118
  // Like Insert, but external synchronization is not required.
119
  bool InsertConcurrently(const char* key);
120
121
  // Inserts a node into the skip list.  key must have been allocated by
122
  // AllocateKey and then filled in by the caller.  If UseCAS is true,
123
  // then external synchronization is not required, otherwise this method
124
  // may not be called concurrently with any other insertions.
125
  //
126
  // Regardless of whether UseCAS is true, the splice must be owned
127
  // exclusively by the current thread.  If allow_partial_splice_fix is
128
  // true, then the cost of insertion is amortized O(log D), where D is
129
  // the distance from the splice to the inserted key (measured as the
130
  // number of intervening nodes).  Note that this bound is very good for
131
  // sequential insertions!  If allow_partial_splice_fix is false then
132
  // the existing splice will be ignored unless the current key is being
133
  // inserted immediately after the splice.  allow_partial_splice_fix ==
134
  // false has worse running time for the non-sequential case O(log N),
135
  // but a better constant factor.
136
  template <bool UseCAS>
137
  bool Insert(const char* key, Splice* splice, bool allow_partial_splice_fix);
138
139
  // Returns true iff an entry that compares equal to key is in the list.
140
  bool Contains(const char* key) const;
141
142
  // Return estimated number of entries from `start_ikey` to `end_ikey`.
143
  uint64_t ApproximateNumEntries(const Slice& start_ikey,
144
                                 const Slice& end_ikey) const;
145
146
  // Validate correctness of the skip-list.
147
  void TEST_Validate() const;
148
149
  // Iteration over the contents of a skip list
150
  class Iterator {
151
   public:
152
    // Initialize an iterator over the specified list.
153
    // The returned iterator is not valid.
154
    explicit Iterator(const InlineSkipList* list);
155
156
    // Change the underlying skiplist used for this iterator
157
    // This enables us not changing the iterator without deallocating
158
    // an old one and then allocating a new one
159
    void SetList(const InlineSkipList* list);
160
161
    // Returns true iff the iterator is positioned at a valid node.
162
    bool Valid() const;
163
164
    // Returns the key at the current position.
165
    // REQUIRES: Valid()
166
    const char* key() const;
167
168
    // Advances to the next position.
169
    // REQUIRES: Valid()
170
    void Next();
171
172
    [[nodiscard]] Status NextAndValidate(bool allow_data_in_errors);
173
174
    // Advances to the previous position.
175
    // REQUIRES: Valid()
176
    void Prev();
177
178
    [[nodiscard]] Status PrevAndValidate(bool allow_data_in_errors);
179
180
    // Advance to the first entry with a key >= target
181
    void Seek(const char* target);
182
183
    [[nodiscard]] Status SeekAndValidate(
184
        const char* target, bool allow_data_in_errors,
185
        bool detect_key_out_of_order,
186
        const std::function<Status(const char*, bool)>&
187
            key_validation_callback);
188
189
    // Retreat to the last entry with a key <= target
190
    void SeekForPrev(const char* target);
191
192
    // Advance to a random entry in the list.
193
    void RandomSeek();
194
195
    // Position at the first entry in list.
196
    // Final state of iterator is Valid() iff list is not empty.
197
    void SeekToFirst();
198
199
    // Position at the last entry in list.
200
    // Final state of iterator is Valid() iff list is not empty.
201
    void SeekToLast();
202
203
   private:
204
    const InlineSkipList* list_;
205
    Node* node_;
206
    // Intentionally copyable
207
  };
208
209
 private:
210
  const uint16_t kMaxHeight_;
211
  const uint16_t kBranching_;
212
  const uint32_t kScaledInverseBranching_;
213
214
  Allocator* const allocator_;  // Allocator used for allocations of nodes
215
  // Immutable after construction
216
  Comparator const compare_;
217
  Node* const head_;
218
219
  // Maximum height of any node in the list (or in the process of being added).
220
  //  Modified only by Insert().  Relaxed reads are always OK because starting
221
  // from higher levels only helps efficiency, not correctness.
222
  RelaxedAtomic<int> max_height_;
223
224
  // seq_splice_ is a Splice used for insertions in the non-concurrent
225
  // case.  It caches the prev and next found during the most recent
226
  // non-concurrent insertion.
227
  Splice* seq_splice_;
228
229
16.3k
  inline int GetMaxHeight() const { return max_height_.LoadRelaxed(); }
230
231
  int RandomHeight();
232
233
  Node* AllocateNode(size_t key_size, int height);
234
235
0
  bool Equal(const char* a, const char* b) const {
236
0
    return (compare_(a, b) == 0);
237
0
  }
238
239
0
  bool LessThan(const char* a, const char* b) const {
240
0
    return (compare_(a, b) < 0);
241
0
  }
242
243
  // Return true if key is greater than the data stored in "n".  Null n
244
  // is considered infinite.  n should not be head_.
245
  bool KeyIsAfterNode(const char* key, Node* n) const;
246
  bool KeyIsAfterNode(const DecodedKey& key, Node* n) const;
247
248
  // Returns the earliest node with a key >= key.
249
  // Returns OK, if no corruption is found.
250
  // node is set to the found node, or to nullptr if no node is found.
251
  // Returns Corruption if a corruption is found.
252
  Status FindGreaterOrEqual(const char* key, Node** node,
253
                            bool detect_key_out_of_order,
254
                            bool allow_data_in_errors,
255
                            const std::function<Status(const char*, bool)>&
256
                                key_validation_callback) const;
257
258
  // Returns the latest node with a key < key.
259
  // Returns head_ if there is no such node.
260
  // Fills prev[level] with pointer to previous node at "level" for every
261
  // level in [0..max_height_-1], if prev is non-null.
262
  // @param corrupted_node If not null, will validate the order of visited
263
  // nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be
264
  // returned and *corrupted_node will be set to n2.
265
  Node* FindLessThan(const char* key, Node** corrupted_node) const;
266
267
  // Return the last node in the list.
268
  // Return head_ if list is empty.
269
  Node* FindLast() const;
270
271
  // Returns a random entry.
272
  Node* FindRandomEntry() const;
273
274
  // Traverses a single level of the list, setting *out_prev to the last
275
  // node before the key and *out_next to the first node after. Assumes
276
  // that the key is not present in the skip list. On entry, before should
277
  // point to a node that is before the key, and after should point to
278
  // a node that is after the key.  after should be nullptr if a good after
279
  // node isn't conveniently available.
280
  template <bool prefetch_before>
281
  void FindSpliceForLevel(const DecodedKey& key, Node* before, Node* after,
282
                          int level, Node** out_prev, Node** out_next);
283
284
  // Recomputes Splice levels from highest_level (inclusive) down to
285
  // lowest_level (inclusive).
286
  void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice,
287
                             int recompute_level);
288
289
  static Status Corruption(Node* prev, Node* next, bool allow_data_in_errors);
290
};
291
292
// Implementation details follow
293
294
template <class Comparator>
295
struct InlineSkipList<Comparator>::Splice {
296
  // The invariant of a Splice is that prev_[i+1].key <= prev_[i].key <
297
  // next_[i].key <= next_[i+1].key for all i.  That means that if a
298
  // key is bracketed by prev_[i] and next_[i] then it is bracketed by
299
  // all higher levels.  It is _not_ required that prev_[i]->Next(i) ==
300
  // next_[i] (it probably did at some point in the past, but intervening
301
  // or concurrent operations might have inserted nodes in between).
302
  int height_ = 0;
303
  Node** prev_;
304
  Node** next_;
305
};
306
307
// The Node data type is more of a pointer into custom-managed memory than
308
// a traditional C++ struct.  The key is stored in the bytes immediately
309
// after the struct, and the next_ pointers for nodes with height > 1 are
310
// stored immediately _before_ the struct.  This avoids the need to include
311
// any pointer or sizing data, which reduces per-node memory overheads.
312
template <class Comparator>
313
struct InlineSkipList<Comparator>::Node {
314
  // Stores the height of the node in the memory location normally used for
315
  // next_[0].  This is used for passing data from AllocateKey to Insert.
316
130k
  void StashHeight(const int height) {
317
130k
    static_assert(sizeof(int) <= sizeof(next_[0]));
318
130k
    memcpy(static_cast<void*>(&next_[0]), &height, sizeof(int));
319
130k
  }
320
321
  // Retrieves the value passed to StashHeight.  Undefined after a call
322
  // to SetNext or NoBarrier_SetNext.
323
29.0k
  int UnstashHeight() const {
324
29.0k
    int rv;
325
29.0k
    memcpy(&rv, &next_[0], sizeof(int));
326
29.0k
    return rv;
327
29.0k
  }
328
329
124k
  const char* Key() const { return reinterpret_cast<const char*>(&next_[1]); }
330
331
  // Accessors/mutators for links.  Wrapped in methods so we can add
332
  // the appropriate barriers as necessary, and perform the necessary
333
  // addressing trickery for storing links below the Node in memory.
334
166k
  Node* Next(int n) {
335
166k
    assert(n >= 0);
336
    // Use an 'acquire load' so that we observe a fully initialized
337
    // version of the returned Node.
338
166k
    return ((&next_[0] - n)->Load());
339
166k
  }
340
341
1.25M
  void SetNext(int n, Node* x) {
342
1.25M
    assert(n >= 0);
343
    // Use a 'release store' so that anybody who reads through this
344
    // pointer observes a fully initialized version of the inserted node.
345
1.25M
    (&next_[0] - n)->Store(x);
346
1.25M
  }
347
348
0
  bool CASNext(int n, Node* expected, Node* x) {
349
0
    assert(n >= 0);
350
0
    return (&next_[0] - n)->CasStrong(expected, x);
351
0
  }
352
353
  // No-barrier variants that can be safely used in a few locations.
354
  Node* NoBarrier_Next(int n) {
355
    assert(n >= 0);
356
    return (&next_[0] - n)->LoadRelaxed();
357
  }
358
359
38.5k
  void NoBarrier_SetNext(int n, Node* x) {
360
38.5k
    assert(n >= 0);
361
38.5k
    (&next_[0] - n)->StoreRelaxed(x);
362
38.5k
  }
363
364
  // Insert node after prev on specific level.
365
  void InsertAfter(Node* prev, int level) {
366
    // NoBarrier_SetNext() suffices since we will add a barrier when
367
    // we publish a pointer to "this" in prev.
368
    NoBarrier_SetNext(level, prev->NoBarrier_Next(level));
369
    prev->SetNext(level, this);
370
  }
371
372
 private:
373
  // next_[0] is the lowest level link (level 0).  Higher levels are
374
  // stored _earlier_, so level 1 is at next_[-1].
375
  Atomic<Node*> next_[1];
376
};
377
378
template <class Comparator>
379
inline InlineSkipList<Comparator>::Iterator::Iterator(
380
23.2k
    const InlineSkipList* list) {
381
23.2k
  SetList(list);
382
23.2k
}
383
384
template <class Comparator>
385
inline void InlineSkipList<Comparator>::Iterator::SetList(
386
23.2k
    const InlineSkipList* list) {
387
23.2k
  list_ = list;
388
23.2k
  node_ = nullptr;
389
23.2k
}
390
391
template <class Comparator>
392
36.7k
inline bool InlineSkipList<Comparator>::Iterator::Valid() const {
393
36.7k
  return node_ != nullptr;
394
36.7k
}
395
396
template <class Comparator>
397
40.9k
inline const char* InlineSkipList<Comparator>::Iterator::key() const {
398
40.9k
  assert(Valid());
399
40.9k
  return node_->Key();
400
40.9k
}
401
402
template <class Comparator>
403
11.7k
inline void InlineSkipList<Comparator>::Iterator::Next() {
404
11.7k
  assert(Valid());
405
406
  // Capture the key before move on to next node
407
11.7k
  TEST_SYNC_POINT_CALLBACK(
408
11.7k
      "InlineSkipList::Iterator::Next::key",
409
11.7k
      static_cast<void*>(const_cast<char*>((node_->Key()))));
410
411
11.7k
  node_ = node_->Next(0);
412
11.7k
}
413
414
template <class Comparator>
415
inline Status InlineSkipList<Comparator>::Iterator::NextAndValidate(
416
0
    bool allow_data_in_errors) {
417
0
  assert(Valid());
418
419
  // Capture the key before move on to next node
420
0
  TEST_SYNC_POINT_CALLBACK(
421
0
      "InlineSkipList::Iterator::Next::key",
422
0
      static_cast<void*>(const_cast<char*>((node_->Key()))));
423
424
0
  Node* prev_node = node_;
425
0
  node_ = node_->Next(0);
426
  // Verify that keys are increasing.
427
0
  if (prev_node != list_->head_ && node_ != nullptr &&
428
0
      list_->compare_(prev_node->Key(), node_->Key()) >= 0) {
429
0
    Node* node = node_;
430
    // invalidates the iterator
431
0
    node_ = nullptr;
432
0
    return Corruption(prev_node, node, allow_data_in_errors);
433
0
  }
434
0
  return Status::OK();
435
0
}
436
437
template <class Comparator>
438
2.14k
inline void InlineSkipList<Comparator>::Iterator::Prev() {
439
  // Instead of using explicit "prev" links, we just search for the
440
  // last node that falls before key.
441
2.14k
  assert(Valid());
442
2.14k
  node_ = list_->FindLessThan(node_->Key(), nullptr);
443
2.14k
  if (node_ == list_->head_) {
444
743
    node_ = nullptr;
445
743
  }
446
2.14k
}
447
448
template <class Comparator>
449
inline Status InlineSkipList<Comparator>::Iterator::PrevAndValidate(
450
0
    const bool allow_data_in_errors) {
451
0
  assert(Valid());
452
  // Skip list validation is done in FindLessThan().
453
0
  Node* corrupted_node = nullptr;
454
0
  node_ = list_->FindLessThan(node_->Key(), &corrupted_node);
455
0
  if (corrupted_node) {
456
0
    Node* node = node_;
457
0
    node_ = nullptr;
458
0
    return Corruption(node, corrupted_node, allow_data_in_errors);
459
0
  }
460
0
  if (node_ == list_->head_) {
461
0
    node_ = nullptr;
462
0
  }
463
0
  return Status::OK();
464
0
}
465
466
template <class Comparator>
467
12.9k
inline void InlineSkipList<Comparator>::Iterator::Seek(const char* target) {
468
12.9k
  auto status =
469
12.9k
      list_->FindGreaterOrEqual(target, &node_, false, false, nullptr);
470
12.9k
  assert(status.ok());
471
12.9k
}
472
473
template <class Comparator>
474
inline Status InlineSkipList<Comparator>::Iterator::SeekAndValidate(
475
    const char* target, const bool allow_data_in_errors,
476
    bool check_key_out_of_order,
477
0
    const std::function<Status(const char*, bool)>& key_validation_callback) {
478
0
  return list_->FindGreaterOrEqual(target, &node_, allow_data_in_errors,
479
0
                                   check_key_out_of_order,
480
0
                                   key_validation_callback);
481
0
}
482
483
template <class Comparator>
484
inline void InlineSkipList<Comparator>::Iterator::SeekForPrev(
485
0
    const char* target) {
486
0
  Seek(target);
487
0
  if (!Valid()) {
488
0
    SeekToLast();
489
0
  }
490
0
  while (Valid() && list_->LessThan(target, key())) {
491
0
    Prev();
492
0
  }
493
0
}
494
495
template <class Comparator>
496
0
inline void InlineSkipList<Comparator>::Iterator::RandomSeek() {
497
0
  node_ = list_->FindRandomEntry();
498
0
}
499
500
template <class Comparator>
501
8.68k
inline void InlineSkipList<Comparator>::Iterator::SeekToFirst() {
502
8.68k
  node_ = list_->head_->Next(0);
503
8.68k
}
504
505
template <class Comparator>
506
1.24k
inline void InlineSkipList<Comparator>::Iterator::SeekToLast() {
507
1.24k
  node_ = list_->FindLast();
508
1.24k
  if (node_ == list_->head_) {
509
772
    node_ = nullptr;
510
772
  }
511
1.24k
}
512
513
template <class Comparator>
514
29.0k
int InlineSkipList<Comparator>::RandomHeight() {
515
29.0k
  auto rnd = Random::GetTLSInstance();
516
517
  // Increase height with probability 1 in kBranching
518
29.0k
  int height = 1;
519
38.5k
  while (height < kMaxHeight_ && height < kMaxPossibleHeight &&
520
38.5k
         rnd->Next() < kScaledInverseBranching_) {
521
9.47k
    height++;
522
9.47k
  }
523
29.0k
  TEST_SYNC_POINT_CALLBACK("InlineSkipList::RandomHeight::height", &height);
524
29.0k
  assert(height > 0);
525
29.0k
  assert(height <= kMaxHeight_);
526
29.0k
  assert(height <= kMaxPossibleHeight);
527
29.0k
  return height;
528
29.0k
}
529
530
template <class Comparator>
531
bool InlineSkipList<Comparator>::KeyIsAfterNode(const char* key,
532
                                                Node* n) const {
533
  // nullptr n is considered infinite
534
  assert(n != head_);
535
  return (n != nullptr) && (compare_(n->Key(), key) < 0);
536
}
537
538
template <class Comparator>
539
bool InlineSkipList<Comparator>::KeyIsAfterNode(const DecodedKey& key,
540
32.6k
                                                Node* n) const {
541
  // nullptr n is considered infinite
542
32.6k
  assert(n != head_);
543
32.6k
  return (n != nullptr) && (compare_(n->Key(), key) < 0);
544
32.6k
}
545
546
template <class Comparator>
547
Status InlineSkipList<Comparator>::FindGreaterOrEqual(
548
    const char* key, Node** node, bool allow_data_in_errors,
549
    bool detect_key_out_of_order,
550
    const std::function<Status(const char*, bool)>& key_validation_callback)
551
12.9k
    const {
552
  // Note: It looks like we could reduce duplication by implementing
553
  // this function as FindLessThan(key)->Next(0), but we wouldn't be able
554
  // to exit early on equality and the result wouldn't even be correct.
555
  // A concurrent insert might occur after FindLessThan(key) but before
556
  // we get a chance to call Next(0).
557
12.9k
  Node* x = head_;
558
12.9k
  *node = nullptr;
559
12.9k
  int level = GetMaxHeight() - 1;
560
12.9k
  Node* last_bigger = nullptr;
561
12.9k
  const DecodedKey key_decoded = compare_.decode_key(key);
562
16.1k
  while (true) {
563
16.1k
    Node* next = x->Next(level);
564
16.1k
    if (next != nullptr) {
565
13.4k
      PREFETCH(next->Next(level), 0, 1);
566
13.4k
      if (detect_key_out_of_order && x != head_ &&
567
0
          compare_(x->Key(), next->Key()) >= 0) {
568
0
        return Corruption(x, next, allow_data_in_errors);
569
0
      }
570
13.4k
      if (key_validation_callback != nullptr) {
571
0
        auto status =
572
0
            key_validation_callback(next->Key(), allow_data_in_errors);
573
0
        if (!status.ok()) {
574
0
          return status;
575
0
        }
576
0
      }
577
13.4k
    }
578
    // Make sure the lists are sorted
579
16.1k
    assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x));
580
    // Make sure we haven't overshot during our search
581
16.1k
    assert(x == head_ || KeyIsAfterNode(key_decoded, x));
582
16.1k
    int cmp = (next == nullptr || next == last_bigger)
583
16.1k
                  ? 1
584
16.1k
                  : compare_(next->Key(), key_decoded);
585
16.1k
    if (cmp == 0 || (cmp > 0 && level == 0)) {
586
12.9k
      *node = next;
587
12.9k
      return Status::OK();
588
12.9k
    } else if (cmp < 0) {
589
      // Keep searching in this list
590
1.71k
      x = next;
591
1.71k
    } else {
592
      // Switch to next list, reuse compare_() result
593
1.49k
      last_bigger = next;
594
1.49k
      level--;
595
1.49k
    }
596
16.1k
  }
597
12.9k
}
598
599
template <class Comparator>
600
typename InlineSkipList<Comparator>::Node*
601
InlineSkipList<Comparator>::FindLessThan(const char* key,
602
2.14k
                                         Node** const out_of_order_node) const {
603
2.14k
  int level = GetMaxHeight() - 1;
604
2.14k
  assert(level >= 0);
605
2.14k
  Node* x = head_;
606
  // KeyIsAfter(key, last_not_after) is definitely false
607
2.14k
  Node* last_not_after = nullptr;
608
2.14k
  const DecodedKey key_decoded = compare_.decode_key(key);
609
7.84k
  while (true) {
610
7.84k
    assert(x != nullptr);
611
7.84k
    Node* next = x->Next(level);
612
7.84k
    if (next != nullptr) {
613
7.17k
      PREFETCH(next->Next(level), 0, 1);
614
7.17k
      if (out_of_order_node && x != head_ &&
615
0
          compare_(x->Key(), next->Key()) >= 0) {
616
0
        *out_of_order_node = next;
617
0
        return x;
618
0
      }
619
7.17k
    }
620
7.84k
    assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x));
621
7.84k
    assert(x == head_ || KeyIsAfterNode(key_decoded, x));
622
7.84k
    if (next != last_not_after && KeyIsAfterNode(key_decoded, next)) {
623
      // Keep searching in this list
624
3.64k
      assert(next != nullptr);
625
3.64k
      x = next;
626
4.20k
    } else {
627
4.20k
      if (level == 0) {
628
2.14k
        return x;
629
2.14k
      } else {
630
        // Switch to next list, reuse KeyIsAfterNode() result
631
2.05k
        last_not_after = next;
632
2.05k
        level--;
633
2.05k
      }
634
4.20k
    }
635
7.84k
  }
636
2.14k
}
637
638
template <class Comparator>
639
typename InlineSkipList<Comparator>::Node*
640
1.24k
InlineSkipList<Comparator>::FindLast() const {
641
1.24k
  Node* x = head_;
642
1.24k
  int level = GetMaxHeight() - 1;
643
2.60k
  while (true) {
644
2.60k
    Node* next = x->Next(level);
645
2.60k
    if (next == nullptr) {
646
1.53k
      if (level == 0) {
647
1.24k
        return x;
648
1.24k
      } else {
649
        // Switch to next list
650
287
        level--;
651
287
      }
652
1.53k
    } else {
653
1.06k
      x = next;
654
1.06k
    }
655
2.60k
  }
656
1.24k
}
657
658
template <class Comparator>
659
typename InlineSkipList<Comparator>::Node*
660
0
InlineSkipList<Comparator>::FindRandomEntry() const {
661
  // TODO(bjlemaire): consider adding PREFETCH calls.
662
0
  Node *x = head_, *scan_node = nullptr, *limit_node = nullptr;
663
664
  // We start at the max level.
665
  // FOr each level, we look at all the nodes at the level, and
666
  // we randomly pick one of them. Then decrement the level
667
  // and reiterate the process.
668
  // eg: assume GetMaxHeight()=5, and there are #100 elements (nodes).
669
  // level 4 nodes: lvl_nodes={#1, #15, #67, #84}. Randomly pick #15.
670
  // We will consider all the nodes between #15 (inclusive) and #67
671
  // (exclusive). #67 is called 'limit_node' here.
672
  // level 3 nodes: lvl_nodes={#15, #21, #45, #51}. Randomly choose
673
  // #51. #67 remains 'limit_node'.
674
  // [...]
675
  // level 0 nodes: lvl_nodes={#56,#57,#58,#59}. Randomly pick $57.
676
  // Return Node #57.
677
0
  std::vector<Node*> lvl_nodes;
678
0
  Random* rnd = Random::GetTLSInstance();
679
0
  int level = GetMaxHeight() - 1;
680
681
0
  while (level >= 0) {
682
0
    lvl_nodes.clear();
683
0
    scan_node = x;
684
0
    while (scan_node != limit_node) {
685
0
      lvl_nodes.push_back(scan_node);
686
0
      scan_node = scan_node->Next(level);
687
0
    }
688
0
    uint32_t rnd_idx = rnd->Next() % lvl_nodes.size();
689
0
    x = lvl_nodes[rnd_idx];
690
0
    if (rnd_idx + 1 < lvl_nodes.size()) {
691
0
      limit_node = lvl_nodes[rnd_idx + 1];
692
0
    }
693
0
    level--;
694
0
  }
695
  // There is a special case where x could still be the head_
696
  // (note that the head_ contains no key).
697
0
  return x == head_ && head_ != nullptr ? head_->Next(0) : x;
698
0
}
699
700
template <class Comparator>
701
uint64_t InlineSkipList<Comparator>::ApproximateNumEntries(
702
0
    const Slice& start_ikey, const Slice& end_ikey) const {
703
  // The number of entries at a given level for the given range, in terms of
704
  // the actual number of entries in that range (level 0), follows a binomial
705
  // distribution, which is very well approximated by the Poisson distribution.
706
  // That has stddev sqrt(x) where x is the expected number of entries (mean)
707
  // at this level, and the best predictor of x is the number of observed
708
  // entries (at this level). To predict the number of entries on level 0 we use
709
  // x * kBranchinng ^ level. From the standard deviation, the P99+ relative
710
  // error is roughly 3 * sqrt(x) / x. Thus, a reasonable approach would be to
711
  // find the smallest level with at least some moderate constant number entries
712
  // in range. E.g. with at least ~40 entries, we expect P99+ relative error
713
  // (approximation accuracy) of ~ 50% = 3 * sqrt(40) / 40; P95 error of
714
  // ~30%; P75 error of < 20%.
715
  //
716
  // However, there are two issues with this approach, and an observation:
717
  // * Pointer chasing on the larger (bottom) levels is much slower because of
718
  // cache hierarchy effects, so when the result is smaller, getting the result
719
  // will be substantially slower, despite traversing a similar number of
720
  // entries. (We could be clever about pipelining our pointer chasing but
721
  // that's complicated.)
722
  // * The larger (bottom) levels also have lower variance because there's a
723
  // chance (or certainty) that we reach level 0 and return the exact answer.
724
  // * For applications in query planning, we can also tolerate more variance on
725
  // small results because the impact of misestimating is likely smaller.
726
  //
727
  // These factors point us to an approach in which we have a higher minimum
728
  // threshold number of samples for higher levels and lower for lower levels
729
  // (see sufficient_samples below). This seems to yield roughly consistent
730
  // relative error (stddev around 20%, less for large results) and roughly
731
  // consistent query time around the time of two memtable point queries.
732
  //
733
  // Engineering observation: it is tempting to think that taking into account
734
  // what we already found in how many entries occur on higher levels, not just
735
  // the first iterated level with a sufficient number of samples, would yield
736
  // a more accurate estimate. But that doesn't work because of the particular
737
  // correlations and independences of the data: each level higher is just an
738
  // independently probabilistic filtering of the level below it. That
739
  // filtering from level l to l+1 has no more information about levels
740
  // 0 .. l-1 than we can get from level l. The structure of RandomHeight() is
741
  // a clue to these correlations and independences.
742
743
0
  Node* lb = head_;
744
0
  Node* ub = nullptr;
745
0
  uint64_t count = 0;
746
0
  for (int level = GetMaxHeight() - 1; level >= 0; level--) {
747
0
    auto sufficient_samples = static_cast<uint64_t>(level) * kBranching_ + 10U;
748
0
    if (count >= sufficient_samples) {
749
      // No more counting; apply powers of kBranching and avoid floating point
750
0
      count *= kBranching_;
751
0
      continue;
752
0
    }
753
0
    count = 0;
754
0
    Node* next;
755
    // Get a more precise lower bound (for start key)
756
0
    for (;;) {
757
0
      next = lb->Next(level);
758
0
      if (next == ub) {
759
0
        break;
760
0
      }
761
0
      assert(next != nullptr);
762
0
      if (compare_(next->Key(), start_ikey) >= 0) {
763
0
        break;
764
0
      }
765
0
      lb = next;
766
0
    }
767
    // Count entries on this level until upper bound (for end key)
768
0
    for (;;) {
769
0
      if (next == ub) {
770
0
        break;
771
0
      }
772
0
      assert(next != nullptr);
773
0
      if (compare_(next->Key(), end_ikey) >= 0) {
774
        // Save refined upper bound to potentially save key comparison
775
0
        ub = next;
776
0
        break;
777
0
      }
778
0
      count++;
779
0
      next = next->Next(level);
780
0
    }
781
0
  }
782
0
  return count;
783
0
}
784
785
template <class Comparator>
786
InlineSkipList<Comparator>::InlineSkipList(const Comparator cmp,
787
                                           Allocator* allocator,
788
                                           int32_t max_height,
789
                                           int32_t branching_factor)
790
101k
    : kMaxHeight_(static_cast<uint16_t>(max_height)),
791
101k
      kBranching_(static_cast<uint16_t>(branching_factor)),
792
101k
      kScaledInverseBranching_((Random::kMaxNext + 1) / kBranching_),
793
101k
      allocator_(allocator),
794
101k
      compare_(cmp),
795
101k
      head_(AllocateNode(0, max_height)),
796
101k
      max_height_(1),
797
101k
      seq_splice_(AllocateSplice()) {
798
101k
  assert(max_height > 0 && kMaxHeight_ == static_cast<uint32_t>(max_height));
799
101k
  assert(branching_factor > 1 &&
800
101k
         kBranching_ == static_cast<uint32_t>(branching_factor));
801
101k
  assert(kScaledInverseBranching_ > 0);
802
803
1.31M
  for (int i = 0; i < kMaxHeight_; ++i) {
804
1.21M
    head_->SetNext(i, nullptr);
805
1.21M
  }
806
101k
}
807
808
template <class Comparator>
809
29.0k
char* InlineSkipList<Comparator>::AllocateKey(size_t key_size) {
810
29.0k
  return const_cast<char*>(AllocateNode(key_size, RandomHeight())->Key());
811
29.0k
}
812
813
template <class Comparator>
814
typename InlineSkipList<Comparator>::Node*
815
130k
InlineSkipList<Comparator>::AllocateNode(size_t key_size, int height) {
816
130k
  auto prefix = sizeof(Atomic<Node*>) * (height - 1);
817
818
  // prefix is space for the height - 1 pointers that we store before
819
  // the Node instance (next_[-(height - 1) .. -1]).  Node starts at
820
  // raw + prefix, and holds the bottom-mode (level 0) skip list pointer
821
  // next_[0].  key_size is the bytes for the key, which comes just after
822
  // the Node.
823
130k
  char* raw = allocator_->AllocateAligned(prefix + sizeof(Node) + key_size);
824
130k
  Node* x = reinterpret_cast<Node*>(raw + prefix);
825
826
  // Once we've linked the node into the skip list we don't actually need
827
  // to know its height, because we can implicitly use the fact that we
828
  // traversed into a node at level h to known that h is a valid level
829
  // for that node.  We need to convey the height to the Insert step,
830
  // however, so that it can perform the proper links.  Since we're not
831
  // using the pointers at the moment, StashHeight temporarily borrow
832
  // storage from next_[0] for that purpose.
833
130k
  x->StashHeight(height);
834
130k
  return x;
835
130k
}
836
837
template <class Comparator>
838
typename InlineSkipList<Comparator>::Splice*
839
101k
InlineSkipList<Comparator>::AllocateSplice() {
840
  // size of prev_ and next_
841
101k
  size_t array_size = sizeof(Node*) * (kMaxHeight_ + 1);
842
101k
  char* raw = allocator_->AllocateAligned(sizeof(Splice) + array_size * 2);
843
101k
  Splice* splice = reinterpret_cast<Splice*>(raw);
844
101k
  splice->height_ = 0;
845
101k
  splice->prev_ = reinterpret_cast<Node**>(raw + sizeof(Splice));
846
101k
  splice->next_ = reinterpret_cast<Node**>(raw + sizeof(Splice) + array_size);
847
101k
  return splice;
848
101k
}
849
850
template <class Comparator>
851
typename InlineSkipList<Comparator>::Splice*
852
0
InlineSkipList<Comparator>::AllocateSpliceOnHeap() {
853
0
  size_t array_size = sizeof(Node*) * (kMaxHeight_ + 1);
854
0
  char* raw = new char[sizeof(Splice) + array_size * 2];
855
0
  Splice* splice = reinterpret_cast<Splice*>(raw);
856
0
  splice->height_ = 0;
857
0
  splice->prev_ = reinterpret_cast<Node**>(raw + sizeof(Splice));
858
0
  splice->next_ = reinterpret_cast<Node**>(raw + sizeof(Splice) + array_size);
859
0
  return splice;
860
0
}
861
862
template <class Comparator>
863
29.0k
bool InlineSkipList<Comparator>::Insert(const char* key) {
864
29.0k
  return Insert<false>(key, seq_splice_, false);
865
29.0k
}
866
867
template <class Comparator>
868
0
bool InlineSkipList<Comparator>::InsertConcurrently(const char* key) {
869
0
  Node* prev[kMaxPossibleHeight];
870
0
  Node* next[kMaxPossibleHeight];
871
0
  Splice splice;
872
0
  splice.prev_ = prev;
873
0
  splice.next_ = next;
874
0
  return Insert<true>(key, &splice, false);
875
0
}
876
877
template <class Comparator>
878
0
bool InlineSkipList<Comparator>::InsertWithHint(const char* key, void** hint) {
879
0
  assert(hint != nullptr);
880
0
  Splice* splice = reinterpret_cast<Splice*>(*hint);
881
0
  if (splice == nullptr) {
882
0
    splice = AllocateSplice();
883
0
    *hint = splice;
884
0
  }
885
0
  return Insert<false>(key, splice, true);
886
0
}
887
888
template <class Comparator>
889
bool InlineSkipList<Comparator>::InsertWithHintConcurrently(const char* key,
890
0
                                                            void** hint) {
891
0
  assert(hint != nullptr);
892
0
  Splice* splice = reinterpret_cast<Splice*>(*hint);
893
0
  if (splice == nullptr) {
894
0
    splice = AllocateSpliceOnHeap();
895
0
    *hint = splice;
896
0
  }
897
0
  return Insert<true>(key, splice, true);
898
0
}
899
900
template <class Comparator>
901
template <bool prefetch_before>
902
void InlineSkipList<Comparator>::FindSpliceForLevel(const DecodedKey& key,
903
                                                    Node* before, Node* after,
904
                                                    int level, Node** out_prev,
905
44.7k
                                                    Node** out_next) {
906
55.5k
  while (true) {
907
55.5k
    Node* next = before->Next(level);
908
55.5k
    if (next != nullptr) {
909
23.1k
      PREFETCH(next->Next(level), 0, 1);
910
23.1k
    }
911
55.5k
    if (prefetch_before == true) {
912
55.5k
      if (next != nullptr && level > 0) {
913
14.3k
        PREFETCH(next->Next(level - 1), 0, 1);
914
14.3k
      }
915
55.5k
    }
916
55.5k
    assert(before == head_ || next == nullptr ||
917
55.5k
           KeyIsAfterNode(next->Key(), before));
918
55.5k
    assert(before == head_ || KeyIsAfterNode(key, before));
919
55.5k
    if (next == after || !KeyIsAfterNode(key, next)) {
920
      // found it
921
44.7k
      *out_prev = before;
922
44.7k
      *out_next = next;
923
44.7k
      return;
924
44.7k
    }
925
10.7k
    before = next;
926
10.7k
  }
927
44.7k
}
void rocksdb::InlineSkipList<rocksdb::MemTableRep::KeyComparator const&>::FindSpliceForLevel<true>(rocksdb::Slice const&, rocksdb::InlineSkipList<rocksdb::MemTableRep::KeyComparator const&>::Node*, rocksdb::InlineSkipList<rocksdb::MemTableRep::KeyComparator const&>::Node*, int, rocksdb::InlineSkipList<rocksdb::MemTableRep::KeyComparator const&>::Node**, rocksdb::InlineSkipList<rocksdb::MemTableRep::KeyComparator const&>::Node**)
Line
Count
Source
905
44.7k
                                                    Node** out_next) {
906
55.5k
  while (true) {
907
55.5k
    Node* next = before->Next(level);
908
55.5k
    if (next != nullptr) {
909
23.1k
      PREFETCH(next->Next(level), 0, 1);
910
23.1k
    }
911
55.5k
    if (prefetch_before == true) {
912
55.5k
      if (next != nullptr && level > 0) {
913
14.3k
        PREFETCH(next->Next(level - 1), 0, 1);
914
14.3k
      }
915
55.5k
    }
916
55.5k
    assert(before == head_ || next == nullptr ||
917
55.5k
           KeyIsAfterNode(next->Key(), before));
918
55.5k
    assert(before == head_ || KeyIsAfterNode(key, before));
919
55.5k
    if (next == after || !KeyIsAfterNode(key, next)) {
920
      // found it
921
44.7k
      *out_prev = before;
922
44.7k
      *out_next = next;
923
44.7k
      return;
924
44.7k
    }
925
10.7k
    before = next;
926
10.7k
  }
927
44.7k
}
Unexecuted instantiation: void rocksdb::InlineSkipList<rocksdb::MemTableRep::KeyComparator const&>::FindSpliceForLevel<false>(rocksdb::Slice const&, rocksdb::InlineSkipList<rocksdb::MemTableRep::KeyComparator const&>::Node*, rocksdb::InlineSkipList<rocksdb::MemTableRep::KeyComparator const&>::Node*, int, rocksdb::InlineSkipList<rocksdb::MemTableRep::KeyComparator const&>::Node**, rocksdb::InlineSkipList<rocksdb::MemTableRep::KeyComparator const&>::Node**)
928
929
template <class Comparator>
930
void InlineSkipList<Comparator>::RecomputeSpliceLevels(const DecodedKey& key,
931
                                                       Splice* splice,
932
28.8k
                                                       int recompute_level) {
933
28.8k
  assert(recompute_level > 0);
934
28.8k
  assert(recompute_level <= splice->height_);
935
73.6k
  for (int i = recompute_level - 1; i >= 0; --i) {
936
44.7k
    FindSpliceForLevel<true>(key, splice->prev_[i + 1], splice->next_[i + 1], i,
937
44.7k
                             &splice->prev_[i], &splice->next_[i]);
938
44.7k
  }
939
28.8k
}
940
941
template <class Comparator>
942
template <bool UseCAS>
943
bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
944
29.0k
                                        bool allow_partial_splice_fix) {
945
29.0k
  Node* x = reinterpret_cast<Node*>(const_cast<char*>(key)) - 1;
946
29.0k
  const DecodedKey key_decoded = compare_.decode_key(key);
947
29.0k
  int height = x->UnstashHeight();
948
29.0k
  assert(height >= 1 && height <= kMaxHeight_);
949
950
29.0k
  int max_height = max_height_.LoadRelaxed();
951
29.0k
  while (height > max_height) {
952
6.38k
    if (max_height_.CasWeakRelaxed(max_height, height)) {
953
      // successfully updated it
954
6.38k
      max_height = height;
955
6.38k
      break;
956
6.38k
    }
957
    // else retry, possibly exiting the loop because somebody else
958
    // increased it
959
6.38k
  }
960
29.0k
  assert(max_height <= kMaxPossibleHeight);
961
962
29.0k
  int recompute_height = 0;
963
29.0k
  if (splice->height_ < max_height) {
964
    // Either splice has never been used or max_height has grown since
965
    // last use.  We could potentially fix it in the latter case, but
966
    // that is tricky.
967
23.6k
    splice->prev_[max_height] = head_;
968
23.6k
    splice->next_[max_height] = nullptr;
969
23.6k
    splice->height_ = max_height;
970
23.6k
    recompute_height = max_height;
971
23.6k
  } else {
972
    // Splice is a valid proper-height splice that brackets some
973
    // key, but does it bracket this one?  We need to validate it and
974
    // recompute a portion of the splice (levels 0..recompute_height-1)
975
    // that is a superset of all levels that don't bracket the new key.
976
    // Several choices are reasonable, because we have to balance the work
977
    // saved against the extra comparisons required to validate the Splice.
978
    //
979
    // One strategy is just to recompute all of orig_splice_height if the
980
    // bottom level isn't bracketing.  This pessimistically assumes that
981
    // we will either get a perfect Splice hit (increasing sequential
982
    // inserts) or have no locality.
983
    //
984
    // Another strategy is to walk up the Splice's levels until we find
985
    // a level that brackets the key.  This strategy lets the Splice
986
    // hint help for other cases: it turns insertion from O(log N) into
987
    // O(log D), where D is the number of nodes in between the key that
988
    // produced the Splice and the current insert (insertion is aided
989
    // whether the new key is before or after the splice).  If you have
990
    // a way of using a prefix of the key to map directly to the closest
991
    // Splice out of O(sqrt(N)) Splices and we make it so that splices
992
    // can also be used as hints during read, then we end up with Oshman's
993
    // and Shavit's SkipTrie, which has O(log log N) lookup and insertion
994
    // (compare to O(log N) for skip list).
995
    //
996
    // We control the pessimistic strategy with allow_partial_splice_fix.
997
    // A good strategy is probably to be pessimistic for seq_splice_,
998
    // optimistic if the caller actually went to the work of providing
999
    // a Splice.
1000
10.6k
    while (recompute_height < max_height) {
1001
5.41k
      if (splice->prev_[recompute_height]->Next(recompute_height) !=
1002
5.41k
          splice->next_[recompute_height]) {
1003
        // splice isn't tight at this level, there must have been some inserts
1004
        // to this
1005
        // location that didn't update the splice.  We might only be a little
1006
        // stale, but if
1007
        // the splice is very stale it would be O(N) to fix it.  We haven't used
1008
        // up any of
1009
        // our budget of comparisons, so always move up even if we are
1010
        // pessimistic about
1011
        // our chances of success.
1012
0
        ++recompute_height;
1013
5.41k
      } else if (splice->prev_[recompute_height] != head_ &&
1014
5.41k
                 !KeyIsAfterNode(key_decoded,
1015
5.41k
                                 splice->prev_[recompute_height])) {
1016
        // key is from before splice
1017
4.64k
        if (allow_partial_splice_fix) {
1018
          // skip all levels with the same node without more comparisons
1019
0
          Node* bad = splice->prev_[recompute_height];
1020
0
          while (splice->prev_[recompute_height] == bad) {
1021
0
            ++recompute_height;
1022
0
          }
1023
4.64k
        } else {
1024
          // we're pessimistic, recompute everything
1025
4.64k
          recompute_height = max_height;
1026
4.64k
        }
1027
4.64k
      } else if (KeyIsAfterNode(key_decoded, splice->next_[recompute_height])) {
1028
        // key is from after splice
1029
564
        if (allow_partial_splice_fix) {
1030
0
          Node* bad = splice->next_[recompute_height];
1031
0
          while (splice->next_[recompute_height] == bad) {
1032
0
            ++recompute_height;
1033
0
          }
1034
564
        } else {
1035
564
          recompute_height = max_height;
1036
564
        }
1037
564
      } else {
1038
        // this level brackets the key, we won!
1039
213
        break;
1040
213
      }
1041
5.41k
    }
1042
5.41k
  }
1043
29.0k
  assert(recompute_height <= max_height);
1044
29.0k
  if (recompute_height > 0) {
1045
28.8k
    RecomputeSpliceLevels(key_decoded, splice, recompute_height);
1046
28.8k
  }
1047
1048
29.0k
  bool splice_is_valid = true;
1049
29.0k
  if (UseCAS) {
1050
0
    for (int i = 0; i < height; ++i) {
1051
0
      while (true) {
1052
        // Checking for duplicate keys on the level 0 is sufficient
1053
0
        if (UNLIKELY(i == 0 && splice->next_[i] != nullptr &&
1054
0
                     compare_(splice->next_[i]->Key(), key_decoded) <= 0)) {
1055
          // duplicate key
1056
0
          return false;
1057
0
        }
1058
0
        if (UNLIKELY(i == 0 && splice->prev_[i] != head_ &&
1059
0
                     compare_(splice->prev_[i]->Key(), key_decoded) >= 0)) {
1060
          // duplicate key
1061
0
          return false;
1062
0
        }
1063
0
        assert(splice->next_[i] == nullptr ||
1064
0
               compare_(x->Key(), splice->next_[i]->Key()) < 0);
1065
0
        assert(splice->prev_[i] == head_ ||
1066
0
               compare_(splice->prev_[i]->Key(), x->Key()) < 0);
1067
0
        x->NoBarrier_SetNext(i, splice->next_[i]);
1068
0
        if (splice->prev_[i]->CASNext(i, splice->next_[i], x)) {
1069
          // success
1070
0
          break;
1071
0
        }
1072
        // CAS failed, we need to recompute prev and next. It is unlikely
1073
        // to be helpful to try to use a different level as we redo the
1074
        // search, because it should be unlikely that lots of nodes have
1075
        // been inserted between prev[i] and next[i]. No point in using
1076
        // next[i] as the after hint, because we know it is stale.
1077
0
        FindSpliceForLevel<false>(key_decoded, splice->prev_[i], nullptr, i,
1078
0
                                  &splice->prev_[i], &splice->next_[i]);
1079
1080
        // Since we've narrowed the bracket for level i, we might have
1081
        // violated the Splice constraint between i and i-1.  Make sure
1082
        // we recompute the whole thing next time.
1083
0
        if (i > 0) {
1084
0
          splice_is_valid = false;
1085
0
        }
1086
0
      }
1087
0
    }
1088
29.0k
  } else {
1089
67.6k
    for (int i = 0; i < height; ++i) {
1090
38.5k
      if (i >= recompute_height &&
1091
225
          splice->prev_[i]->Next(i) != splice->next_[i]) {
1092
0
        FindSpliceForLevel<false>(key_decoded, splice->prev_[i], nullptr, i,
1093
0
                                  &splice->prev_[i], &splice->next_[i]);
1094
0
      }
1095
      // Checking for duplicate keys on the level 0 is sufficient
1096
38.5k
      if (UNLIKELY(i == 0 && splice->next_[i] != nullptr &&
1097
38.5k
                   compare_(splice->next_[i]->Key(), key_decoded) <= 0)) {
1098
        // duplicate key
1099
0
        return false;
1100
0
      }
1101
38.5k
      if (UNLIKELY(i == 0 && splice->prev_[i] != head_ &&
1102
38.5k
                   compare_(splice->prev_[i]->Key(), key_decoded) >= 0)) {
1103
        // duplicate key
1104
0
        return false;
1105
0
      }
1106
38.5k
      assert(splice->next_[i] == nullptr ||
1107
38.5k
             compare_(x->Key(), splice->next_[i]->Key()) < 0);
1108
38.5k
      assert(splice->prev_[i] == head_ ||
1109
38.5k
             compare_(splice->prev_[i]->Key(), x->Key()) < 0);
1110
38.5k
      assert(splice->prev_[i]->Next(i) == splice->next_[i]);
1111
38.5k
      x->NoBarrier_SetNext(i, splice->next_[i]);
1112
38.5k
      splice->prev_[i]->SetNext(i, x);
1113
38.5k
    }
1114
29.0k
  }
1115
29.0k
  if (splice_is_valid) {
1116
67.6k
    for (int i = 0; i < height; ++i) {
1117
38.5k
      splice->prev_[i] = x;
1118
38.5k
    }
1119
29.0k
    assert(splice->prev_[splice->height_] == head_);
1120
29.0k
    assert(splice->next_[splice->height_] == nullptr);
1121
74.1k
    for (int i = 0; i < splice->height_; ++i) {
1122
45.1k
      assert(splice->next_[i] == nullptr ||
1123
45.1k
             compare_(key, splice->next_[i]->Key()) < 0);
1124
45.1k
      assert(splice->prev_[i] == head_ ||
1125
45.1k
             compare_(splice->prev_[i]->Key(), key) <= 0);
1126
45.1k
      assert(splice->prev_[i + 1] == splice->prev_[i] ||
1127
45.1k
             splice->prev_[i + 1] == head_ ||
1128
45.1k
             compare_(splice->prev_[i + 1]->Key(), splice->prev_[i]->Key()) <
1129
45.1k
                 0);
1130
45.1k
      assert(splice->next_[i + 1] == splice->next_[i] ||
1131
45.1k
             splice->next_[i + 1] == nullptr ||
1132
45.1k
             compare_(splice->next_[i]->Key(), splice->next_[i + 1]->Key()) <
1133
45.1k
                 0);
1134
45.1k
    }
1135
29.0k
  } else {
1136
0
    splice->height_ = 0;
1137
0
  }
1138
29.0k
  return true;
1139
29.0k
}
bool rocksdb::InlineSkipList<rocksdb::MemTableRep::KeyComparator const&>::Insert<false>(char const*, rocksdb::InlineSkipList<rocksdb::MemTableRep::KeyComparator const&>::Splice*, bool)
Line
Count
Source
944
29.0k
                                        bool allow_partial_splice_fix) {
945
29.0k
  Node* x = reinterpret_cast<Node*>(const_cast<char*>(key)) - 1;
946
29.0k
  const DecodedKey key_decoded = compare_.decode_key(key);
947
29.0k
  int height = x->UnstashHeight();
948
29.0k
  assert(height >= 1 && height <= kMaxHeight_);
949
950
29.0k
  int max_height = max_height_.LoadRelaxed();
951
29.0k
  while (height > max_height) {
952
6.38k
    if (max_height_.CasWeakRelaxed(max_height, height)) {
953
      // successfully updated it
954
6.38k
      max_height = height;
955
6.38k
      break;
956
6.38k
    }
957
    // else retry, possibly exiting the loop because somebody else
958
    // increased it
959
6.38k
  }
960
29.0k
  assert(max_height <= kMaxPossibleHeight);
961
962
29.0k
  int recompute_height = 0;
963
29.0k
  if (splice->height_ < max_height) {
964
    // Either splice has never been used or max_height has grown since
965
    // last use.  We could potentially fix it in the latter case, but
966
    // that is tricky.
967
23.6k
    splice->prev_[max_height] = head_;
968
23.6k
    splice->next_[max_height] = nullptr;
969
23.6k
    splice->height_ = max_height;
970
23.6k
    recompute_height = max_height;
971
23.6k
  } else {
972
    // Splice is a valid proper-height splice that brackets some
973
    // key, but does it bracket this one?  We need to validate it and
974
    // recompute a portion of the splice (levels 0..recompute_height-1)
975
    // that is a superset of all levels that don't bracket the new key.
976
    // Several choices are reasonable, because we have to balance the work
977
    // saved against the extra comparisons required to validate the Splice.
978
    //
979
    // One strategy is just to recompute all of orig_splice_height if the
980
    // bottom level isn't bracketing.  This pessimistically assumes that
981
    // we will either get a perfect Splice hit (increasing sequential
982
    // inserts) or have no locality.
983
    //
984
    // Another strategy is to walk up the Splice's levels until we find
985
    // a level that brackets the key.  This strategy lets the Splice
986
    // hint help for other cases: it turns insertion from O(log N) into
987
    // O(log D), where D is the number of nodes in between the key that
988
    // produced the Splice and the current insert (insertion is aided
989
    // whether the new key is before or after the splice).  If you have
990
    // a way of using a prefix of the key to map directly to the closest
991
    // Splice out of O(sqrt(N)) Splices and we make it so that splices
992
    // can also be used as hints during read, then we end up with Oshman's
993
    // and Shavit's SkipTrie, which has O(log log N) lookup and insertion
994
    // (compare to O(log N) for skip list).
995
    //
996
    // We control the pessimistic strategy with allow_partial_splice_fix.
997
    // A good strategy is probably to be pessimistic for seq_splice_,
998
    // optimistic if the caller actually went to the work of providing
999
    // a Splice.
1000
10.6k
    while (recompute_height < max_height) {
1001
5.41k
      if (splice->prev_[recompute_height]->Next(recompute_height) !=
1002
5.41k
          splice->next_[recompute_height]) {
1003
        // splice isn't tight at this level, there must have been some inserts
1004
        // to this
1005
        // location that didn't update the splice.  We might only be a little
1006
        // stale, but if
1007
        // the splice is very stale it would be O(N) to fix it.  We haven't used
1008
        // up any of
1009
        // our budget of comparisons, so always move up even if we are
1010
        // pessimistic about
1011
        // our chances of success.
1012
0
        ++recompute_height;
1013
5.41k
      } else if (splice->prev_[recompute_height] != head_ &&
1014
5.41k
                 !KeyIsAfterNode(key_decoded,
1015
5.41k
                                 splice->prev_[recompute_height])) {
1016
        // key is from before splice
1017
4.64k
        if (allow_partial_splice_fix) {
1018
          // skip all levels with the same node without more comparisons
1019
0
          Node* bad = splice->prev_[recompute_height];
1020
0
          while (splice->prev_[recompute_height] == bad) {
1021
0
            ++recompute_height;
1022
0
          }
1023
4.64k
        } else {
1024
          // we're pessimistic, recompute everything
1025
4.64k
          recompute_height = max_height;
1026
4.64k
        }
1027
4.64k
      } else if (KeyIsAfterNode(key_decoded, splice->next_[recompute_height])) {
1028
        // key is from after splice
1029
564
        if (allow_partial_splice_fix) {
1030
0
          Node* bad = splice->next_[recompute_height];
1031
0
          while (splice->next_[recompute_height] == bad) {
1032
0
            ++recompute_height;
1033
0
          }
1034
564
        } else {
1035
564
          recompute_height = max_height;
1036
564
        }
1037
564
      } else {
1038
        // this level brackets the key, we won!
1039
213
        break;
1040
213
      }
1041
5.41k
    }
1042
5.41k
  }
1043
29.0k
  assert(recompute_height <= max_height);
1044
29.0k
  if (recompute_height > 0) {
1045
28.8k
    RecomputeSpliceLevels(key_decoded, splice, recompute_height);
1046
28.8k
  }
1047
1048
29.0k
  bool splice_is_valid = true;
1049
29.0k
  if (UseCAS) {
1050
0
    for (int i = 0; i < height; ++i) {
1051
0
      while (true) {
1052
        // Checking for duplicate keys on the level 0 is sufficient
1053
0
        if (UNLIKELY(i == 0 && splice->next_[i] != nullptr &&
1054
0
                     compare_(splice->next_[i]->Key(), key_decoded) <= 0)) {
1055
          // duplicate key
1056
0
          return false;
1057
0
        }
1058
0
        if (UNLIKELY(i == 0 && splice->prev_[i] != head_ &&
1059
0
                     compare_(splice->prev_[i]->Key(), key_decoded) >= 0)) {
1060
          // duplicate key
1061
0
          return false;
1062
0
        }
1063
0
        assert(splice->next_[i] == nullptr ||
1064
0
               compare_(x->Key(), splice->next_[i]->Key()) < 0);
1065
0
        assert(splice->prev_[i] == head_ ||
1066
0
               compare_(splice->prev_[i]->Key(), x->Key()) < 0);
1067
0
        x->NoBarrier_SetNext(i, splice->next_[i]);
1068
0
        if (splice->prev_[i]->CASNext(i, splice->next_[i], x)) {
1069
          // success
1070
0
          break;
1071
0
        }
1072
        // CAS failed, we need to recompute prev and next. It is unlikely
1073
        // to be helpful to try to use a different level as we redo the
1074
        // search, because it should be unlikely that lots of nodes have
1075
        // been inserted between prev[i] and next[i]. No point in using
1076
        // next[i] as the after hint, because we know it is stale.
1077
0
        FindSpliceForLevel<false>(key_decoded, splice->prev_[i], nullptr, i,
1078
0
                                  &splice->prev_[i], &splice->next_[i]);
1079
1080
        // Since we've narrowed the bracket for level i, we might have
1081
        // violated the Splice constraint between i and i-1.  Make sure
1082
        // we recompute the whole thing next time.
1083
0
        if (i > 0) {
1084
0
          splice_is_valid = false;
1085
0
        }
1086
0
      }
1087
0
    }
1088
29.0k
  } else {
1089
67.6k
    for (int i = 0; i < height; ++i) {
1090
38.5k
      if (i >= recompute_height &&
1091
225
          splice->prev_[i]->Next(i) != splice->next_[i]) {
1092
0
        FindSpliceForLevel<false>(key_decoded, splice->prev_[i], nullptr, i,
1093
0
                                  &splice->prev_[i], &splice->next_[i]);
1094
0
      }
1095
      // Checking for duplicate keys on the level 0 is sufficient
1096
38.5k
      if (UNLIKELY(i == 0 && splice->next_[i] != nullptr &&
1097
38.5k
                   compare_(splice->next_[i]->Key(), key_decoded) <= 0)) {
1098
        // duplicate key
1099
0
        return false;
1100
0
      }
1101
38.5k
      if (UNLIKELY(i == 0 && splice->prev_[i] != head_ &&
1102
38.5k
                   compare_(splice->prev_[i]->Key(), key_decoded) >= 0)) {
1103
        // duplicate key
1104
0
        return false;
1105
0
      }
1106
38.5k
      assert(splice->next_[i] == nullptr ||
1107
38.5k
             compare_(x->Key(), splice->next_[i]->Key()) < 0);
1108
38.5k
      assert(splice->prev_[i] == head_ ||
1109
38.5k
             compare_(splice->prev_[i]->Key(), x->Key()) < 0);
1110
38.5k
      assert(splice->prev_[i]->Next(i) == splice->next_[i]);
1111
38.5k
      x->NoBarrier_SetNext(i, splice->next_[i]);
1112
38.5k
      splice->prev_[i]->SetNext(i, x);
1113
38.5k
    }
1114
29.0k
  }
1115
29.0k
  if (splice_is_valid) {
1116
67.6k
    for (int i = 0; i < height; ++i) {
1117
38.5k
      splice->prev_[i] = x;
1118
38.5k
    }
1119
29.0k
    assert(splice->prev_[splice->height_] == head_);
1120
29.0k
    assert(splice->next_[splice->height_] == nullptr);
1121
74.1k
    for (int i = 0; i < splice->height_; ++i) {
1122
45.1k
      assert(splice->next_[i] == nullptr ||
1123
45.1k
             compare_(key, splice->next_[i]->Key()) < 0);
1124
45.1k
      assert(splice->prev_[i] == head_ ||
1125
45.1k
             compare_(splice->prev_[i]->Key(), key) <= 0);
1126
45.1k
      assert(splice->prev_[i + 1] == splice->prev_[i] ||
1127
45.1k
             splice->prev_[i + 1] == head_ ||
1128
45.1k
             compare_(splice->prev_[i + 1]->Key(), splice->prev_[i]->Key()) <
1129
45.1k
                 0);
1130
45.1k
      assert(splice->next_[i + 1] == splice->next_[i] ||
1131
45.1k
             splice->next_[i + 1] == nullptr ||
1132
45.1k
             compare_(splice->next_[i]->Key(), splice->next_[i + 1]->Key()) <
1133
45.1k
                 0);
1134
45.1k
    }
1135
29.0k
  } else {
1136
0
    splice->height_ = 0;
1137
0
  }
1138
29.0k
  return true;
1139
29.0k
}
Unexecuted instantiation: bool rocksdb::InlineSkipList<rocksdb::MemTableRep::KeyComparator const&>::Insert<true>(char const*, rocksdb::InlineSkipList<rocksdb::MemTableRep::KeyComparator const&>::Splice*, bool)
1140
1141
template <class Comparator>
1142
0
bool InlineSkipList<Comparator>::Contains(const char* key) const {
1143
0
  Node* x = nullptr;
1144
0
  auto status = FindGreaterOrEqual(key, &x, false, false, nullptr);
1145
0
  assert(status.ok());
1146
0
  if (x != nullptr && Equal(key, x->Key())) {
1147
0
    return true;
1148
0
  } else {
1149
0
    return false;
1150
0
  }
1151
0
}
1152
1153
template <class Comparator>
1154
void InlineSkipList<Comparator>::TEST_Validate() const {
1155
  // Interate over all levels at the same time, and verify nodes appear in
1156
  // the right order, and nodes appear in upper level also appear in lower
1157
  // levels.
1158
  Node* nodes[kMaxPossibleHeight];
1159
  int max_height = GetMaxHeight();
1160
  assert(max_height > 0);
1161
  for (int i = 0; i < max_height; i++) {
1162
    nodes[i] = head_;
1163
  }
1164
  while (nodes[0] != nullptr) {
1165
    Node* l0_next = nodes[0]->Next(0);
1166
    if (l0_next == nullptr) {
1167
      break;
1168
    }
1169
    assert(nodes[0] == head_ || compare_(nodes[0]->Key(), l0_next->Key()) < 0);
1170
    nodes[0] = l0_next;
1171
1172
    int i = 1;
1173
    while (i < max_height) {
1174
      Node* next = nodes[i]->Next(i);
1175
      if (next == nullptr) {
1176
        break;
1177
      }
1178
      auto cmp = compare_(nodes[0]->Key(), next->Key());
1179
      assert(cmp <= 0);
1180
      if (cmp == 0) {
1181
        assert(next == nodes[0]);
1182
        nodes[i] = next;
1183
      } else {
1184
        break;
1185
      }
1186
      i++;
1187
    }
1188
  }
1189
  for (int i = 1; i < max_height; i++) {
1190
    assert(nodes[i] != nullptr && nodes[i]->Next(i) == nullptr);
1191
  }
1192
}
1193
1194
template <class Comparator>
1195
Status InlineSkipList<Comparator>::Corruption(Node* prev, Node* next,
1196
0
                                              bool allow_data_in_errors) {
1197
0
  std::string msg = "Out-of-order keys found in skiplist.";
1198
0
  if (allow_data_in_errors) {
1199
0
    msg.append(" prev key: " + Slice(prev->Key()).ToString(true));
1200
0
    msg.append(" next key: " + Slice(next->Key()).ToString(true));
1201
0
  }
1202
0
  return Status::Corruption(msg);
1203
0
}
1204
}  // namespace ROCKSDB_NAMESPACE