/src/rocksdb/memtable/skiplistrep.cc
Line | Count | Source |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | #include <random> |
7 | | |
8 | | #include "db/memtable.h" |
9 | | #include "memory/arena.h" |
10 | | #include "memtable/inlineskiplist.h" |
11 | | #include "rocksdb/memtablerep.h" |
12 | | #include "rocksdb/utilities/options_type.h" |
13 | | #include "util/string_util.h" |
14 | | |
15 | | namespace ROCKSDB_NAMESPACE { |
16 | | namespace { |
17 | | class SkipListRep : public MemTableRep { |
18 | | InlineSkipList<const MemTableRep::KeyComparator&> skip_list_; |
19 | | const MemTableRep::KeyComparator& cmp_; |
20 | | const SliceTransform* transform_; |
21 | | const size_t lookahead_; |
22 | | |
23 | | friend class LookaheadIterator; |
24 | | |
25 | | public: |
26 | | explicit SkipListRep(const MemTableRep::KeyComparator& compare, |
27 | | Allocator* allocator, const SliceTransform* transform, |
28 | | const size_t lookahead) |
29 | 101k | : MemTableRep(allocator), |
30 | 101k | skip_list_(compare, allocator), |
31 | 101k | cmp_(compare), |
32 | 101k | transform_(transform), |
33 | 101k | lookahead_(lookahead) {} |
34 | | |
35 | 29.0k | KeyHandle Allocate(const size_t len, char** buf) override { |
36 | 29.0k | *buf = skip_list_.AllocateKey(len); |
37 | 29.0k | return static_cast<KeyHandle>(*buf); |
38 | 29.0k | } |
39 | | |
40 | | // Insert key into the list. |
41 | | // REQUIRES: nothing that compares equal to key is currently in the list. |
42 | 0 | void Insert(KeyHandle handle) override { |
43 | 0 | skip_list_.Insert(static_cast<char*>(handle)); |
44 | 0 | } |
45 | | |
46 | 29.0k | bool InsertKey(KeyHandle handle) override { |
47 | 29.0k | return skip_list_.Insert(static_cast<char*>(handle)); |
48 | 29.0k | } |
49 | | |
50 | 0 | void InsertWithHint(KeyHandle handle, void** hint) override { |
51 | 0 | skip_list_.InsertWithHint(static_cast<char*>(handle), hint); |
52 | 0 | } |
53 | | |
54 | 0 | bool InsertKeyWithHint(KeyHandle handle, void** hint) override { |
55 | 0 | return skip_list_.InsertWithHint(static_cast<char*>(handle), hint); |
56 | 0 | } |
57 | | |
58 | 0 | void InsertWithHintConcurrently(KeyHandle handle, void** hint) override { |
59 | 0 | skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle), hint); |
60 | 0 | } |
61 | | |
62 | 0 | bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) override { |
63 | 0 | return skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle), |
64 | 0 | hint); |
65 | 0 | } |
66 | | |
67 | 0 | void InsertConcurrently(KeyHandle handle) override { |
68 | 0 | skip_list_.InsertConcurrently(static_cast<char*>(handle)); |
69 | 0 | } |
70 | | |
71 | 0 | bool InsertKeyConcurrently(KeyHandle handle) override { |
72 | 0 | return skip_list_.InsertConcurrently(static_cast<char*>(handle)); |
73 | 0 | } |
74 | | |
75 | | // Returns true iff an entry that compares equal to key is in the list. |
76 | 0 | bool Contains(const char* key) const override { |
77 | 0 | return skip_list_.Contains(key); |
78 | 0 | } |
79 | | |
80 | 91.1k | size_t ApproximateMemoryUsage() override { |
81 | | // All memory is allocated through allocator; nothing to report here |
82 | 91.1k | return 0; |
83 | 91.1k | } |
84 | | |
85 | | void Get(const LookupKey& k, void* callback_args, |
86 | 9.15k | bool (*callback_func)(void* arg, const char* entry)) override { |
87 | 9.15k | SkipListRep::Iterator iter(&skip_list_); |
88 | 9.15k | Slice dummy_slice; |
89 | 9.15k | for (iter.Seek(dummy_slice, k.memtable_key().data()); |
90 | 9.15k | iter.Valid() && callback_func(callback_args, iter.key()); |
91 | 9.15k | iter.Next()) { |
92 | 0 | } |
93 | 9.15k | } |
94 | | |
95 | | Status GetAndValidate(const LookupKey& k, void* callback_args, |
96 | | bool (*callback_func)(void* arg, const char* entry), |
97 | | bool allow_data_in_errors, bool detect_key_out_of_order, |
98 | | const std::function<Status(const char*, bool)>& |
99 | 0 | key_validation_callback) override { |
100 | 0 | SkipListRep::Iterator iter(&skip_list_); |
101 | 0 | Slice dummy_slice; |
102 | 0 | Status status = iter.SeekAndValidate( |
103 | 0 | dummy_slice, k.memtable_key().data(), allow_data_in_errors, |
104 | 0 | detect_key_out_of_order, key_validation_callback); |
105 | 0 | for (; iter.Valid() && status.ok() && |
106 | 0 | callback_func(callback_args, iter.key()); |
107 | 0 | status = iter.NextAndValidate(allow_data_in_errors)) { |
108 | 0 | } |
109 | 0 | return status; |
110 | 0 | } |
111 | | |
112 | | uint64_t ApproximateNumEntries(const Slice& start_ikey, |
113 | 0 | const Slice& end_ikey) override { |
114 | 0 | return skip_list_.ApproximateNumEntries(start_ikey, end_ikey); |
115 | 0 | } |
116 | | |
117 | | void UniqueRandomSample(const uint64_t num_entries, |
118 | | const uint64_t target_sample_size, |
119 | 0 | std::unordered_set<const char*>* entries) override { |
120 | 0 | entries->clear(); |
121 | | // Avoid divide-by-0. |
122 | 0 | assert(target_sample_size > 0); |
123 | 0 | assert(num_entries > 0); |
124 | | // NOTE: the size of entries is not enforced to be exactly |
125 | | // target_sample_size at the end of this function, it might be slightly |
126 | | // greater or smaller. |
127 | 0 | SkipListRep::Iterator iter(&skip_list_); |
128 | | // There are two methods to create the subset of samples (size m) |
129 | | // from the table containing N elements: |
130 | | // 1-Iterate linearly through the N memtable entries. For each entry i, |
131 | | // add it to the sample set with a probability |
132 | | // (target_sample_size - entries.size() ) / (N-i). |
133 | | // |
134 | | // 2-Pick m random elements without repetition. |
135 | | // We pick Option 2 when m<sqrt(N) and |
136 | | // Option 1 when m > sqrt(N). |
137 | 0 | if (target_sample_size > |
138 | 0 | static_cast<uint64_t>(std::sqrt(1.0 * num_entries))) { |
139 | 0 | Random* rnd = Random::GetTLSInstance(); |
140 | 0 | iter.SeekToFirst(); |
141 | 0 | uint64_t counter = 0, num_samples_left = target_sample_size; |
142 | 0 | for (; iter.Valid() && (num_samples_left > 0); iter.Next(), counter++) { |
143 | | // Add entry to sample set with probability |
144 | | // num_samples_left/(num_entries - counter). |
145 | 0 | if (rnd->Next() % (num_entries - counter) < num_samples_left) { |
146 | 0 | entries->insert(iter.key()); |
147 | 0 | num_samples_left--; |
148 | 0 | } |
149 | 0 | } |
150 | 0 | } else { |
151 | | // Option 2: pick m random elements with no duplicates. |
152 | | // If Option 2 is picked, then target_sample_size<sqrt(N) |
153 | | // Using a set spares the need to check for duplicates. |
154 | 0 | for (uint64_t i = 0; i < target_sample_size; i++) { |
155 | | // We give it 5 attempts to find a non-duplicate |
156 | | // With 5 attempts, the chances of returning `entries` set |
157 | | // of size target_sample_size is: |
158 | | // PROD_{i=1}^{target_sample_size-1} [1-(i/N)^5] |
159 | | // which is monotonically increasing with N in the worse case |
160 | | // of target_sample_size=sqrt(N), and is always >99.9% for N>4. |
161 | | // At worst, for the final pick , when m=sqrt(N) there is |
162 | | // a probability of p= 1/sqrt(N) chances to find a duplicate. |
163 | 0 | for (uint64_t j = 0; j < 5; j++) { |
164 | 0 | iter.RandomSeek(); |
165 | | // unordered_set::insert returns pair<iterator, bool>. |
166 | | // The second element is true if an insert successfully happened. |
167 | | // If element is already in the set, this bool will be false, and |
168 | | // true otherwise. |
169 | 0 | if ((entries->insert(iter.key())).second) { |
170 | 0 | break; |
171 | 0 | } |
172 | 0 | } |
173 | 0 | } |
174 | 0 | } |
175 | 0 | } |
176 | | |
177 | | ~SkipListRep() override = default; |
178 | | |
179 | | // Iteration over the contents of a skip list |
180 | | class Iterator : public MemTableRep::Iterator { |
181 | | InlineSkipList<const MemTableRep::KeyComparator&>::Iterator iter_; |
182 | | |
183 | | public: |
184 | | // Initialize an iterator over the specified list. |
185 | | // The returned iterator is not valid. |
186 | | explicit Iterator( |
187 | | const InlineSkipList<const MemTableRep::KeyComparator&>* list) |
188 | 23.2k | : iter_(list) {} |
189 | | |
190 | 23.2k | ~Iterator() override = default; |
191 | | |
192 | | // Returns true iff the iterator is positioned at a valid node. |
193 | 36.7k | bool Valid() const override { return iter_.Valid(); } |
194 | | |
195 | | // Returns the key at the current position. |
196 | | // REQUIRES: Valid() |
197 | 40.9k | const char* key() const override { |
198 | 40.9k | assert(Valid()); |
199 | 40.9k | return iter_.key(); |
200 | 40.9k | } |
201 | | |
202 | | // Advances to the next position. |
203 | | // REQUIRES: Valid() |
204 | 11.7k | void Next() override { |
205 | 11.7k | assert(Valid()); |
206 | 11.7k | iter_.Next(); |
207 | 11.7k | } |
208 | | |
209 | | // Advances to the previous position. |
210 | | // REQUIRES: Valid() |
211 | 2.14k | void Prev() override { |
212 | 2.14k | assert(Valid()); |
213 | 2.14k | iter_.Prev(); |
214 | 2.14k | } |
215 | | |
216 | | // Advance to the first entry with a key >= target |
217 | 12.9k | void Seek(const Slice& user_key, const char* memtable_key) override { |
218 | 12.9k | if (memtable_key != nullptr) { |
219 | 9.15k | iter_.Seek(memtable_key); |
220 | 9.15k | } else { |
221 | 3.80k | iter_.Seek(EncodeKey(&tmp_, user_key)); |
222 | 3.80k | } |
223 | 12.9k | } |
224 | | |
225 | | // Retreat to the last entry with a key <= target |
226 | 0 | void SeekForPrev(const Slice& user_key, const char* memtable_key) override { |
227 | 0 | if (memtable_key != nullptr) { |
228 | 0 | iter_.SeekForPrev(memtable_key); |
229 | 0 | } else { |
230 | 0 | iter_.SeekForPrev(EncodeKey(&tmp_, user_key)); |
231 | 0 | } |
232 | 0 | } |
233 | | |
234 | 0 | void RandomSeek() override { iter_.RandomSeek(); } |
235 | | |
236 | | // Position at the first entry in list. |
237 | | // Final state of iterator is Valid() iff list is not empty. |
238 | 8.68k | void SeekToFirst() override { iter_.SeekToFirst(); } |
239 | | |
240 | | // Position at the last entry in list. |
241 | | // Final state of iterator is Valid() iff list is not empty. |
242 | 1.24k | void SeekToLast() override { iter_.SeekToLast(); } |
243 | | |
244 | 0 | Status NextAndValidate(bool allow_data_in_errors) override { |
245 | 0 | assert(Valid()); |
246 | 0 | return iter_.NextAndValidate(allow_data_in_errors); |
247 | 0 | } |
248 | | |
249 | | Status SeekAndValidate(const Slice& user_key, const char* memtable_key, |
250 | | bool allow_data_in_errors, |
251 | | bool detect_key_out_of_order, |
252 | | const std::function<Status(const char*, bool)>& |
253 | 0 | key_validation_callback) override { |
254 | 0 | if (memtable_key != nullptr) { |
255 | 0 | return iter_.SeekAndValidate(memtable_key, allow_data_in_errors, |
256 | 0 | detect_key_out_of_order, |
257 | 0 | key_validation_callback); |
258 | 0 | } else { |
259 | 0 | return iter_.SeekAndValidate( |
260 | 0 | EncodeKey(&tmp_, user_key), allow_data_in_errors, |
261 | 0 | detect_key_out_of_order, key_validation_callback); |
262 | 0 | } |
263 | 0 | } |
264 | | |
265 | 0 | Status PrevAndValidate(bool allow_data_in_error) override { |
266 | 0 | assert(Valid()); |
267 | 0 | return iter_.PrevAndValidate(allow_data_in_error); |
268 | 0 | } |
269 | | |
270 | | protected: |
271 | | std::string tmp_; // For passing to EncodeKey |
272 | | }; |
273 | | |
274 | | // Iterator over the contents of a skip list which also keeps track of the |
275 | | // previously visited node. In Seek(), it examines a few nodes after it |
276 | | // first, falling back to O(log n) search from the head of the list only if |
277 | | // the target key hasn't been found. |
278 | | class LookaheadIterator : public MemTableRep::Iterator { |
279 | | public: |
280 | | explicit LookaheadIterator(const SkipListRep& rep) |
281 | 0 | : rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {} |
282 | | |
283 | 0 | ~LookaheadIterator() override = default; |
284 | | |
285 | 0 | bool Valid() const override { return iter_.Valid(); } |
286 | | |
287 | 0 | const char* key() const override { |
288 | 0 | assert(Valid()); |
289 | 0 | return iter_.key(); |
290 | 0 | } |
291 | | |
292 | 0 | void Next() override { |
293 | 0 | assert(Valid()); |
294 | |
|
295 | 0 | bool advance_prev = true; |
296 | 0 | if (prev_.Valid()) { |
297 | 0 | auto k1 = rep_.UserKey(prev_.key()); |
298 | 0 | auto k2 = rep_.UserKey(iter_.key()); |
299 | |
|
300 | 0 | if (k1.compare(k2) == 0) { |
301 | | // same user key, don't move prev_ |
302 | 0 | advance_prev = false; |
303 | 0 | } else if (rep_.transform_) { |
304 | | // only advance prev_ if it has the same prefix as iter_ |
305 | 0 | auto t1 = rep_.transform_->Transform(k1); |
306 | 0 | auto t2 = rep_.transform_->Transform(k2); |
307 | 0 | advance_prev = t1.compare(t2) == 0; |
308 | 0 | } |
309 | 0 | } |
310 | |
|
311 | 0 | if (advance_prev) { |
312 | 0 | prev_ = iter_; |
313 | 0 | } |
314 | 0 | iter_.Next(); |
315 | 0 | } |
316 | | |
317 | 0 | void Prev() override { |
318 | 0 | assert(Valid()); |
319 | 0 | iter_.Prev(); |
320 | 0 | prev_ = iter_; |
321 | 0 | } |
322 | | |
323 | 0 | void Seek(const Slice& internal_key, const char* memtable_key) override { |
324 | 0 | const char* encoded_key = (memtable_key != nullptr) |
325 | 0 | ? memtable_key |
326 | 0 | : EncodeKey(&tmp_, internal_key); |
327 | |
|
328 | 0 | if (prev_.Valid() && rep_.cmp_(encoded_key, prev_.key()) >= 0) { |
329 | | // prev_.key() is smaller or equal to our target key; do a quick |
330 | | // linear search (at most lookahead_ steps) starting from prev_ |
331 | 0 | iter_ = prev_; |
332 | |
|
333 | 0 | size_t cur = 0; |
334 | 0 | while (cur++ <= rep_.lookahead_ && iter_.Valid()) { |
335 | 0 | if (rep_.cmp_(encoded_key, iter_.key()) <= 0) { |
336 | 0 | return; |
337 | 0 | } |
338 | 0 | Next(); |
339 | 0 | } |
340 | 0 | } |
341 | | |
342 | 0 | iter_.Seek(encoded_key); |
343 | 0 | prev_ = iter_; |
344 | 0 | } |
345 | | |
346 | | void SeekForPrev(const Slice& internal_key, |
347 | 0 | const char* memtable_key) override { |
348 | 0 | const char* encoded_key = (memtable_key != nullptr) |
349 | 0 | ? memtable_key |
350 | 0 | : EncodeKey(&tmp_, internal_key); |
351 | 0 | iter_.SeekForPrev(encoded_key); |
352 | 0 | prev_ = iter_; |
353 | 0 | } |
354 | | |
355 | 0 | void SeekToFirst() override { |
356 | 0 | iter_.SeekToFirst(); |
357 | 0 | prev_ = iter_; |
358 | 0 | } |
359 | | |
360 | 0 | void SeekToLast() override { |
361 | 0 | iter_.SeekToLast(); |
362 | 0 | prev_ = iter_; |
363 | 0 | } |
364 | | |
365 | | protected: |
366 | | std::string tmp_; // For passing to EncodeKey |
367 | | |
368 | | private: |
369 | | const SkipListRep& rep_; |
370 | | InlineSkipList<const MemTableRep::KeyComparator&>::Iterator iter_; |
371 | | InlineSkipList<const MemTableRep::KeyComparator&>::Iterator prev_; |
372 | | }; |
373 | | |
374 | 14.0k | MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { |
375 | 14.0k | if (lookahead_ > 0) { |
376 | 0 | void* mem = |
377 | 0 | arena ? arena->AllocateAligned(sizeof(SkipListRep::LookaheadIterator)) |
378 | 0 | : operator new(sizeof(SkipListRep::LookaheadIterator)); |
379 | 0 | return new (mem) SkipListRep::LookaheadIterator(*this); |
380 | 14.0k | } else { |
381 | 14.0k | void* mem = arena ? arena->AllocateAligned(sizeof(SkipListRep::Iterator)) |
382 | 14.0k | : operator new(sizeof(SkipListRep::Iterator)); |
383 | 14.0k | return new (mem) SkipListRep::Iterator(&skip_list_); |
384 | 14.0k | } |
385 | 14.0k | } |
386 | | }; |
387 | | } // namespace |
388 | | |
389 | | static std::unordered_map<std::string, OptionTypeInfo> skiplist_factory_info = { |
390 | | {"lookahead", |
391 | | {0, OptionType::kSizeT, OptionVerificationType::kNormal, |
392 | | OptionTypeFlags::kDontSerialize /*Since it is part of the ID*/}}, |
393 | | }; |
394 | | |
395 | 264k | SkipListFactory::SkipListFactory(size_t lookahead) : lookahead_(lookahead) { |
396 | 264k | RegisterOptions("SkipListFactoryOptions", &lookahead_, |
397 | 264k | &skiplist_factory_info); |
398 | 264k | } |
399 | | |
400 | 220k | std::string SkipListFactory::GetId() const { |
401 | 220k | std::string id = Name(); |
402 | 220k | if (lookahead_ > 0) { |
403 | 0 | id.append(":").append(std::to_string(lookahead_)); |
404 | 0 | } |
405 | 220k | return id; |
406 | 220k | } |
407 | | |
408 | | MemTableRep* SkipListFactory::CreateMemTableRep( |
409 | | const MemTableRep::KeyComparator& compare, Allocator* allocator, |
410 | 101k | const SliceTransform* transform, Logger* /*logger*/) { |
411 | 101k | return new SkipListRep(compare, allocator, transform, lookahead_); |
412 | 101k | } |
413 | | |
414 | | } // namespace ROCKSDB_NAMESPACE |