/src/rocksdb/memtable/hash_linklist_rep.cc
Line | Count | Source |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | |
7 | | #include <algorithm> |
8 | | #include <atomic> |
9 | | |
10 | | #include "db/memtable.h" |
11 | | #include "memory/arena.h" |
12 | | #include "memtable/skiplist.h" |
13 | | #include "monitoring/histogram.h" |
14 | | #include "port/port.h" |
15 | | #include "rocksdb/memtablerep.h" |
16 | | #include "rocksdb/slice.h" |
17 | | #include "rocksdb/slice_transform.h" |
18 | | #include "rocksdb/utilities/options_type.h" |
19 | | #include "util/hash.h" |
20 | | |
21 | | namespace ROCKSDB_NAMESPACE { |
22 | | namespace { |
23 | | |
24 | | using Key = const char*; |
25 | | using MemtableSkipList = SkipList<Key, const MemTableRep::KeyComparator&>; |
26 | | using Pointer = std::atomic<void*>; |
27 | | |
28 | | // A data structure used as the header of a link list of a hash bucket. |
29 | | struct BucketHeader { |
30 | | Pointer next; |
31 | | std::atomic<uint32_t> num_entries; |
32 | | |
33 | | explicit BucketHeader(void* n, uint32_t count) |
34 | 0 | : next(n), num_entries(count) {} |
35 | | |
36 | 0 | bool IsSkipListBucket() { |
37 | 0 | return next.load(std::memory_order_relaxed) == this; |
38 | 0 | } |
39 | | |
40 | 0 | uint32_t GetNumEntries() const { |
41 | 0 | return num_entries.load(std::memory_order_relaxed); |
42 | 0 | } |
43 | | |
44 | | // REQUIRES: called from single-threaded Insert() |
45 | 0 | void IncNumEntries() { |
46 | | // Only one thread can do write at one time. No need to do atomic |
47 | | // incremental. Update it with relaxed load and store. |
48 | 0 | num_entries.store(GetNumEntries() + 1, std::memory_order_relaxed); |
49 | 0 | } |
50 | | }; |
51 | | |
52 | | // A data structure used as the header of a skip list of a hash bucket. |
53 | | struct SkipListBucketHeader { |
54 | | BucketHeader Counting_header; |
55 | | MemtableSkipList skip_list; |
56 | | |
57 | | explicit SkipListBucketHeader(const MemTableRep::KeyComparator& cmp, |
58 | | Allocator* allocator, uint32_t count) |
59 | 0 | : Counting_header(this, // Pointing to itself to indicate header type. |
60 | 0 | count), |
61 | 0 | skip_list(cmp, allocator) {} |
62 | | }; |
63 | | |
64 | | struct Node { |
65 | | // Accessors/mutators for links. Wrapped in methods so we can |
66 | | // add the appropriate barriers as necessary. |
67 | 0 | Node* Next() { |
68 | | // Use an 'acquire load' so that we observe a fully initialized |
69 | | // version of the returned Node. |
70 | 0 | return next_.load(std::memory_order_acquire); |
71 | 0 | } |
72 | 0 | void SetNext(Node* x) { |
73 | | // Use a 'release store' so that anybody who reads through this |
74 | | // pointer observes a fully initialized version of the inserted node. |
75 | 0 | next_.store(x, std::memory_order_release); |
76 | 0 | } |
77 | | // No-barrier variants that can be safely used in a few locations. |
78 | 0 | Node* NoBarrier_Next() { return next_.load(std::memory_order_relaxed); } |
79 | | |
80 | 0 | void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); } |
81 | | |
82 | | // Needed for placement new below which is fine |
83 | 0 | Node() = default; |
84 | | |
85 | | private: |
86 | | std::atomic<Node*> next_; |
87 | | |
88 | | // Prohibit copying due to the below |
89 | | Node(const Node&) = delete; |
90 | | Node& operator=(const Node&) = delete; |
91 | | |
92 | | public: |
93 | | char key[1]; |
94 | | }; |
95 | | |
96 | | // Memory structure of the mem table: |
97 | | // It is a hash table, each bucket points to one entry, a linked list or a |
98 | | // skip list. In order to track total number of records in a bucket to determine |
99 | | // whether should switch to skip list, a header is added just to indicate |
100 | | // number of entries in the bucket. |
101 | | // |
102 | | // |
103 | | // +-----> NULL Case 1. Empty bucket |
104 | | // | |
105 | | // | |
106 | | // | +---> +-------+ |
107 | | // | | | Next +--> NULL |
108 | | // | | +-------+ |
109 | | // +-----+ | | | | Case 2. One Entry in bucket. |
110 | | // | +-+ | | Data | next pointer points to |
111 | | // +-----+ | | | NULL. All other cases |
112 | | // | | | | | next pointer is not NULL. |
113 | | // +-----+ | +-------+ |
114 | | // | +---+ |
115 | | // +-----+ +-> +-------+ +> +-------+ +-> +-------+ |
116 | | // | | | | Next +--+ | Next +--+ | Next +-->NULL |
117 | | // +-----+ | +-------+ +-------+ +-------+ |
118 | | // | +-----+ | Count | | | | | |
119 | | // +-----+ +-------+ | Data | | Data | |
120 | | // | | | | | | |
121 | | // +-----+ Case 3. | | | | |
122 | | // | | A header +-------+ +-------+ |
123 | | // +-----+ points to |
124 | | // | | a linked list. Count indicates total number |
125 | | // +-----+ of rows in this bucket. |
126 | | // | | |
127 | | // +-----+ +-> +-------+ <--+ |
128 | | // | | | | Next +----+ |
129 | | // +-----+ | +-------+ Case 4. A header points to a skip |
130 | | // | +----+ | Count | list and next pointer points to |
131 | | // +-----+ +-------+ itself, to distinguish case 3 or 4. |
132 | | // | | | | Count still is kept to indicates total |
133 | | // +-----+ | Skip +--> of entries in the bucket for debugging |
134 | | // | | | List | Data purpose. |
135 | | // | | | +--> |
136 | | // +-----+ | | |
137 | | // | | +-------+ |
138 | | // +-----+ |
139 | | // |
140 | | // We don't have data race when changing cases because: |
141 | | // (1) When changing from case 2->3, we create a new bucket header, put the |
142 | | // single node there first without changing the original node, and do a |
143 | | // release store when changing the bucket pointer. In that case, a reader |
144 | | // who sees a stale value of the bucket pointer will read this node, while |
145 | | // a reader sees the correct value because of the release store. |
146 | | // (2) When changing case 3->4, a new header is created with skip list points |
147 | | // to the data, before doing an acquire store to change the bucket pointer. |
148 | | // The old header and nodes are never changed, so any reader sees any |
149 | | // of those existing pointers will guarantee to be able to iterate to the |
150 | | // end of the linked list. |
151 | | // (3) Header's next pointer in case 3 might change, but they are never equal |
152 | | // to itself, so no matter a reader sees any stale or newer value, it will |
153 | | // be able to correctly distinguish case 3 and 4. |
154 | | // |
155 | | // The reason that we use case 2 is we want to make the format to be efficient |
156 | | // when the utilization of buckets is relatively low. If we use case 3 for |
157 | | // single entry bucket, we will need to waste 12 bytes for every entry, |
158 | | // which can be significant decrease of memory utilization. |
159 | | class HashLinkListRep : public MemTableRep { |
160 | | public: |
161 | | HashLinkListRep(const MemTableRep::KeyComparator& compare, |
162 | | Allocator* allocator, const SliceTransform* transform, |
163 | | size_t bucket_size, uint32_t threshold_use_skiplist, |
164 | | size_t huge_page_tlb_size, Logger* logger, |
165 | | int bucket_entries_logging_threshold, |
166 | | bool if_log_bucket_dist_when_flash); |
167 | | |
168 | | KeyHandle Allocate(const size_t len, char** buf) override; |
169 | | |
170 | | void Insert(KeyHandle handle) override; |
171 | | |
172 | | bool Contains(const char* key) const override; |
173 | | |
174 | | size_t ApproximateMemoryUsage() override; |
175 | | |
176 | | void Get(const LookupKey& k, void* callback_args, |
177 | | bool (*callback_func)(void* arg, const char* entry)) override; |
178 | | |
179 | | ~HashLinkListRep() override; |
180 | | |
181 | | MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override; |
182 | | |
183 | | MemTableRep::Iterator* GetDynamicPrefixIterator( |
184 | | Arena* arena = nullptr) override; |
185 | | |
186 | | private: |
187 | | friend class DynamicIterator; |
188 | | |
189 | | size_t bucket_size_; |
190 | | |
191 | | // Maps slices (which are transformed user keys) to buckets of keys sharing |
192 | | // the same transform. |
193 | | Pointer* buckets_; |
194 | | |
195 | | const uint32_t threshold_use_skiplist_; |
196 | | |
197 | | // The user-supplied transform whose domain is the user keys. |
198 | | const SliceTransform* transform_; |
199 | | |
200 | | const MemTableRep::KeyComparator& compare_; |
201 | | |
202 | | Logger* logger_; |
203 | | int bucket_entries_logging_threshold_; |
204 | | bool if_log_bucket_dist_when_flash_; |
205 | | |
206 | | bool LinkListContains(Node* head, const Slice& key) const; |
207 | | |
208 | 0 | bool IsEmptyBucket(Pointer& bucket_pointer) const { |
209 | 0 | return bucket_pointer.load(std::memory_order_acquire) == nullptr; |
210 | 0 | } |
211 | | |
212 | | // Precondition: GetLinkListFirstNode() must have been called first and return |
213 | | // null so that it must be a skip list bucket |
214 | | SkipListBucketHeader* GetSkipListBucketHeader(Pointer& bucket_pointer) const; |
215 | | |
216 | | // Returning nullptr indicates it is a skip list bucket. |
217 | | Node* GetLinkListFirstNode(Pointer& bucket_pointer) const; |
218 | | |
219 | 0 | Slice GetPrefix(const Slice& internal_key) const { |
220 | 0 | return transform_->Transform(ExtractUserKey(internal_key)); |
221 | 0 | } |
222 | | |
223 | 0 | size_t GetHash(const Slice& slice) const { |
224 | 0 | return GetSliceRangedNPHash(slice, bucket_size_); |
225 | 0 | } |
226 | | |
227 | 0 | Pointer& GetBucket(size_t i) const { return buckets_[i]; } |
228 | | |
229 | 0 | Pointer& GetBucket(const Slice& slice) const { |
230 | 0 | return GetBucket(GetHash(slice)); |
231 | 0 | } |
232 | | |
233 | 0 | bool Equal(const Slice& a, const Key& b) const { |
234 | 0 | return (compare_(b, a) == 0); |
235 | 0 | } |
236 | | |
237 | 0 | bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } |
238 | | |
239 | 0 | bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const { |
240 | | // nullptr n is considered infinite |
241 | 0 | return (n != nullptr) && (compare_(n->key, internal_key) < 0); |
242 | 0 | } |
243 | | |
244 | 0 | bool KeyIsAfterNode(const Key& key, const Node* n) const { |
245 | 0 | // nullptr n is considered infinite |
246 | 0 | return (n != nullptr) && (compare_(n->key, key) < 0); |
247 | 0 | } |
248 | | |
249 | 0 | bool KeyIsAfterOrAtNode(const Slice& internal_key, const Node* n) const { |
250 | 0 | // nullptr n is considered infinite |
251 | 0 | return (n != nullptr) && (compare_(n->key, internal_key) <= 0); |
252 | 0 | } |
253 | | |
254 | 0 | bool KeyIsAfterOrAtNode(const Key& key, const Node* n) const { |
255 | 0 | // nullptr n is considered infinite |
256 | 0 | return (n != nullptr) && (compare_(n->key, key) <= 0); |
257 | 0 | } |
258 | | |
259 | | Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const; |
260 | | Node* FindLessOrEqualInBucket(Node* head, const Slice& key) const; |
261 | | |
262 | | class FullListIterator : public MemTableRep::Iterator { |
263 | | public: |
264 | | explicit FullListIterator(MemtableSkipList* list, Allocator* allocator) |
265 | 0 | : iter_(list), full_list_(list), allocator_(allocator) {} |
266 | | |
267 | 0 | ~FullListIterator() override = default; |
268 | | |
269 | | // Returns true iff the iterator is positioned at a valid node. |
270 | 0 | bool Valid() const override { return iter_.Valid(); } |
271 | | |
272 | | // Returns the key at the current position. |
273 | | // REQUIRES: Valid() |
274 | 0 | const char* key() const override { |
275 | 0 | assert(Valid()); |
276 | 0 | return iter_.key(); |
277 | 0 | } |
278 | | |
279 | | // Advances to the next position. |
280 | | // REQUIRES: Valid() |
281 | 0 | void Next() override { |
282 | 0 | assert(Valid()); |
283 | 0 | iter_.Next(); |
284 | 0 | } |
285 | | |
286 | | // Advances to the previous position. |
287 | | // REQUIRES: Valid() |
288 | 0 | void Prev() override { |
289 | 0 | assert(Valid()); |
290 | 0 | iter_.Prev(); |
291 | 0 | } |
292 | | |
293 | | // Advance to the first entry with a key >= target |
294 | 0 | void Seek(const Slice& internal_key, const char* memtable_key) override { |
295 | 0 | const char* encoded_key = (memtable_key != nullptr) |
296 | 0 | ? memtable_key |
297 | 0 | : EncodeKey(&tmp_, internal_key); |
298 | 0 | iter_.Seek(encoded_key); |
299 | 0 | } |
300 | | |
301 | | // Retreat to the last entry with a key <= target |
302 | | void SeekForPrev(const Slice& internal_key, |
303 | 0 | const char* memtable_key) override { |
304 | 0 | const char* encoded_key = (memtable_key != nullptr) |
305 | 0 | ? memtable_key |
306 | 0 | : EncodeKey(&tmp_, internal_key); |
307 | 0 | iter_.SeekForPrev(encoded_key); |
308 | 0 | } |
309 | | |
310 | | // Position at the first entry in collection. |
311 | | // Final state of iterator is Valid() iff collection is not empty. |
312 | 0 | void SeekToFirst() override { iter_.SeekToFirst(); } |
313 | | |
314 | | // Position at the last entry in collection. |
315 | | // Final state of iterator is Valid() iff collection is not empty. |
316 | 0 | void SeekToLast() override { iter_.SeekToLast(); } |
317 | | |
318 | | private: |
319 | | MemtableSkipList::Iterator iter_; |
320 | | // To destruct with the iterator. |
321 | | std::unique_ptr<MemtableSkipList> full_list_; |
322 | | std::unique_ptr<Allocator> allocator_; |
323 | | std::string tmp_; // For passing to EncodeKey |
324 | | }; |
325 | | |
326 | | class LinkListIterator : public MemTableRep::Iterator { |
327 | | public: |
328 | | explicit LinkListIterator(const HashLinkListRep* const hash_link_list_rep, |
329 | | Node* head) |
330 | 0 | : hash_link_list_rep_(hash_link_list_rep), |
331 | 0 | head_(head), |
332 | 0 | node_(nullptr) {} |
333 | | |
334 | | ~LinkListIterator() override = default; |
335 | | |
336 | | // Returns true iff the iterator is positioned at a valid node. |
337 | 0 | bool Valid() const override { return node_ != nullptr; } |
338 | | |
339 | | // Returns the key at the current position. |
340 | | // REQUIRES: Valid() |
341 | 0 | const char* key() const override { |
342 | 0 | assert(Valid()); |
343 | 0 | return node_->key; |
344 | 0 | } |
345 | | |
346 | | // Advances to the next position. |
347 | | // REQUIRES: Valid() |
348 | 0 | void Next() override { |
349 | 0 | assert(Valid()); |
350 | 0 | node_ = node_->Next(); |
351 | 0 | } |
352 | | |
353 | | // Advances to the previous position. |
354 | | // REQUIRES: Valid() |
355 | 0 | void Prev() override { |
356 | | // Prefix iterator does not support total order. |
357 | | // We simply set the iterator to invalid state |
358 | 0 | Reset(nullptr); |
359 | 0 | } |
360 | | |
361 | | // Advance to the first entry with a key >= target |
362 | | void Seek(const Slice& internal_key, |
363 | 0 | const char* /*memtable_key*/) override { |
364 | 0 | node_ = |
365 | 0 | hash_link_list_rep_->FindGreaterOrEqualInBucket(head_, internal_key); |
366 | 0 | } |
367 | | |
368 | | // Retreat to the last entry with a key <= target |
369 | | void SeekForPrev(const Slice& /*internal_key*/, |
370 | 0 | const char* /*memtable_key*/) override { |
371 | | // Since we do not support Prev() |
372 | | // We simply do not support SeekForPrev |
373 | 0 | Reset(nullptr); |
374 | 0 | } |
375 | | |
376 | | // Position at the first entry in collection. |
377 | | // Final state of iterator is Valid() iff collection is not empty. |
378 | 0 | void SeekToFirst() override { |
379 | | // Prefix iterator does not support total order. |
380 | | // We simply set the iterator to invalid state |
381 | 0 | Reset(nullptr); |
382 | 0 | } |
383 | | |
384 | | // Position at the last entry in collection. |
385 | | // Final state of iterator is Valid() iff collection is not empty. |
386 | 0 | void SeekToLast() override { |
387 | | // Prefix iterator does not support total order. |
388 | | // We simply set the iterator to invalid state |
389 | 0 | Reset(nullptr); |
390 | 0 | } |
391 | | |
392 | | protected: |
393 | 0 | void Reset(Node* head) { |
394 | 0 | head_ = head; |
395 | 0 | node_ = nullptr; |
396 | 0 | } |
397 | | |
398 | | private: |
399 | | friend class HashLinkListRep; |
400 | | const HashLinkListRep* const hash_link_list_rep_; |
401 | | Node* head_; |
402 | | Node* node_; |
403 | | |
404 | 0 | virtual void SeekToHead() { node_ = head_; } |
405 | | }; |
406 | | |
407 | | class DynamicIterator : public HashLinkListRep::LinkListIterator { |
408 | | public: |
409 | | explicit DynamicIterator(HashLinkListRep& memtable_rep) |
410 | 0 | : HashLinkListRep::LinkListIterator(&memtable_rep, nullptr), |
411 | 0 | memtable_rep_(memtable_rep) {} |
412 | | |
413 | | // Advance to the first entry with a key >= target |
414 | 0 | void Seek(const Slice& k, const char* memtable_key) override { |
415 | 0 | auto transformed = memtable_rep_.GetPrefix(k); |
416 | 0 | Pointer& bucket = memtable_rep_.GetBucket(transformed); |
417 | |
|
418 | 0 | if (memtable_rep_.IsEmptyBucket(bucket)) { |
419 | 0 | skip_list_iter_.reset(); |
420 | 0 | Reset(nullptr); |
421 | 0 | } else { |
422 | 0 | Node* first_linked_list_node = |
423 | 0 | memtable_rep_.GetLinkListFirstNode(bucket); |
424 | 0 | if (first_linked_list_node != nullptr) { |
425 | | // The bucket is organized as a linked list |
426 | 0 | skip_list_iter_.reset(); |
427 | 0 | Reset(first_linked_list_node); |
428 | 0 | HashLinkListRep::LinkListIterator::Seek(k, memtable_key); |
429 | |
|
430 | 0 | } else { |
431 | 0 | SkipListBucketHeader* skip_list_header = |
432 | 0 | memtable_rep_.GetSkipListBucketHeader(bucket); |
433 | 0 | assert(skip_list_header != nullptr); |
434 | | // The bucket is organized as a skip list |
435 | 0 | if (!skip_list_iter_) { |
436 | 0 | skip_list_iter_.reset( |
437 | 0 | new MemtableSkipList::Iterator(&skip_list_header->skip_list)); |
438 | 0 | } else { |
439 | 0 | skip_list_iter_->SetList(&skip_list_header->skip_list); |
440 | 0 | } |
441 | 0 | if (memtable_key != nullptr) { |
442 | 0 | skip_list_iter_->Seek(memtable_key); |
443 | 0 | } else { |
444 | 0 | IterKey encoded_key; |
445 | 0 | encoded_key.EncodeLengthPrefixedKey(k); |
446 | 0 | skip_list_iter_->Seek(encoded_key.GetUserKey().data()); |
447 | 0 | } |
448 | 0 | } |
449 | 0 | } |
450 | 0 | } |
451 | | |
452 | 0 | bool Valid() const override { |
453 | 0 | if (skip_list_iter_) { |
454 | 0 | return skip_list_iter_->Valid(); |
455 | 0 | } |
456 | 0 | return HashLinkListRep::LinkListIterator::Valid(); |
457 | 0 | } |
458 | | |
459 | 0 | const char* key() const override { |
460 | 0 | if (skip_list_iter_) { |
461 | 0 | return skip_list_iter_->key(); |
462 | 0 | } |
463 | 0 | return HashLinkListRep::LinkListIterator::key(); |
464 | 0 | } |
465 | | |
466 | 0 | void Next() override { |
467 | 0 | if (skip_list_iter_) { |
468 | 0 | skip_list_iter_->Next(); |
469 | 0 | } else { |
470 | 0 | HashLinkListRep::LinkListIterator::Next(); |
471 | 0 | } |
472 | 0 | } |
473 | | |
474 | | private: |
475 | | // the underlying memtable |
476 | | const HashLinkListRep& memtable_rep_; |
477 | | std::unique_ptr<MemtableSkipList::Iterator> skip_list_iter_; |
478 | | }; |
479 | | |
480 | | class EmptyIterator : public MemTableRep::Iterator { |
481 | | // This is used when there wasn't a bucket. It is cheaper than |
482 | | // instantiating an empty bucket over which to iterate. |
483 | | public: |
484 | | EmptyIterator() = default; |
485 | 0 | bool Valid() const override { return false; } |
486 | 0 | const char* key() const override { |
487 | 0 | assert(false); |
488 | 0 | return nullptr; |
489 | 0 | } |
490 | 0 | void Next() override {} |
491 | 0 | void Prev() override {} |
492 | | void Seek(const Slice& /*user_key*/, |
493 | 0 | const char* /*memtable_key*/) override {} |
494 | | void SeekForPrev(const Slice& /*user_key*/, |
495 | 0 | const char* /*memtable_key*/) override {} |
496 | 0 | void SeekToFirst() override {} |
497 | 0 | void SeekToLast() override {} |
498 | | |
499 | | private: |
500 | | }; |
501 | | }; |
502 | | |
503 | | HashLinkListRep::HashLinkListRep( |
504 | | const MemTableRep::KeyComparator& compare, Allocator* allocator, |
505 | | const SliceTransform* transform, size_t bucket_size, |
506 | | uint32_t threshold_use_skiplist, size_t huge_page_tlb_size, Logger* logger, |
507 | | int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash) |
508 | 0 | : MemTableRep(allocator), |
509 | 0 | bucket_size_(bucket_size), |
510 | | // Threshold to use skip list doesn't make sense if less than 3, so we |
511 | | // force it to be minimum of 3 to simplify implementation. |
512 | 0 | threshold_use_skiplist_(std::max(threshold_use_skiplist, 3U)), |
513 | 0 | transform_(transform), |
514 | 0 | compare_(compare), |
515 | 0 | logger_(logger), |
516 | 0 | bucket_entries_logging_threshold_(bucket_entries_logging_threshold), |
517 | 0 | if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) { |
518 | 0 | char* mem = allocator_->AllocateAligned(sizeof(Pointer) * bucket_size, |
519 | 0 | huge_page_tlb_size, logger); |
520 | |
|
521 | 0 | buckets_ = new (mem) Pointer[bucket_size]; |
522 | |
|
523 | 0 | for (size_t i = 0; i < bucket_size_; ++i) { |
524 | 0 | buckets_[i].store(nullptr, std::memory_order_relaxed); |
525 | 0 | } |
526 | 0 | } |
527 | | |
528 | | HashLinkListRep::~HashLinkListRep() = default; |
529 | | |
530 | 0 | KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) { |
531 | 0 | char* mem = allocator_->AllocateAligned(sizeof(Node) + len); |
532 | 0 | Node* x = new (mem) Node(); |
533 | 0 | *buf = x->key; |
534 | 0 | return static_cast<void*>(x); |
535 | 0 | } |
536 | | |
537 | | SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader( |
538 | 0 | Pointer& bucket_pointer) const { |
539 | 0 | Pointer* first_next_pointer = |
540 | 0 | static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire)); |
541 | 0 | assert(first_next_pointer != nullptr); |
542 | 0 | assert(first_next_pointer->load(std::memory_order_relaxed) != nullptr); |
543 | | |
544 | | // Counting header |
545 | 0 | BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer); |
546 | 0 | assert(header->IsSkipListBucket()); |
547 | 0 | assert(header->GetNumEntries() > threshold_use_skiplist_); |
548 | 0 | auto* skip_list_bucket_header = |
549 | 0 | reinterpret_cast<SkipListBucketHeader*>(header); |
550 | 0 | assert(skip_list_bucket_header->Counting_header.next.load( |
551 | 0 | std::memory_order_relaxed) == header); |
552 | 0 | return skip_list_bucket_header; |
553 | 0 | } |
554 | | |
555 | 0 | Node* HashLinkListRep::GetLinkListFirstNode(Pointer& bucket_pointer) const { |
556 | 0 | Pointer* first_next_pointer = |
557 | 0 | static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire)); |
558 | 0 | assert(first_next_pointer != nullptr); |
559 | 0 | if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { |
560 | | // Single entry bucket |
561 | 0 | return reinterpret_cast<Node*>(first_next_pointer); |
562 | 0 | } |
563 | | |
564 | | // It is possible that after we fetch first_next_pointer it is modified |
565 | | // and the next is not null anymore. In this case, the bucket should have been |
566 | | // modified to a counting header, so we should reload the first_next_pointer |
567 | | // to make sure we see the update. |
568 | 0 | first_next_pointer = |
569 | 0 | static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire)); |
570 | | // Counting header |
571 | 0 | BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer); |
572 | 0 | if (!header->IsSkipListBucket()) { |
573 | 0 | assert(header->GetNumEntries() <= threshold_use_skiplist_); |
574 | 0 | return reinterpret_cast<Node*>( |
575 | 0 | header->next.load(std::memory_order_acquire)); |
576 | 0 | } |
577 | 0 | assert(header->GetNumEntries() > threshold_use_skiplist_); |
578 | 0 | return nullptr; |
579 | 0 | } |
580 | | |
581 | 0 | void HashLinkListRep::Insert(KeyHandle handle) { |
582 | 0 | Node* x = static_cast<Node*>(handle); |
583 | 0 | assert(!Contains(x->key)); |
584 | 0 | Slice internal_key = GetLengthPrefixedSlice(x->key); |
585 | 0 | auto transformed = GetPrefix(internal_key); |
586 | 0 | auto& bucket = buckets_[GetHash(transformed)]; |
587 | 0 | Pointer* first_next_pointer = |
588 | 0 | static_cast<Pointer*>(bucket.load(std::memory_order_relaxed)); |
589 | |
|
590 | 0 | if (first_next_pointer == nullptr) { |
591 | | // Case 1. empty bucket |
592 | | // NoBarrier_SetNext() suffices since we will add a barrier when |
593 | | // we publish a pointer to "x" in prev[i]. |
594 | 0 | x->NoBarrier_SetNext(nullptr); |
595 | 0 | bucket.store(x, std::memory_order_release); |
596 | 0 | return; |
597 | 0 | } |
598 | | |
599 | 0 | BucketHeader* header = nullptr; |
600 | 0 | if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { |
601 | | // Case 2. only one entry in the bucket |
602 | | // Need to convert to a Counting bucket and turn to case 4. |
603 | 0 | Node* first = reinterpret_cast<Node*>(first_next_pointer); |
604 | | // Need to add a bucket header. |
605 | | // We have to first convert it to a bucket with header before inserting |
606 | | // the new node. Otherwise, we might need to change next pointer of first. |
607 | | // In that case, a reader might sees the next pointer is NULL and wrongly |
608 | | // think the node is a bucket header. |
609 | 0 | auto* mem = allocator_->AllocateAligned(sizeof(BucketHeader)); |
610 | 0 | header = new (mem) BucketHeader(first, 1); |
611 | 0 | bucket.store(header, std::memory_order_release); |
612 | 0 | } else { |
613 | 0 | header = reinterpret_cast<BucketHeader*>(first_next_pointer); |
614 | 0 | if (header->IsSkipListBucket()) { |
615 | | // Case 4. Bucket is already a skip list |
616 | 0 | assert(header->GetNumEntries() > threshold_use_skiplist_); |
617 | 0 | auto* skip_list_bucket_header = |
618 | 0 | reinterpret_cast<SkipListBucketHeader*>(header); |
619 | | // Only one thread can execute Insert() at one time. No need to do atomic |
620 | | // incremental. |
621 | 0 | skip_list_bucket_header->Counting_header.IncNumEntries(); |
622 | 0 | skip_list_bucket_header->skip_list.Insert(x->key); |
623 | 0 | return; |
624 | 0 | } |
625 | 0 | } |
626 | | |
627 | 0 | if (bucket_entries_logging_threshold_ > 0 && |
628 | 0 | header->GetNumEntries() == |
629 | 0 | static_cast<uint32_t>(bucket_entries_logging_threshold_)) { |
630 | 0 | Info(logger_, |
631 | 0 | "HashLinkedList bucket %" ROCKSDB_PRIszt |
632 | 0 | " has more than %d " |
633 | 0 | "entries. Key to insert: %s", |
634 | 0 | GetHash(transformed), header->GetNumEntries(), |
635 | 0 | GetLengthPrefixedSlice(x->key).ToString(true).c_str()); |
636 | 0 | } |
637 | |
|
638 | 0 | if (header->GetNumEntries() == threshold_use_skiplist_) { |
639 | | // Case 3. number of entries reaches the threshold so need to convert to |
640 | | // skip list. |
641 | 0 | LinkListIterator bucket_iter( |
642 | 0 | this, reinterpret_cast<Node*>( |
643 | 0 | first_next_pointer->load(std::memory_order_relaxed))); |
644 | 0 | auto mem = allocator_->AllocateAligned(sizeof(SkipListBucketHeader)); |
645 | 0 | SkipListBucketHeader* new_skip_list_header = new (mem) |
646 | 0 | SkipListBucketHeader(compare_, allocator_, header->GetNumEntries() + 1); |
647 | 0 | auto& skip_list = new_skip_list_header->skip_list; |
648 | | |
649 | | // Add all current entries to the skip list |
650 | 0 | for (bucket_iter.SeekToHead(); bucket_iter.Valid(); bucket_iter.Next()) { |
651 | 0 | skip_list.Insert(bucket_iter.key()); |
652 | 0 | } |
653 | | |
654 | | // insert the new entry |
655 | 0 | skip_list.Insert(x->key); |
656 | | // Set the bucket |
657 | 0 | bucket.store(new_skip_list_header, std::memory_order_release); |
658 | 0 | } else { |
659 | | // Case 5. Need to insert to the sorted linked list without changing the |
660 | | // header. |
661 | 0 | Node* first = |
662 | 0 | reinterpret_cast<Node*>(header->next.load(std::memory_order_relaxed)); |
663 | 0 | assert(first != nullptr); |
664 | | // Advance counter unless the bucket needs to be advanced to skip list. |
665 | | // In that case, we need to make sure the previous count never exceeds |
666 | | // threshold_use_skiplist_ to avoid readers to cast to wrong format. |
667 | 0 | header->IncNumEntries(); |
668 | |
|
669 | 0 | Node* cur = first; |
670 | 0 | Node* prev = nullptr; |
671 | 0 | while (true) { |
672 | 0 | if (cur == nullptr) { |
673 | 0 | break; |
674 | 0 | } |
675 | 0 | Node* next = cur->Next(); |
676 | | // Make sure the lists are sorted. |
677 | | // If x points to head_ or next points nullptr, it is trivially satisfied. |
678 | 0 | assert((cur == first) || (next == nullptr) || |
679 | 0 | KeyIsAfterNode(next->key, cur)); |
680 | 0 | if (KeyIsAfterNode(internal_key, cur)) { |
681 | | // Keep searching in this list |
682 | 0 | prev = cur; |
683 | 0 | cur = next; |
684 | 0 | } else { |
685 | 0 | break; |
686 | 0 | } |
687 | 0 | } |
688 | | |
689 | | // Our data structure does not allow duplicate insertion |
690 | 0 | assert(cur == nullptr || !Equal(x->key, cur->key)); |
691 | | |
692 | | // NoBarrier_SetNext() suffices since we will add a barrier when |
693 | | // we publish a pointer to "x" in prev[i]. |
694 | 0 | x->NoBarrier_SetNext(cur); |
695 | |
|
696 | 0 | if (prev) { |
697 | 0 | prev->SetNext(x); |
698 | 0 | } else { |
699 | 0 | header->next.store(static_cast<void*>(x), std::memory_order_release); |
700 | 0 | } |
701 | 0 | } |
702 | 0 | } |
703 | | |
704 | 0 | bool HashLinkListRep::Contains(const char* key) const { |
705 | 0 | Slice internal_key = GetLengthPrefixedSlice(key); |
706 | |
|
707 | 0 | auto transformed = GetPrefix(internal_key); |
708 | 0 | Pointer& bucket = GetBucket(transformed); |
709 | 0 | if (IsEmptyBucket(bucket)) { |
710 | 0 | return false; |
711 | 0 | } |
712 | | |
713 | 0 | Node* linked_list_node = GetLinkListFirstNode(bucket); |
714 | 0 | if (linked_list_node != nullptr) { |
715 | 0 | return LinkListContains(linked_list_node, internal_key); |
716 | 0 | } |
717 | | |
718 | 0 | SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket); |
719 | 0 | if (skip_list_header != nullptr) { |
720 | 0 | return skip_list_header->skip_list.Contains(key); |
721 | 0 | } |
722 | 0 | return false; |
723 | 0 | } |
724 | | |
725 | 0 | size_t HashLinkListRep::ApproximateMemoryUsage() { |
726 | | // Memory is always allocated from the allocator. |
727 | 0 | return 0; |
728 | 0 | } |
729 | | |
730 | | void HashLinkListRep::Get(const LookupKey& k, void* callback_args, |
731 | 0 | bool (*callback_func)(void* arg, const char* entry)) { |
732 | 0 | auto transformed = transform_->Transform(k.user_key()); |
733 | 0 | Pointer& bucket = GetBucket(transformed); |
734 | |
|
735 | 0 | if (IsEmptyBucket(bucket)) { |
736 | 0 | return; |
737 | 0 | } |
738 | | |
739 | 0 | auto* link_list_head = GetLinkListFirstNode(bucket); |
740 | 0 | if (link_list_head != nullptr) { |
741 | 0 | LinkListIterator iter(this, link_list_head); |
742 | 0 | for (iter.Seek(k.internal_key(), nullptr); |
743 | 0 | iter.Valid() && callback_func(callback_args, iter.key()); |
744 | 0 | iter.Next()) { |
745 | 0 | } |
746 | 0 | } else { |
747 | 0 | auto* skip_list_header = GetSkipListBucketHeader(bucket); |
748 | 0 | if (skip_list_header != nullptr) { |
749 | | // Is a skip list |
750 | 0 | MemtableSkipList::Iterator iter(&skip_list_header->skip_list); |
751 | 0 | for (iter.Seek(k.memtable_key().data()); |
752 | 0 | iter.Valid() && callback_func(callback_args, iter.key()); |
753 | 0 | iter.Next()) { |
754 | 0 | } |
755 | 0 | } |
756 | 0 | } |
757 | 0 | } |
758 | | |
759 | 0 | MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) { |
760 | | // allocate a new arena of similar size to the one currently in use |
761 | 0 | Arena* new_arena = new Arena(allocator_->BlockSize()); |
762 | 0 | auto list = new MemtableSkipList(compare_, new_arena); |
763 | 0 | HistogramImpl keys_per_bucket_hist; |
764 | |
|
765 | 0 | for (size_t i = 0; i < bucket_size_; ++i) { |
766 | 0 | int count = 0; |
767 | 0 | Pointer& bucket = GetBucket(i); |
768 | 0 | if (!IsEmptyBucket(bucket)) { |
769 | 0 | auto* link_list_head = GetLinkListFirstNode(bucket); |
770 | 0 | if (link_list_head != nullptr) { |
771 | 0 | LinkListIterator itr(this, link_list_head); |
772 | 0 | for (itr.SeekToHead(); itr.Valid(); itr.Next()) { |
773 | 0 | list->Insert(itr.key()); |
774 | 0 | count++; |
775 | 0 | } |
776 | 0 | } else { |
777 | 0 | auto* skip_list_header = GetSkipListBucketHeader(bucket); |
778 | 0 | assert(skip_list_header != nullptr); |
779 | | // Is a skip list |
780 | 0 | MemtableSkipList::Iterator itr(&skip_list_header->skip_list); |
781 | 0 | for (itr.SeekToFirst(); itr.Valid(); itr.Next()) { |
782 | 0 | list->Insert(itr.key()); |
783 | 0 | count++; |
784 | 0 | } |
785 | 0 | } |
786 | 0 | } |
787 | 0 | if (if_log_bucket_dist_when_flash_) { |
788 | 0 | keys_per_bucket_hist.Add(count); |
789 | 0 | } |
790 | 0 | } |
791 | 0 | if (if_log_bucket_dist_when_flash_ && logger_ != nullptr) { |
792 | 0 | Info(logger_, "hashLinkedList Entry distribution among buckets: %s", |
793 | 0 | keys_per_bucket_hist.ToString().c_str()); |
794 | 0 | } |
795 | |
|
796 | 0 | if (alloc_arena == nullptr) { |
797 | 0 | return new FullListIterator(list, new_arena); |
798 | 0 | } else { |
799 | 0 | auto mem = alloc_arena->AllocateAligned(sizeof(FullListIterator)); |
800 | 0 | return new (mem) FullListIterator(list, new_arena); |
801 | 0 | } |
802 | 0 | } |
803 | | |
804 | | MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator( |
805 | 0 | Arena* alloc_arena) { |
806 | 0 | if (alloc_arena == nullptr) { |
807 | 0 | return new DynamicIterator(*this); |
808 | 0 | } else { |
809 | 0 | auto mem = alloc_arena->AllocateAligned(sizeof(DynamicIterator)); |
810 | 0 | return new (mem) DynamicIterator(*this); |
811 | 0 | } |
812 | 0 | } |
813 | | |
814 | | bool HashLinkListRep::LinkListContains(Node* head, |
815 | 0 | const Slice& user_key) const { |
816 | 0 | Node* x = FindGreaterOrEqualInBucket(head, user_key); |
817 | 0 | return (x != nullptr && Equal(user_key, x->key)); |
818 | 0 | } |
819 | | |
820 | | Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head, |
821 | 0 | const Slice& key) const { |
822 | 0 | Node* x = head; |
823 | 0 | while (true) { |
824 | 0 | if (x == nullptr) { |
825 | 0 | return x; |
826 | 0 | } |
827 | 0 | Node* next = x->Next(); |
828 | | // Make sure the lists are sorted. |
829 | | // If x points to head_ or next points nullptr, it is trivially satisfied. |
830 | 0 | assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x)); |
831 | 0 | if (KeyIsAfterNode(key, x)) { |
832 | | // Keep searching in this list |
833 | 0 | x = next; |
834 | 0 | } else { |
835 | 0 | break; |
836 | 0 | } |
837 | 0 | } |
838 | 0 | return x; |
839 | 0 | } |
840 | | |
841 | | struct HashLinkListRepOptions { |
842 | 0 | static const char* kName() { return "HashLinkListRepFactoryOptions"; } |
843 | | size_t bucket_count; |
844 | | uint32_t threshold_use_skiplist; |
845 | | size_t huge_page_tlb_size; |
846 | | int bucket_entries_logging_threshold; |
847 | | bool if_log_bucket_dist_when_flash; |
848 | | }; |
849 | | |
850 | | static std::unordered_map<std::string, OptionTypeInfo> hash_linklist_info = { |
851 | | {"bucket_count", |
852 | | {offsetof(struct HashLinkListRepOptions, bucket_count), OptionType::kSizeT, |
853 | | OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, |
854 | | {"threshold", |
855 | | {offsetof(struct HashLinkListRepOptions, threshold_use_skiplist), |
856 | | OptionType::kUInt32T, OptionVerificationType::kNormal, |
857 | | OptionTypeFlags::kNone}}, |
858 | | {"huge_page_size", |
859 | | {offsetof(struct HashLinkListRepOptions, huge_page_tlb_size), |
860 | | OptionType::kSizeT, OptionVerificationType::kNormal, |
861 | | OptionTypeFlags::kNone}}, |
862 | | {"logging_threshold", |
863 | | {offsetof(struct HashLinkListRepOptions, bucket_entries_logging_threshold), |
864 | | OptionType::kInt, OptionVerificationType::kNormal, |
865 | | OptionTypeFlags::kNone}}, |
866 | | {"log_when_flash", |
867 | | {offsetof(struct HashLinkListRepOptions, if_log_bucket_dist_when_flash), |
868 | | OptionType::kBoolean, OptionVerificationType::kNormal, |
869 | | OptionTypeFlags::kNone}}, |
870 | | }; |
871 | | |
872 | | class HashLinkListRepFactory : public MemTableRepFactory { |
873 | | public: |
874 | | explicit HashLinkListRepFactory(size_t bucket_count, |
875 | | uint32_t threshold_use_skiplist, |
876 | | size_t huge_page_tlb_size, |
877 | | int bucket_entries_logging_threshold, |
878 | 0 | bool if_log_bucket_dist_when_flash) { |
879 | 0 | options_.bucket_count = bucket_count; |
880 | 0 | options_.threshold_use_skiplist = threshold_use_skiplist; |
881 | 0 | options_.huge_page_tlb_size = huge_page_tlb_size; |
882 | 0 | options_.bucket_entries_logging_threshold = |
883 | 0 | bucket_entries_logging_threshold; |
884 | 0 | options_.if_log_bucket_dist_when_flash = if_log_bucket_dist_when_flash; |
885 | 0 | RegisterOptions(&options_, &hash_linklist_info); |
886 | 0 | } |
887 | | |
888 | | using MemTableRepFactory::CreateMemTableRep; |
889 | | MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& compare, |
890 | | Allocator* allocator, |
891 | | const SliceTransform* transform, |
892 | | Logger* logger) override; |
893 | | |
894 | 0 | static const char* kClassName() { return "HashLinkListRepFactory"; } |
895 | 0 | static const char* kNickName() { return "hash_linkedlist"; } |
896 | 0 | const char* Name() const override { return kClassName(); } |
897 | 0 | const char* NickName() const override { return kNickName(); } |
898 | | |
899 | | private: |
900 | | HashLinkListRepOptions options_; |
901 | | }; |
902 | | |
903 | | } // namespace |
904 | | |
905 | | MemTableRep* HashLinkListRepFactory::CreateMemTableRep( |
906 | | const MemTableRep::KeyComparator& compare, Allocator* allocator, |
907 | 0 | const SliceTransform* transform, Logger* logger) { |
908 | 0 | return new HashLinkListRep( |
909 | 0 | compare, allocator, transform, options_.bucket_count, |
910 | 0 | options_.threshold_use_skiplist, options_.huge_page_tlb_size, logger, |
911 | 0 | options_.bucket_entries_logging_threshold, |
912 | 0 | options_.if_log_bucket_dist_when_flash); |
913 | 0 | } |
914 | | |
915 | | MemTableRepFactory* NewHashLinkListRepFactory( |
916 | | size_t bucket_count, size_t huge_page_tlb_size, |
917 | | int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash, |
918 | 0 | uint32_t threshold_use_skiplist) { |
919 | 0 | return new HashLinkListRepFactory( |
920 | 0 | bucket_count, threshold_use_skiplist, huge_page_tlb_size, |
921 | 0 | bucket_entries_logging_threshold, if_log_bucket_dist_when_flash); |
922 | 0 | } |
923 | | |
924 | | } // namespace ROCKSDB_NAMESPACE |