/src/rocksdb/table/block_based/block.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | // |
10 | | // Decodes the blocks generated by block_builder.cc. |
11 | | |
12 | | #include "table/block_based/block.h" |
13 | | |
14 | | #include <algorithm> |
15 | | #include <string> |
16 | | #include <unordered_map> |
17 | | #include <vector> |
18 | | |
19 | | #include "monitoring/perf_context_imp.h" |
20 | | #include "port/port.h" |
21 | | #include "port/stack_trace.h" |
22 | | #include "rocksdb/comparator.h" |
23 | | #include "table/block_based/block_prefix_index.h" |
24 | | #include "table/block_based/data_block_footer.h" |
25 | | #include "table/format.h" |
26 | | #include "util/coding.h" |
27 | | |
28 | | namespace ROCKSDB_NAMESPACE { |
29 | | |
30 | | // Helper routine: decode the next block entry starting at "p", |
31 | | // storing the number of shared key bytes, non_shared key bytes, |
32 | | // and the length of the value in "*shared", "*non_shared", and |
33 | | // "*value_length", respectively. Will not dereference past "limit". |
34 | | // |
35 | | // If any errors are detected, returns nullptr. Otherwise, returns a |
36 | | // pointer to the key delta (just past the three decoded values). |
37 | | struct DecodeEntry { |
38 | | inline const char* operator()(const char* p, const char* limit, |
39 | | uint32_t* shared, uint32_t* non_shared, |
40 | 284k | uint32_t* value_length) { |
41 | | // We need 2 bytes for shared and non_shared size. We also need one more |
42 | | // byte either for value size or the actual value in case of value delta |
43 | | // encoding. |
44 | 284k | assert(limit - p >= 3); |
45 | 284k | *shared = reinterpret_cast<const unsigned char*>(p)[0]; |
46 | 284k | *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; |
47 | 284k | *value_length = reinterpret_cast<const unsigned char*>(p)[2]; |
48 | 284k | if ((*shared | *non_shared | *value_length) < 128) { |
49 | | // Fast path: all three values are encoded in one byte each |
50 | 205k | p += 3; |
51 | 205k | } else { |
52 | 78.8k | if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) { |
53 | 0 | return nullptr; |
54 | 0 | } |
55 | 78.8k | if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) { |
56 | 0 | return nullptr; |
57 | 0 | } |
58 | 78.8k | if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { |
59 | 0 | return nullptr; |
60 | 0 | } |
61 | 78.8k | } |
62 | | |
63 | | // Using an assert in place of "return null" since we should not pay the |
64 | | // cost of checking for corruption on every single key decoding |
65 | 284k | assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length))); |
66 | 284k | return p; |
67 | 284k | } |
68 | | }; |
69 | | |
70 | | // Helper routine: similar to DecodeEntry but does not have assertions. |
71 | | // Instead, returns nullptr so that caller can detect and report failure. |
72 | | struct CheckAndDecodeEntry { |
73 | | inline const char* operator()(const char* p, const char* limit, |
74 | | uint32_t* shared, uint32_t* non_shared, |
75 | 278k | uint32_t* value_length) { |
76 | | // We need 2 bytes for shared and non_shared size. We also need one more |
77 | | // byte either for value size or the actual value in case of value delta |
78 | | // encoding. |
79 | 278k | if (limit - p < 3) { |
80 | 0 | return nullptr; |
81 | 0 | } |
82 | 278k | *shared = reinterpret_cast<const unsigned char*>(p)[0]; |
83 | 278k | *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; |
84 | 278k | *value_length = reinterpret_cast<const unsigned char*>(p)[2]; |
85 | 278k | if ((*shared | *non_shared | *value_length) < 128) { |
86 | | // Fast path: all three values are encoded in one byte each |
87 | 271k | p += 3; |
88 | 271k | } else { |
89 | 7.06k | if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) { |
90 | 0 | return nullptr; |
91 | 0 | } |
92 | 7.06k | if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) { |
93 | 0 | return nullptr; |
94 | 0 | } |
95 | 7.06k | if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { |
96 | 0 | return nullptr; |
97 | 0 | } |
98 | 7.06k | } |
99 | | |
100 | 278k | if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) { |
101 | 0 | return nullptr; |
102 | 0 | } |
103 | 278k | return p; |
104 | 278k | } |
105 | | }; |
106 | | |
107 | | struct DecodeKey { |
108 | | inline const char* operator()(const char* p, const char* limit, |
109 | 49.7k | uint32_t* shared, uint32_t* non_shared) { |
110 | 49.7k | uint32_t value_length; |
111 | 49.7k | return DecodeEntry()(p, limit, shared, non_shared, &value_length); |
112 | 49.7k | } |
113 | | }; |
114 | | |
115 | | // In format_version 4, which is used by index blocks, the value size is not |
116 | | // encoded before the entry, as the value is known to be the handle with the |
117 | | // known size. |
118 | | struct DecodeKeyV4 { |
119 | | inline const char* operator()(const char* p, const char* limit, |
120 | 12.8k | uint32_t* shared, uint32_t* non_shared) { |
121 | | // We need 2 bytes for shared and non_shared size. We also need one more |
122 | | // byte either for value size or the actual value in case of value delta |
123 | | // encoding. |
124 | 12.8k | if (limit - p < 3) { |
125 | 0 | return nullptr; |
126 | 0 | } |
127 | 12.8k | *shared = reinterpret_cast<const unsigned char*>(p)[0]; |
128 | 12.8k | *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; |
129 | 12.8k | if ((*shared | *non_shared) < 128) { |
130 | | // Fast path: all three values are encoded in one byte each |
131 | 9.64k | p += 2; |
132 | 9.64k | } else { |
133 | 3.16k | if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) { |
134 | 0 | return nullptr; |
135 | 0 | } |
136 | 3.16k | if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) { |
137 | 0 | return nullptr; |
138 | 0 | } |
139 | 3.16k | } |
140 | 12.8k | return p; |
141 | 12.8k | } |
142 | | }; |
143 | | |
144 | | struct DecodeEntryV4 { |
145 | | inline const char* operator()(const char* p, const char* limit, |
146 | | uint32_t* shared, uint32_t* non_shared, |
147 | 10.9k | uint32_t* value_length) { |
148 | 10.9k | assert(value_length); |
149 | | |
150 | 10.9k | *value_length = 0; |
151 | 10.9k | return DecodeKeyV4()(p, limit, shared, non_shared); |
152 | 10.9k | } |
153 | | }; |
154 | | |
155 | 233k | void DataBlockIter::NextImpl() { |
156 | | #ifndef NDEBUG |
157 | | if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) { |
158 | | return; |
159 | | } |
160 | | #endif |
161 | 233k | bool is_shared = false; |
162 | 233k | ParseNextDataKey(&is_shared); |
163 | 233k | ++cur_entry_idx_; |
164 | 233k | } |
165 | | |
166 | 281k | void MetaBlockIter::NextImpl() { |
167 | 281k | bool is_shared = false; |
168 | 281k | ParseNextKey<CheckAndDecodeEntry>(&is_shared); |
169 | 281k | ++cur_entry_idx_; |
170 | 281k | } |
171 | | |
172 | 10.4k | void IndexBlockIter::NextImpl() { |
173 | 10.4k | ParseNextIndexKey(); |
174 | 10.4k | ++cur_entry_idx_; |
175 | 10.4k | } |
176 | | |
177 | 1.53k | void IndexBlockIter::PrevImpl() { |
178 | 1.53k | assert(Valid()); |
179 | | // Scan backwards to a restart point before current_ |
180 | 1.53k | const uint32_t original = current_; |
181 | 1.53k | while (GetRestartPoint(restart_index_) >= original) { |
182 | 1.53k | if (restart_index_ == 0) { |
183 | | // No more entries |
184 | 1.53k | current_ = restarts_; |
185 | 1.53k | restart_index_ = num_restarts_; |
186 | 1.53k | return; |
187 | 1.53k | } |
188 | 0 | restart_index_--; |
189 | 0 | } |
190 | 0 | SeekToRestartPoint(restart_index_); |
191 | | // Loop until end of current entry hits the start of original entry |
192 | 0 | while (ParseNextIndexKey() && NextEntryOffset() < original) { |
193 | 0 | } |
194 | 0 | --cur_entry_idx_; |
195 | 0 | } |
196 | | |
197 | 0 | void MetaBlockIter::PrevImpl() { |
198 | 0 | assert(Valid()); |
199 | | // Scan backwards to a restart point before current_ |
200 | 0 | const uint32_t original = current_; |
201 | 0 | while (GetRestartPoint(restart_index_) >= original) { |
202 | 0 | if (restart_index_ == 0) { |
203 | | // No more entries |
204 | 0 | current_ = restarts_; |
205 | 0 | restart_index_ = num_restarts_; |
206 | 0 | return; |
207 | 0 | } |
208 | 0 | restart_index_--; |
209 | 0 | } |
210 | 0 | SeekToRestartPoint(restart_index_); |
211 | 0 | bool is_shared = false; |
212 | | // Loop until end of current entry hits the start of original entry |
213 | 0 | while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) && |
214 | 0 | NextEntryOffset() < original) { |
215 | 0 | } |
216 | 0 | --cur_entry_idx_; |
217 | 0 | } |
218 | | |
219 | | // Similar to IndexBlockIter::PrevImpl but also caches the prev entries |
220 | 1.71k | void DataBlockIter::PrevImpl() { |
221 | 1.71k | assert(Valid()); |
222 | | |
223 | 1.71k | assert(prev_entries_idx_ == -1 || |
224 | 1.71k | static_cast<size_t>(prev_entries_idx_) < prev_entries_.size()); |
225 | 1.71k | --cur_entry_idx_; |
226 | | // Check if we can use cached prev_entries_ |
227 | 1.71k | if (prev_entries_idx_ > 0 && |
228 | 1.71k | prev_entries_[prev_entries_idx_].offset == current_) { |
229 | | // Read cached CachedPrevEntry |
230 | 0 | prev_entries_idx_--; |
231 | 0 | const CachedPrevEntry& current_prev_entry = |
232 | 0 | prev_entries_[prev_entries_idx_]; |
233 | |
|
234 | 0 | const char* key_ptr = nullptr; |
235 | 0 | bool raw_key_cached; |
236 | 0 | if (current_prev_entry.key_ptr != nullptr) { |
237 | | // The key is not delta encoded and stored in the data block |
238 | 0 | key_ptr = current_prev_entry.key_ptr; |
239 | 0 | raw_key_cached = false; |
240 | 0 | } else { |
241 | | // The key is delta encoded and stored in prev_entries_keys_buff_ |
242 | 0 | key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; |
243 | 0 | raw_key_cached = true; |
244 | 0 | } |
245 | 0 | const Slice current_key(key_ptr, current_prev_entry.key_size); |
246 | |
|
247 | 0 | current_ = current_prev_entry.offset; |
248 | | // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience, |
249 | | // not necessity. It is convenient since this class treats keys as pinned |
250 | | // when `raw_key_` points to an outside buffer. So we cannot allow |
251 | | // `raw_key_` point into Prev cache as it is a transient outside buffer |
252 | | // (i.e., keys in it are not actually pinned). |
253 | 0 | raw_key_.SetKey(current_key, raw_key_cached /* copy */); |
254 | 0 | value_ = current_prev_entry.value; |
255 | |
|
256 | 0 | return; |
257 | 0 | } |
258 | | |
259 | | // Clear prev entries cache |
260 | 1.71k | prev_entries_idx_ = -1; |
261 | 1.71k | prev_entries_.clear(); |
262 | 1.71k | prev_entries_keys_buff_.clear(); |
263 | | |
264 | | // Scan backwards to a restart point before current_ |
265 | 1.71k | const uint32_t original = current_; |
266 | 1.71k | while (GetRestartPoint(restart_index_) >= original) { |
267 | 1.53k | if (restart_index_ == 0) { |
268 | | // No more entries |
269 | 1.53k | current_ = restarts_; |
270 | 1.53k | restart_index_ = num_restarts_; |
271 | 1.53k | return; |
272 | 1.53k | } |
273 | 0 | restart_index_--; |
274 | 0 | } |
275 | | |
276 | 174 | SeekToRestartPoint(restart_index_); |
277 | | |
278 | 174 | do { |
279 | 174 | bool is_shared = false; |
280 | 174 | if (!ParseNextDataKey(&is_shared)) { |
281 | 0 | break; |
282 | 0 | } |
283 | 174 | Slice current_key = raw_key_.GetKey(); |
284 | | |
285 | 174 | if (raw_key_.IsKeyPinned()) { |
286 | | // The key is not delta encoded |
287 | 174 | prev_entries_.emplace_back(current_, current_key.data(), 0, |
288 | 174 | current_key.size(), value()); |
289 | 174 | } else { |
290 | | // The key is delta encoded, cache decoded key in buffer |
291 | 0 | size_t new_key_offset = prev_entries_keys_buff_.size(); |
292 | 0 | prev_entries_keys_buff_.append(current_key.data(), current_key.size()); |
293 | |
|
294 | 0 | prev_entries_.emplace_back(current_, nullptr, new_key_offset, |
295 | 0 | current_key.size(), value()); |
296 | 0 | } |
297 | | // Loop until end of current entry hits the start of original entry |
298 | 174 | } while (NextEntryOffset() < original); |
299 | 0 | prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1; |
300 | 174 | } |
301 | | |
302 | 678 | void DataBlockIter::SeekImpl(const Slice& target) { |
303 | 678 | Slice seek_key = target; |
304 | 678 | PERF_TIMER_GUARD(block_seek_nanos); |
305 | 678 | if (data_ == nullptr) { // Not init yet |
306 | 0 | return; |
307 | 0 | } |
308 | 678 | uint32_t index = 0; |
309 | 678 | bool skip_linear_scan = false; |
310 | 678 | bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan); |
311 | | |
312 | 678 | if (!ok) { |
313 | 0 | return; |
314 | 0 | } |
315 | 678 | FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); |
316 | 678 | } |
317 | | |
318 | 29.8k | void MetaBlockIter::SeekImpl(const Slice& target) { |
319 | 29.8k | Slice seek_key = target; |
320 | 29.8k | PERF_TIMER_GUARD(block_seek_nanos); |
321 | 29.8k | if (data_ == nullptr) { // Not init yet |
322 | 0 | return; |
323 | 0 | } |
324 | 29.8k | uint32_t index = 0; |
325 | 29.8k | bool skip_linear_scan = false; |
326 | 29.8k | bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan); |
327 | | |
328 | 29.8k | if (!ok) { |
329 | 0 | return; |
330 | 0 | } |
331 | 29.8k | FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); |
332 | 29.8k | } |
333 | | |
334 | | // Optimized Seek for point lookup for an internal key `target` |
335 | | // target = "seek_user_key @ type | seqno". |
336 | | // |
337 | | // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion, |
338 | | // kTypeBlobIndex, kTypeWideColumnEntity, kTypeValuePreferredSeqno or |
339 | | // kTypeMerge, this function behaves identically to Seek(). |
340 | | // |
341 | | // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion, |
342 | | // kTypeBlobIndex, kTypeWideColumnEntity, kTypeValuePreferredSeqno or |
343 | | // kTypeMerge: |
344 | | // |
345 | | // If the return value is FALSE, iter location is undefined, and it means: |
346 | | // 1) there is no key in this block falling into the range: |
347 | | // ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"], |
348 | | // inclusive; AND |
349 | | // 2) the last key of this block has a greater user_key from seek_user_key |
350 | | // |
351 | | // If the return value is TRUE, iter location has two possibilities: |
352 | | // 1) If iter is valid, it is set to a location as if set by SeekImpl(target). |
353 | | // In this case, it points to the first key with a larger user_key or a |
354 | | // matching user_key with a seqno no greater than the seeking seqno. |
355 | | // 2) If the iter is invalid, it means that either all the user_key is less |
356 | | // than the seek_user_key, or the block ends with a matching user_key but |
357 | | // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno |
358 | | // but larger type). |
359 | 0 | bool DataBlockIter::SeekForGetImpl(const Slice& target) { |
360 | 0 | Slice target_user_key = ExtractUserKey(target); |
361 | 0 | uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t); |
362 | 0 | uint8_t entry = |
363 | 0 | data_block_hash_index_->Lookup(data_, map_offset, target_user_key); |
364 | |
|
365 | 0 | if (entry == kCollision) { |
366 | | // HashSeek not effective, falling back |
367 | 0 | SeekImpl(target); |
368 | 0 | return true; |
369 | 0 | } |
370 | | |
371 | 0 | if (entry == kNoEntry) { |
372 | | // Even if we cannot find the user_key in this block, the result may |
373 | | // exist in the next block. Consider this example: |
374 | | // |
375 | | // Block N: [aab@100, ... , app@120] |
376 | | // boundary key: axy@50 (we make minimal assumption about a boundary key) |
377 | | // Block N+1: [axy@10, ... ] |
378 | | // |
379 | | // If seek_key = axy@60, the search will start from Block N. |
380 | | // Even if the user_key is not found in the hash map, the caller still |
381 | | // have to continue searching the next block. |
382 | | // |
383 | | // In this case, we pretend the key is in the last restart interval. |
384 | | // The while-loop below will search the last restart interval for the |
385 | | // key. It will stop at the first key that is larger than the seek_key, |
386 | | // or to the end of the block if no one is larger. |
387 | 0 | entry = static_cast<uint8_t>(num_restarts_ - 1); |
388 | 0 | } |
389 | |
|
390 | 0 | uint32_t restart_index = entry; |
391 | | |
392 | | // check if the key is in the restart_interval |
393 | 0 | assert(restart_index < num_restarts_); |
394 | 0 | SeekToRestartPoint(restart_index); |
395 | 0 | current_ = GetRestartPoint(restart_index); |
396 | 0 | cur_entry_idx_ = |
397 | 0 | static_cast<int32_t>(restart_index * block_restart_interval_) - 1; |
398 | |
|
399 | 0 | uint32_t limit = restarts_; |
400 | 0 | if (restart_index + 1 < num_restarts_) { |
401 | 0 | limit = GetRestartPoint(restart_index + 1); |
402 | 0 | } |
403 | 0 | while (current_ < limit) { |
404 | 0 | ++cur_entry_idx_; |
405 | 0 | bool shared; |
406 | | // Here we only linear seek the target key inside the restart interval. |
407 | | // If a key does not exist inside a restart interval, we avoid |
408 | | // further searching the block content across restart interval boundary. |
409 | | // |
410 | | // TODO(fwu): check the left and right boundary of the restart interval |
411 | | // to avoid linear seek a target key that is out of range. |
412 | 0 | if (!ParseNextDataKey(&shared) || CompareCurrentKey(target) >= 0) { |
413 | | // we stop at the first potential matching user key. |
414 | 0 | break; |
415 | 0 | } |
416 | | // If the loop exits due to CompareCurrentKey(target) >= 0, then current key |
417 | | // exists, and its checksum verification will be done in UpdateKey() called |
418 | | // in SeekForGet(). |
419 | | // TODO(cbi): If this loop exits with current_ == restart_, per key-value |
420 | | // checksum will not be verified in UpdateKey() since Valid() |
421 | | // will return false. |
422 | 0 | } |
423 | |
|
424 | 0 | if (current_ == restarts_) { |
425 | | // Search reaches to the end of the block. There are three possibilities: |
426 | | // 1) there is only one user_key match in the block (otherwise collision). |
427 | | // the matching user_key resides in the last restart interval, and it |
428 | | // is the last key of the restart interval and of the block as well. |
429 | | // ParseNextKey() skipped it as its [ type | seqno ] is smaller. |
430 | | // |
431 | | // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry, |
432 | | // AND all existing user_keys in the restart interval are smaller than |
433 | | // seek_user_key. |
434 | | // |
435 | | // 3) The seek_key is a false positive and happens to be hashed to the |
436 | | // last restart interval, AND all existing user_keys in the restart |
437 | | // interval are smaller than seek_user_key. |
438 | | // |
439 | | // The result may exist in the next block each case, so we return true. |
440 | 0 | return true; |
441 | 0 | } |
442 | | |
443 | 0 | if (icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), |
444 | 0 | target_user_key) != 0) { |
445 | | // the key is not in this block and cannot be at the next block either. |
446 | 0 | return false; |
447 | 0 | } |
448 | | |
449 | | // Here we are conservative and only support a limited set of cases |
450 | 0 | ValueType value_type = ExtractValueType(raw_key_.GetInternalKey()); |
451 | 0 | if (value_type != ValueType::kTypeValue && |
452 | 0 | value_type != ValueType::kTypeDeletion && |
453 | 0 | value_type != ValueType::kTypeMerge && |
454 | 0 | value_type != ValueType::kTypeSingleDeletion && |
455 | 0 | value_type != ValueType::kTypeBlobIndex && |
456 | 0 | value_type != ValueType::kTypeWideColumnEntity && |
457 | 0 | value_type != ValueType::kTypeValuePreferredSeqno) { |
458 | 0 | SeekImpl(target); |
459 | 0 | } |
460 | | |
461 | | // Result found, and the iter is correctly set. |
462 | 0 | return true; |
463 | 0 | } |
464 | | |
465 | 1.82k | void IndexBlockIter::SeekImpl(const Slice& target) { |
466 | | #ifndef NDEBUG |
467 | | if (TEST_Corrupt_Callback("IndexBlockIter::SeekImpl")) { |
468 | | return; |
469 | | } |
470 | | #endif |
471 | 1.82k | TEST_SYNC_POINT("IndexBlockIter::Seek:0"); |
472 | 1.82k | PERF_TIMER_GUARD(block_seek_nanos); |
473 | 1.82k | if (data_ == nullptr) { // Not init yet |
474 | 0 | return; |
475 | 0 | } |
476 | 1.82k | Slice seek_key = target; |
477 | 1.82k | if (raw_key_.IsUserKey()) { |
478 | 1.82k | seek_key = ExtractUserKey(target); |
479 | 1.82k | } |
480 | 1.82k | status_ = Status::OK(); |
481 | 1.82k | uint32_t index = 0; |
482 | 1.82k | bool skip_linear_scan = false; |
483 | 1.82k | bool ok = false; |
484 | 1.82k | if (prefix_index_) { |
485 | 0 | bool prefix_may_exist = true; |
486 | 0 | ok = PrefixSeek(target, &index, &prefix_may_exist); |
487 | 0 | if (!prefix_may_exist) { |
488 | | // This is to let the caller to distinguish between non-existing prefix, |
489 | | // and when key is larger than the last key, which both set Valid() to |
490 | | // false. |
491 | 0 | current_ = restarts_; |
492 | 0 | status_ = Status::NotFound(); |
493 | 0 | } |
494 | | // restart interval must be one when hash search is enabled so the binary |
495 | | // search simply lands at the right place. |
496 | 0 | skip_linear_scan = true; |
497 | 1.82k | } else if (value_delta_encoded_) { |
498 | 1.82k | ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan); |
499 | 1.82k | } else { |
500 | 0 | ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan); |
501 | 0 | } |
502 | | |
503 | 1.82k | if (!ok) { |
504 | 0 | return; |
505 | 0 | } |
506 | 1.82k | FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); |
507 | 1.82k | } |
508 | | |
509 | 1.14k | void DataBlockIter::SeekForPrevImpl(const Slice& target) { |
510 | 1.14k | PERF_TIMER_GUARD(block_seek_nanos); |
511 | 1.14k | Slice seek_key = target; |
512 | 1.14k | if (data_ == nullptr) { // Not init yet |
513 | 0 | return; |
514 | 0 | } |
515 | 1.14k | uint32_t index = 0; |
516 | 1.14k | bool skip_linear_scan = false; |
517 | 1.14k | bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan); |
518 | | |
519 | 1.14k | if (!ok) { |
520 | 0 | return; |
521 | 0 | } |
522 | 1.14k | FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); |
523 | | |
524 | 1.14k | if (!Valid()) { |
525 | 581 | if (status_.ok()) { |
526 | 581 | SeekToLastImpl(); |
527 | 581 | } |
528 | 581 | } else { |
529 | 1.13k | while (Valid() && CompareCurrentKey(seek_key) > 0) { |
530 | 566 | PrevImpl(); |
531 | 566 | } |
532 | 566 | } |
533 | 1.14k | } |
534 | | |
535 | 0 | void MetaBlockIter::SeekForPrevImpl(const Slice& target) { |
536 | 0 | PERF_TIMER_GUARD(block_seek_nanos); |
537 | 0 | Slice seek_key = target; |
538 | 0 | if (data_ == nullptr) { // Not init yet |
539 | 0 | return; |
540 | 0 | } |
541 | 0 | uint32_t index = 0; |
542 | 0 | bool skip_linear_scan = false; |
543 | 0 | bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan); |
544 | |
|
545 | 0 | if (!ok) { |
546 | 0 | return; |
547 | 0 | } |
548 | 0 | FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); |
549 | |
|
550 | 0 | if (!Valid()) { |
551 | 0 | if (status_.ok()) { |
552 | 0 | SeekToLastImpl(); |
553 | 0 | } |
554 | 0 | } else { |
555 | 0 | while (Valid() && CompareCurrentKey(seek_key) > 0) { |
556 | 0 | PrevImpl(); |
557 | 0 | } |
558 | 0 | } |
559 | 0 | } |
560 | | |
561 | 15.0k | void DataBlockIter::SeekToFirstImpl() { |
562 | 15.0k | if (data_ == nullptr) { // Not init yet |
563 | 0 | return; |
564 | 0 | } |
565 | 15.0k | SeekToRestartPoint(0); |
566 | 15.0k | bool is_shared = false; |
567 | 15.0k | ParseNextDataKey(&is_shared); |
568 | 15.0k | cur_entry_idx_ = 0; |
569 | 15.0k | } |
570 | | |
571 | 7.48k | void MetaBlockIter::SeekToFirstImpl() { |
572 | 7.48k | if (data_ == nullptr) { // Not init yet |
573 | 0 | return; |
574 | 0 | } |
575 | 7.48k | SeekToRestartPoint(0); |
576 | 7.48k | bool is_shared = false; |
577 | 7.48k | ParseNextKey<CheckAndDecodeEntry>(&is_shared); |
578 | 7.48k | cur_entry_idx_ = 0; |
579 | 7.48k | } |
580 | | |
581 | 6.08k | void IndexBlockIter::SeekToFirstImpl() { |
582 | | #ifndef NDEBUG |
583 | | if (TEST_Corrupt_Callback("IndexBlockIter::SeekToFirstImpl")) { |
584 | | return; |
585 | | } |
586 | | #endif |
587 | 6.08k | if (data_ == nullptr) { // Not init yet |
588 | 0 | return; |
589 | 0 | } |
590 | 6.08k | status_ = Status::OK(); |
591 | 6.08k | SeekToRestartPoint(0); |
592 | 6.08k | ParseNextIndexKey(); |
593 | 6.08k | cur_entry_idx_ = 0; |
594 | 6.08k | } |
595 | | |
596 | 970 | void DataBlockIter::SeekToLastImpl() { |
597 | 970 | if (data_ == nullptr) { // Not init yet |
598 | 0 | return; |
599 | 0 | } |
600 | 970 | SeekToRestartPoint(num_restarts_ - 1); |
601 | 970 | bool is_shared = false; |
602 | 970 | cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_; |
603 | 970 | while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) { |
604 | | // Keep skipping |
605 | 0 | ++cur_entry_idx_; |
606 | 0 | } |
607 | 970 | } |
608 | | |
609 | 0 | void MetaBlockIter::SeekToLastImpl() { |
610 | 0 | if (data_ == nullptr) { // Not init yet |
611 | 0 | return; |
612 | 0 | } |
613 | 0 | SeekToRestartPoint(num_restarts_ - 1); |
614 | 0 | bool is_shared = false; |
615 | 0 | assert(num_restarts_ >= 1); |
616 | 0 | cur_entry_idx_ = |
617 | 0 | static_cast<int32_t>((num_restarts_ - 1) * block_restart_interval_); |
618 | 0 | while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) && |
619 | 0 | NextEntryOffset() < restarts_) { |
620 | | // Will probably never reach here since restart_interval is always 1 |
621 | 0 | ++cur_entry_idx_; |
622 | 0 | } |
623 | 0 | } |
624 | | |
625 | 389 | void IndexBlockIter::SeekToLastImpl() { |
626 | 389 | if (data_ == nullptr) { // Not init yet |
627 | 0 | return; |
628 | 0 | } |
629 | 389 | status_ = Status::OK(); |
630 | 389 | SeekToRestartPoint(num_restarts_ - 1); |
631 | 389 | cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_; |
632 | 389 | while (ParseNextIndexKey() && NextEntryOffset() < restarts_) { |
633 | 0 | ++cur_entry_idx_; |
634 | 0 | } |
635 | 389 | } |
636 | | |
637 | | template <class TValue> |
638 | | template <typename DecodeEntryFunc> |
639 | 556k | bool BlockIter<TValue>::ParseNextKey(bool* is_shared) { |
640 | 556k | current_ = NextEntryOffset(); |
641 | 556k | const char* p = data_ + current_; |
642 | 556k | const char* limit = data_ + restarts_; // Restarts come right after data |
643 | | |
644 | 556k | if (p >= limit) { |
645 | | // No more entries to return. Mark as invalid. |
646 | 33.1k | current_ = restarts_; |
647 | 33.1k | restart_index_ = num_restarts_; |
648 | 33.1k | return false; |
649 | 33.1k | } |
650 | | // Decode next entry |
651 | 523k | uint32_t shared, non_shared, value_length; |
652 | 523k | p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length); |
653 | 523k | if (p == nullptr || raw_key_.Size() < shared) { |
654 | 0 | CorruptionError(); |
655 | 0 | return false; |
656 | 523k | } else { |
657 | 523k | if (shared == 0) { |
658 | 274k | *is_shared = false; |
659 | | // If this key doesn't share any bytes with prev key, and no min timestamp |
660 | | // needs to be padded to the key, then we don't need to decode it and |
661 | | // can use its address in the block directly (no copy). |
662 | 274k | UpdateRawKeyAndMaybePadMinTimestamp(Slice(p, non_shared)); |
663 | 274k | } else { |
664 | | // This key share `shared` bytes with prev key, we need to decode it |
665 | 248k | *is_shared = true; |
666 | | // If user-defined timestamp is stripped from user key before keys are |
667 | | // delta encoded, the decoded key consisting of the shared and non shared |
668 | | // bytes do not have user-defined timestamp yet. We need to pad min |
669 | | // timestamp to it. |
670 | 248k | if (pad_min_timestamp_) { |
671 | 0 | raw_key_.TrimAppendWithTimestamp(shared, p, non_shared, ts_sz_); |
672 | 248k | } else { |
673 | 248k | raw_key_.TrimAppend(shared, p, non_shared); |
674 | 248k | } |
675 | 248k | } |
676 | 523k | value_ = Slice(p + non_shared, value_length); |
677 | 523k | if (shared == 0) { |
678 | 468k | while (restart_index_ + 1 < num_restarts_ && |
679 | 468k | GetRestartPoint(restart_index_ + 1) < current_) { |
680 | 193k | ++restart_index_; |
681 | 193k | } |
682 | 274k | } |
683 | | // else we are in the middle of a restart interval and the restart_index_ |
684 | | // thus has not changed |
685 | 523k | return true; |
686 | 523k | } |
687 | 523k | } bool rocksdb::BlockIter<rocksdb::Slice>::ParseNextKey<rocksdb::DecodeEntry>(bool*) Line | Count | Source | 639 | 249k | bool BlockIter<TValue>::ParseNextKey(bool* is_shared) { | 640 | 249k | current_ = NextEntryOffset(); | 641 | 249k | const char* p = data_ + current_; | 642 | 249k | const char* limit = data_ + restarts_; // Restarts come right after data | 643 | | | 644 | 249k | if (p >= limit) { | 645 | | // No more entries to return. Mark as invalid. | 646 | 15.4k | current_ = restarts_; | 647 | 15.4k | restart_index_ = num_restarts_; | 648 | 15.4k | return false; | 649 | 15.4k | } | 650 | | // Decode next entry | 651 | 234k | uint32_t shared, non_shared, value_length; | 652 | 234k | p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length); | 653 | 234k | if (p == nullptr || raw_key_.Size() < shared) { | 654 | 0 | CorruptionError(); | 655 | 0 | return false; | 656 | 234k | } else { | 657 | 234k | if (shared == 0) { | 658 | 226k | *is_shared = false; | 659 | | // If this key doesn't share any bytes with prev key, and no min timestamp | 660 | | // needs to be padded to the key, then we don't need to decode it and | 661 | | // can use its address in the block directly (no copy). | 662 | 226k | UpdateRawKeyAndMaybePadMinTimestamp(Slice(p, non_shared)); | 663 | 226k | } else { | 664 | | // This key share `shared` bytes with prev key, we need to decode it | 665 | 7.90k | *is_shared = true; | 666 | | // If user-defined timestamp is stripped from user key before keys are | 667 | | // delta encoded, the decoded key consisting of the shared and non shared | 668 | | // bytes do not have user-defined timestamp yet. We need to pad min | 669 | | // timestamp to it. | 670 | 7.90k | if (pad_min_timestamp_) { | 671 | 0 | raw_key_.TrimAppendWithTimestamp(shared, p, non_shared, ts_sz_); | 672 | 7.90k | } else { | 673 | 7.90k | raw_key_.TrimAppend(shared, p, non_shared); | 674 | 7.90k | } | 675 | 7.90k | } | 676 | 234k | value_ = Slice(p + non_shared, value_length); | 677 | 234k | if (shared == 0) { | 678 | 417k | while (restart_index_ + 1 < num_restarts_ && | 679 | 417k | GetRestartPoint(restart_index_ + 1) < current_) { | 680 | 190k | ++restart_index_; | 681 | 190k | } | 682 | 226k | } | 683 | | // else we are in the middle of a restart interval and the restart_index_ | 684 | | // thus has not changed | 685 | 234k | return true; | 686 | 234k | } | 687 | 234k | } |
bool rocksdb::BlockIter<rocksdb::IndexValue>::ParseNextKey<rocksdb::DecodeEntryV4>(bool*) Line | Count | Source | 639 | 16.8k | bool BlockIter<TValue>::ParseNextKey(bool* is_shared) { | 640 | 16.8k | current_ = NextEntryOffset(); | 641 | 16.8k | const char* p = data_ + current_; | 642 | 16.8k | const char* limit = data_ + restarts_; // Restarts come right after data | 643 | | | 644 | 16.8k | if (p >= limit) { | 645 | | // No more entries to return. Mark as invalid. | 646 | 5.91k | current_ = restarts_; | 647 | 5.91k | restart_index_ = num_restarts_; | 648 | 5.91k | return false; | 649 | 5.91k | } | 650 | | // Decode next entry | 651 | 10.9k | uint32_t shared, non_shared, value_length; | 652 | 10.9k | p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length); | 653 | 10.9k | if (p == nullptr || raw_key_.Size() < shared) { | 654 | 0 | CorruptionError(); | 655 | 0 | return false; | 656 | 10.9k | } else { | 657 | 10.9k | if (shared == 0) { | 658 | 10.9k | *is_shared = false; | 659 | | // If this key doesn't share any bytes with prev key, and no min timestamp | 660 | | // needs to be padded to the key, then we don't need to decode it and | 661 | | // can use its address in the block directly (no copy). | 662 | 10.9k | UpdateRawKeyAndMaybePadMinTimestamp(Slice(p, non_shared)); | 663 | 18.4E | } else { | 664 | | // This key share `shared` bytes with prev key, we need to decode it | 665 | 18.4E | *is_shared = true; | 666 | | // If user-defined timestamp is stripped from user key before keys are | 667 | | // delta encoded, the decoded key consisting of the shared and non shared | 668 | | // bytes do not have user-defined timestamp yet. We need to pad min | 669 | | // timestamp to it. | 670 | 18.4E | if (pad_min_timestamp_) { | 671 | 0 | raw_key_.TrimAppendWithTimestamp(shared, p, non_shared, ts_sz_); | 672 | 18.4E | } else { | 673 | 18.4E | raw_key_.TrimAppend(shared, p, non_shared); | 674 | 18.4E | } | 675 | 18.4E | } | 676 | 10.9k | value_ = Slice(p + non_shared, value_length); | 677 | 10.9k | if (shared == 0) { | 678 | 13.9k | while (restart_index_ + 1 < num_restarts_ && | 679 | 13.9k | GetRestartPoint(restart_index_ + 1) < current_) { | 680 | 2.96k | ++restart_index_; | 681 | 2.96k | } | 682 | 10.9k | } | 683 | | // else we are in the middle of a restart interval and the restart_index_ | 684 | | // thus has not changed | 685 | 10.9k | return true; | 686 | 10.9k | } | 687 | 10.9k | } |
Unexecuted instantiation: bool rocksdb::BlockIter<rocksdb::IndexValue>::ParseNextKey<rocksdb::DecodeEntry>(bool*) bool rocksdb::BlockIter<rocksdb::Slice>::ParseNextKey<rocksdb::CheckAndDecodeEntry>(bool*) Line | Count | Source | 639 | 289k | bool BlockIter<TValue>::ParseNextKey(bool* is_shared) { | 640 | 289k | current_ = NextEntryOffset(); | 641 | 289k | const char* p = data_ + current_; | 642 | 289k | const char* limit = data_ + restarts_; // Restarts come right after data | 643 | | | 644 | 289k | if (p >= limit) { | 645 | | // No more entries to return. Mark as invalid. | 646 | 11.8k | current_ = restarts_; | 647 | 11.8k | restart_index_ = num_restarts_; | 648 | 11.8k | return false; | 649 | 11.8k | } | 650 | | // Decode next entry | 651 | 277k | uint32_t shared, non_shared, value_length; | 652 | 277k | p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length); | 653 | 278k | if (p == nullptr || raw_key_.Size() < shared) { | 654 | 0 | CorruptionError(); | 655 | 0 | return false; | 656 | 277k | } else { | 657 | 277k | if (shared == 0) { | 658 | 37.3k | *is_shared = false; | 659 | | // If this key doesn't share any bytes with prev key, and no min timestamp | 660 | | // needs to be padded to the key, then we don't need to decode it and | 661 | | // can use its address in the block directly (no copy). | 662 | 37.3k | UpdateRawKeyAndMaybePadMinTimestamp(Slice(p, non_shared)); | 663 | 240k | } else { | 664 | | // This key share `shared` bytes with prev key, we need to decode it | 665 | 240k | *is_shared = true; | 666 | | // If user-defined timestamp is stripped from user key before keys are | 667 | | // delta encoded, the decoded key consisting of the shared and non shared | 668 | | // bytes do not have user-defined timestamp yet. We need to pad min | 669 | | // timestamp to it. | 670 | 240k | if (pad_min_timestamp_) { | 671 | 0 | raw_key_.TrimAppendWithTimestamp(shared, p, non_shared, ts_sz_); | 672 | 240k | } else { | 673 | 240k | raw_key_.TrimAppend(shared, p, non_shared); | 674 | 240k | } | 675 | 240k | } | 676 | 277k | value_ = Slice(p + non_shared, value_length); | 677 | 277k | if (shared == 0) { | 678 | 37.2k | while (restart_index_ + 1 < num_restarts_ && | 679 | 37.2k | GetRestartPoint(restart_index_ + 1) < current_) { | 680 | 0 | ++restart_index_; | 681 | 0 | } | 682 | 37.2k | } | 683 | | // else we are in the middle of a restart interval and the restart_index_ | 684 | | // thus has not changed | 685 | 277k | return true; | 686 | 277k | } | 687 | 277k | } |
|
688 | | |
689 | 249k | bool DataBlockIter::ParseNextDataKey(bool* is_shared) { |
690 | 249k | if (ParseNextKey<DecodeEntry>(is_shared)) { |
691 | | #ifndef NDEBUG |
692 | | if (global_seqno_ != kDisableGlobalSequenceNumber) { |
693 | | // If we are reading a file with a global sequence number we should |
694 | | // expect that all encoded sequence numbers are zeros and any value |
695 | | // type is kTypeValue, kTypeMerge, kTypeDeletion, |
696 | | // kTypeDeletionWithTimestamp, kTypeRangeDeletion, or |
697 | | // kTypeWideColumnEntity. |
698 | | uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey()); |
699 | | SequenceNumber seqno; |
700 | | ValueType value_type; |
701 | | UnPackSequenceAndType(packed, &seqno, &value_type); |
702 | | assert(value_type == ValueType::kTypeValue || |
703 | | value_type == ValueType::kTypeMerge || |
704 | | value_type == ValueType::kTypeDeletion || |
705 | | value_type == ValueType::kTypeDeletionWithTimestamp || |
706 | | value_type == ValueType::kTypeRangeDeletion || |
707 | | value_type == ValueType::kTypeWideColumnEntity); |
708 | | assert(seqno == 0); |
709 | | } |
710 | | #endif // NDEBUG |
711 | 234k | return true; |
712 | 234k | } else { |
713 | 15.4k | return false; |
714 | 15.4k | } |
715 | 249k | } |
716 | | |
717 | 16.8k | bool IndexBlockIter::ParseNextIndexKey() { |
718 | 16.8k | bool is_shared = false; |
719 | 16.8k | bool ok = (value_delta_encoded_) ? ParseNextKey<DecodeEntryV4>(&is_shared) |
720 | 16.8k | : ParseNextKey<DecodeEntry>(&is_shared); |
721 | 16.8k | if (ok) { |
722 | 10.9k | if (value_delta_encoded_ || global_seqno_state_ != nullptr || |
723 | 10.9k | pad_min_timestamp_) { |
724 | 10.9k | DecodeCurrentValue(is_shared); |
725 | 10.9k | } |
726 | 10.9k | } |
727 | 16.8k | return ok; |
728 | 16.8k | } |
729 | | |
730 | | // The format: |
731 | | // restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) |
732 | | // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) |
733 | | // ... |
734 | | // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) |
735 | | // where, k is key, v is value, and its encoding is in parentheses. |
736 | | // The format of each key is (shared_size, non_shared_size, shared, non_shared) |
737 | | // The format of each value, i.e., block handle, is (offset, size) whenever the |
738 | | // is_shared is false, which included the first entry in each restart point. |
739 | | // Otherwise, the format is delta-size = the size of current block - the size o |
740 | | // last block. |
741 | 10.9k | void IndexBlockIter::DecodeCurrentValue(bool is_shared) { |
742 | 10.9k | Slice v(value_.data(), data_ + restarts_ - value_.data()); |
743 | | // Delta encoding is used if `shared` != 0. |
744 | 10.9k | Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom( |
745 | 10.9k | &v, have_first_key_, |
746 | 10.9k | (value_delta_encoded_ && is_shared) ? &decoded_value_.handle : nullptr); |
747 | 10.9k | assert(decode_s.ok()); |
748 | 10.9k | value_ = Slice(value_.data(), v.data() - value_.data()); |
749 | | |
750 | 10.9k | if (global_seqno_state_ != nullptr) { |
751 | | // Overwrite sequence number the same way as in DataBlockIter. |
752 | |
|
753 | 0 | IterKey& first_internal_key = global_seqno_state_->first_internal_key; |
754 | 0 | first_internal_key.SetInternalKey(decoded_value_.first_internal_key, |
755 | 0 | /* copy */ true); |
756 | |
|
757 | 0 | assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0); |
758 | |
|
759 | 0 | ValueType value_type = ExtractValueType(first_internal_key.GetKey()); |
760 | 0 | assert(value_type == ValueType::kTypeValue || |
761 | 0 | value_type == ValueType::kTypeMerge || |
762 | 0 | value_type == ValueType::kTypeDeletion || |
763 | 0 | value_type == ValueType::kTypeRangeDeletion || |
764 | 0 | value_type == ValueType::kTypeWideColumnEntity); |
765 | |
|
766 | 0 | first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno, |
767 | 0 | value_type); |
768 | 0 | decoded_value_.first_internal_key = first_internal_key.GetKey(); |
769 | 0 | } |
770 | 10.9k | if (pad_min_timestamp_ && !decoded_value_.first_internal_key.empty()) { |
771 | 0 | first_internal_key_with_ts_.clear(); |
772 | 0 | PadInternalKeyWithMinTimestamp(&first_internal_key_with_ts_, |
773 | 0 | decoded_value_.first_internal_key, ts_sz_); |
774 | 0 | decoded_value_.first_internal_key = first_internal_key_with_ts_; |
775 | 0 | } |
776 | 10.9k | } |
777 | | |
778 | | template <class TValue> |
779 | | void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target, |
780 | | uint32_t index, |
781 | 33.5k | bool skip_linear_scan) { |
782 | | // SeekToRestartPoint() only does the lookup in the restart block. We need |
783 | | // to follow it up with NextImpl() to position the iterator at the restart |
784 | | // key. |
785 | 33.5k | SeekToRestartPoint(index); |
786 | 33.5k | cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1; |
787 | 33.5k | NextImpl(); |
788 | | |
789 | 33.5k | if (!skip_linear_scan) { |
790 | | // Linear search (within restart block) for first key >= target |
791 | 5.10k | uint32_t max_offset; |
792 | 5.10k | if (index + 1 < num_restarts_) { |
793 | | // We are in a non-last restart interval. Since `BinarySeek()` guarantees |
794 | | // the next restart key is strictly greater than `target`, we can |
795 | | // terminate upon reaching it without any additional key comparison. |
796 | 0 | max_offset = GetRestartPoint(index + 1); |
797 | 5.10k | } else { |
798 | | // We are in the last restart interval. The while-loop will terminate by |
799 | | // `Valid()` returning false upon advancing past the block's last key. |
800 | 5.10k | max_offset = std::numeric_limits<uint32_t>::max(); |
801 | 5.10k | } |
802 | 5.10k | while (true) { |
803 | 5.10k | NextImpl(); |
804 | 5.10k | if (!Valid()) { |
805 | | // TODO(cbi): per key-value checksum will not be verified in UpdateKey() |
806 | | // since Valid() will returns false. |
807 | 4.93k | break; |
808 | 4.93k | } |
809 | 175 | if (current_ == max_offset) { |
810 | 0 | assert(CompareCurrentKey(target) > 0); |
811 | 0 | break; |
812 | 175 | } else if (CompareCurrentKey(target) >= 0) { |
813 | 175 | break; |
814 | 175 | } |
815 | 175 | } |
816 | 5.10k | } |
817 | 33.5k | } rocksdb::BlockIter<rocksdb::Slice>::FindKeyAfterBinarySeek(rocksdb::Slice const&, unsigned int, bool) Line | Count | Source | 781 | 31.7k | bool skip_linear_scan) { | 782 | | // SeekToRestartPoint() only does the lookup in the restart block. We need | 783 | | // to follow it up with NextImpl() to position the iterator at the restart | 784 | | // key. | 785 | 31.7k | SeekToRestartPoint(index); | 786 | 31.7k | cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1; | 787 | 31.7k | NextImpl(); | 788 | | | 789 | 31.7k | if (!skip_linear_scan) { | 790 | | // Linear search (within restart block) for first key >= target | 791 | 5.10k | uint32_t max_offset; | 792 | 5.10k | if (index + 1 < num_restarts_) { | 793 | | // We are in a non-last restart interval. Since `BinarySeek()` guarantees | 794 | | // the next restart key is strictly greater than `target`, we can | 795 | | // terminate upon reaching it without any additional key comparison. | 796 | 0 | max_offset = GetRestartPoint(index + 1); | 797 | 5.10k | } else { | 798 | | // We are in the last restart interval. The while-loop will terminate by | 799 | | // `Valid()` returning false upon advancing past the block's last key. | 800 | 5.10k | max_offset = std::numeric_limits<uint32_t>::max(); | 801 | 5.10k | } | 802 | 5.10k | while (true) { | 803 | 5.10k | NextImpl(); | 804 | 5.10k | if (!Valid()) { | 805 | | // TODO(cbi): per key-value checksum will not be verified in UpdateKey() | 806 | | // since Valid() will returns false. | 807 | 4.93k | break; | 808 | 4.93k | } | 809 | 175 | if (current_ == max_offset) { | 810 | 0 | assert(CompareCurrentKey(target) > 0); | 811 | 0 | break; | 812 | 175 | } else if (CompareCurrentKey(target) >= 0) { | 813 | 175 | break; | 814 | 175 | } | 815 | 175 | } | 816 | 5.10k | } | 817 | 31.7k | } |
rocksdb::BlockIter<rocksdb::IndexValue>::FindKeyAfterBinarySeek(rocksdb::Slice const&, unsigned int, bool) Line | Count | Source | 781 | 1.82k | bool skip_linear_scan) { | 782 | | // SeekToRestartPoint() only does the lookup in the restart block. We need | 783 | | // to follow it up with NextImpl() to position the iterator at the restart | 784 | | // key. | 785 | 1.82k | SeekToRestartPoint(index); | 786 | 1.82k | cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1; | 787 | 1.82k | NextImpl(); | 788 | | | 789 | 1.82k | if (!skip_linear_scan) { | 790 | | // Linear search (within restart block) for first key >= target | 791 | 0 | uint32_t max_offset; | 792 | 0 | if (index + 1 < num_restarts_) { | 793 | | // We are in a non-last restart interval. Since `BinarySeek()` guarantees | 794 | | // the next restart key is strictly greater than `target`, we can | 795 | | // terminate upon reaching it without any additional key comparison. | 796 | 0 | max_offset = GetRestartPoint(index + 1); | 797 | 0 | } else { | 798 | | // We are in the last restart interval. The while-loop will terminate by | 799 | | // `Valid()` returning false upon advancing past the block's last key. | 800 | 0 | max_offset = std::numeric_limits<uint32_t>::max(); | 801 | 0 | } | 802 | 0 | while (true) { | 803 | 0 | NextImpl(); | 804 | 0 | if (!Valid()) { | 805 | | // TODO(cbi): per key-value checksum will not be verified in UpdateKey() | 806 | | // since Valid() will returns false. | 807 | 0 | break; | 808 | 0 | } | 809 | 0 | if (current_ == max_offset) { | 810 | 0 | assert(CompareCurrentKey(target) > 0); | 811 | 0 | break; | 812 | 0 | } else if (CompareCurrentKey(target) >= 0) { | 813 | 0 | break; | 814 | 0 | } | 815 | 0 | } | 816 | 0 | } | 817 | 1.82k | } |
|
818 | | |
819 | | // Binary searches in restart array to find the starting restart point for the |
820 | | // linear scan, and stores it in `*index`. Assumes restart array does not |
821 | | // contain duplicate keys. It is guaranteed that the restart key at `*index + 1` |
822 | | // is strictly greater than `target` or does not exist (this can be used to |
823 | | // elide a comparison when linear scan reaches all the way to the next restart |
824 | | // key). Furthermore, `*skip_linear_scan` is set to indicate whether the |
825 | | // `*index`th restart key is the final result so that key does not need to be |
826 | | // compared again later. |
827 | | template <class TValue> |
828 | | template <typename DecodeKeyFunc> |
829 | | bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index, |
830 | 33.5k | bool* skip_linear_scan) { |
831 | 33.5k | if (restarts_ == 0) { |
832 | | // SST files dedicated to range tombstones are written with index blocks |
833 | | // that have no keys while also having `num_restarts_ == 1`. This would |
834 | | // cause a problem for `BinarySeek()` as it'd try to access the first key |
835 | | // which does not exist. We identify such blocks by the offset at which |
836 | | // their restarts are stored, and return false to prevent any attempted |
837 | | // key accesses. |
838 | 0 | return false; |
839 | 0 | } |
840 | | |
841 | 33.5k | *skip_linear_scan = false; |
842 | | // Loop invariants: |
843 | | // - Restart key at index `left` is less than or equal to the target key. The |
844 | | // sentinel index `-1` is considered to have a key that is less than all |
845 | | // keys. |
846 | | // - Any restart keys after index `right` are strictly greater than the target |
847 | | // key. |
848 | 33.5k | int64_t left = -1, right = num_restarts_ - 1; |
849 | 85.1k | while (left != right) { |
850 | | // The `mid` is computed by rounding up so it lands in (`left`, `right`]. |
851 | 51.6k | int64_t mid = left + (right - left + 1) / 2; |
852 | 51.6k | uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid)); |
853 | 51.6k | uint32_t shared, non_shared; |
854 | 51.6k | const char* key_ptr = DecodeKeyFunc()( |
855 | 51.6k | data_ + region_offset, data_ + restarts_, &shared, &non_shared); |
856 | 51.6k | if (key_ptr == nullptr || (shared != 0)) { |
857 | 0 | CorruptionError(); |
858 | 0 | return false; |
859 | 0 | } |
860 | 51.6k | Slice mid_key(key_ptr, non_shared); |
861 | 51.6k | UpdateRawKeyAndMaybePadMinTimestamp(mid_key); |
862 | 51.6k | int cmp = CompareCurrentKey(target); |
863 | 51.6k | if (cmp < 0) { |
864 | | // Key at "mid" is smaller than "target". Therefore all |
865 | | // blocks before "mid" are uninteresting. |
866 | 16.9k | left = mid; |
867 | 34.6k | } else if (cmp > 0) { |
868 | | // Key at "mid" is >= "target". Therefore all blocks at or |
869 | | // after "mid" are uninteresting. |
870 | 15.1k | right = mid - 1; |
871 | 19.4k | } else { |
872 | 19.4k | *skip_linear_scan = true; |
873 | 19.4k | left = right = mid; |
874 | 19.4k | } |
875 | 51.6k | } |
876 | | |
877 | 33.5k | if (left == -1) { |
878 | | // All keys in the block were strictly greater than `target`. So the very |
879 | | // first key in the block is the final seek result. |
880 | 8.91k | *skip_linear_scan = true; |
881 | 8.91k | *index = 0; |
882 | 24.6k | } else { |
883 | 24.6k | *index = static_cast<uint32_t>(left); |
884 | 24.6k | } |
885 | 33.5k | return true; |
886 | 33.5k | } bool rocksdb::BlockIter<rocksdb::Slice>::BinarySeek<rocksdb::DecodeKey>(rocksdb::Slice const&, unsigned int*, bool*) Line | Count | Source | 830 | 31.7k | bool* skip_linear_scan) { | 831 | 31.7k | if (restarts_ == 0) { | 832 | | // SST files dedicated to range tombstones are written with index blocks | 833 | | // that have no keys while also having `num_restarts_ == 1`. This would | 834 | | // cause a problem for `BinarySeek()` as it'd try to access the first key | 835 | | // which does not exist. We identify such blocks by the offset at which | 836 | | // their restarts are stored, and return false to prevent any attempted | 837 | | // key accesses. | 838 | 0 | return false; | 839 | 0 | } | 840 | | | 841 | 31.7k | *skip_linear_scan = false; | 842 | | // Loop invariants: | 843 | | // - Restart key at index `left` is less than or equal to the target key. The | 844 | | // sentinel index `-1` is considered to have a key that is less than all | 845 | | // keys. | 846 | | // - Any restart keys after index `right` are strictly greater than the target | 847 | | // key. | 848 | 31.7k | int64_t left = -1, right = num_restarts_ - 1; | 849 | 81.4k | while (left != right) { | 850 | | // The `mid` is computed by rounding up so it lands in (`left`, `right`]. | 851 | 49.7k | int64_t mid = left + (right - left + 1) / 2; | 852 | 49.7k | uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid)); | 853 | 49.7k | uint32_t shared, non_shared; | 854 | 49.7k | const char* key_ptr = DecodeKeyFunc()( | 855 | 49.7k | data_ + region_offset, data_ + restarts_, &shared, &non_shared); | 856 | 49.7k | if (key_ptr == nullptr || (shared != 0)) { | 857 | 0 | CorruptionError(); | 858 | 0 | return false; | 859 | 0 | } | 860 | 49.7k | Slice mid_key(key_ptr, non_shared); | 861 | 49.7k | UpdateRawKeyAndMaybePadMinTimestamp(mid_key); | 862 | 49.7k | int cmp = CompareCurrentKey(target); | 863 | 49.7k | if (cmp < 0) { | 864 | | // Key at "mid" is smaller than "target". Therefore all | 865 | | // blocks before "mid" are uninteresting. | 866 | 16.9k | left = mid; | 867 | 32.8k | } else if (cmp > 0) { | 868 | | // Key at "mid" is >= "target". Therefore all blocks at or | 869 | | // after "mid" are uninteresting. | 870 | 14.8k | right = mid - 1; | 871 | 18.0k | } else { | 872 | 18.0k | *skip_linear_scan = true; | 873 | 18.0k | left = right = mid; | 874 | 18.0k | } | 875 | 49.7k | } | 876 | | | 877 | 31.7k | if (left == -1) { | 878 | | // All keys in the block were strictly greater than `target`. So the very | 879 | | // first key in the block is the final seek result. | 880 | 8.54k | *skip_linear_scan = true; | 881 | 8.54k | *index = 0; | 882 | 23.1k | } else { | 883 | 23.1k | *index = static_cast<uint32_t>(left); | 884 | 23.1k | } | 885 | 31.7k | return true; | 886 | 31.7k | } |
bool rocksdb::BlockIter<rocksdb::IndexValue>::BinarySeek<rocksdb::DecodeKeyV4>(rocksdb::Slice const&, unsigned int*, bool*) Line | Count | Source | 830 | 1.82k | bool* skip_linear_scan) { | 831 | 1.82k | if (restarts_ == 0) { | 832 | | // SST files dedicated to range tombstones are written with index blocks | 833 | | // that have no keys while also having `num_restarts_ == 1`. This would | 834 | | // cause a problem for `BinarySeek()` as it'd try to access the first key | 835 | | // which does not exist. We identify such blocks by the offset at which | 836 | | // their restarts are stored, and return false to prevent any attempted | 837 | | // key accesses. | 838 | 0 | return false; | 839 | 0 | } | 840 | | | 841 | 1.82k | *skip_linear_scan = false; | 842 | | // Loop invariants: | 843 | | // - Restart key at index `left` is less than or equal to the target key. The | 844 | | // sentinel index `-1` is considered to have a key that is less than all | 845 | | // keys. | 846 | | // - Any restart keys after index `right` are strictly greater than the target | 847 | | // key. | 848 | 1.82k | int64_t left = -1, right = num_restarts_ - 1; | 849 | 3.65k | while (left != right) { | 850 | | // The `mid` is computed by rounding up so it lands in (`left`, `right`]. | 851 | 1.82k | int64_t mid = left + (right - left + 1) / 2; | 852 | 1.82k | uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid)); | 853 | 1.82k | uint32_t shared, non_shared; | 854 | 1.82k | const char* key_ptr = DecodeKeyFunc()( | 855 | 1.82k | data_ + region_offset, data_ + restarts_, &shared, &non_shared); | 856 | 1.82k | if (key_ptr == nullptr || (shared != 0)) { | 857 | 0 | CorruptionError(); | 858 | 0 | return false; | 859 | 0 | } | 860 | 1.82k | Slice mid_key(key_ptr, non_shared); | 861 | 1.82k | UpdateRawKeyAndMaybePadMinTimestamp(mid_key); | 862 | 1.82k | int cmp = CompareCurrentKey(target); | 863 | 1.82k | if (cmp < 0) { | 864 | | // Key at "mid" is smaller than "target". Therefore all | 865 | | // blocks before "mid" are uninteresting. | 866 | 0 | left = mid; | 867 | 1.82k | } else if (cmp > 0) { | 868 | | // Key at "mid" is >= "target". Therefore all blocks at or | 869 | | // after "mid" are uninteresting. | 870 | 371 | right = mid - 1; | 871 | 1.45k | } else { | 872 | 1.45k | *skip_linear_scan = true; | 873 | 1.45k | left = right = mid; | 874 | 1.45k | } | 875 | 1.82k | } | 876 | | | 877 | 1.82k | if (left == -1) { | 878 | | // All keys in the block were strictly greater than `target`. So the very | 879 | | // first key in the block is the final seek result. | 880 | 371 | *skip_linear_scan = true; | 881 | 371 | *index = 0; | 882 | 1.45k | } else { | 883 | 1.45k | *index = static_cast<uint32_t>(left); | 884 | 1.45k | } | 885 | 1.82k | return true; | 886 | 1.82k | } |
Unexecuted instantiation: bool rocksdb::BlockIter<rocksdb::IndexValue>::BinarySeek<rocksdb::DecodeKey>(rocksdb::Slice const&, unsigned int*, bool*) |
887 | | |
888 | | // Compare target key and the block key of the block of `block_index`. |
889 | | // Return -1 if error. |
890 | 0 | int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { |
891 | 0 | uint32_t region_offset = GetRestartPoint(block_index); |
892 | 0 | uint32_t shared, non_shared; |
893 | 0 | const char* key_ptr = |
894 | 0 | value_delta_encoded_ |
895 | 0 | ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared, |
896 | 0 | &non_shared) |
897 | 0 | : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared, |
898 | 0 | &non_shared); |
899 | 0 | if (key_ptr == nullptr || (shared != 0)) { |
900 | 0 | CorruptionError(); |
901 | 0 | return 1; // Return target is smaller |
902 | 0 | } |
903 | 0 | Slice block_key(key_ptr, non_shared); |
904 | 0 | UpdateRawKeyAndMaybePadMinTimestamp(block_key); |
905 | 0 | return CompareCurrentKey(target); |
906 | 0 | } |
907 | | |
908 | | // Binary search in block_ids to find the first block |
909 | | // with a key >= target |
910 | | bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target, |
911 | | uint32_t* block_ids, uint32_t left, |
912 | | uint32_t right, uint32_t* index, |
913 | 0 | bool* prefix_may_exist) { |
914 | 0 | assert(left <= right); |
915 | 0 | assert(index); |
916 | 0 | assert(prefix_may_exist); |
917 | 0 | *prefix_may_exist = true; |
918 | 0 | uint32_t left_bound = left; |
919 | |
|
920 | 0 | while (left <= right) { |
921 | 0 | uint32_t mid = (right + left) / 2; |
922 | |
|
923 | 0 | int cmp = CompareBlockKey(block_ids[mid], target); |
924 | 0 | if (!status_.ok()) { |
925 | 0 | return false; |
926 | 0 | } |
927 | 0 | if (cmp < 0) { |
928 | | // Key at "target" is larger than "mid". Therefore all |
929 | | // blocks before or at "mid" are uninteresting. |
930 | 0 | left = mid + 1; |
931 | 0 | } else { |
932 | | // Key at "target" is <= "mid". Therefore all blocks |
933 | | // after "mid" are uninteresting. |
934 | | // If there is only one block left, we found it. |
935 | 0 | if (left == right) { |
936 | 0 | break; |
937 | 0 | } |
938 | 0 | right = mid; |
939 | 0 | } |
940 | 0 | } |
941 | | |
942 | 0 | if (left == right) { |
943 | | // In one of the two following cases: |
944 | | // (1) left is the first one of block_ids |
945 | | // (2) there is a gap of blocks between block of `left` and `left-1`. |
946 | | // we can further distinguish the case of key in the block or key not |
947 | | // existing, by comparing the target key and the key of the previous |
948 | | // block to the left of the block found. |
949 | 0 | if (block_ids[left] > 0 && |
950 | 0 | (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) && |
951 | 0 | CompareBlockKey(block_ids[left] - 1, target) > 0) { |
952 | 0 | current_ = restarts_; |
953 | 0 | *prefix_may_exist = false; |
954 | 0 | return false; |
955 | 0 | } |
956 | | |
957 | 0 | *index = block_ids[left]; |
958 | 0 | return true; |
959 | 0 | } else { |
960 | 0 | assert(left > right); |
961 | | |
962 | | // If the next block key is larger than seek key, it is possible that |
963 | | // no key shares the prefix with `target`, or all keys with the same |
964 | | // prefix as `target` are smaller than prefix. In the latter case, |
965 | | // we are mandated to set the position the same as the total order. |
966 | | // In the latter case, either: |
967 | | // (1) `target` falls into the range of the next block. In this case, |
968 | | // we can place the iterator to the next block, or |
969 | | // (2) `target` is larger than all block keys. In this case we can |
970 | | // keep the iterator invalidate without setting `prefix_may_exist` |
971 | | // to false. |
972 | | // We might sometimes end up with setting the total order position |
973 | | // while there is no key sharing the prefix as `target`, but it |
974 | | // still follows the contract. |
975 | 0 | uint32_t right_index = block_ids[right]; |
976 | 0 | assert(right_index + 1 <= num_restarts_); |
977 | 0 | if (right_index + 1 < num_restarts_) { |
978 | 0 | if (CompareBlockKey(right_index + 1, target) >= 0) { |
979 | 0 | *index = right_index + 1; |
980 | 0 | return true; |
981 | 0 | } else { |
982 | | // We have to set the flag here because we are not positioning |
983 | | // the iterator to the total order position. |
984 | 0 | *prefix_may_exist = false; |
985 | 0 | } |
986 | 0 | } |
987 | | |
988 | | // Mark iterator invalid |
989 | 0 | current_ = restarts_; |
990 | 0 | return false; |
991 | 0 | } |
992 | 0 | } |
993 | | |
994 | | bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index, |
995 | 0 | bool* prefix_may_exist) { |
996 | 0 | assert(index); |
997 | 0 | assert(prefix_may_exist); |
998 | 0 | assert(prefix_index_); |
999 | 0 | *prefix_may_exist = true; |
1000 | 0 | Slice seek_key = target; |
1001 | 0 | if (raw_key_.IsUserKey()) { |
1002 | 0 | seek_key = ExtractUserKey(target); |
1003 | 0 | } |
1004 | 0 | uint32_t* block_ids = nullptr; |
1005 | 0 | uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); |
1006 | |
|
1007 | 0 | if (num_blocks == 0) { |
1008 | 0 | current_ = restarts_; |
1009 | 0 | *prefix_may_exist = false; |
1010 | 0 | return false; |
1011 | 0 | } else { |
1012 | 0 | assert(block_ids); |
1013 | 0 | return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index, |
1014 | 0 | prefix_may_exist); |
1015 | 0 | } |
1016 | 0 | } |
1017 | | |
1018 | 34.0k | uint32_t Block::NumRestarts() const { |
1019 | 34.0k | assert(size_ >= 2 * sizeof(uint32_t)); |
1020 | 34.0k | uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); |
1021 | 34.0k | uint32_t num_restarts = block_footer; |
1022 | 34.0k | if (size_ > kMaxBlockSizeSupportedByHashIndex) { |
1023 | | // In BlockBuilder, we have ensured a block with HashIndex is less than |
1024 | | // kMaxBlockSizeSupportedByHashIndex (64KiB). |
1025 | | // |
1026 | | // Therefore, if we encounter a block with a size > 64KiB, the block |
1027 | | // cannot have HashIndex. So the footer will directly interpreted as |
1028 | | // num_restarts. |
1029 | | // |
1030 | | // Such check is for backward compatibility. We can ensure legacy block |
1031 | | // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted |
1032 | | // correctly as no HashIndex even if the MSB of num_restarts is set. |
1033 | 1.06k | return num_restarts; |
1034 | 1.06k | } |
1035 | 32.9k | BlockBasedTableOptions::DataBlockIndexType index_type; |
1036 | 32.9k | UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); |
1037 | 32.9k | return num_restarts; |
1038 | 34.0k | } |
1039 | | |
1040 | 34.0k | BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const { |
1041 | 34.0k | assert(size_ >= 2 * sizeof(uint32_t)); |
1042 | 34.0k | if (size_ > kMaxBlockSizeSupportedByHashIndex) { |
1043 | | // The check is for the same reason as that in NumRestarts() |
1044 | 1.06k | return BlockBasedTableOptions::kDataBlockBinarySearch; |
1045 | 1.06k | } |
1046 | 32.9k | uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); |
1047 | 32.9k | uint32_t num_restarts = block_footer; |
1048 | 32.9k | BlockBasedTableOptions::DataBlockIndexType index_type; |
1049 | 32.9k | UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); |
1050 | 32.9k | return index_type; |
1051 | 34.0k | } |
1052 | | |
1053 | 34.0k | Block::~Block() { |
1054 | | // This sync point can be re-enabled if RocksDB can control the |
1055 | | // initialization order of any/all static options created by the user. |
1056 | | // TEST_SYNC_POINT("Block::~Block"); |
1057 | 34.0k | delete[] kv_checksum_; |
1058 | 34.0k | } |
1059 | | |
1060 | | Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, |
1061 | | Statistics* statistics) |
1062 | | : contents_(std::move(contents)), |
1063 | | data_(contents_.data.data()), |
1064 | | size_(contents_.data.size()), |
1065 | | restart_offset_(0), |
1066 | 34.0k | num_restarts_(0) { |
1067 | 34.0k | TEST_SYNC_POINT("Block::Block:0"); |
1068 | 34.0k | if (size_ < sizeof(uint32_t)) { |
1069 | 0 | size_ = 0; // Error marker |
1070 | 34.0k | } else { |
1071 | | // Should only decode restart points for uncompressed blocks |
1072 | 34.0k | num_restarts_ = NumRestarts(); |
1073 | 34.0k | switch (IndexType()) { |
1074 | 33.9k | case BlockBasedTableOptions::kDataBlockBinarySearch: |
1075 | 33.9k | restart_offset_ = static_cast<uint32_t>(size_) - |
1076 | 33.9k | (1 + num_restarts_) * sizeof(uint32_t); |
1077 | 33.9k | if (restart_offset_ > size_ - sizeof(uint32_t)) { |
1078 | | // The size is too small for NumRestarts() and therefore |
1079 | | // restart_offset_ wrapped around. |
1080 | 0 | size_ = 0; |
1081 | 0 | } |
1082 | 33.9k | break; |
1083 | 0 | case BlockBasedTableOptions::kDataBlockBinaryAndHash: |
1084 | 0 | if (size_ < sizeof(uint32_t) /* block footer */ + |
1085 | 0 | sizeof(uint16_t) /* NUM_BUCK */) { |
1086 | 0 | size_ = 0; |
1087 | 0 | break; |
1088 | 0 | } |
1089 | | |
1090 | 0 | uint16_t map_offset; |
1091 | 0 | data_block_hash_index_.Initialize( |
1092 | 0 | data_, static_cast<uint16_t>(size_ - sizeof(uint32_t)), /*chop off |
1093 | | NUM_RESTARTS*/ |
1094 | 0 | &map_offset); |
1095 | |
|
1096 | 0 | restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t); |
1097 | |
|
1098 | 0 | if (restart_offset_ > map_offset) { |
1099 | | // map_offset is too small for NumRestarts() and |
1100 | | // therefore restart_offset_ wrapped around. |
1101 | 0 | size_ = 0; |
1102 | 0 | break; |
1103 | 0 | } |
1104 | 0 | break; |
1105 | 0 | default: |
1106 | 0 | size_ = 0; // Error marker |
1107 | 34.0k | } |
1108 | 34.0k | } |
1109 | 33.9k | if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) { |
1110 | 0 | read_amp_bitmap_.reset(new BlockReadAmpBitmap( |
1111 | 0 | restart_offset_, read_amp_bytes_per_bit, statistics)); |
1112 | 0 | } |
1113 | 33.9k | } |
1114 | | |
1115 | | void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key, |
1116 | 11.6k | const Comparator* raw_ucmp) { |
1117 | 11.6k | protection_bytes_per_key_ = 0; |
1118 | 11.6k | if (protection_bytes_per_key > 0 && num_restarts_ > 0) { |
1119 | | // NewDataIterator() is called with protection_bytes_per_key_ = 0. |
1120 | | // This is intended since checksum is not constructed yet. |
1121 | | // |
1122 | | // We do not know global_seqno yet, so checksum computation and |
1123 | | // verification all assume global_seqno = 0. |
1124 | | // TODO(yuzhangyu): handle the implication of padding timestamp for kv |
1125 | | // protection. |
1126 | 0 | std::unique_ptr<DataBlockIter> iter{NewDataIterator( |
1127 | 0 | raw_ucmp, kDisableGlobalSequenceNumber, nullptr /* iter */, |
1128 | 0 | nullptr /* stats */, true /* block_contents_pinned */, |
1129 | 0 | true /* user_defined_timestamps_persisted */)}; |
1130 | 0 | if (iter->status().ok()) { |
1131 | 0 | block_restart_interval_ = iter->GetRestartInterval(); |
1132 | 0 | } |
1133 | 0 | uint32_t num_keys = 0; |
1134 | 0 | if (iter->status().ok()) { |
1135 | 0 | num_keys = iter->NumberOfKeys(block_restart_interval_); |
1136 | 0 | } |
1137 | 0 | if (iter->status().ok()) { |
1138 | 0 | checksum_size_ = num_keys * protection_bytes_per_key; |
1139 | 0 | kv_checksum_ = new char[(size_t)checksum_size_]; |
1140 | 0 | size_t i = 0; |
1141 | 0 | iter->SeekToFirst(); |
1142 | 0 | while (iter->Valid()) { |
1143 | 0 | GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, |
1144 | 0 | iter->key(), iter->value()); |
1145 | 0 | iter->Next(); |
1146 | 0 | i += protection_bytes_per_key; |
1147 | 0 | } |
1148 | 0 | assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); |
1149 | 0 | } |
1150 | 0 | if (!iter->status().ok()) { |
1151 | 0 | size_ = 0; // Error marker |
1152 | 0 | return; |
1153 | 0 | } |
1154 | 0 | protection_bytes_per_key_ = protection_bytes_per_key; |
1155 | 0 | } |
1156 | 11.6k | } |
1157 | | |
1158 | | void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key, |
1159 | | const Comparator* raw_ucmp, |
1160 | | bool value_is_full, |
1161 | 7.47k | bool index_has_first_key) { |
1162 | 7.47k | protection_bytes_per_key_ = 0; |
1163 | 7.47k | if (num_restarts_ > 0 && protection_bytes_per_key > 0) { |
1164 | | // Note that `global_seqno` and `key_includes_seq` are hardcoded here. They |
1165 | | // do not impact how the index block is parsed. During checksum |
1166 | | // construction/verification, we use the entire key buffer from |
1167 | | // raw_key_.GetKey() returned by iter->key() as the `key` part of key-value |
1168 | | // checksum, and the content of this buffer do not change for different |
1169 | | // values of `global_seqno` or `key_includes_seq`. |
1170 | | // TODO(yuzhangyu): handle the implication of padding timestamp for kv |
1171 | | // protection. |
1172 | 0 | std::unique_ptr<IndexBlockIter> iter{NewIndexIterator( |
1173 | 0 | raw_ucmp, kDisableGlobalSequenceNumber /* global_seqno */, nullptr, |
1174 | 0 | nullptr /* Statistics */, true /* total_order_seek */, |
1175 | 0 | index_has_first_key /* have_first_key */, false /* key_includes_seq */, |
1176 | 0 | value_is_full, true /* block_contents_pinned */, |
1177 | 0 | true /* user_defined_timestamps_persisted*/, |
1178 | 0 | nullptr /* prefix_index */)}; |
1179 | 0 | if (iter->status().ok()) { |
1180 | 0 | block_restart_interval_ = iter->GetRestartInterval(); |
1181 | 0 | } |
1182 | 0 | uint32_t num_keys = 0; |
1183 | 0 | if (iter->status().ok()) { |
1184 | 0 | num_keys = iter->NumberOfKeys(block_restart_interval_); |
1185 | 0 | } |
1186 | 0 | if (iter->status().ok()) { |
1187 | 0 | checksum_size_ = num_keys * protection_bytes_per_key; |
1188 | 0 | kv_checksum_ = new char[(size_t)checksum_size_]; |
1189 | 0 | iter->SeekToFirst(); |
1190 | 0 | size_t i = 0; |
1191 | 0 | while (iter->Valid()) { |
1192 | 0 | GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, |
1193 | 0 | iter->key(), iter->raw_value()); |
1194 | 0 | iter->Next(); |
1195 | 0 | i += protection_bytes_per_key; |
1196 | 0 | } |
1197 | 0 | assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); |
1198 | 0 | } |
1199 | 0 | if (!iter->status().ok()) { |
1200 | 0 | size_ = 0; // Error marker |
1201 | 0 | return; |
1202 | 0 | } |
1203 | 0 | protection_bytes_per_key_ = protection_bytes_per_key; |
1204 | 0 | } |
1205 | 7.47k | } |
1206 | | |
1207 | | void Block::InitializeMetaIndexBlockProtectionInfo( |
1208 | 7.47k | uint8_t protection_bytes_per_key) { |
1209 | 7.47k | protection_bytes_per_key_ = 0; |
1210 | 7.47k | if (num_restarts_ > 0 && protection_bytes_per_key > 0) { |
1211 | 0 | std::unique_ptr<MetaBlockIter> iter{ |
1212 | 0 | NewMetaIterator(true /* block_contents_pinned */)}; |
1213 | 0 | if (iter->status().ok()) { |
1214 | 0 | block_restart_interval_ = iter->GetRestartInterval(); |
1215 | 0 | } |
1216 | 0 | uint32_t num_keys = 0; |
1217 | 0 | if (iter->status().ok()) { |
1218 | 0 | num_keys = iter->NumberOfKeys(block_restart_interval_); |
1219 | 0 | } |
1220 | 0 | if (iter->status().ok()) { |
1221 | 0 | checksum_size_ = num_keys * protection_bytes_per_key; |
1222 | 0 | kv_checksum_ = new char[(size_t)checksum_size_]; |
1223 | 0 | iter->SeekToFirst(); |
1224 | 0 | size_t i = 0; |
1225 | 0 | while (iter->Valid()) { |
1226 | 0 | GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, |
1227 | 0 | iter->key(), iter->value()); |
1228 | 0 | iter->Next(); |
1229 | 0 | i += protection_bytes_per_key; |
1230 | 0 | } |
1231 | 0 | assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); |
1232 | 0 | } |
1233 | 0 | if (!iter->status().ok()) { |
1234 | 0 | size_ = 0; // Error marker |
1235 | 0 | return; |
1236 | 0 | } |
1237 | 0 | protection_bytes_per_key_ = protection_bytes_per_key; |
1238 | 0 | } |
1239 | 7.47k | } |
1240 | | |
1241 | 14.9k | MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) { |
1242 | 14.9k | MetaBlockIter* iter = new MetaBlockIter(); |
1243 | 14.9k | if (size_ < 2 * sizeof(uint32_t)) { |
1244 | 0 | iter->Invalidate(Status::Corruption("bad block contents")); |
1245 | 0 | return iter; |
1246 | 14.9k | } else if (num_restarts_ == 0) { |
1247 | | // Empty block. |
1248 | 0 | iter->Invalidate(Status::OK()); |
1249 | 14.9k | } else { |
1250 | 14.9k | iter->Initialize(data_, restart_offset_, num_restarts_, |
1251 | 14.9k | block_contents_pinned, protection_bytes_per_key_, |
1252 | 14.9k | kv_checksum_, block_restart_interval_); |
1253 | 14.9k | } |
1254 | 14.9k | return iter; |
1255 | 14.9k | } |
1256 | | |
1257 | | DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp, |
1258 | | SequenceNumber global_seqno, |
1259 | | DataBlockIter* iter, Statistics* stats, |
1260 | | bool block_contents_pinned, |
1261 | 13.8k | bool user_defined_timestamps_persisted) { |
1262 | 13.8k | DataBlockIter* ret_iter; |
1263 | 13.8k | if (iter != nullptr) { |
1264 | 13.8k | ret_iter = iter; |
1265 | 13.8k | } else { |
1266 | 0 | ret_iter = new DataBlockIter; |
1267 | 0 | } |
1268 | 13.8k | if (size_ < 2 * sizeof(uint32_t)) { |
1269 | 0 | ret_iter->Invalidate(Status::Corruption("bad block contents")); |
1270 | 0 | return ret_iter; |
1271 | 0 | } |
1272 | 13.8k | if (num_restarts_ == 0) { |
1273 | | // Empty block. |
1274 | 0 | ret_iter->Invalidate(Status::OK()); |
1275 | 0 | return ret_iter; |
1276 | 13.8k | } else { |
1277 | 13.8k | ret_iter->Initialize( |
1278 | 13.8k | raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, |
1279 | 13.8k | read_amp_bitmap_.get(), block_contents_pinned, |
1280 | 13.8k | user_defined_timestamps_persisted, |
1281 | 13.8k | data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr, |
1282 | 13.8k | protection_bytes_per_key_, kv_checksum_, block_restart_interval_); |
1283 | 13.8k | if (read_amp_bitmap_) { |
1284 | 0 | if (read_amp_bitmap_->GetStatistics() != stats) { |
1285 | | // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ |
1286 | 0 | read_amp_bitmap_->SetStatistics(stats); |
1287 | 0 | } |
1288 | 0 | } |
1289 | 13.8k | } |
1290 | | |
1291 | 13.8k | return ret_iter; |
1292 | 13.8k | } |
1293 | | |
1294 | | IndexBlockIter* Block::NewIndexIterator( |
1295 | | const Comparator* raw_ucmp, SequenceNumber global_seqno, |
1296 | | IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek, |
1297 | | bool have_first_key, bool key_includes_seq, bool value_is_full, |
1298 | | bool block_contents_pinned, bool user_defined_timestamps_persisted, |
1299 | 13.5k | BlockPrefixIndex* prefix_index) { |
1300 | 13.5k | IndexBlockIter* ret_iter; |
1301 | 13.5k | if (iter != nullptr) { |
1302 | 98 | ret_iter = iter; |
1303 | 13.4k | } else { |
1304 | 13.4k | ret_iter = new IndexBlockIter; |
1305 | 13.4k | } |
1306 | 13.5k | if (size_ < 2 * sizeof(uint32_t)) { |
1307 | 0 | ret_iter->Invalidate(Status::Corruption("bad block contents")); |
1308 | 0 | return ret_iter; |
1309 | 0 | } |
1310 | 13.5k | if (num_restarts_ == 0) { |
1311 | | // Empty block. |
1312 | 0 | ret_iter->Invalidate(Status::OK()); |
1313 | 0 | return ret_iter; |
1314 | 13.5k | } else { |
1315 | 13.5k | BlockPrefixIndex* prefix_index_ptr = |
1316 | 13.5k | total_order_seek ? nullptr : prefix_index; |
1317 | 13.5k | ret_iter->Initialize( |
1318 | 13.5k | raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, |
1319 | 13.5k | prefix_index_ptr, have_first_key, key_includes_seq, value_is_full, |
1320 | 13.5k | block_contents_pinned, user_defined_timestamps_persisted, |
1321 | 13.5k | protection_bytes_per_key_, kv_checksum_, block_restart_interval_); |
1322 | 13.5k | } |
1323 | | |
1324 | 13.5k | return ret_iter; |
1325 | 13.5k | } |
1326 | | |
1327 | 11.6k | size_t Block::ApproximateMemoryUsage() const { |
1328 | 11.6k | size_t usage = usable_size(); |
1329 | 11.6k | #ifdef ROCKSDB_MALLOC_USABLE_SIZE |
1330 | 11.6k | usage += malloc_usable_size((void*)this); |
1331 | | #else |
1332 | | usage += sizeof(*this); |
1333 | | #endif // ROCKSDB_MALLOC_USABLE_SIZE |
1334 | 11.6k | if (read_amp_bitmap_) { |
1335 | 0 | usage += read_amp_bitmap_->ApproximateMemoryUsage(); |
1336 | 0 | } |
1337 | 11.6k | usage += checksum_size_; |
1338 | 11.6k | return usage; |
1339 | 11.6k | } |
1340 | | |
1341 | | } // namespace ROCKSDB_NAMESPACE |