/src/rocksdb/table/block_based/index_builder.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | |
10 | | #include "table/block_based/index_builder.h" |
11 | | |
12 | | #include <cassert> |
13 | | #include <cinttypes> |
14 | | #include <list> |
15 | | #include <string> |
16 | | |
17 | | #include "db/dbformat.h" |
18 | | #include "rocksdb/comparator.h" |
19 | | #include "rocksdb/flush_block_policy.h" |
20 | | #include "table/block_based/partitioned_filter_block.h" |
21 | | #include "table/format.h" |
22 | | |
23 | | namespace ROCKSDB_NAMESPACE { |
24 | | |
25 | | // Create a index builder based on its type. |
26 | | IndexBuilder* IndexBuilder::CreateIndexBuilder( |
27 | | BlockBasedTableOptions::IndexType index_type, |
28 | | const InternalKeyComparator* comparator, |
29 | | const InternalKeySliceTransform* int_key_slice_transform, |
30 | | const bool use_value_delta_encoding, |
31 | | const BlockBasedTableOptions& table_opt, size_t ts_sz, |
32 | 5.44k | const bool persist_user_defined_timestamps) { |
33 | 5.44k | IndexBuilder* result = nullptr; |
34 | 5.44k | switch (index_type) { |
35 | 5.44k | case BlockBasedTableOptions::kBinarySearch: { |
36 | 5.44k | result = new ShortenedIndexBuilder( |
37 | 5.44k | comparator, table_opt.index_block_restart_interval, |
38 | 5.44k | table_opt.format_version, use_value_delta_encoding, |
39 | 5.44k | table_opt.index_shortening, /* include_first_key */ false, ts_sz, |
40 | 5.44k | persist_user_defined_timestamps); |
41 | 5.44k | break; |
42 | 0 | } |
43 | 0 | case BlockBasedTableOptions::kHashSearch: { |
44 | | // Currently kHashSearch is incompatible with index_block_restart_interval |
45 | | // > 1 |
46 | 0 | assert(table_opt.index_block_restart_interval == 1); |
47 | 0 | result = new HashIndexBuilder( |
48 | 0 | comparator, int_key_slice_transform, |
49 | 0 | table_opt.index_block_restart_interval, table_opt.format_version, |
50 | 0 | use_value_delta_encoding, table_opt.index_shortening, ts_sz, |
51 | 0 | persist_user_defined_timestamps); |
52 | 0 | break; |
53 | 0 | } |
54 | 0 | case BlockBasedTableOptions::kTwoLevelIndexSearch: { |
55 | 0 | result = PartitionedIndexBuilder::CreateIndexBuilder( |
56 | 0 | comparator, use_value_delta_encoding, table_opt, ts_sz, |
57 | 0 | persist_user_defined_timestamps); |
58 | 0 | break; |
59 | 0 | } |
60 | 0 | case BlockBasedTableOptions::kBinarySearchWithFirstKey: { |
61 | 0 | result = new ShortenedIndexBuilder( |
62 | 0 | comparator, table_opt.index_block_restart_interval, |
63 | 0 | table_opt.format_version, use_value_delta_encoding, |
64 | 0 | table_opt.index_shortening, /* include_first_key */ true, ts_sz, |
65 | 0 | persist_user_defined_timestamps); |
66 | 0 | break; |
67 | 0 | } |
68 | 0 | default: { |
69 | 0 | assert(!"Do not recognize the index type "); |
70 | 0 | break; |
71 | 0 | } |
72 | 5.44k | } |
73 | 5.44k | return result; |
74 | 5.44k | } |
75 | | |
76 | | Slice ShortenedIndexBuilder::FindShortestInternalKeySeparator( |
77 | | const Comparator& comparator, const Slice& start, const Slice& limit, |
78 | 3.59k | std::string* scratch) { |
79 | | // Attempt to shorten the user portion of the key |
80 | 3.59k | Slice user_start = ExtractUserKey(start); |
81 | 3.59k | Slice user_limit = ExtractUserKey(limit); |
82 | 3.59k | scratch->assign(user_start.data(), user_start.size()); |
83 | 3.59k | comparator.FindShortestSeparator(scratch, user_limit); |
84 | 3.59k | assert(comparator.Compare(user_start, *scratch) <= 0); |
85 | 3.59k | assert(comparator.Compare(user_start, user_limit) >= 0 || |
86 | 3.59k | comparator.Compare(*scratch, user_limit) < 0); |
87 | 3.59k | if (scratch->size() <= user_start.size() && |
88 | 3.59k | comparator.Compare(user_start, *scratch) < 0) { |
89 | | // User key has become shorter physically, but larger logically. |
90 | | // Tack on the earliest possible number to the shortened user key. |
91 | 3.37k | PutFixed64(scratch, |
92 | 3.37k | PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek)); |
93 | 3.37k | assert(InternalKeyComparator(&comparator).Compare(start, *scratch) < 0); |
94 | 3.37k | assert(InternalKeyComparator(&comparator).Compare(*scratch, limit) < 0); |
95 | 3.37k | return *scratch; |
96 | 3.37k | } else { |
97 | 224 | return start; |
98 | 224 | } |
99 | 3.59k | } |
100 | | |
101 | | Slice ShortenedIndexBuilder::FindShortInternalKeySuccessor( |
102 | 0 | const Comparator& comparator, const Slice& key, std::string* scratch) { |
103 | 0 | Slice user_key = ExtractUserKey(key); |
104 | 0 | scratch->assign(user_key.data(), user_key.size()); |
105 | 0 | comparator.FindShortSuccessor(scratch); |
106 | 0 | assert(comparator.Compare(user_key, *scratch) <= 0); |
107 | 0 | if (scratch->size() <= user_key.size() && |
108 | 0 | comparator.Compare(user_key, *scratch) < 0) { |
109 | | // User key has become shorter physically, but larger logically. |
110 | | // Tack on the earliest possible number to the shortened user key. |
111 | 0 | PutFixed64(scratch, |
112 | 0 | PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek)); |
113 | 0 | assert(InternalKeyComparator(&comparator).Compare(key, *scratch) < 0); |
114 | 0 | return *scratch; |
115 | 0 | } else { |
116 | 0 | return key; |
117 | 0 | } |
118 | 0 | } |
119 | | |
120 | | PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder( |
121 | | const InternalKeyComparator* comparator, |
122 | | const bool use_value_delta_encoding, |
123 | | const BlockBasedTableOptions& table_opt, size_t ts_sz, |
124 | 0 | const bool persist_user_defined_timestamps) { |
125 | 0 | return new PartitionedIndexBuilder(comparator, table_opt, |
126 | 0 | use_value_delta_encoding, ts_sz, |
127 | 0 | persist_user_defined_timestamps); |
128 | 0 | } |
129 | | |
130 | | PartitionedIndexBuilder::PartitionedIndexBuilder( |
131 | | const InternalKeyComparator* comparator, |
132 | | const BlockBasedTableOptions& table_opt, |
133 | | const bool use_value_delta_encoding, size_t ts_sz, |
134 | | const bool persist_user_defined_timestamps) |
135 | | : IndexBuilder(comparator, ts_sz, persist_user_defined_timestamps), |
136 | | index_block_builder_( |
137 | | table_opt.index_block_restart_interval, true /*use_delta_encoding*/, |
138 | | use_value_delta_encoding, |
139 | | BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */, |
140 | | 0.75 /* data_block_hash_table_util_ratio */, ts_sz, |
141 | | persist_user_defined_timestamps, false /* is_user_key */), |
142 | | index_block_builder_without_seq_( |
143 | | table_opt.index_block_restart_interval, true /*use_delta_encoding*/, |
144 | | use_value_delta_encoding, |
145 | | BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */, |
146 | | 0.75 /* data_block_hash_table_util_ratio */, ts_sz, |
147 | | persist_user_defined_timestamps, true /* is_user_key */), |
148 | | table_opt_(table_opt), |
149 | | // We start by false. After each partition we revise the value based on |
150 | | // what the sub_index_builder has decided. If the feature is disabled |
151 | | // entirely, this will be set to true after switching the first |
152 | | // sub_index_builder. Otherwise, it could be set to true even one of the |
153 | | // sub_index_builders could not safely exclude seq from the keys, then it |
154 | | // wil be enforced on all sub_index_builders on ::Finish. |
155 | | seperator_is_key_plus_seq_(false), |
156 | 0 | use_value_delta_encoding_(use_value_delta_encoding) {} |
157 | | |
158 | 0 | void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { |
159 | 0 | assert(sub_index_builder_ == nullptr); |
160 | 0 | sub_index_builder_ = std::make_unique<ShortenedIndexBuilder>( |
161 | 0 | comparator_, table_opt_.index_block_restart_interval, |
162 | 0 | table_opt_.format_version, use_value_delta_encoding_, |
163 | 0 | table_opt_.index_shortening, /* include_first_key */ false, ts_sz_, |
164 | 0 | persist_user_defined_timestamps_); |
165 | | |
166 | | // Set sub_index_builder_->seperator_is_key_plus_seq_ to true if |
167 | | // seperator_is_key_plus_seq_ is true (internal-key mode) (set to false by |
168 | | // default on Creation) so that flush policy can point to |
169 | | // sub_index_builder_->index_block_builder_ |
170 | 0 | if (seperator_is_key_plus_seq_) { |
171 | 0 | sub_index_builder_->seperator_is_key_plus_seq_ = true; |
172 | 0 | } |
173 | |
|
174 | 0 | flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( |
175 | 0 | table_opt_.metadata_block_size, table_opt_.block_size_deviation, |
176 | | // Note: this is sub-optimal since sub_index_builder_ could later reset |
177 | | // seperator_is_key_plus_seq_ but the probability of that is low. |
178 | 0 | sub_index_builder_->seperator_is_key_plus_seq_ |
179 | 0 | ? sub_index_builder_->index_block_builder_ |
180 | 0 | : sub_index_builder_->index_block_builder_without_seq_)); |
181 | 0 | partition_cut_requested_ = false; |
182 | 0 | } |
183 | | |
184 | 0 | void PartitionedIndexBuilder::RequestPartitionCut() { |
185 | 0 | partition_cut_requested_ = true; |
186 | 0 | } |
187 | | |
188 | | Slice PartitionedIndexBuilder::AddIndexEntry( |
189 | | const Slice& last_key_in_current_block, |
190 | | const Slice* first_key_in_next_block, const BlockHandle& block_handle, |
191 | 0 | std::string* separator_scratch) { |
192 | | // Note: to avoid two consecuitive flush in the same method call, we do not |
193 | | // check flush policy when adding the last key |
194 | 0 | if (UNLIKELY(first_key_in_next_block == nullptr)) { // no more keys |
195 | 0 | if (sub_index_builder_ == nullptr) { |
196 | 0 | MakeNewSubIndexBuilder(); |
197 | | // Reserve next partition entry, where we will modify the key and |
198 | | // eventually set the value |
199 | 0 | entries_.push_back({{}, {}}); |
200 | 0 | } |
201 | 0 | auto sep = sub_index_builder_->AddIndexEntry( |
202 | 0 | last_key_in_current_block, first_key_in_next_block, block_handle, |
203 | 0 | separator_scratch); |
204 | 0 | if (!seperator_is_key_plus_seq_ && |
205 | 0 | sub_index_builder_->seperator_is_key_plus_seq_) { |
206 | | // We need to apply !seperator_is_key_plus_seq to all sub-index builders |
207 | 0 | seperator_is_key_plus_seq_ = true; |
208 | | // Would associate flush_policy with the appropriate builder, but it won't |
209 | | // be used again with no more keys |
210 | 0 | flush_policy_.reset(); |
211 | 0 | } |
212 | 0 | entries_.back().key.assign(sep.data(), sep.size()); |
213 | 0 | assert(entries_.back().value == nullptr); |
214 | 0 | std::swap(entries_.back().value, sub_index_builder_); |
215 | 0 | cut_filter_block = true; |
216 | 0 | return sep; |
217 | 0 | } else { |
218 | | // apply flush policy only to non-empty sub_index_builder_ |
219 | 0 | if (sub_index_builder_ != nullptr) { |
220 | 0 | std::string handle_encoding; |
221 | 0 | block_handle.EncodeTo(&handle_encoding); |
222 | 0 | bool do_flush = |
223 | 0 | partition_cut_requested_ || |
224 | 0 | flush_policy_->Update(last_key_in_current_block, handle_encoding); |
225 | 0 | if (do_flush) { |
226 | 0 | assert(entries_.back().value == nullptr); |
227 | 0 | std::swap(entries_.back().value, sub_index_builder_); |
228 | 0 | cut_filter_block = true; |
229 | 0 | } |
230 | 0 | } |
231 | 0 | if (sub_index_builder_ == nullptr) { |
232 | 0 | MakeNewSubIndexBuilder(); |
233 | | // Reserve next partition entry, where we will modify the key and |
234 | | // eventually set the value |
235 | 0 | entries_.push_back({{}, {}}); |
236 | 0 | } |
237 | 0 | auto sep = sub_index_builder_->AddIndexEntry( |
238 | 0 | last_key_in_current_block, first_key_in_next_block, block_handle, |
239 | 0 | separator_scratch); |
240 | 0 | entries_.back().key.assign(sep.data(), sep.size()); |
241 | 0 | if (!seperator_is_key_plus_seq_ && |
242 | 0 | sub_index_builder_->seperator_is_key_plus_seq_) { |
243 | | // We need to apply !seperator_is_key_plus_seq to all sub-index builders |
244 | 0 | seperator_is_key_plus_seq_ = true; |
245 | | // And use a flush_policy with the appropriate builder |
246 | 0 | flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( |
247 | 0 | table_opt_.metadata_block_size, table_opt_.block_size_deviation, |
248 | 0 | sub_index_builder_->index_block_builder_)); |
249 | 0 | } |
250 | 0 | return sep; |
251 | 0 | } |
252 | 0 | } |
253 | | |
254 | | Status PartitionedIndexBuilder::Finish( |
255 | 0 | IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) { |
256 | 0 | if (partition_cnt_ == 0) { |
257 | 0 | partition_cnt_ = entries_.size(); |
258 | 0 | } |
259 | | // It must be set to null after last key is added |
260 | 0 | assert(sub_index_builder_ == nullptr); |
261 | 0 | if (finishing_indexes == true) { |
262 | 0 | Entry& last_entry = entries_.front(); |
263 | 0 | std::string handle_encoding; |
264 | 0 | last_partition_block_handle.EncodeTo(&handle_encoding); |
265 | 0 | std::string handle_delta_encoding; |
266 | 0 | PutVarsignedint64( |
267 | 0 | &handle_delta_encoding, |
268 | 0 | last_partition_block_handle.size() - last_encoded_handle_.size()); |
269 | 0 | last_encoded_handle_ = last_partition_block_handle; |
270 | 0 | const Slice handle_delta_encoding_slice(handle_delta_encoding); |
271 | 0 | index_block_builder_.Add(last_entry.key, handle_encoding, |
272 | 0 | &handle_delta_encoding_slice); |
273 | 0 | if (!seperator_is_key_plus_seq_) { |
274 | 0 | index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key), |
275 | 0 | handle_encoding, |
276 | 0 | &handle_delta_encoding_slice); |
277 | 0 | } |
278 | 0 | entries_.pop_front(); |
279 | 0 | } |
280 | | // If there is no sub_index left, then return the 2nd level index. |
281 | 0 | if (UNLIKELY(entries_.empty())) { |
282 | 0 | if (seperator_is_key_plus_seq_) { |
283 | 0 | index_blocks->index_block_contents = index_block_builder_.Finish(); |
284 | 0 | } else { |
285 | 0 | index_blocks->index_block_contents = |
286 | 0 | index_block_builder_without_seq_.Finish(); |
287 | 0 | } |
288 | 0 | top_level_index_size_ = index_blocks->index_block_contents.size(); |
289 | 0 | index_size_ += top_level_index_size_; |
290 | 0 | return Status::OK(); |
291 | 0 | } else { |
292 | | // Finish the next partition index in line and Incomplete() to indicate we |
293 | | // expect more calls to Finish |
294 | 0 | Entry& entry = entries_.front(); |
295 | | // Apply the policy to all sub-indexes |
296 | 0 | entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_; |
297 | 0 | auto s = entry.value->Finish(index_blocks); |
298 | 0 | index_size_ += index_blocks->index_block_contents.size(); |
299 | 0 | finishing_indexes = true; |
300 | 0 | return s.ok() ? Status::Incomplete() : s; |
301 | 0 | } |
302 | 0 | } |
303 | | |
304 | 0 | size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; } |
305 | | } // namespace ROCKSDB_NAMESPACE |