/src/rocksdb/table/block_based/block_builder.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | // |
10 | | // BlockBuilder generates blocks where keys are prefix-compressed: |
11 | | // |
12 | | // When we store a key, we drop the prefix shared with the previous |
13 | | // string. This helps reduce the space requirement significantly. |
14 | | // Furthermore, once every K keys, we do not apply the prefix |
15 | | // compression and store the entire key. We call this a "restart |
16 | | // point". The tail end of the block stores the offsets of all of the |
17 | | // restart points, and can be used to do a binary search when looking |
18 | | // for a particular key. Values are stored as-is (without compression) |
19 | | // immediately following the corresponding key. |
20 | | // |
21 | | // An entry for a particular key-value pair has the form: |
22 | | // shared_bytes: varint32 |
23 | | // unshared_bytes: varint32 |
24 | | // value_length: varint32 |
25 | | // key_delta: char[unshared_bytes] |
26 | | // value: char[value_length] |
27 | | // shared_bytes == 0 for restart points. |
28 | | // |
29 | | // The trailer of the block has the form: |
30 | | // restarts: uint32[num_restarts] |
31 | | // num_restarts: uint32 |
32 | | // restarts[i] contains the offset within the block of the ith restart point. |
33 | | |
34 | | #include "table/block_based/block_builder.h" |
35 | | |
36 | | #include <algorithm> |
37 | | #include <cassert> |
38 | | |
39 | | #include "db/dbformat.h" |
40 | | #include "rocksdb/comparator.h" |
41 | | #include "table/block_based/data_block_footer.h" |
42 | | #include "util/coding.h" |
43 | | |
44 | | namespace ROCKSDB_NAMESPACE { |
45 | | |
46 | | BlockBuilder::BlockBuilder( |
47 | | int block_restart_interval, bool use_delta_encoding, |
48 | | bool use_value_delta_encoding, |
49 | | BlockBasedTableOptions::DataBlockIndexType index_type, |
50 | | double data_block_hash_table_util_ratio, size_t ts_sz, |
51 | | bool persist_user_defined_timestamps, bool is_user_key) |
52 | | : block_restart_interval_(block_restart_interval), |
53 | | use_delta_encoding_(use_delta_encoding), |
54 | | use_value_delta_encoding_(use_value_delta_encoding), |
55 | | strip_ts_sz_(persist_user_defined_timestamps ? 0 : ts_sz), |
56 | | is_user_key_(is_user_key), |
57 | | restarts_(1, 0), // First restart point is at offset 0 |
58 | | counter_(0), |
59 | 32.6k | finished_(false) { |
60 | 32.6k | switch (index_type) { |
61 | 32.6k | case BlockBasedTableOptions::kDataBlockBinarySearch: |
62 | 32.6k | break; |
63 | 0 | case BlockBasedTableOptions::kDataBlockBinaryAndHash: |
64 | 0 | data_block_hash_index_builder_.Initialize( |
65 | 0 | data_block_hash_table_util_ratio); |
66 | 0 | break; |
67 | 0 | default: |
68 | 0 | assert(0); |
69 | 32.6k | } |
70 | 32.6k | assert(block_restart_interval_ >= 1); |
71 | 32.6k | estimate_ = sizeof(uint32_t) + sizeof(uint32_t); |
72 | 32.6k | } |
73 | | |
74 | 8.11k | void BlockBuilder::Reset() { |
75 | 8.11k | buffer_.clear(); |
76 | 8.11k | restarts_.resize(1); // First restart point is at offset 0 |
77 | 8.11k | assert(restarts_[0] == 0); |
78 | 8.11k | estimate_ = sizeof(uint32_t) + sizeof(uint32_t); |
79 | 8.11k | counter_ = 0; |
80 | 8.11k | finished_ = false; |
81 | 8.11k | last_key_.clear(); |
82 | 8.11k | if (data_block_hash_index_builder_.Valid()) { |
83 | 0 | data_block_hash_index_builder_.Reset(); |
84 | 0 | } |
85 | | #ifndef NDEBUG |
86 | | add_with_last_key_called_ = false; |
87 | | #endif |
88 | 8.11k | } |
89 | | |
90 | 8.11k | void BlockBuilder::SwapAndReset(std::string& buffer) { |
91 | 8.11k | std::swap(buffer_, buffer); |
92 | 8.11k | Reset(); |
93 | 8.11k | } |
94 | | |
95 | | size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, |
96 | 21.6k | const Slice& value) const { |
97 | 21.6k | size_t estimate = CurrentSizeEstimate(); |
98 | | // Note: this is an imprecise estimate as it accounts for the whole key size |
99 | | // instead of non-shared key size. |
100 | 21.6k | estimate += key.size(); |
101 | 21.6k | if (strip_ts_sz_ > 0) { |
102 | 0 | estimate -= strip_ts_sz_; |
103 | 0 | } |
104 | | // In value delta encoding we estimate the value delta size as half the full |
105 | | // value size since only the size field of block handle is encoded. |
106 | 21.6k | estimate += |
107 | 21.6k | !use_value_delta_encoding_ || (counter_ >= block_restart_interval_) |
108 | 21.6k | ? value.size() |
109 | 21.6k | : value.size() / 2; |
110 | | |
111 | 21.6k | if (counter_ >= block_restart_interval_) { |
112 | 261 | estimate += sizeof(uint32_t); // a new restart entry. |
113 | 261 | } |
114 | | |
115 | 21.6k | estimate += sizeof(int32_t); // varint for shared prefix length. |
116 | | // Note: this is an imprecise estimate as we will have to encoded size, one |
117 | | // for shared key and one for non-shared key. |
118 | 21.6k | estimate += VarintLength(key.size()); // varint for key length. |
119 | 21.6k | if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) { |
120 | 21.6k | estimate += VarintLength(value.size()); // varint for value length. |
121 | 21.6k | } |
122 | | |
123 | 21.6k | return estimate; |
124 | 21.6k | } |
125 | | |
126 | 27.5k | Slice BlockBuilder::Finish() { |
127 | | // Append restart array |
128 | 165k | for (size_t i = 0; i < restarts_.size(); i++) { |
129 | 137k | PutFixed32(&buffer_, restarts_[i]); |
130 | 137k | } |
131 | | |
132 | 27.5k | uint32_t num_restarts = static_cast<uint32_t>(restarts_.size()); |
133 | 27.5k | BlockBasedTableOptions::DataBlockIndexType index_type = |
134 | 27.5k | BlockBasedTableOptions::kDataBlockBinarySearch; |
135 | 27.5k | if (data_block_hash_index_builder_.Valid() && |
136 | 27.5k | CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) { |
137 | 0 | data_block_hash_index_builder_.Finish(buffer_); |
138 | 0 | index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; |
139 | 0 | } |
140 | | |
141 | | // footer is a packed format of data_block_index_type and num_restarts |
142 | 27.5k | uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts); |
143 | | |
144 | 27.5k | PutFixed32(&buffer_, block_footer); |
145 | 27.5k | finished_ = true; |
146 | 27.5k | return Slice(buffer_); |
147 | 27.5k | } |
148 | | |
149 | | void BlockBuilder::Add(const Slice& key, const Slice& value, |
150 | 315k | const Slice* const delta_value) { |
151 | | // Ensure no unsafe mixing of Add and AddWithLastKey |
152 | 315k | assert(!add_with_last_key_called_); |
153 | | |
154 | 315k | AddWithLastKeyImpl(key, value, last_key_, delta_value, buffer_.size()); |
155 | 315k | if (use_delta_encoding_) { |
156 | | // Update state |
157 | | // We used to just copy the changed data, but it appears to be |
158 | | // faster to just copy the whole thing. |
159 | 315k | last_key_.assign(key.data(), key.size()); |
160 | 315k | } |
161 | 315k | } |
162 | | |
163 | | void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value, |
164 | | const Slice& last_key_param, |
165 | 29.1k | const Slice* const delta_value) { |
166 | | // Ensure no unsafe mixing of Add and AddWithLastKey |
167 | 29.1k | assert(last_key_.empty()); |
168 | | #ifndef NDEBUG |
169 | | add_with_last_key_called_ = false; |
170 | | #endif |
171 | | |
172 | | // Here we make sure to use an empty `last_key` on first call after creation |
173 | | // or Reset. This is more convenient for the caller and we can be more |
174 | | // clever inside BlockBuilder. On this hot code path, we want to avoid |
175 | | // conditional jumps like `buffer_.empty() ? ... : ...` so we can use a |
176 | | // fast arithmetic operation instead, with an assertion to be sure our logic |
177 | | // is sound. |
178 | 29.1k | size_t buffer_size = buffer_.size(); |
179 | 29.1k | size_t last_key_size = last_key_param.size(); |
180 | 29.1k | assert(buffer_size == 0 || buffer_size >= last_key_size - strip_ts_sz_); |
181 | | |
182 | 29.1k | Slice last_key(last_key_param.data(), last_key_size * (buffer_size > 0)); |
183 | | |
184 | 29.1k | AddWithLastKeyImpl(key, value, last_key, delta_value, buffer_size); |
185 | 29.1k | } |
186 | | |
187 | | inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key, |
188 | | const Slice& value, |
189 | | const Slice& last_key, |
190 | | const Slice* const delta_value, |
191 | 344k | size_t buffer_size) { |
192 | 344k | assert(!finished_); |
193 | 344k | assert(counter_ <= block_restart_interval_); |
194 | 344k | assert(!use_value_delta_encoding_ || delta_value); |
195 | 344k | std::string key_buf; |
196 | 344k | std::string last_key_buf; |
197 | 344k | const Slice key_to_persist = MaybeStripTimestampFromKey(&key_buf, key); |
198 | | // For delta key encoding, the first key in each restart interval doesn't have |
199 | | // a last key to share bytes with. |
200 | 344k | const Slice last_key_persisted = |
201 | 344k | last_key.size() == 0 |
202 | 344k | ? last_key |
203 | 344k | : MaybeStripTimestampFromKey(&last_key_buf, last_key); |
204 | 344k | size_t shared = 0; // number of bytes shared with prev key |
205 | 344k | if (counter_ >= block_restart_interval_) { |
206 | | // Restart compression |
207 | 113k | restarts_.push_back(static_cast<uint32_t>(buffer_size)); |
208 | 113k | estimate_ += sizeof(uint32_t); |
209 | 113k | counter_ = 0; |
210 | 231k | } else if (use_delta_encoding_) { |
211 | | // See how much sharing to do with previous string |
212 | 231k | shared = key_to_persist.difference_offset(last_key_persisted); |
213 | 231k | } |
214 | | |
215 | 344k | const size_t non_shared = key_to_persist.size() - shared; |
216 | | |
217 | 344k | if (use_value_delta_encoding_) { |
218 | | // Add "<shared><non_shared>" to buffer_ |
219 | 16.2k | PutVarint32Varint32(&buffer_, static_cast<uint32_t>(shared), |
220 | 16.2k | static_cast<uint32_t>(non_shared)); |
221 | 328k | } else { |
222 | | // Add "<shared><non_shared><value_size>" to buffer_ |
223 | 328k | PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared), |
224 | 328k | static_cast<uint32_t>(non_shared), |
225 | 328k | static_cast<uint32_t>(value.size())); |
226 | 328k | } |
227 | | |
228 | | // Add string delta to buffer_ followed by value |
229 | 344k | buffer_.append(key_to_persist.data() + shared, non_shared); |
230 | | // Use value delta encoding only when the key has shared bytes. This would |
231 | | // simplify the decoding, where it can figure which decoding to use simply by |
232 | | // looking at the shared bytes size. |
233 | 344k | if (shared != 0 && use_value_delta_encoding_) { |
234 | 0 | buffer_.append(delta_value->data(), delta_value->size()); |
235 | 344k | } else { |
236 | 344k | buffer_.append(value.data(), value.size()); |
237 | 344k | } |
238 | | |
239 | | // TODO(yuzhangyu): make user defined timestamp work with block hash index. |
240 | 344k | if (data_block_hash_index_builder_.Valid()) { |
241 | | // Only data blocks should be using `kDataBlockBinaryAndHash` index type. |
242 | | // And data blocks should always be built with internal keys instead of |
243 | | // user keys. |
244 | 0 | assert(!is_user_key_); |
245 | 0 | data_block_hash_index_builder_.Add(ExtractUserKey(key), |
246 | 0 | restarts_.size() - 1); |
247 | 0 | } |
248 | | |
249 | 344k | counter_++; |
250 | 344k | estimate_ += buffer_.size() - buffer_size; |
251 | 344k | } |
252 | | |
253 | | const Slice BlockBuilder::MaybeStripTimestampFromKey(std::string* key_buf, |
254 | 658k | const Slice& key) { |
255 | 658k | Slice stripped_key = key; |
256 | 658k | if (strip_ts_sz_ > 0) { |
257 | 0 | if (is_user_key_) { |
258 | 0 | stripped_key.remove_suffix(strip_ts_sz_); |
259 | 0 | } else { |
260 | 0 | StripTimestampFromInternalKey(key_buf, key, strip_ts_sz_); |
261 | 0 | stripped_key = *key_buf; |
262 | 0 | } |
263 | 0 | } |
264 | 658k | return stripped_key; |
265 | 658k | } |
266 | | } // namespace ROCKSDB_NAMESPACE |