Coverage Report

Created: 2024-09-08 07:17

/src/rocksdb/table/block_based/block_builder.cc
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
//
10
// BlockBuilder generates blocks where keys are prefix-compressed:
11
//
12
// When we store a key, we drop the prefix shared with the previous
13
// string.  This helps reduce the space requirement significantly.
14
// Furthermore, once every K keys, we do not apply the prefix
15
// compression and store the entire key.  We call this a "restart
16
// point".  The tail end of the block stores the offsets of all of the
17
// restart points, and can be used to do a binary search when looking
18
// for a particular key.  Values are stored as-is (without compression)
19
// immediately following the corresponding key.
20
//
21
// An entry for a particular key-value pair has the form:
22
//     shared_bytes: varint32
23
//     unshared_bytes: varint32
24
//     value_length: varint32
25
//     key_delta: char[unshared_bytes]
26
//     value: char[value_length]
27
// shared_bytes == 0 for restart points.
28
//
29
// The trailer of the block has the form:
30
//     restarts: uint32[num_restarts]
31
//     num_restarts: uint32
32
// restarts[i] contains the offset within the block of the ith restart point.
33
34
#include "table/block_based/block_builder.h"
35
36
#include <algorithm>
37
#include <cassert>
38
39
#include "db/dbformat.h"
40
#include "rocksdb/comparator.h"
41
#include "table/block_based/data_block_footer.h"
42
#include "util/coding.h"
43
44
namespace ROCKSDB_NAMESPACE {
45
46
BlockBuilder::BlockBuilder(
47
    int block_restart_interval, bool use_delta_encoding,
48
    bool use_value_delta_encoding,
49
    BlockBasedTableOptions::DataBlockIndexType index_type,
50
    double data_block_hash_table_util_ratio, size_t ts_sz,
51
    bool persist_user_defined_timestamps, bool is_user_key)
52
    : block_restart_interval_(block_restart_interval),
53
      use_delta_encoding_(use_delta_encoding),
54
      use_value_delta_encoding_(use_value_delta_encoding),
55
      strip_ts_sz_(persist_user_defined_timestamps ? 0 : ts_sz),
56
      is_user_key_(is_user_key),
57
      restarts_(1, 0),  // First restart point is at offset 0
58
      counter_(0),
59
32.6k
      finished_(false) {
60
32.6k
  switch (index_type) {
61
32.6k
    case BlockBasedTableOptions::kDataBlockBinarySearch:
62
32.6k
      break;
63
0
    case BlockBasedTableOptions::kDataBlockBinaryAndHash:
64
0
      data_block_hash_index_builder_.Initialize(
65
0
          data_block_hash_table_util_ratio);
66
0
      break;
67
0
    default:
68
0
      assert(0);
69
32.6k
  }
70
32.6k
  assert(block_restart_interval_ >= 1);
71
32.6k
  estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
72
32.6k
}
73
74
8.11k
void BlockBuilder::Reset() {
75
8.11k
  buffer_.clear();
76
8.11k
  restarts_.resize(1);  // First restart point is at offset 0
77
8.11k
  assert(restarts_[0] == 0);
78
8.11k
  estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
79
8.11k
  counter_ = 0;
80
8.11k
  finished_ = false;
81
8.11k
  last_key_.clear();
82
8.11k
  if (data_block_hash_index_builder_.Valid()) {
83
0
    data_block_hash_index_builder_.Reset();
84
0
  }
85
#ifndef NDEBUG
86
  add_with_last_key_called_ = false;
87
#endif
88
8.11k
}
89
90
8.11k
void BlockBuilder::SwapAndReset(std::string& buffer) {
91
8.11k
  std::swap(buffer_, buffer);
92
8.11k
  Reset();
93
8.11k
}
94
95
size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key,
96
21.6k
                                         const Slice& value) const {
97
21.6k
  size_t estimate = CurrentSizeEstimate();
98
  // Note: this is an imprecise estimate as it accounts for the whole key size
99
  // instead of non-shared key size.
100
21.6k
  estimate += key.size();
101
21.6k
  if (strip_ts_sz_ > 0) {
102
0
    estimate -= strip_ts_sz_;
103
0
  }
104
  // In value delta encoding we estimate the value delta size as half the full
105
  // value size since only the size field of block handle is encoded.
106
21.6k
  estimate +=
107
21.6k
      !use_value_delta_encoding_ || (counter_ >= block_restart_interval_)
108
21.6k
          ? value.size()
109
21.6k
          : value.size() / 2;
110
111
21.6k
  if (counter_ >= block_restart_interval_) {
112
261
    estimate += sizeof(uint32_t);  // a new restart entry.
113
261
  }
114
115
21.6k
  estimate += sizeof(int32_t);  // varint for shared prefix length.
116
  // Note: this is an imprecise estimate as we will have to encoded size, one
117
  // for shared key and one for non-shared key.
118
21.6k
  estimate += VarintLength(key.size());  // varint for key length.
119
21.6k
  if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) {
120
21.6k
    estimate += VarintLength(value.size());  // varint for value length.
121
21.6k
  }
122
123
21.6k
  return estimate;
124
21.6k
}
125
126
27.5k
Slice BlockBuilder::Finish() {
127
  // Append restart array
128
165k
  for (size_t i = 0; i < restarts_.size(); i++) {
129
137k
    PutFixed32(&buffer_, restarts_[i]);
130
137k
  }
131
132
27.5k
  uint32_t num_restarts = static_cast<uint32_t>(restarts_.size());
133
27.5k
  BlockBasedTableOptions::DataBlockIndexType index_type =
134
27.5k
      BlockBasedTableOptions::kDataBlockBinarySearch;
135
27.5k
  if (data_block_hash_index_builder_.Valid() &&
136
27.5k
      CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) {
137
0
    data_block_hash_index_builder_.Finish(buffer_);
138
0
    index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
139
0
  }
140
141
  // footer is a packed format of data_block_index_type and num_restarts
142
27.5k
  uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts);
143
144
27.5k
  PutFixed32(&buffer_, block_footer);
145
27.5k
  finished_ = true;
146
27.5k
  return Slice(buffer_);
147
27.5k
}
148
149
void BlockBuilder::Add(const Slice& key, const Slice& value,
150
315k
                       const Slice* const delta_value) {
151
  // Ensure no unsafe mixing of Add and AddWithLastKey
152
315k
  assert(!add_with_last_key_called_);
153
154
315k
  AddWithLastKeyImpl(key, value, last_key_, delta_value, buffer_.size());
155
315k
  if (use_delta_encoding_) {
156
    // Update state
157
    // We used to just copy the changed data, but it appears to be
158
    // faster to just copy the whole thing.
159
315k
    last_key_.assign(key.data(), key.size());
160
315k
  }
161
315k
}
162
163
void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value,
164
                                  const Slice& last_key_param,
165
29.1k
                                  const Slice* const delta_value) {
166
  // Ensure no unsafe mixing of Add and AddWithLastKey
167
29.1k
  assert(last_key_.empty());
168
#ifndef NDEBUG
169
  add_with_last_key_called_ = false;
170
#endif
171
172
  // Here we make sure to use an empty `last_key` on first call after creation
173
  // or Reset. This is more convenient for the caller and we can be more
174
  // clever inside BlockBuilder. On this hot code path, we want to avoid
175
  // conditional jumps like `buffer_.empty() ? ... : ...` so we can use a
176
  // fast arithmetic operation instead, with an assertion to be sure our logic
177
  // is sound.
178
29.1k
  size_t buffer_size = buffer_.size();
179
29.1k
  size_t last_key_size = last_key_param.size();
180
29.1k
  assert(buffer_size == 0 || buffer_size >= last_key_size - strip_ts_sz_);
181
182
29.1k
  Slice last_key(last_key_param.data(), last_key_size * (buffer_size > 0));
183
184
29.1k
  AddWithLastKeyImpl(key, value, last_key, delta_value, buffer_size);
185
29.1k
}
186
187
inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key,
188
                                             const Slice& value,
189
                                             const Slice& last_key,
190
                                             const Slice* const delta_value,
191
344k
                                             size_t buffer_size) {
192
344k
  assert(!finished_);
193
344k
  assert(counter_ <= block_restart_interval_);
194
344k
  assert(!use_value_delta_encoding_ || delta_value);
195
344k
  std::string key_buf;
196
344k
  std::string last_key_buf;
197
344k
  const Slice key_to_persist = MaybeStripTimestampFromKey(&key_buf, key);
198
  // For delta key encoding, the first key in each restart interval doesn't have
199
  // a last key to share bytes with.
200
344k
  const Slice last_key_persisted =
201
344k
      last_key.size() == 0
202
344k
          ? last_key
203
344k
          : MaybeStripTimestampFromKey(&last_key_buf, last_key);
204
344k
  size_t shared = 0;  // number of bytes shared with prev key
205
344k
  if (counter_ >= block_restart_interval_) {
206
    // Restart compression
207
113k
    restarts_.push_back(static_cast<uint32_t>(buffer_size));
208
113k
    estimate_ += sizeof(uint32_t);
209
113k
    counter_ = 0;
210
231k
  } else if (use_delta_encoding_) {
211
    // See how much sharing to do with previous string
212
231k
    shared = key_to_persist.difference_offset(last_key_persisted);
213
231k
  }
214
215
344k
  const size_t non_shared = key_to_persist.size() - shared;
216
217
344k
  if (use_value_delta_encoding_) {
218
    // Add "<shared><non_shared>" to buffer_
219
16.2k
    PutVarint32Varint32(&buffer_, static_cast<uint32_t>(shared),
220
16.2k
                        static_cast<uint32_t>(non_shared));
221
328k
  } else {
222
    // Add "<shared><non_shared><value_size>" to buffer_
223
328k
    PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
224
328k
                                static_cast<uint32_t>(non_shared),
225
328k
                                static_cast<uint32_t>(value.size()));
226
328k
  }
227
228
  // Add string delta to buffer_ followed by value
229
344k
  buffer_.append(key_to_persist.data() + shared, non_shared);
230
  // Use value delta encoding only when the key has shared bytes. This would
231
  // simplify the decoding, where it can figure which decoding to use simply by
232
  // looking at the shared bytes size.
233
344k
  if (shared != 0 && use_value_delta_encoding_) {
234
0
    buffer_.append(delta_value->data(), delta_value->size());
235
344k
  } else {
236
344k
    buffer_.append(value.data(), value.size());
237
344k
  }
238
239
  // TODO(yuzhangyu): make user defined timestamp work with block hash index.
240
344k
  if (data_block_hash_index_builder_.Valid()) {
241
    // Only data blocks should be using `kDataBlockBinaryAndHash` index type.
242
    // And data blocks should always be built with internal keys instead of
243
    // user keys.
244
0
    assert(!is_user_key_);
245
0
    data_block_hash_index_builder_.Add(ExtractUserKey(key),
246
0
                                       restarts_.size() - 1);
247
0
  }
248
249
344k
  counter_++;
250
344k
  estimate_ += buffer_.size() - buffer_size;
251
344k
}
252
253
const Slice BlockBuilder::MaybeStripTimestampFromKey(std::string* key_buf,
254
658k
                                                     const Slice& key) {
255
658k
  Slice stripped_key = key;
256
658k
  if (strip_ts_sz_ > 0) {
257
0
    if (is_user_key_) {
258
0
      stripped_key.remove_suffix(strip_ts_sz_);
259
0
    } else {
260
0
      StripTimestampFromInternalKey(key_buf, key, strip_ts_sz_);
261
0
      stripped_key = *key_buf;
262
0
    }
263
0
  }
264
658k
  return stripped_key;
265
658k
}
266
}  // namespace ROCKSDB_NAMESPACE