Coverage Report

Created: 2026-05-31 07:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rocksdb/table/block_based/block_prefix_index.cc
Line
Count
Source
1
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
6
#include "table/block_based/block_prefix_index.h"
7
8
#include <vector>
9
10
#include "memory/arena.h"
11
#include "rocksdb/comparator.h"
12
#include "rocksdb/slice.h"
13
#include "rocksdb/slice_transform.h"
14
#include "util/coding.h"
15
#include "util/hash.h"
16
17
namespace ROCKSDB_NAMESPACE {
18
19
0
inline uint32_t Hash(const Slice& s) {
20
0
  return ROCKSDB_NAMESPACE::Hash(s.data(), s.size(), 0);
21
0
}
22
23
0
inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
24
0
  return Hash(prefix) % num_buckets;
25
0
}
26
27
// The prefix block index is simply a bucket array, with each entry pointing to
28
// the blocks that span the prefixes hashed to this bucket.
29
//
30
// To reduce memory footprint, if there is only one block per bucket, the entry
31
// stores the block id directly. If there are more than one blocks per bucket,
32
// because of hash collision or a single prefix spanning multiple blocks,
33
// the entry points to an array of block ids. The block array is an array of
34
// uint32_t's. The first uint32_t indicates the total number of blocks, followed
35
// by the block ids.
36
//
37
// To differentiate the two cases, the high order bit of the entry indicates
38
// whether it is a 'pointer' into a separate block array.
39
// 0x7FFFFFFF is reserved for empty bucket.
40
41
const uint32_t kNoneBlock = 0x7FFFFFFF;
42
const uint32_t kBlockArrayMask = 0x80000000;
43
44
0
inline bool IsNone(uint32_t block_id) { return block_id == kNoneBlock; }
45
46
0
inline bool IsBlockId(uint32_t block_id) {
47
0
  return (block_id & kBlockArrayMask) == 0;
48
0
}
49
50
0
inline uint32_t DecodeIndex(uint32_t block_id) {
51
0
  uint32_t index = block_id ^ kBlockArrayMask;
52
0
  assert(index < kBlockArrayMask);
53
0
  return index;
54
0
}
55
56
0
inline uint32_t EncodeIndex(uint32_t index) {
57
0
  assert(index < kBlockArrayMask);
58
0
  return index | kBlockArrayMask;
59
0
}
60
61
// temporary storage for prefix information during index building
62
struct PrefixRecord {
63
  Slice prefix;
64
  uint32_t start_block;
65
  uint32_t end_block;
66
  uint32_t num_blocks;
67
  PrefixRecord* next;
68
};
69
70
class BlockPrefixIndex::Builder {
71
 public:
72
0
  void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) {
73
0
    PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
74
0
        arena_.AllocateAligned(sizeof(PrefixRecord)));
75
0
    record->prefix = key_prefix;
76
0
    record->start_block = start_block;
77
0
    record->end_block = start_block + num_blocks - 1;
78
0
    record->num_blocks = num_blocks;
79
0
    prefixes_.push_back(record);
80
0
  }
81
82
0
  BlockPrefixIndex* Finish(const SliceTransform* prefix_extractor) {
83
    // For now, use roughly 1:1 prefix to bucket ratio.
84
0
    uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;
85
86
    // Collect prefix records that hash to the same bucket, into a single
87
    // linklist.
88
0
    std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
89
0
    std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
90
0
    for (PrefixRecord* current : prefixes_) {
91
0
      uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
92
      // merge the prefix block span if the first block of this prefix is
93
      // connected to the last block of the previous prefix.
94
0
      PrefixRecord* prev = prefixes_per_bucket[bucket];
95
0
      if (prev) {
96
0
        assert(current->start_block >= prev->end_block);
97
0
        auto distance = current->start_block - prev->end_block;
98
0
        if (distance <= 1) {
99
0
          prev->end_block = current->end_block;
100
0
          prev->num_blocks = prev->end_block - prev->start_block + 1;
101
0
          num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
102
0
          continue;
103
0
        }
104
0
      }
105
0
      current->next = prev;
106
0
      prefixes_per_bucket[bucket] = current;
107
0
      num_blocks_per_bucket[bucket] += current->num_blocks;
108
0
    }
109
110
    // Calculate the block array buffer size
111
0
    uint32_t total_block_array_entries = 0;
112
0
    for (uint32_t i = 0; i < num_buckets; i++) {
113
0
      uint32_t num_blocks = num_blocks_per_bucket[i];
114
0
      if (num_blocks > 1) {
115
0
        total_block_array_entries += (num_blocks + 1);
116
0
      }
117
0
    }
118
119
    // Populate the final prefix block index
120
0
    uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
121
0
    uint32_t* buckets = new uint32_t[num_buckets];
122
0
    uint32_t offset = 0;
123
0
    for (uint32_t i = 0; i < num_buckets; i++) {
124
0
      uint32_t num_blocks = num_blocks_per_bucket[i];
125
0
      if (num_blocks == 0) {
126
0
        assert(prefixes_per_bucket[i] == nullptr);
127
0
        buckets[i] = kNoneBlock;
128
0
      } else if (num_blocks == 1) {
129
0
        assert(prefixes_per_bucket[i] != nullptr);
130
0
        assert(prefixes_per_bucket[i]->next == nullptr);
131
0
        buckets[i] = prefixes_per_bucket[i]->start_block;
132
0
      } else {
133
0
        assert(total_block_array_entries > 0);
134
0
        assert(prefixes_per_bucket[i] != nullptr);
135
0
        buckets[i] = EncodeIndex(offset);
136
0
        block_array_buffer[offset] = num_blocks;
137
0
        uint32_t* last_block = &block_array_buffer[offset + num_blocks];
138
0
        auto current = prefixes_per_bucket[i];
139
        // populate block ids from largest to smallest
140
0
        while (current != nullptr) {
141
0
          for (uint32_t iter = 0; iter < current->num_blocks; iter++) {
142
0
            *last_block = current->end_block - iter;
143
0
            last_block--;
144
0
          }
145
0
          current = current->next;
146
0
        }
147
0
        assert(last_block == &block_array_buffer[offset]);
148
0
        offset += (num_blocks + 1);
149
0
      }
150
0
    }
151
152
0
    assert(offset == total_block_array_entries);
153
154
0
    return new BlockPrefixIndex(prefix_extractor, num_buckets, buckets,
155
0
                                total_block_array_entries, block_array_buffer);
156
0
  }
157
158
 private:
159
  std::vector<PrefixRecord*> prefixes_;
160
  Arena arena_;
161
};
162
163
Status BlockPrefixIndex::Create(const SliceTransform* prefix_extractor,
164
                                const Slice& prefixes, const Slice& prefix_meta,
165
0
                                BlockPrefixIndex** prefix_index) {
166
0
  uint64_t pos = 0;
167
0
  auto meta_pos = prefix_meta;
168
0
  Status s;
169
0
  Builder builder;
170
171
0
  while (!meta_pos.empty()) {
172
0
    uint32_t prefix_size = 0;
173
0
    uint32_t entry_index = 0;
174
0
    uint32_t num_blocks = 0;
175
0
    if (!GetVarint32(&meta_pos, &prefix_size) ||
176
0
        !GetVarint32(&meta_pos, &entry_index) ||
177
0
        !GetVarint32(&meta_pos, &num_blocks)) {
178
0
      s = Status::Corruption(
179
0
          "Corrupted prefix meta block: unable to read from it.");
180
0
      break;
181
0
    }
182
0
    if (pos + prefix_size > prefixes.size()) {
183
0
      s = Status::Corruption(
184
0
          "Corrupted prefix meta block: size inconsistency.");
185
0
      break;
186
0
    }
187
0
    Slice prefix(prefixes.data() + pos, prefix_size);
188
0
    builder.Add(prefix, entry_index, num_blocks);
189
190
0
    pos += prefix_size;
191
0
  }
192
193
0
  if (s.ok() && pos != prefixes.size()) {
194
0
    s = Status::Corruption("Corrupted prefix meta block");
195
0
  }
196
197
0
  if (s.ok()) {
198
0
    *prefix_index = builder.Finish(prefix_extractor);
199
0
  }
200
201
0
  return s;
202
0
}
203
204
0
uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) {
205
0
  Slice prefix = internal_prefix_extractor_.Transform(key);
206
207
0
  uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
208
0
  uint32_t block_id = buckets_[bucket];
209
210
0
  if (IsNone(block_id)) {
211
0
    return 0;
212
0
  } else if (IsBlockId(block_id)) {
213
0
    *blocks = &buckets_[bucket];
214
0
    return 1;
215
0
  } else {
216
0
    uint32_t index = DecodeIndex(block_id);
217
0
    assert(index < num_block_array_buffer_entries_);
218
0
    *blocks = &block_array_buffer_[index + 1];
219
0
    uint32_t num_blocks = block_array_buffer_[index];
220
0
    assert(num_blocks > 1);
221
    assert(index + num_blocks < num_block_array_buffer_entries_);
222
0
    return num_blocks;
223
0
  }
224
0
}
225
226
}  // namespace ROCKSDB_NAMESPACE