Coverage Report

Created: 2025-09-05 08:05

/src/duckdb/src/storage/compression/roaring/metadata.cpp
Line
Count
Source (jump to first uncovered line)
1
#include "duckdb/storage/compression/roaring/roaring.hpp"
2
3
#include "duckdb/common/limits.hpp"
4
#include "duckdb/common/likely.hpp"
5
#include "duckdb/common/numeric_utils.hpp"
6
#include "duckdb/function/compression/compression.hpp"
7
#include "duckdb/function/compression_function.hpp"
8
#include "duckdb/main/config.hpp"
9
#include "duckdb/storage/buffer_manager.hpp"
10
#include "duckdb/storage/table/column_data_checkpointer.hpp"
11
#include "duckdb/storage/table/column_segment.hpp"
12
#include "duckdb/storage/table/scan_state.hpp"
13
#include "duckdb/storage/segment/uncompressed.hpp"
14
#include "duckdb/common/fast_mem.hpp"
15
#include "duckdb/common/bitpacking.hpp"
16
17
namespace duckdb {
18
19
namespace roaring {
20
21
ContainerMetadata ContainerMetadata::CreateMetadata(uint16_t count, uint16_t array_null, uint16_t array_non_null,
22
0
                                                    uint16_t runs) {
23
0
  const bool can_use_null_array = array_null < MAX_ARRAY_IDX;
24
0
  const bool can_use_non_null_array = array_non_null < MAX_ARRAY_IDX;
25
26
0
  const bool can_use_run = runs < MAX_RUN_IDX;
27
28
0
  const bool can_use_array = can_use_null_array || can_use_non_null_array;
29
0
  if (!can_use_array && !can_use_run) {
30
    // Can not efficiently encode at all, write it as bitset
31
0
    return ContainerMetadata::BitsetContainer(count);
32
0
  }
33
0
  uint16_t null_array_cost = array_null < COMPRESSED_ARRAY_THRESHOLD
34
0
                                 ? array_null * sizeof(uint16_t)
35
0
                                 : COMPRESSED_SEGMENT_COUNT + (array_null * sizeof(uint8_t));
36
0
  uint16_t non_null_array_cost = array_non_null < COMPRESSED_ARRAY_THRESHOLD
37
0
                                     ? array_non_null * sizeof(uint16_t)
38
0
                                     : COMPRESSED_SEGMENT_COUNT + (array_non_null * sizeof(uint8_t));
39
40
0
  uint16_t lowest_array_cost = MinValue<uint16_t>(null_array_cost, non_null_array_cost);
41
0
  uint16_t lowest_run_cost = runs < COMPRESSED_RUN_THRESHOLD ? runs * sizeof(uint32_t)
42
0
                                                             : COMPRESSED_SEGMENT_COUNT + (runs * sizeof(uint16_t));
43
0
  uint16_t bitset_cost =
44
0
      (AlignValue<uint16_t, ValidityMask::BITS_PER_VALUE>(count) / ValidityMask::BITS_PER_VALUE) * sizeof(validity_t);
45
0
  if (MinValue<uint16_t>(lowest_array_cost, lowest_run_cost) > bitset_cost) {
46
    // The amount of values is too small, better off using bitset
47
    // we can detect this at decompression because we know how many values are left
48
0
    return ContainerMetadata::BitsetContainer(count);
49
0
  }
50
51
0
  if (lowest_array_cost <= lowest_run_cost) {
52
0
    if (array_null <= array_non_null) {
53
0
      return ContainerMetadata::ArrayContainer(array_null, NULLS);
54
0
    } else {
55
0
      return ContainerMetadata::ArrayContainer(array_non_null, NON_NULLS);
56
0
    }
57
0
  } else {
58
0
    return ContainerMetadata::RunContainer(runs);
59
0
  }
60
0
}
61
62
0
idx_t ContainerMetadata::GetDataSizeInBytes(idx_t container_size) const {
63
0
  if (IsUncompressed()) {
64
0
    return (container_size / ValidityMask::BITS_PER_VALUE) * sizeof(validity_t);
65
0
  }
66
0
  if (IsRun()) {
67
0
    auto number_of_runs = NumberOfRuns();
68
0
    if (number_of_runs >= COMPRESSED_RUN_THRESHOLD) {
69
0
      return COMPRESSED_SEGMENT_COUNT + (sizeof(uint8_t) * number_of_runs * 2);
70
0
    } else {
71
0
      return sizeof(RunContainerRLEPair) * number_of_runs;
72
0
    }
73
0
  } else {
74
0
    auto cardinality = Cardinality();
75
0
    if (cardinality >= COMPRESSED_ARRAY_THRESHOLD) {
76
0
      return COMPRESSED_SEGMENT_COUNT + (sizeof(uint8_t) * cardinality);
77
0
    } else {
78
0
      return sizeof(uint16_t) * cardinality;
79
0
    }
80
0
  }
81
0
}
82
83
0
ContainerMetadataCollection::ContainerMetadataCollection() {
84
0
}
85
86
0
void ContainerMetadataCollection::AddMetadata(ContainerMetadata metadata) {
87
0
  if (metadata.IsRun()) {
88
0
    AddRunContainer(metadata.NumberOfRuns(), metadata.IsInverted());
89
0
  } else if (metadata.IsUncompressed()) {
90
0
    AddBitsetContainer();
91
0
  } else {
92
0
    AddArrayContainer(metadata.Cardinality(), metadata.IsInverted());
93
0
  }
94
0
}
95
96
0
idx_t ContainerMetadataCollection::GetMetadataSizeForSegment() const {
97
0
  idx_t runs_count = GetRunContainerCount();
98
0
  idx_t arrays_count = GetArrayAndBitsetContainerCount();
99
0
  return GetMetadataSize(runs_count + arrays_count, runs_count, arrays_count);
100
0
}
101
102
idx_t ContainerMetadataCollection::GetMetadataSize(idx_t container_count, idx_t run_containers,
103
0
                                                   idx_t array_containers) const {
104
0
  idx_t types_size = BitpackingPrimitives::GetRequiredSize(container_count, CONTAINER_TYPE_BITWIDTH);
105
0
  idx_t runs_size = BitpackingPrimitives::GetRequiredSize(run_containers, RUN_CONTAINER_SIZE_BITWIDTH);
106
0
  idx_t arrays_size = sizeof(uint8_t) * array_containers;
107
0
  return types_size + runs_size + arrays_size;
108
0
}
109
110
0
idx_t ContainerMetadataCollection::GetRunContainerCount() const {
111
0
  return runs_in_segment;
112
0
}
113
0
idx_t ContainerMetadataCollection::GetArrayAndBitsetContainerCount() const {
114
0
  return arrays_in_segment;
115
0
}
116
117
0
void ContainerMetadataCollection::FlushSegment() {
118
0
  runs_in_segment = 0;
119
0
  count_in_segment = 0;
120
0
  arrays_in_segment = 0;
121
0
}
122
123
0
void ContainerMetadataCollection::Reset() {
124
0
  FlushSegment();
125
0
  container_type.clear();
126
0
  number_of_runs.clear();
127
0
  cardinality.clear();
128
0
}
129
130
// Write the metadata for the current segment
131
0
idx_t ContainerMetadataCollection::Serialize(data_ptr_t dest) const {
132
  // Element sizes (in bits) for written metadata
133
  // +======================================+
134
  // |mmmmmm|rrrrrr|aaaaaaa|                |
135
  // +======================================+
136
  //
137
  // m: 2: (1: is_run, 1: is_inverted)
138
  // r: 7: number_of_runs
139
  // a: 8: cardinality
140
141
0
  idx_t types_size = BitpackingPrimitives::GetRequiredSize(count_in_segment, CONTAINER_TYPE_BITWIDTH);
142
0
  idx_t runs_size = BitpackingPrimitives::GetRequiredSize(runs_in_segment, RUN_CONTAINER_SIZE_BITWIDTH);
143
0
  idx_t arrays_size = sizeof(uint8_t) * arrays_in_segment;
144
145
0
  idx_t types_offset = container_type.size() - count_in_segment;
146
0
  data_ptr_t types_data = (data_ptr_t)(container_type.data()); // NOLINT: c-style cast (for const)
147
0
  BitpackingPrimitives::PackBuffer<uint8_t>(dest, types_data + types_offset, count_in_segment,
148
0
                                            CONTAINER_TYPE_BITWIDTH);
149
0
  dest += types_size;
150
151
0
  if (!number_of_runs.empty()) {
152
0
    idx_t runs_offset = number_of_runs.size() - runs_in_segment;
153
0
    data_ptr_t run_data = (data_ptr_t)(number_of_runs.data()); // NOLINT: c-style cast (for const)
154
0
    BitpackingPrimitives::PackBuffer<uint8_t>(dest, run_data + runs_offset, runs_in_segment,
155
0
                                              RUN_CONTAINER_SIZE_BITWIDTH);
156
0
    dest += runs_size;
157
0
  }
158
159
0
  if (!cardinality.empty()) {
160
0
    idx_t arrays_offset = cardinality.size() - arrays_in_segment;
161
0
    data_ptr_t arrays_data = (data_ptr_t)(cardinality.data()); // NOLINT: c-style cast (for const)
162
0
    memcpy(dest, arrays_data + arrays_offset, sizeof(uint8_t) * arrays_in_segment);
163
0
  }
164
0
  return types_size + runs_size + arrays_size;
165
0
}
166
167
0
void ContainerMetadataCollection::Deserialize(data_ptr_t src, idx_t container_count) {
168
0
  container_type.resize(AlignValue<idx_t, BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE>(container_count));
169
0
  count_in_segment = container_count;
170
171
  // Load the types of the containers
172
0
  idx_t types_size = BitpackingPrimitives::GetRequiredSize(container_type.size(), 2);
173
0
  BitpackingPrimitives::UnPackBuffer<uint8_t>(container_type.data(), src, container_count, 2, true);
174
0
  src += types_size;
175
176
  // Figure out how many are run containers
177
0
  idx_t runs_count = 0;
178
0
  for (idx_t i = 0; i < container_count; i++) {
179
0
    auto type = container_type[i];
180
0
    runs_count += ((type >> 1) & 1) == 1;
181
0
  }
182
0
  runs_in_segment = runs_count;
183
0
  number_of_runs.resize(AlignValue<idx_t, BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE>(runs_count));
184
0
  cardinality.resize(container_count - runs_count);
185
186
  // Load the run containers
187
0
  if (runs_count) {
188
0
    idx_t runs_size = BitpackingPrimitives::GetRequiredSize(runs_count, RUN_CONTAINER_SIZE_BITWIDTH);
189
0
    BitpackingPrimitives::UnPackBuffer<uint8_t>(number_of_runs.data(), src, runs_count, RUN_CONTAINER_SIZE_BITWIDTH,
190
0
                                                true);
191
0
    src += runs_size;
192
0
  }
193
194
  // Load the array/bitset containers
195
0
  if (!cardinality.empty()) {
196
0
    idx_t arrays_size = sizeof(uint8_t) * cardinality.size();
197
0
    arrays_in_segment = arrays_size;
198
0
    memcpy(cardinality.data(), src, arrays_size);
199
0
  }
200
0
}
201
202
0
void ContainerMetadataCollection::AddBitsetContainer() {
203
0
  AddContainerType(false, false);
204
0
  cardinality.push_back(BITSET_CONTAINER_SENTINEL_VALUE);
205
0
  arrays_in_segment++;
206
0
  count_in_segment++;
207
0
}
208
209
0
void ContainerMetadataCollection::AddArrayContainer(idx_t amount, bool is_inverted) {
210
0
  AddContainerType(false, is_inverted);
211
0
  D_ASSERT(amount < MAX_ARRAY_IDX);
212
0
  cardinality.push_back(NumericCast<uint8_t>(amount));
213
0
  arrays_in_segment++;
214
0
  count_in_segment++;
215
0
}
216
217
0
void ContainerMetadataCollection::AddRunContainer(idx_t amount, bool is_inverted) {
218
0
  AddContainerType(true, is_inverted);
219
0
  D_ASSERT(amount < MAX_RUN_IDX);
220
0
  number_of_runs.push_back(NumericCast<uint8_t>(amount));
221
0
  runs_in_segment++;
222
0
  count_in_segment++;
223
0
}
224
225
0
void ContainerMetadataCollection::AddContainerType(bool is_run, bool is_inverted) {
226
0
  uint8_t type = 0;
227
0
  if (is_run) {
228
0
    type |= IS_RUN_FLAG;
229
0
  }
230
0
  if (is_inverted) {
231
0
    type |= IS_INVERTED_FLAG;
232
0
  }
233
0
  container_type.push_back(type);
234
0
}
235
236
ContainerMetadataCollectionScanner::ContainerMetadataCollectionScanner(ContainerMetadataCollection &collection)
237
0
    : collection(collection) {
238
0
}
239
240
0
ContainerMetadata ContainerMetadataCollectionScanner::GetNext() {
241
0
  D_ASSERT(idx < collection.count_in_segment);
242
0
  auto type = collection.container_type[idx++];
243
0
  const bool is_inverted = (type & 1) == 1;
244
0
  const bool is_run = ((type >> 1) & 1) == 1;
245
0
  uint8_t amount;
246
0
  if (is_run) {
247
0
    amount = collection.number_of_runs[run_idx++];
248
0
  } else {
249
0
    amount = collection.cardinality[array_idx++];
250
0
  }
251
0
  if (is_run) {
252
0
    return ContainerMetadata::RunContainer(amount);
253
0
  }
254
0
  if (amount == BITSET_CONTAINER_SENTINEL_VALUE) {
255
0
    return ContainerMetadata::BitsetContainer(amount);
256
0
  }
257
0
  return ContainerMetadata::ArrayContainer(amount, is_inverted);
258
0
}
259
260
} // namespace roaring
261
262
} // namespace duckdb