/src/duckdb/src/storage/compression/roaring/metadata.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | #include "duckdb/storage/compression/roaring/roaring.hpp" |
2 | | |
3 | | #include "duckdb/common/limits.hpp" |
4 | | #include "duckdb/common/likely.hpp" |
5 | | #include "duckdb/common/numeric_utils.hpp" |
6 | | #include "duckdb/function/compression/compression.hpp" |
7 | | #include "duckdb/function/compression_function.hpp" |
8 | | #include "duckdb/main/config.hpp" |
9 | | #include "duckdb/storage/buffer_manager.hpp" |
10 | | #include "duckdb/storage/table/column_data_checkpointer.hpp" |
11 | | #include "duckdb/storage/table/column_segment.hpp" |
12 | | #include "duckdb/storage/table/scan_state.hpp" |
13 | | #include "duckdb/storage/segment/uncompressed.hpp" |
14 | | #include "duckdb/common/fast_mem.hpp" |
15 | | #include "duckdb/common/bitpacking.hpp" |
16 | | |
17 | | namespace duckdb { |
18 | | |
19 | | namespace roaring { |
20 | | |
21 | | ContainerMetadata ContainerMetadata::CreateMetadata(uint16_t count, uint16_t array_null, uint16_t array_non_null, |
22 | 0 | uint16_t runs) { |
23 | 0 | const bool can_use_null_array = array_null < MAX_ARRAY_IDX; |
24 | 0 | const bool can_use_non_null_array = array_non_null < MAX_ARRAY_IDX; |
25 | |
|
26 | 0 | const bool can_use_run = runs < MAX_RUN_IDX; |
27 | |
|
28 | 0 | const bool can_use_array = can_use_null_array || can_use_non_null_array; |
29 | 0 | if (!can_use_array && !can_use_run) { |
30 | | // Can not efficiently encode at all, write it as bitset |
31 | 0 | return ContainerMetadata::BitsetContainer(count); |
32 | 0 | } |
33 | 0 | uint16_t null_array_cost = array_null < COMPRESSED_ARRAY_THRESHOLD |
34 | 0 | ? array_null * sizeof(uint16_t) |
35 | 0 | : COMPRESSED_SEGMENT_COUNT + (array_null * sizeof(uint8_t)); |
36 | 0 | uint16_t non_null_array_cost = array_non_null < COMPRESSED_ARRAY_THRESHOLD |
37 | 0 | ? array_non_null * sizeof(uint16_t) |
38 | 0 | : COMPRESSED_SEGMENT_COUNT + (array_non_null * sizeof(uint8_t)); |
39 | |
|
40 | 0 | uint16_t lowest_array_cost = MinValue<uint16_t>(null_array_cost, non_null_array_cost); |
41 | 0 | uint16_t lowest_run_cost = runs < COMPRESSED_RUN_THRESHOLD ? runs * sizeof(uint32_t) |
42 | 0 | : COMPRESSED_SEGMENT_COUNT + (runs * sizeof(uint16_t)); |
43 | 0 | uint16_t bitset_cost = |
44 | 0 | (AlignValue<uint16_t, ValidityMask::BITS_PER_VALUE>(count) / ValidityMask::BITS_PER_VALUE) * sizeof(validity_t); |
45 | 0 | if (MinValue<uint16_t>(lowest_array_cost, lowest_run_cost) > bitset_cost) { |
46 | | // The amount of values is too small, better off using bitset |
47 | | // we can detect this at decompression because we know how many values are left |
48 | 0 | return ContainerMetadata::BitsetContainer(count); |
49 | 0 | } |
50 | | |
51 | 0 | if (lowest_array_cost <= lowest_run_cost) { |
52 | 0 | if (array_null <= array_non_null) { |
53 | 0 | return ContainerMetadata::ArrayContainer(array_null, NULLS); |
54 | 0 | } else { |
55 | 0 | return ContainerMetadata::ArrayContainer(array_non_null, NON_NULLS); |
56 | 0 | } |
57 | 0 | } else { |
58 | 0 | return ContainerMetadata::RunContainer(runs); |
59 | 0 | } |
60 | 0 | } |
61 | | |
62 | 0 | idx_t ContainerMetadata::GetDataSizeInBytes(idx_t container_size) const { |
63 | 0 | if (IsUncompressed()) { |
64 | 0 | return (container_size / ValidityMask::BITS_PER_VALUE) * sizeof(validity_t); |
65 | 0 | } |
66 | 0 | if (IsRun()) { |
67 | 0 | auto number_of_runs = NumberOfRuns(); |
68 | 0 | if (number_of_runs >= COMPRESSED_RUN_THRESHOLD) { |
69 | 0 | return COMPRESSED_SEGMENT_COUNT + (sizeof(uint8_t) * number_of_runs * 2); |
70 | 0 | } else { |
71 | 0 | return sizeof(RunContainerRLEPair) * number_of_runs; |
72 | 0 | } |
73 | 0 | } else { |
74 | 0 | auto cardinality = Cardinality(); |
75 | 0 | if (cardinality >= COMPRESSED_ARRAY_THRESHOLD) { |
76 | 0 | return COMPRESSED_SEGMENT_COUNT + (sizeof(uint8_t) * cardinality); |
77 | 0 | } else { |
78 | 0 | return sizeof(uint16_t) * cardinality; |
79 | 0 | } |
80 | 0 | } |
81 | 0 | } |
82 | | |
83 | 0 | ContainerMetadataCollection::ContainerMetadataCollection() { |
84 | 0 | } |
85 | | |
86 | 0 | void ContainerMetadataCollection::AddMetadata(ContainerMetadata metadata) { |
87 | 0 | if (metadata.IsRun()) { |
88 | 0 | AddRunContainer(metadata.NumberOfRuns(), metadata.IsInverted()); |
89 | 0 | } else if (metadata.IsUncompressed()) { |
90 | 0 | AddBitsetContainer(); |
91 | 0 | } else { |
92 | 0 | AddArrayContainer(metadata.Cardinality(), metadata.IsInverted()); |
93 | 0 | } |
94 | 0 | } |
95 | | |
96 | 0 | idx_t ContainerMetadataCollection::GetMetadataSizeForSegment() const { |
97 | 0 | idx_t runs_count = GetRunContainerCount(); |
98 | 0 | idx_t arrays_count = GetArrayAndBitsetContainerCount(); |
99 | 0 | return GetMetadataSize(runs_count + arrays_count, runs_count, arrays_count); |
100 | 0 | } |
101 | | |
102 | | idx_t ContainerMetadataCollection::GetMetadataSize(idx_t container_count, idx_t run_containers, |
103 | 0 | idx_t array_containers) const { |
104 | 0 | idx_t types_size = BitpackingPrimitives::GetRequiredSize(container_count, CONTAINER_TYPE_BITWIDTH); |
105 | 0 | idx_t runs_size = BitpackingPrimitives::GetRequiredSize(run_containers, RUN_CONTAINER_SIZE_BITWIDTH); |
106 | 0 | idx_t arrays_size = sizeof(uint8_t) * array_containers; |
107 | 0 | return types_size + runs_size + arrays_size; |
108 | 0 | } |
109 | | |
110 | 0 | idx_t ContainerMetadataCollection::GetRunContainerCount() const { |
111 | 0 | return runs_in_segment; |
112 | 0 | } |
113 | 0 | idx_t ContainerMetadataCollection::GetArrayAndBitsetContainerCount() const { |
114 | 0 | return arrays_in_segment; |
115 | 0 | } |
116 | | |
117 | 0 | void ContainerMetadataCollection::FlushSegment() { |
118 | 0 | runs_in_segment = 0; |
119 | 0 | count_in_segment = 0; |
120 | 0 | arrays_in_segment = 0; |
121 | 0 | } |
122 | | |
123 | 0 | void ContainerMetadataCollection::Reset() { |
124 | 0 | FlushSegment(); |
125 | 0 | container_type.clear(); |
126 | 0 | number_of_runs.clear(); |
127 | 0 | cardinality.clear(); |
128 | 0 | } |
129 | | |
130 | | // Write the metadata for the current segment |
131 | 0 | idx_t ContainerMetadataCollection::Serialize(data_ptr_t dest) const { |
132 | | // Element sizes (in bits) for written metadata |
133 | | // +======================================+ |
134 | | // |mmmmmm|rrrrrr|aaaaaaa| | |
135 | | // +======================================+ |
136 | | // |
137 | | // m: 2: (1: is_run, 1: is_inverted) |
138 | | // r: 7: number_of_runs |
139 | | // a: 8: cardinality |
140 | |
|
141 | 0 | idx_t types_size = BitpackingPrimitives::GetRequiredSize(count_in_segment, CONTAINER_TYPE_BITWIDTH); |
142 | 0 | idx_t runs_size = BitpackingPrimitives::GetRequiredSize(runs_in_segment, RUN_CONTAINER_SIZE_BITWIDTH); |
143 | 0 | idx_t arrays_size = sizeof(uint8_t) * arrays_in_segment; |
144 | |
|
145 | 0 | idx_t types_offset = container_type.size() - count_in_segment; |
146 | 0 | data_ptr_t types_data = (data_ptr_t)(container_type.data()); // NOLINT: c-style cast (for const) |
147 | 0 | BitpackingPrimitives::PackBuffer<uint8_t>(dest, types_data + types_offset, count_in_segment, |
148 | 0 | CONTAINER_TYPE_BITWIDTH); |
149 | 0 | dest += types_size; |
150 | |
|
151 | 0 | if (!number_of_runs.empty()) { |
152 | 0 | idx_t runs_offset = number_of_runs.size() - runs_in_segment; |
153 | 0 | data_ptr_t run_data = (data_ptr_t)(number_of_runs.data()); // NOLINT: c-style cast (for const) |
154 | 0 | BitpackingPrimitives::PackBuffer<uint8_t>(dest, run_data + runs_offset, runs_in_segment, |
155 | 0 | RUN_CONTAINER_SIZE_BITWIDTH); |
156 | 0 | dest += runs_size; |
157 | 0 | } |
158 | |
|
159 | 0 | if (!cardinality.empty()) { |
160 | 0 | idx_t arrays_offset = cardinality.size() - arrays_in_segment; |
161 | 0 | data_ptr_t arrays_data = (data_ptr_t)(cardinality.data()); // NOLINT: c-style cast (for const) |
162 | 0 | memcpy(dest, arrays_data + arrays_offset, sizeof(uint8_t) * arrays_in_segment); |
163 | 0 | } |
164 | 0 | return types_size + runs_size + arrays_size; |
165 | 0 | } |
166 | | |
167 | 0 | void ContainerMetadataCollection::Deserialize(data_ptr_t src, idx_t container_count) { |
168 | 0 | container_type.resize(AlignValue<idx_t, BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE>(container_count)); |
169 | 0 | count_in_segment = container_count; |
170 | | |
171 | | // Load the types of the containers |
172 | 0 | idx_t types_size = BitpackingPrimitives::GetRequiredSize(container_type.size(), 2); |
173 | 0 | BitpackingPrimitives::UnPackBuffer<uint8_t>(container_type.data(), src, container_count, 2, true); |
174 | 0 | src += types_size; |
175 | | |
176 | | // Figure out how many are run containers |
177 | 0 | idx_t runs_count = 0; |
178 | 0 | for (idx_t i = 0; i < container_count; i++) { |
179 | 0 | auto type = container_type[i]; |
180 | 0 | runs_count += ((type >> 1) & 1) == 1; |
181 | 0 | } |
182 | 0 | runs_in_segment = runs_count; |
183 | 0 | number_of_runs.resize(AlignValue<idx_t, BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE>(runs_count)); |
184 | 0 | cardinality.resize(container_count - runs_count); |
185 | | |
186 | | // Load the run containers |
187 | 0 | if (runs_count) { |
188 | 0 | idx_t runs_size = BitpackingPrimitives::GetRequiredSize(runs_count, RUN_CONTAINER_SIZE_BITWIDTH); |
189 | 0 | BitpackingPrimitives::UnPackBuffer<uint8_t>(number_of_runs.data(), src, runs_count, RUN_CONTAINER_SIZE_BITWIDTH, |
190 | 0 | true); |
191 | 0 | src += runs_size; |
192 | 0 | } |
193 | | |
194 | | // Load the array/bitset containers |
195 | 0 | if (!cardinality.empty()) { |
196 | 0 | idx_t arrays_size = sizeof(uint8_t) * cardinality.size(); |
197 | 0 | arrays_in_segment = arrays_size; |
198 | 0 | memcpy(cardinality.data(), src, arrays_size); |
199 | 0 | } |
200 | 0 | } |
201 | | |
202 | 0 | void ContainerMetadataCollection::AddBitsetContainer() { |
203 | 0 | AddContainerType(false, false); |
204 | 0 | cardinality.push_back(BITSET_CONTAINER_SENTINEL_VALUE); |
205 | 0 | arrays_in_segment++; |
206 | 0 | count_in_segment++; |
207 | 0 | } |
208 | | |
209 | 0 | void ContainerMetadataCollection::AddArrayContainer(idx_t amount, bool is_inverted) { |
210 | 0 | AddContainerType(false, is_inverted); |
211 | 0 | D_ASSERT(amount < MAX_ARRAY_IDX); |
212 | 0 | cardinality.push_back(NumericCast<uint8_t>(amount)); |
213 | 0 | arrays_in_segment++; |
214 | 0 | count_in_segment++; |
215 | 0 | } |
216 | | |
217 | 0 | void ContainerMetadataCollection::AddRunContainer(idx_t amount, bool is_inverted) { |
218 | 0 | AddContainerType(true, is_inverted); |
219 | 0 | D_ASSERT(amount < MAX_RUN_IDX); |
220 | 0 | number_of_runs.push_back(NumericCast<uint8_t>(amount)); |
221 | 0 | runs_in_segment++; |
222 | 0 | count_in_segment++; |
223 | 0 | } |
224 | | |
225 | 0 | void ContainerMetadataCollection::AddContainerType(bool is_run, bool is_inverted) { |
226 | 0 | uint8_t type = 0; |
227 | 0 | if (is_run) { |
228 | 0 | type |= IS_RUN_FLAG; |
229 | 0 | } |
230 | 0 | if (is_inverted) { |
231 | 0 | type |= IS_INVERTED_FLAG; |
232 | 0 | } |
233 | 0 | container_type.push_back(type); |
234 | 0 | } |
235 | | |
236 | | ContainerMetadataCollectionScanner::ContainerMetadataCollectionScanner(ContainerMetadataCollection &collection) |
237 | 0 | : collection(collection) { |
238 | 0 | } |
239 | | |
240 | 0 | ContainerMetadata ContainerMetadataCollectionScanner::GetNext() { |
241 | 0 | D_ASSERT(idx < collection.count_in_segment); |
242 | 0 | auto type = collection.container_type[idx++]; |
243 | 0 | const bool is_inverted = (type & 1) == 1; |
244 | 0 | const bool is_run = ((type >> 1) & 1) == 1; |
245 | 0 | uint8_t amount; |
246 | 0 | if (is_run) { |
247 | 0 | amount = collection.number_of_runs[run_idx++]; |
248 | 0 | } else { |
249 | 0 | amount = collection.cardinality[array_idx++]; |
250 | 0 | } |
251 | 0 | if (is_run) { |
252 | 0 | return ContainerMetadata::RunContainer(amount); |
253 | 0 | } |
254 | 0 | if (amount == BITSET_CONTAINER_SENTINEL_VALUE) { |
255 | 0 | return ContainerMetadata::BitsetContainer(amount); |
256 | 0 | } |
257 | 0 | return ContainerMetadata::ArrayContainer(amount, is_inverted); |
258 | 0 | } |
259 | | |
260 | | } // namespace roaring |
261 | | |
262 | | } // namespace duckdb |