/src/duckdb/extension/parquet/writer/variant/convert_variant.cpp
Line | Count | Source |
1 | | #include <stdint.h> |
2 | | #include <string.h> |
3 | | #include <string> |
4 | | #include <unordered_set> |
5 | | #include <utility> |
6 | | #include <vector> |
7 | | |
8 | | #include "duckdb/common/vector/struct_vector.hpp" |
9 | | #include "writer/variant_column_writer.hpp" |
10 | | #include "duckdb/common/types/variant.hpp" |
11 | | #include "duckdb/function/scalar/variant_utils.hpp" |
12 | | #include "reader/variant/variant_binary_decoder.hpp" |
13 | | #include "duckdb/function/variant/variant_shredding.hpp" |
14 | | #include "duckdb/planner/expression_binder.hpp" |
15 | | #include "duckdb/execution/expression_executor.hpp" |
16 | | #include "duckdb/common/operator/cast_operators.hpp" |
17 | | #include "column_writer.hpp" |
18 | | #include "duckdb/common/assert.hpp" |
19 | | #include "duckdb/common/enum_util.hpp" |
20 | | #include "duckdb/common/exception.hpp" |
21 | | #include "duckdb/common/exception/binder_exception.hpp" |
22 | | #include "duckdb/common/helper.hpp" |
23 | | #include "duckdb/common/hugeint.hpp" |
24 | | #include "duckdb/common/limits.hpp" |
25 | | #include "duckdb/common/numeric_utils.hpp" |
26 | | #include "duckdb/common/optional_idx.hpp" |
27 | | #include "duckdb/common/optional_ptr.hpp" |
28 | | #include "duckdb/common/string.hpp" |
29 | | #include "duckdb/common/string_map_set.hpp" |
30 | | #include "duckdb/common/typedefs.hpp" |
31 | | #include "duckdb/common/types.hpp" |
32 | | #include "duckdb/common/types/data_chunk.hpp" |
33 | | #include "duckdb/common/types/selection_vector.hpp" |
34 | | #include "duckdb/common/types/string_type.hpp" |
35 | | #include "duckdb/common/types/uuid.hpp" |
36 | | #include "duckdb/common/types/validity_mask.hpp" |
37 | | #include "duckdb/common/types/value.hpp" |
38 | | #include "duckdb/common/types/vector.hpp" |
39 | | #include "duckdb/common/uhugeint.hpp" |
40 | | #include "duckdb/common/unique_ptr.hpp" |
41 | | #include "duckdb/common/vector.hpp" |
42 | | #include "duckdb/common/vector/flat_vector.hpp" |
43 | | #include "duckdb/common/vector/string_vector.hpp" |
44 | | #include "duckdb/common/vector/unified_vector_format.hpp" |
45 | | #include "duckdb/function/function.hpp" |
46 | | #include "duckdb/function/scalar_function.hpp" |
47 | | #include "duckdb/planner/expression.hpp" |
48 | | #include "parquet_column_schema.hpp" |
49 | | #include "parquet_types.h" |
50 | | |
51 | | namespace duckdb { |
52 | | class ClientContext; |
53 | | struct ExpressionState; |
54 | | |
55 | 0 | static idx_t CalculateByteLength(idx_t value) { |
56 | 0 | if (value == 0) { |
57 | 0 | return 1; |
58 | 0 | } |
59 | 0 | auto value_data = reinterpret_cast<data_ptr_t>(&value); |
60 | 0 | idx_t irrelevant_bytes = 0; |
61 | | //! Check how many of the most significant bytes are 0 |
62 | 0 | for (idx_t i = sizeof(idx_t); i > 0 && value_data[i - 1] == 0; i--) { |
63 | 0 | irrelevant_bytes++; |
64 | 0 | } |
65 | 0 | return sizeof(idx_t) - irrelevant_bytes; |
66 | 0 | } |
67 | | |
68 | 0 | static uint8_t EncodeMetadataHeader(idx_t byte_length) { |
69 | 0 | D_ASSERT(byte_length <= 4); |
70 | |
|
71 | 0 | uint8_t header_byte = 0; |
72 | | //! Set 'version' to 1 |
73 | 0 | header_byte |= static_cast<uint8_t>(1); |
74 | | //! Set 'sorted_strings' to 1 |
75 | 0 | header_byte |= static_cast<uint8_t>(1) << 4; |
76 | | //! Set 'offset_size_minus_one' to byte_length-1 |
77 | 0 | header_byte |= (static_cast<uint8_t>(byte_length) - 1) << 6; |
78 | |
|
79 | | #ifdef DEBUG |
80 | | auto decoded_header = VariantMetadataHeader::FromHeaderByte(header_byte); |
81 | | D_ASSERT(decoded_header.offset_size == byte_length); |
82 | | #endif |
83 | |
|
84 | 0 | return header_byte; |
85 | 0 | } |
86 | | |
87 | 0 | static void CreateMetadata(UnifiedVariantVectorData &variant, Vector &metadata, idx_t count) { |
88 | | //! NOTE: the parquet variant is limited to a max dictionary size of NumericLimits<uint32_t>::Maximum() |
89 | | //! Whereas we can have NumericLimits<uint32_t>::Maximum() *per* string in DuckDB |
90 | 0 | auto metadata_data = FlatVector::GetDataMutable<string_t>(metadata); |
91 | 0 | for (idx_t row = 0; row < count; row++) { |
92 | 0 | uint64_t dictionary_count = 0; |
93 | 0 | if (variant.RowIsValid(row)) { |
94 | 0 | dictionary_count = variant.GetKeysCount(row); |
95 | 0 | } |
96 | 0 | idx_t dictionary_size = 0; |
97 | 0 | for (idx_t i = 0; i < dictionary_count; i++) { |
98 | 0 | auto &key = variant.GetKey(row, i); |
99 | 0 | dictionary_size += key.GetSize(); |
100 | 0 | } |
101 | 0 | if (dictionary_size >= NumericLimits<uint32_t>::Maximum()) { |
102 | 0 | throw InvalidInputException("The total length of the dictionary exceeds a 4 byte value (uint32_t), failed " |
103 | 0 | "to export VARIANT to Parquet"); |
104 | 0 | } |
105 | | |
106 | 0 | auto byte_length = CalculateByteLength(dictionary_size); |
107 | 0 | auto total_length = 1 + (byte_length * (dictionary_count + 2)) + dictionary_size; |
108 | |
|
109 | 0 | metadata_data[row] = StringVector::EmptyString(metadata, total_length); |
110 | 0 | auto &metadata_blob = metadata_data[row]; |
111 | 0 | auto metadata_blob_data = metadata_blob.GetDataWriteable(); |
112 | |
|
113 | 0 | metadata_blob_data[0] = static_cast<char>(EncodeMetadataHeader(byte_length)); |
114 | 0 | memcpy(metadata_blob_data + 1, const_data_ptr_cast(&dictionary_count), byte_length); |
115 | |
|
116 | 0 | auto offset_ptr = metadata_blob_data + 1 + byte_length; |
117 | 0 | auto string_ptr = |
118 | 0 | metadata_blob_data + 1 + byte_length + (NumericCast<idx_t>(dictionary_count + 1) * byte_length); |
119 | 0 | idx_t total_offset = 0; |
120 | 0 | for (idx_t i = 0; i < dictionary_count; i++) { |
121 | 0 | memcpy(offset_ptr + (i * byte_length), const_data_ptr_cast(&total_offset), byte_length); |
122 | 0 | auto &key = variant.GetKey(row, i); |
123 | |
|
124 | 0 | memcpy(string_ptr + total_offset, key.GetData(), key.GetSize()); |
125 | 0 | total_offset += key.GetSize(); |
126 | 0 | } |
127 | 0 | memcpy(offset_ptr + (NumericCast<idx_t>(dictionary_count) * byte_length), const_data_ptr_cast(&total_offset), |
128 | 0 | byte_length); |
129 | 0 | D_ASSERT(offset_ptr + (NumericCast<idx_t>(dictionary_count + 1) * byte_length) == string_ptr); |
130 | 0 | D_ASSERT(string_ptr + total_offset == metadata_blob_data + total_length); |
131 | 0 | metadata_blob.SetSizeAndFinalize(total_length, total_length); |
132 | |
|
133 | | #ifdef DEBUG |
134 | | auto decoded_metadata = VariantMetadata(metadata_blob); |
135 | | D_ASSERT(decoded_metadata.strings.size() == dictionary_count); |
136 | | for (idx_t i = 0; i < dictionary_count; i++) { |
137 | | D_ASSERT(decoded_metadata.strings[i] == variant.GetKey(row, i).GetString()); |
138 | | } |
139 | | #endif |
140 | 0 | } |
141 | 0 | } |
142 | | |
143 | | namespace { |
144 | | |
145 | 0 | static unordered_set<VariantLogicalType> GetVariantType(const LogicalType &type) { |
146 | 0 | if (type.id() == LogicalTypeId::ANY) { |
147 | 0 | return {}; |
148 | 0 | } |
149 | 0 | switch (type.id()) { |
150 | 0 | case LogicalTypeId::STRUCT: |
151 | 0 | return {VariantLogicalType::OBJECT}; |
152 | 0 | case LogicalTypeId::LIST: |
153 | 0 | return {VariantLogicalType::ARRAY}; |
154 | 0 | case LogicalTypeId::BOOLEAN: |
155 | 0 | return {VariantLogicalType::BOOL_TRUE, VariantLogicalType::BOOL_FALSE}; |
156 | 0 | case LogicalTypeId::TINYINT: |
157 | 0 | return {VariantLogicalType::INT8}; |
158 | 0 | case LogicalTypeId::SMALLINT: |
159 | 0 | return {VariantLogicalType::INT16}; |
160 | 0 | case LogicalTypeId::INTEGER: |
161 | 0 | return {VariantLogicalType::INT32}; |
162 | 0 | case LogicalTypeId::BIGINT: |
163 | 0 | return {VariantLogicalType::INT64}; |
164 | 0 | case LogicalTypeId::FLOAT: |
165 | 0 | return {VariantLogicalType::FLOAT}; |
166 | 0 | case LogicalTypeId::DOUBLE: |
167 | 0 | return {VariantLogicalType::DOUBLE}; |
168 | 0 | case LogicalTypeId::DECIMAL: |
169 | 0 | return {VariantLogicalType::DECIMAL}; |
170 | 0 | case LogicalTypeId::DATE: |
171 | 0 | return {VariantLogicalType::DATE}; |
172 | 0 | case LogicalTypeId::TIME: |
173 | 0 | return {VariantLogicalType::TIME_MICROS}; |
174 | 0 | case LogicalTypeId::TIMESTAMP_TZ: |
175 | 0 | return {VariantLogicalType::TIMESTAMP_MICROS_TZ}; |
176 | 0 | case LogicalTypeId::TIMESTAMP_TZ_NS: |
177 | 0 | return {VariantLogicalType::TIMESTAMP_NANOS_TZ}; |
178 | 0 | case LogicalTypeId::TIMESTAMP: |
179 | 0 | return {VariantLogicalType::TIMESTAMP_MICROS}; |
180 | 0 | case LogicalTypeId::TIMESTAMP_NS: |
181 | 0 | return {VariantLogicalType::TIMESTAMP_NANOS}; |
182 | 0 | case LogicalTypeId::BLOB: |
183 | 0 | return {VariantLogicalType::BLOB}; |
184 | 0 | case LogicalTypeId::VARCHAR: |
185 | 0 | return {VariantLogicalType::VARCHAR}; |
186 | 0 | case LogicalTypeId::UUID: |
187 | 0 | return {VariantLogicalType::UUID}; |
188 | 0 | default: |
189 | 0 | throw BinderException("Type '%s' can't be translated to a VARIANT type", type.ToString()); |
190 | 0 | } |
191 | 0 | } |
192 | | |
193 | | struct ParquetVariantShreddingState : public VariantShreddingState { |
194 | | public: |
195 | | ParquetVariantShreddingState(const LogicalType &type, idx_t total_count) |
196 | 0 | : VariantShreddingState(type, total_count), variant_types(GetVariantType(type)) { |
197 | 0 | } |
198 | | |
199 | | public: |
200 | 0 | const unordered_set<VariantLogicalType> &GetVariantTypes() override { |
201 | 0 | return variant_types; |
202 | 0 | } |
203 | | |
204 | | private: |
205 | | unordered_set<VariantLogicalType> variant_types; |
206 | | }; |
207 | | |
208 | | struct ParquetVariantShredding : public VariantShredding { |
209 | 0 | ParquetVariantShredding() { |
210 | | // for parquet untyped ("value") comes before typed ("typed_value") |
211 | 0 | untyped_value_index = 0; |
212 | 0 | typed_value_index = 1; |
213 | 0 | } |
214 | | |
215 | | void WriteVariantValues(UnifiedVariantVectorData &variant, Vector &result, optional_ptr<const SelectionVector> sel, |
216 | | optional_ptr<const SelectionVector> value_index_sel, |
217 | | optional_ptr<const SelectionVector> result_sel, idx_t count) override; |
218 | | |
219 | | protected: |
220 | | void WriteMissingField(Vector &vector, idx_t index) override; |
221 | | }; |
222 | | |
223 | | } // namespace |
224 | | |
225 | | vector<idx_t> GetChildIndices(const UnifiedVariantVectorData &variant, idx_t row, const VariantNestedData &nested_data, |
226 | 0 | optional_ptr<ParquetVariantShreddingState> shredding_state) { |
227 | 0 | vector<idx_t> child_indices; |
228 | 0 | if (!shredding_state || shredding_state->type.id() != LogicalTypeId::STRUCT) { |
229 | 0 | for (idx_t i = 0; i < nested_data.child_count; i++) { |
230 | 0 | child_indices.push_back(i); |
231 | 0 | } |
232 | 0 | return child_indices; |
233 | 0 | } |
234 | | //! FIXME: The variant spec says that field names should be case-sensitive, not insensitive |
235 | 0 | case_insensitive_string_set_t shredded_fields = shredding_state->ObjectFields(); |
236 | |
|
237 | 0 | for (idx_t i = 0; i < nested_data.child_count; i++) { |
238 | 0 | auto keys_index = variant.GetKeysIndex(row, i + nested_data.children_idx); |
239 | 0 | auto &key = variant.GetKey(row, keys_index); |
240 | |
|
241 | 0 | if (shredded_fields.count(key)) { |
242 | | //! This field is shredded on, omit it from the value |
243 | 0 | continue; |
244 | 0 | } |
245 | 0 | child_indices.push_back(i); |
246 | 0 | } |
247 | 0 | return child_indices; |
248 | 0 | } |
249 | | |
250 | | static idx_t AnalyzeValueData(const UnifiedVariantVectorData &variant, idx_t row, uint32_t values_index, |
251 | 0 | vector<uint32_t> &offsets, optional_ptr<ParquetVariantShreddingState> shredding_state) { |
252 | 0 | idx_t total_size = 0; |
253 | | //! Every value has at least a value header |
254 | 0 | total_size++; |
255 | |
|
256 | 0 | idx_t offset_size = offsets.size(); |
257 | 0 | VariantLogicalType type_id = VariantLogicalType::VARIANT_NULL; |
258 | 0 | if (variant.RowIsValid(row)) { |
259 | 0 | type_id = variant.GetTypeId(row, values_index); |
260 | 0 | } |
261 | 0 | switch (type_id) { |
262 | 0 | case VariantLogicalType::OBJECT: { |
263 | 0 | auto nested_data = VariantUtils::DecodeNestedData(variant, row, values_index); |
264 | | |
265 | | //! Calculate value and key offsets for all children |
266 | 0 | idx_t total_offset = 0; |
267 | 0 | uint32_t highest_keys_index = 0; |
268 | |
|
269 | 0 | auto child_indices = GetChildIndices(variant, row, nested_data, shredding_state); |
270 | 0 | if (nested_data.child_count && child_indices.empty()) { |
271 | | //! All fields of the object are shredded, omit the object entirely |
272 | 0 | return 0; |
273 | 0 | } |
274 | | |
275 | 0 | auto num_elements = child_indices.size(); |
276 | 0 | offsets.resize(offset_size + num_elements + 1); |
277 | |
|
278 | 0 | for (idx_t entry = 0; entry < child_indices.size(); entry++) { |
279 | 0 | auto i = child_indices[entry]; |
280 | 0 | auto keys_index = variant.GetKeysIndex(row, i + nested_data.children_idx); |
281 | 0 | auto values_index = variant.GetValuesIndex(row, i + nested_data.children_idx); |
282 | 0 | offsets[offset_size + entry] = total_offset; |
283 | |
|
284 | 0 | total_offset += AnalyzeValueData(variant, row, values_index, offsets, nullptr); |
285 | 0 | highest_keys_index = MaxValue(highest_keys_index, keys_index); |
286 | 0 | } |
287 | 0 | offsets[offset_size + num_elements] = total_offset; |
288 | | |
289 | | //! Calculate the sizes for the objects value data |
290 | 0 | auto field_id_size = CalculateByteLength(highest_keys_index); |
291 | 0 | auto field_offset_size = CalculateByteLength(total_offset); |
292 | 0 | const bool is_large = num_elements > NumericLimits<uint8_t>::Maximum(); |
293 | | |
294 | | //! Now add the sizes for the objects value data |
295 | 0 | if (is_large) { |
296 | 0 | total_size += sizeof(uint32_t); |
297 | 0 | } else { |
298 | 0 | total_size += sizeof(uint8_t); |
299 | 0 | } |
300 | 0 | total_size += num_elements * field_id_size; |
301 | 0 | total_size += (num_elements + 1) * field_offset_size; |
302 | 0 | total_size += total_offset; |
303 | 0 | break; |
304 | 0 | } |
305 | 0 | case VariantLogicalType::ARRAY: { |
306 | 0 | auto nested_data = VariantUtils::DecodeNestedData(variant, row, values_index); |
307 | |
|
308 | 0 | idx_t total_offset = 0; |
309 | 0 | offsets.resize(offset_size + nested_data.child_count + 1); |
310 | 0 | for (idx_t i = 0; i < nested_data.child_count; i++) { |
311 | 0 | auto values_index = variant.GetValuesIndex(row, i + nested_data.children_idx); |
312 | 0 | offsets[offset_size + i] = total_offset; |
313 | |
|
314 | 0 | total_offset += AnalyzeValueData(variant, row, values_index, offsets, nullptr); |
315 | 0 | } |
316 | 0 | offsets[offset_size + nested_data.child_count] = total_offset; |
317 | |
|
318 | 0 | auto field_offset_size = CalculateByteLength(total_offset); |
319 | 0 | auto num_elements = nested_data.child_count; |
320 | 0 | const bool is_large = num_elements > NumericLimits<uint8_t>::Maximum(); |
321 | |
|
322 | 0 | if (is_large) { |
323 | 0 | total_size += sizeof(uint32_t); |
324 | 0 | } else { |
325 | 0 | total_size += sizeof(uint8_t); |
326 | 0 | } |
327 | 0 | total_size += (num_elements + 1) * field_offset_size; |
328 | 0 | total_size += total_offset; |
329 | 0 | break; |
330 | 0 | } |
331 | 0 | case VariantLogicalType::BLOB: |
332 | 0 | case VariantLogicalType::VARCHAR: { |
333 | 0 | auto string_value = VariantUtils::DecodeStringData(variant, row, values_index); |
334 | 0 | total_size += string_value.GetSize(); |
335 | 0 | if (type_id == VariantLogicalType::BLOB || string_value.GetSize() > 64) { |
336 | | //! Save as regular string value |
337 | 0 | total_size += sizeof(uint32_t); |
338 | 0 | } |
339 | 0 | break; |
340 | 0 | } |
341 | 0 | case VariantLogicalType::VARIANT_NULL: |
342 | 0 | case VariantLogicalType::BOOL_TRUE: |
343 | 0 | case VariantLogicalType::BOOL_FALSE: |
344 | 0 | break; |
345 | 0 | case VariantLogicalType::INT8: |
346 | 0 | total_size += sizeof(uint8_t); |
347 | 0 | break; |
348 | 0 | case VariantLogicalType::INT16: |
349 | 0 | total_size += sizeof(uint16_t); |
350 | 0 | break; |
351 | 0 | case VariantLogicalType::INT32: |
352 | 0 | total_size += sizeof(uint32_t); |
353 | 0 | break; |
354 | 0 | case VariantLogicalType::INT64: |
355 | 0 | total_size += sizeof(uint64_t); |
356 | 0 | break; |
357 | 0 | case VariantLogicalType::FLOAT: |
358 | 0 | total_size += sizeof(float); |
359 | 0 | break; |
360 | 0 | case VariantLogicalType::DOUBLE: |
361 | 0 | total_size += sizeof(double); |
362 | 0 | break; |
363 | 0 | case VariantLogicalType::DECIMAL: { |
364 | 0 | auto decimal_data = VariantUtils::DecodeDecimalData(variant, row, values_index); |
365 | 0 | total_size += 1; |
366 | 0 | if (decimal_data.width <= 9) { |
367 | 0 | total_size += sizeof(int32_t); |
368 | 0 | } else if (decimal_data.width <= 18) { |
369 | 0 | total_size += sizeof(int64_t); |
370 | 0 | } else if (decimal_data.width <= 38) { |
371 | 0 | total_size += sizeof(uhugeint_t); |
372 | 0 | } else { |
373 | 0 | throw InvalidInputException("Can't convert VARIANT DECIMAL(%d, %d) to Parquet VARIANT", decimal_data.width, |
374 | 0 | decimal_data.scale); |
375 | 0 | } |
376 | 0 | break; |
377 | 0 | } |
378 | 0 | case VariantLogicalType::UUID: |
379 | 0 | total_size += sizeof(uhugeint_t); |
380 | 0 | break; |
381 | 0 | case VariantLogicalType::DATE: |
382 | 0 | total_size += sizeof(uint32_t); |
383 | 0 | break; |
384 | 0 | case VariantLogicalType::TIME_MICROS: |
385 | 0 | case VariantLogicalType::TIMESTAMP_MICROS: |
386 | 0 | case VariantLogicalType::TIMESTAMP_NANOS: |
387 | 0 | case VariantLogicalType::TIMESTAMP_MICROS_TZ: |
388 | 0 | total_size += sizeof(uint64_t); |
389 | 0 | break; |
390 | 0 | case VariantLogicalType::UINT8: |
391 | | // store as int16_t |
392 | 0 | total_size += sizeof(int16_t); |
393 | 0 | break; |
394 | 0 | case VariantLogicalType::UINT16: |
395 | | // store as int32_t |
396 | 0 | total_size += sizeof(int32_t); |
397 | 0 | break; |
398 | 0 | case VariantLogicalType::UINT32: |
399 | 0 | case VariantLogicalType::UINT64: |
400 | 0 | case VariantLogicalType::UINT128: |
401 | 0 | case VariantLogicalType::INT128: |
402 | | // try to store as int64_t - fail if it doesn't fit |
403 | 0 | total_size += sizeof(int64_t); |
404 | 0 | break; |
405 | 0 | case VariantLogicalType::INTERVAL: |
406 | 0 | case VariantLogicalType::BIGNUM: |
407 | 0 | case VariantLogicalType::BITSTRING: |
408 | 0 | case VariantLogicalType::TIMESTAMP_MILIS: |
409 | 0 | case VariantLogicalType::TIMESTAMP_SEC: |
410 | 0 | case VariantLogicalType::TIME_MICROS_TZ: |
411 | 0 | case VariantLogicalType::TIME_NANOS: |
412 | 0 | default: |
413 | 0 | throw InvalidInputException("Can't convert VARIANT of type '%s' to Parquet VARIANT", |
414 | 0 | EnumUtil::ToString(type_id)); |
415 | 0 | } |
416 | | |
417 | 0 | return total_size; |
418 | 0 | } |
419 | | |
420 | | template <VariantPrimitiveType TYPE_ID> |
421 | 0 | void WritePrimitiveTypeHeader(data_ptr_t &value_data) { |
422 | 0 | uint8_t value_header = 0; |
423 | 0 | value_header |= static_cast<uint8_t>(VariantBasicType::PRIMITIVE); |
424 | 0 | value_header |= static_cast<uint8_t>(TYPE_ID) << 2; |
425 | |
|
426 | 0 | *value_data = value_header; |
427 | 0 | value_data++; |
428 | 0 | } Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)15>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)16>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)0>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)1>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)2>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)3>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)4>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)5>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)6>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)14>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)7>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)20>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)11>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)17>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)13>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)19>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)12>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)8>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)9>(unsigned char*&) Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)10>(unsigned char*&) |
429 | | |
430 | | struct VariantSimpleCopy { |
431 | | template <class T> |
432 | 0 | static void CopyValue(const_data_ptr_t source, data_ptr_t target) { |
433 | 0 | memcpy(target, source, sizeof(T)); |
434 | 0 | } Unexecuted instantiation: void duckdb::VariantSimpleCopy::CopyValue<signed char>(unsigned char const*, unsigned char*) Unexecuted instantiation: void duckdb::VariantSimpleCopy::CopyValue<short>(unsigned char const*, unsigned char*) Unexecuted instantiation: void duckdb::VariantSimpleCopy::CopyValue<int>(unsigned char const*, unsigned char*) Unexecuted instantiation: void duckdb::VariantSimpleCopy::CopyValue<long>(unsigned char const*, unsigned char*) Unexecuted instantiation: void duckdb::VariantSimpleCopy::CopyValue<float>(unsigned char const*, unsigned char*) Unexecuted instantiation: void duckdb::VariantSimpleCopy::CopyValue<double>(unsigned char const*, unsigned char*) |
435 | | }; |
436 | | |
437 | | template <class SRC> |
438 | | struct VariantSimpleConversion { |
439 | | template <class T> |
440 | 0 | static void CopyValue(const_data_ptr_t source, data_ptr_t target) { |
441 | 0 | auto src = Load<SRC>(source); |
442 | 0 | Store(static_cast<T>(src), target); |
443 | 0 | } Unexecuted instantiation: void duckdb::VariantSimpleConversion<unsigned char>::CopyValue<short>(unsigned char const*, unsigned char*) Unexecuted instantiation: void duckdb::VariantSimpleConversion<unsigned short>::CopyValue<int>(unsigned char const*, unsigned char*) Unexecuted instantiation: void duckdb::VariantSimpleConversion<unsigned int>::CopyValue<long>(unsigned char const*, unsigned char*) |
444 | | }; |
445 | | |
446 | | template <class SRC> |
447 | | struct VariantTryConvert { |
448 | | template <class T> |
449 | 0 | static void CopyValue(const_data_ptr_t source, data_ptr_t target) { |
450 | 0 | auto src = Load<SRC>(source); |
451 | 0 | Store(Cast::Operation<SRC, T>(src), target); |
452 | 0 | } Unexecuted instantiation: void duckdb::VariantTryConvert<unsigned long>::CopyValue<long>(unsigned char const*, unsigned char*) Unexecuted instantiation: void duckdb::VariantTryConvert<duckdb::uhugeint_t>::CopyValue<long>(unsigned char const*, unsigned char*) Unexecuted instantiation: void duckdb::VariantTryConvert<duckdb::hugeint_t>::CopyValue<long>(unsigned char const*, unsigned char*) |
453 | | }; |
454 | | |
455 | | template <class T, class OP = VariantSimpleCopy> |
456 | | void CopySimplePrimitiveData(const UnifiedVariantVectorData &variant, data_ptr_t &value_data, idx_t row, |
457 | 0 | uint32_t values_index) { |
458 | 0 | auto byte_offset = variant.GetByteOffset(row, values_index); |
459 | 0 | auto data = const_data_ptr_cast(variant.GetData(row).GetData()); |
460 | 0 | auto ptr = data + byte_offset; |
461 | 0 | OP::template CopyValue<T>(ptr, value_data); |
462 | 0 | value_data += sizeof(T); |
463 | 0 | } Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<signed char, duckdb::VariantSimpleCopy>(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int) Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<short, duckdb::VariantSimpleCopy>(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int) Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<int, duckdb::VariantSimpleCopy>(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int) Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<long, duckdb::VariantSimpleCopy>(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int) Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<float, duckdb::VariantSimpleCopy>(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int) Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<double, duckdb::VariantSimpleCopy>(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int) Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<short, duckdb::VariantSimpleConversion<unsigned char> >(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int) Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<int, duckdb::VariantSimpleConversion<unsigned short> >(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int) Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<long, duckdb::VariantSimpleConversion<unsigned int> >(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int) Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<long, duckdb::VariantTryConvert<unsigned long> >(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int) Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<long, duckdb::VariantTryConvert<duckdb::uhugeint_t> >(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int) Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<long, duckdb::VariantTryConvert<duckdb::hugeint_t> >(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int) |
464 | | |
465 | 0 | void CopyUUIDData(const UnifiedVariantVectorData &variant, data_ptr_t &value_data, idx_t row, uint32_t values_index) { |
466 | 0 | auto byte_offset = variant.GetByteOffset(row, values_index); |
467 | 0 | auto data = const_data_ptr_cast(variant.GetData(row).GetData()); |
468 | 0 | auto ptr = data + byte_offset; |
469 | |
|
470 | 0 | auto uuid = Load<uhugeint_t>(ptr); |
471 | 0 | BaseUUID::ToBlob(uuid, value_data); |
472 | 0 | value_data += sizeof(uhugeint_t); |
473 | 0 | } |
474 | | |
475 | | static void WritePrimitiveValueData(const UnifiedVariantVectorData &variant, idx_t row, uint32_t values_index, |
476 | 0 | data_ptr_t &value_data, const vector<uint32_t> &offsets, idx_t &offset_index) { |
477 | 0 | VariantLogicalType type_id = VariantLogicalType::VARIANT_NULL; |
478 | 0 | if (variant.RowIsValid(row)) { |
479 | 0 | type_id = variant.GetTypeId(row, values_index); |
480 | 0 | } |
481 | |
|
482 | 0 | D_ASSERT(type_id != VariantLogicalType::OBJECT && type_id != VariantLogicalType::ARRAY); |
483 | 0 | switch (type_id) { |
484 | 0 | case VariantLogicalType::BLOB: |
485 | 0 | case VariantLogicalType::VARCHAR: { |
486 | 0 | auto string_value = VariantUtils::DecodeStringData(variant, row, values_index); |
487 | 0 | auto string_size = string_value.GetSize(); |
488 | 0 | if (type_id == VariantLogicalType::BLOB || string_size > 64) { |
489 | 0 | if (type_id == VariantLogicalType::BLOB) { |
490 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::BINARY>(value_data); |
491 | 0 | } else { |
492 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::STRING>(value_data); |
493 | 0 | } |
494 | 0 | Store<uint32_t>(string_size, value_data); |
495 | 0 | value_data += sizeof(uint32_t); |
496 | 0 | } else { |
497 | 0 | uint8_t value_header = 0; |
498 | 0 | value_header |= static_cast<uint8_t>(VariantBasicType::SHORT_STRING); |
499 | 0 | value_header |= static_cast<uint8_t>(string_size) << 2; |
500 | |
|
501 | 0 | *value_data = value_header; |
502 | 0 | value_data++; |
503 | 0 | } |
504 | 0 | memcpy(value_data, reinterpret_cast<const char *>(string_value.GetData()), string_size); |
505 | 0 | value_data += string_size; |
506 | 0 | break; |
507 | 0 | } |
508 | 0 | case VariantLogicalType::VARIANT_NULL: |
509 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::NULL_TYPE>(value_data); |
510 | 0 | break; |
511 | 0 | case VariantLogicalType::BOOL_TRUE: |
512 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::BOOLEAN_TRUE>(value_data); |
513 | 0 | break; |
514 | 0 | case VariantLogicalType::BOOL_FALSE: |
515 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::BOOLEAN_FALSE>(value_data); |
516 | 0 | break; |
517 | 0 | case VariantLogicalType::INT8: |
518 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::INT8>(value_data); |
519 | 0 | CopySimplePrimitiveData<int8_t>(variant, value_data, row, values_index); |
520 | 0 | break; |
521 | 0 | case VariantLogicalType::INT16: |
522 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::INT16>(value_data); |
523 | 0 | CopySimplePrimitiveData<int16_t>(variant, value_data, row, values_index); |
524 | 0 | break; |
525 | 0 | case VariantLogicalType::INT32: |
526 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::INT32>(value_data); |
527 | 0 | CopySimplePrimitiveData<int32_t>(variant, value_data, row, values_index); |
528 | 0 | break; |
529 | 0 | case VariantLogicalType::INT64: |
530 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::INT64>(value_data); |
531 | 0 | CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index); |
532 | 0 | break; |
533 | 0 | case VariantLogicalType::FLOAT: |
534 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::FLOAT>(value_data); |
535 | 0 | CopySimplePrimitiveData<float>(variant, value_data, row, values_index); |
536 | 0 | break; |
537 | 0 | case VariantLogicalType::DOUBLE: |
538 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::DOUBLE>(value_data); |
539 | 0 | CopySimplePrimitiveData<double>(variant, value_data, row, values_index); |
540 | 0 | break; |
541 | 0 | case VariantLogicalType::UUID: |
542 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::UUID>(value_data); |
543 | 0 | CopyUUIDData(variant, value_data, row, values_index); |
544 | 0 | break; |
545 | 0 | case VariantLogicalType::DATE: |
546 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::DATE>(value_data); |
547 | 0 | CopySimplePrimitiveData<int32_t>(variant, value_data, row, values_index); |
548 | 0 | break; |
549 | 0 | case VariantLogicalType::TIME_MICROS: |
550 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::TIME_NTZ_MICROS>(value_data); |
551 | 0 | CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index); |
552 | 0 | break; |
553 | 0 | case VariantLogicalType::TIMESTAMP_MICROS: |
554 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::TIMESTAMP_NTZ_MICROS>(value_data); |
555 | 0 | CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index); |
556 | 0 | break; |
557 | 0 | case VariantLogicalType::TIMESTAMP_NANOS: |
558 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::TIMESTAMP_NTZ_NANOS>(value_data); |
559 | 0 | CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index); |
560 | 0 | break; |
561 | 0 | case VariantLogicalType::TIMESTAMP_MICROS_TZ: |
562 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::TIMESTAMP_MICROS>(value_data); |
563 | 0 | CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index); |
564 | 0 | break; |
565 | 0 | case VariantLogicalType::DECIMAL: { |
566 | 0 | auto decimal_data = VariantUtils::DecodeDecimalData(variant, row, values_index); |
567 | |
|
568 | 0 | if (decimal_data.width > 38) { |
569 | 0 | throw InvalidInputException("Can't convert VARIANT DECIMAL(%d, %d) to Parquet VARIANT", decimal_data.width, |
570 | 0 | decimal_data.scale); |
571 | 0 | } else if (decimal_data.width <= 4) { |
572 | | // DuckDB uses INT16 to store small decimals, but parquet only supports DECIMAL4 at minimum, so here we |
573 | | // promote to INT32. |
574 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::DECIMAL4>(value_data); |
575 | 0 | Store<int8_t>(NumericCast<int8_t>(decimal_data.scale), value_data); |
576 | 0 | value_data++; |
577 | 0 | const int32_t promoted = Load<int16_t>(decimal_data.value_ptr); |
578 | 0 | Store<int32_t>(promoted, value_data); |
579 | 0 | value_data += sizeof(int32_t); |
580 | 0 | } else if (decimal_data.width <= 9) { |
581 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::DECIMAL4>(value_data); |
582 | 0 | Store<int8_t>(NumericCast<int8_t>(decimal_data.scale), value_data); |
583 | 0 | value_data++; |
584 | 0 | memcpy(value_data, decimal_data.value_ptr, sizeof(int32_t)); |
585 | 0 | value_data += sizeof(int32_t); |
586 | 0 | } else if (decimal_data.width <= 18) { |
587 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::DECIMAL8>(value_data); |
588 | 0 | Store<int8_t>(NumericCast<int8_t>(decimal_data.scale), value_data); |
589 | 0 | value_data++; |
590 | 0 | memcpy(value_data, decimal_data.value_ptr, sizeof(int64_t)); |
591 | 0 | value_data += sizeof(int64_t); |
592 | 0 | } else if (decimal_data.width <= 38) { |
593 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::DECIMAL16>(value_data); |
594 | 0 | Store<int8_t>(NumericCast<int8_t>(decimal_data.scale), value_data); |
595 | 0 | value_data++; |
596 | 0 | memcpy(value_data, decimal_data.value_ptr, sizeof(hugeint_t)); |
597 | 0 | value_data += sizeof(hugeint_t); |
598 | 0 | } else { |
599 | 0 | throw InternalException( |
600 | 0 | "Uncovered VARIANT(DECIMAL) -> Parquet VARIANT conversion for type 'DECIMAL(%d, %d)'", |
601 | 0 | decimal_data.width, decimal_data.scale); |
602 | 0 | } |
603 | 0 | break; |
604 | 0 | } |
605 | 0 | case VariantLogicalType::UINT8: |
606 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::INT16>(value_data); |
607 | 0 | CopySimplePrimitiveData<int16_t, VariantSimpleConversion<uint8_t>>(variant, value_data, row, values_index); |
608 | 0 | break; |
609 | 0 | case VariantLogicalType::UINT16: |
610 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::INT32>(value_data); |
611 | 0 | CopySimplePrimitiveData<int32_t, VariantSimpleConversion<uint16_t>>(variant, value_data, row, values_index); |
612 | 0 | break; |
613 | 0 | case VariantLogicalType::UINT32: |
614 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::INT64>(value_data); |
615 | 0 | CopySimplePrimitiveData<int64_t, VariantSimpleConversion<uint32_t>>(variant, value_data, row, values_index); |
616 | 0 | break; |
617 | 0 | case VariantLogicalType::UINT64: |
618 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::INT64>(value_data); |
619 | 0 | CopySimplePrimitiveData<int64_t, VariantTryConvert<uint64_t>>(variant, value_data, row, values_index); |
620 | 0 | break; |
621 | 0 | case VariantLogicalType::UINT128: |
622 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::INT64>(value_data); |
623 | 0 | CopySimplePrimitiveData<int64_t, VariantTryConvert<uhugeint_t>>(variant, value_data, row, values_index); |
624 | 0 | break; |
625 | 0 | case VariantLogicalType::INT128: |
626 | 0 | WritePrimitiveTypeHeader<VariantPrimitiveType::INT64>(value_data); |
627 | 0 | CopySimplePrimitiveData<int64_t, VariantTryConvert<hugeint_t>>(variant, value_data, row, values_index); |
628 | 0 | break; |
629 | 0 | case VariantLogicalType::INTERVAL: |
630 | 0 | case VariantLogicalType::BIGNUM: |
631 | 0 | case VariantLogicalType::BITSTRING: |
632 | 0 | case VariantLogicalType::TIMESTAMP_MILIS: |
633 | 0 | case VariantLogicalType::TIMESTAMP_SEC: |
634 | 0 | case VariantLogicalType::TIME_MICROS_TZ: |
635 | 0 | case VariantLogicalType::TIME_NANOS: |
636 | 0 | default: |
637 | 0 | throw InvalidInputException("Can't convert VARIANT of type '%s' to Parquet VARIANT", |
638 | 0 | EnumUtil::ToString(type_id)); |
639 | 0 | } |
640 | 0 | } |
641 | | |
642 | | static void WriteValueData(const UnifiedVariantVectorData &variant, idx_t row, uint32_t values_index, |
643 | | data_ptr_t &value_data, const vector<uint32_t> &offsets, idx_t &offset_index, |
644 | 0 | optional_ptr<ParquetVariantShreddingState> shredding_state) { |
645 | 0 | VariantLogicalType type_id = VariantLogicalType::VARIANT_NULL; |
646 | 0 | if (variant.RowIsValid(row)) { |
647 | 0 | type_id = variant.GetTypeId(row, values_index); |
648 | 0 | } |
649 | 0 | if (type_id == VariantLogicalType::OBJECT) { |
650 | 0 | auto nested_data = VariantUtils::DecodeNestedData(variant, row, values_index); |
651 | | |
652 | | //! -- Object value header -- |
653 | |
|
654 | 0 | auto child_indices = GetChildIndices(variant, row, nested_data, shredding_state); |
655 | 0 | if (nested_data.child_count && child_indices.empty()) { |
656 | 0 | throw InternalException( |
657 | 0 | "The entire should be omitted, should have been handled by the Analyze step already"); |
658 | 0 | } |
659 | 0 | auto num_elements = child_indices.size(); |
660 | | |
661 | | //! Determine the 'field_id_size' |
662 | 0 | uint32_t highest_keys_index = 0; |
663 | 0 | for (auto &i : child_indices) { |
664 | 0 | auto keys_index = variant.GetKeysIndex(row, i + nested_data.children_idx); |
665 | 0 | highest_keys_index = MaxValue(highest_keys_index, keys_index); |
666 | 0 | } |
667 | 0 | auto field_id_size = CalculateByteLength(highest_keys_index); |
668 | |
|
669 | 0 | uint32_t last_offset = 0; |
670 | 0 | if (num_elements) { |
671 | 0 | last_offset = offsets[offset_index + num_elements]; |
672 | 0 | } |
673 | 0 | offset_index += num_elements + 1; |
674 | 0 | auto field_offset_size = CalculateByteLength(last_offset); |
675 | |
|
676 | 0 | const bool is_large = num_elements > NumericLimits<uint8_t>::Maximum(); |
677 | |
|
678 | 0 | uint8_t value_header = 0; |
679 | 0 | value_header |= static_cast<uint8_t>(VariantBasicType::OBJECT); |
680 | 0 | value_header |= static_cast<uint8_t>(is_large) << 6; |
681 | 0 | value_header |= (static_cast<uint8_t>(field_id_size) - 1) << 4; |
682 | 0 | value_header |= (static_cast<uint8_t>(field_offset_size) - 1) << 2; |
683 | |
|
684 | | #ifdef DEBUG |
685 | | auto object_value_header = VariantValueMetadata::FromHeaderByte(value_header); |
686 | | D_ASSERT(object_value_header.basic_type == VariantBasicType::OBJECT); |
687 | | D_ASSERT(object_value_header.is_large == is_large); |
688 | | D_ASSERT(object_value_header.field_offset_size == field_offset_size); |
689 | | D_ASSERT(object_value_header.field_id_size == field_id_size); |
690 | | #endif |
691 | |
|
692 | 0 | *value_data = value_header; |
693 | 0 | value_data++; |
694 | | |
695 | | //! Write the 'num_elements' |
696 | 0 | if (is_large) { |
697 | 0 | Store<uint32_t>(static_cast<uint32_t>(num_elements), value_data); |
698 | 0 | value_data += sizeof(uint32_t); |
699 | 0 | } else { |
700 | 0 | Store<uint8_t>(static_cast<uint8_t>(num_elements), value_data); |
701 | 0 | value_data += sizeof(uint8_t); |
702 | 0 | } |
703 | | |
704 | | //! Write the 'field_id' entries |
705 | 0 | for (auto &i : child_indices) { |
706 | 0 | auto keys_index = variant.GetKeysIndex(row, i + nested_data.children_idx); |
707 | 0 | memcpy(value_data, reinterpret_cast<data_ptr_t>(&keys_index), field_id_size); |
708 | 0 | value_data += field_id_size; |
709 | 0 | } |
710 | | |
711 | | //! Write the 'field_offset' entries and the child 'value's |
712 | 0 | auto children_ptr = value_data + ((num_elements + 1) * field_offset_size); |
713 | 0 | idx_t total_offset = 0; |
714 | 0 | for (auto &i : child_indices) { |
715 | 0 | auto values_index = variant.GetValuesIndex(row, i + nested_data.children_idx); |
716 | |
|
717 | 0 | memcpy(value_data, reinterpret_cast<data_ptr_t>(&total_offset), field_offset_size); |
718 | 0 | value_data += field_offset_size; |
719 | 0 | auto start_ptr = children_ptr; |
720 | 0 | WriteValueData(variant, row, values_index, children_ptr, offsets, offset_index, nullptr); |
721 | 0 | total_offset += (children_ptr - start_ptr); |
722 | 0 | } |
723 | 0 | memcpy(value_data, reinterpret_cast<data_ptr_t>(&total_offset), field_offset_size); |
724 | 0 | value_data += field_offset_size; |
725 | 0 | D_ASSERT(children_ptr - total_offset == value_data); |
726 | 0 | value_data = children_ptr; |
727 | 0 | } else if (type_id == VariantLogicalType::ARRAY) { |
728 | 0 | auto nested_data = VariantUtils::DecodeNestedData(variant, row, values_index); |
729 | | |
730 | | //! -- Array value header -- |
731 | |
|
732 | 0 | uint32_t last_offset = 0; |
733 | 0 | if (nested_data.child_count) { |
734 | 0 | last_offset = offsets[offset_index + nested_data.child_count]; |
735 | 0 | } |
736 | 0 | offset_index += nested_data.child_count + 1; |
737 | 0 | auto field_offset_size = CalculateByteLength(last_offset); |
738 | |
|
739 | 0 | auto num_elements = nested_data.child_count; |
740 | 0 | const bool is_large = num_elements > NumericLimits<uint8_t>::Maximum(); |
741 | |
|
742 | 0 | uint8_t value_header = 0; |
743 | 0 | value_header |= static_cast<uint8_t>(VariantBasicType::ARRAY); |
744 | 0 | value_header |= static_cast<uint8_t>(is_large) << 4; |
745 | 0 | value_header |= (static_cast<uint8_t>(field_offset_size) - 1) << 2; |
746 | |
|
747 | | #ifdef DEBUG |
748 | | auto array_value_header = VariantValueMetadata::FromHeaderByte(value_header); |
749 | | D_ASSERT(array_value_header.basic_type == VariantBasicType::ARRAY); |
750 | | D_ASSERT(array_value_header.is_large == is_large); |
751 | | D_ASSERT(array_value_header.field_offset_size == field_offset_size); |
752 | | #endif |
753 | |
|
754 | 0 | *value_data = value_header; |
755 | 0 | value_data++; |
756 | | |
757 | | //! Write the 'num_elements' |
758 | 0 | if (is_large) { |
759 | 0 | Store<uint32_t>(static_cast<uint32_t>(num_elements), value_data); |
760 | 0 | value_data += sizeof(uint32_t); |
761 | 0 | } else { |
762 | 0 | Store<uint8_t>(static_cast<uint8_t>(num_elements), value_data); |
763 | 0 | value_data += sizeof(uint8_t); |
764 | 0 | } |
765 | | |
766 | | //! Write the 'field_offset' entries and the child 'value's |
767 | 0 | auto children_ptr = value_data + ((num_elements + 1) * field_offset_size); |
768 | 0 | idx_t total_offset = 0; |
769 | 0 | for (idx_t i = 0; i < nested_data.child_count; i++) { |
770 | 0 | auto values_index = variant.GetValuesIndex(row, i + nested_data.children_idx); |
771 | |
|
772 | 0 | memcpy(value_data, reinterpret_cast<data_ptr_t>(&total_offset), field_offset_size); |
773 | 0 | value_data += field_offset_size; |
774 | 0 | auto start_ptr = children_ptr; |
775 | 0 | WriteValueData(variant, row, values_index, children_ptr, offsets, offset_index, nullptr); |
776 | 0 | total_offset += (children_ptr - start_ptr); |
777 | 0 | } |
778 | 0 | memcpy(value_data, reinterpret_cast<data_ptr_t>(&total_offset), field_offset_size); |
779 | 0 | value_data += field_offset_size; |
780 | 0 | D_ASSERT(children_ptr - total_offset == value_data); |
781 | 0 | value_data = children_ptr; |
782 | 0 | } else { |
783 | 0 | WritePrimitiveValueData(variant, row, values_index, value_data, offsets, offset_index); |
784 | 0 | } |
785 | 0 | } |
786 | | |
787 | | static void CreateValues(UnifiedVariantVectorData &variant, Vector &value, optional_ptr<const SelectionVector> sel, |
788 | | optional_ptr<const SelectionVector> value_index_sel, |
789 | | optional_ptr<const SelectionVector> result_sel, |
790 | 0 | optional_ptr<ParquetVariantShreddingState> shredding_state, idx_t count) { |
791 | 0 | auto &validity = FlatVector::ValidityMutable(value); |
792 | 0 | auto value_data = FlatVector::GetDataMutable<string_t>(value); |
793 | |
|
794 | 0 | for (idx_t i = 0; i < count; i++) { |
795 | 0 | idx_t value_index = 0; |
796 | 0 | if (value_index_sel) { |
797 | 0 | value_index = value_index_sel->get_index(i); |
798 | 0 | } |
799 | |
|
800 | 0 | idx_t row = i; |
801 | 0 | if (sel) { |
802 | 0 | row = sel->get_index(i); |
803 | 0 | } |
804 | |
|
805 | 0 | idx_t result_index = i; |
806 | 0 | if (result_sel) { |
807 | 0 | result_index = result_sel->get_index(i); |
808 | 0 | } |
809 | |
|
810 | 0 | bool is_shredded = false; |
811 | 0 | if (variant.RowIsValid(row) && shredding_state && shredding_state->ValueIsShredded(variant, row, value_index)) { |
812 | 0 | shredding_state->SetShredded(row, value_index, result_index); |
813 | 0 | is_shredded = true; |
814 | 0 | if (shredding_state->type.id() != LogicalTypeId::STRUCT) { |
815 | | //! Value is shredded, directly write a NULL to the 'value' if the type is not an OBJECT |
816 | | //! When the type is OBJECT, all excess fields would still need to be written to the 'value' |
817 | 0 | validity.SetInvalid(result_index); |
818 | 0 | continue; |
819 | 0 | } |
820 | 0 | } |
821 | | |
822 | | //! The (relative) offsets for each value, in the case of nesting |
823 | 0 | vector<uint32_t> offsets; |
824 | | //! Determine the size of this 'value' blob |
825 | 0 | idx_t blob_length = AnalyzeValueData(variant, row, value_index, offsets, shredding_state); |
826 | 0 | if (!blob_length) { |
827 | | //! This is only allowed to happen for a shredded OBJECT, where there are no excess fields to write for the |
828 | | //! OBJECT |
829 | 0 | (void)is_shredded; |
830 | 0 | D_ASSERT(is_shredded); |
831 | 0 | validity.SetInvalid(result_index); |
832 | 0 | continue; |
833 | 0 | } |
834 | 0 | value_data[result_index] = StringVector::EmptyString(value, blob_length); |
835 | 0 | auto &value_blob = value_data[result_index]; |
836 | 0 | auto value_blob_data = reinterpret_cast<data_ptr_t>(value_blob.GetDataWriteable()); |
837 | |
|
838 | 0 | idx_t offset_index = 0; |
839 | 0 | WriteValueData(variant, row, value_index, value_blob_data, offsets, offset_index, shredding_state); |
840 | 0 | D_ASSERT(data_ptr_cast(value_blob.GetDataWriteable() + blob_length) == value_blob_data); |
841 | 0 | value_blob.SetSizeAndFinalize(blob_length, blob_length); |
842 | 0 | } |
843 | 0 | } |
844 | | |
845 | 0 | void ParquetVariantShredding::WriteMissingField(Vector &vector, idx_t index) { |
846 | | //! The field is missing, set it to null |
847 | 0 | FlatVector::SetNull(vector, index, true); |
848 | 0 | } |
849 | | |
850 | | void ParquetVariantShredding::WriteVariantValues(UnifiedVariantVectorData &variant, Vector &result, |
851 | | optional_ptr<const SelectionVector> sel, |
852 | | optional_ptr<const SelectionVector> value_index_sel, |
853 | 0 | optional_ptr<const SelectionVector> result_sel, idx_t count) { |
854 | 0 | optional_ptr<Vector> value; |
855 | 0 | optional_ptr<Vector> typed_value; |
856 | |
|
857 | 0 | auto &result_type = result.GetType(); |
858 | 0 | D_ASSERT(result_type.id() == LogicalTypeId::STRUCT); |
859 | 0 | auto &child_types = StructType::GetChildTypes(result_type); |
860 | 0 | auto &child_vectors = StructVector::GetEntries(result); |
861 | 0 | D_ASSERT(child_types.size() == child_vectors.size()); |
862 | 0 | for (idx_t i = 0; i < child_types.size(); i++) { |
863 | 0 | auto &name = child_types[i].first; |
864 | 0 | if (name == "value") { |
865 | 0 | value = &child_vectors[i]; |
866 | 0 | } else if (name == "typed_value") { |
867 | 0 | typed_value = &child_vectors[i]; |
868 | 0 | } |
869 | 0 | } |
870 | |
|
871 | 0 | if (typed_value) { |
872 | 0 | ParquetVariantShreddingState shredding_state(typed_value->GetType(), count); |
873 | 0 | CreateValues(variant, *value, sel, value_index_sel, result_sel, &shredding_state, count); |
874 | |
|
875 | 0 | SelectionVector null_values; |
876 | 0 | if (shredding_state.count) { |
877 | 0 | WriteTypedValues(variant, *typed_value, shredding_state.shredded_sel, shredding_state.values_index_sel, |
878 | 0 | shredding_state.result_sel, shredding_state.count); |
879 | | //! 'shredding_state.result_sel' will always be a subset of 'result_sel', set the rows not in the subset to |
880 | | //! NULL |
881 | 0 | idx_t sel_idx = 0; |
882 | 0 | for (idx_t i = 0; i < count; i++) { |
883 | 0 | auto original_index = result_sel ? result_sel->get_index(i) : i; |
884 | 0 | if (sel_idx < shredding_state.count && shredding_state.result_sel[sel_idx] == original_index) { |
885 | 0 | sel_idx++; |
886 | 0 | continue; |
887 | 0 | } |
888 | 0 | FlatVector::SetNull(*typed_value, original_index, true); |
889 | 0 | } |
890 | 0 | } else { |
891 | | //! Set all rows of the typed_value to NULL, nothing is shredded on |
892 | 0 | for (idx_t i = 0; i < count; i++) { |
893 | 0 | FlatVector::SetNull(*typed_value, result_sel ? result_sel->get_index(i) : i, true); |
894 | 0 | } |
895 | 0 | } |
896 | 0 | } else { |
897 | 0 | CreateValues(variant, *value, sel, value_index_sel, result_sel, nullptr, count); |
898 | 0 | } |
899 | 0 | } |
900 | | |
901 | 0 | static void ToParquetVariant(DataChunk &input, ExpressionState &state, Vector &result) { |
902 | | // DuckDB Variant: |
903 | | // - keys = VARCHAR[] |
904 | | // - children = STRUCT(keys_index UINTEGER, values_index UINTEGER)[] |
905 | | // - values = STRUCT(type_id UTINYINT, byte_offset UINTEGER)[] |
906 | | // - data = BLOB |
907 | | |
908 | | // Parquet VARIANT: |
909 | | // - metadata = BLOB |
910 | | // - value = BLOB |
911 | |
|
912 | 0 | const auto &variant_vec = input.data[0]; |
913 | 0 | auto count = input.size(); |
914 | |
|
915 | 0 | RecursiveUnifiedVectorFormat recursive_format; |
916 | 0 | Vector::RecursiveToUnifiedFormat(variant_vec, recursive_format); |
917 | 0 | UnifiedVariantVectorData variant(recursive_format); |
918 | |
|
919 | 0 | auto &result_vectors = StructVector::GetEntries(result); |
920 | 0 | auto &metadata = result_vectors[0]; |
921 | 0 | CreateMetadata(variant, metadata, count); |
922 | |
|
923 | 0 | ParquetVariantShredding shredding; |
924 | 0 | shredding.WriteVariantValues(variant, result, nullptr, nullptr, nullptr, count); |
925 | 0 | } |
926 | | |
927 | 0 | idx_t VariantColumnWriter::FinalizeSchema(vector<duckdb_parquet::SchemaElement> &schemas) { |
928 | 0 | idx_t schema_idx = schemas.size(); |
929 | |
|
930 | 0 | auto &schema = Schema(); |
931 | 0 | schema.SetSchemaIndex(schema_idx); |
932 | |
|
933 | 0 | auto &repetition_type = schema.repetition_type; |
934 | 0 | auto &name = schema.name; |
935 | 0 | auto &field_id = schema.field_id; |
936 | | |
937 | | // variant group |
938 | 0 | duckdb_parquet::SchemaElement top_element; |
939 | 0 | top_element.repetition_type = repetition_type; |
940 | 0 | top_element.num_children = NumericCast<int32_t>(child_writers.size()); |
941 | 0 | top_element.logicalType.__isset.VARIANT = true; |
942 | 0 | top_element.logicalType.VARIANT.__isset.specification_version = true; |
943 | 0 | top_element.logicalType.VARIANT.specification_version = 1; |
944 | 0 | top_element.__isset.logicalType = true; |
945 | 0 | top_element.__isset.num_children = true; |
946 | 0 | top_element.__isset.repetition_type = true; |
947 | 0 | top_element.name = name; |
948 | 0 | if (field_id.IsValid()) { |
949 | 0 | top_element.__isset.field_id = true; |
950 | 0 | top_element.field_id = NumericCast<int32_t>(field_id.GetIndex()); |
951 | 0 | } |
952 | 0 | schemas.push_back(std::move(top_element)); |
953 | |
|
954 | 0 | idx_t unique_columns = 0; |
955 | 0 | for (auto &child_writer : child_writers) { |
956 | 0 | unique_columns += child_writer->FinalizeSchema(schemas); |
957 | 0 | } |
958 | 0 | return unique_columns; |
959 | 0 | } |
960 | | |
961 | 0 | LogicalType VariantColumnWriter::TransformTypedValueRecursive(const LogicalType &type) { |
962 | 0 | switch (type.id()) { |
963 | 0 | case LogicalTypeId::STRUCT: { |
964 | | //! Wrap all fields of the struct in a struct with 'value' and 'typed_value' fields |
965 | 0 | auto &child_types = StructType::GetChildTypes(type); |
966 | 0 | child_list_t<LogicalType> replaced_types; |
967 | 0 | for (auto &entry : child_types) { |
968 | 0 | child_list_t<LogicalType> child_children; |
969 | 0 | child_children.emplace_back("value", LogicalType::BLOB); |
970 | 0 | if (entry.second.id() != LogicalTypeId::VARIANT) { |
971 | 0 | child_children.emplace_back("typed_value", TransformTypedValueRecursive(entry.second)); |
972 | 0 | } |
973 | 0 | replaced_types.emplace_back(entry.first, LogicalType::STRUCT(child_children)); |
974 | 0 | } |
975 | 0 | return LogicalType::STRUCT(replaced_types); |
976 | 0 | } |
977 | 0 | case LogicalTypeId::LIST: { |
978 | 0 | auto &child_type = ListType::GetChildType(type); |
979 | 0 | child_list_t<LogicalType> replaced_types; |
980 | 0 | replaced_types.emplace_back("value", LogicalType::BLOB); |
981 | 0 | if (child_type.id() != LogicalTypeId::VARIANT) { |
982 | 0 | replaced_types.emplace_back("typed_value", TransformTypedValueRecursive(child_type)); |
983 | 0 | } |
984 | 0 | return LogicalType::LIST(LogicalType::STRUCT(replaced_types)); |
985 | 0 | } |
986 | 0 | case LogicalTypeId::UNION: |
987 | 0 | case LogicalTypeId::MAP: |
988 | 0 | case LogicalTypeId::VARIANT: |
989 | 0 | case LogicalTypeId::ARRAY: |
990 | 0 | throw BinderException("'%s' can't appear inside a 'typed_value' shredded type!", type.ToString()); |
991 | 0 | default: |
992 | 0 | return type; |
993 | 0 | } |
994 | 0 | } |
995 | | |
996 | 0 | static LogicalType GetParquetVariantType(optional_ptr<LogicalType> shredding = nullptr) { |
997 | 0 | child_list_t<LogicalType> children; |
998 | 0 | children.emplace_back("metadata", LogicalType::BLOB); |
999 | 0 | children.emplace_back("value", LogicalType::BLOB); |
1000 | 0 | if (shredding && shredding->id() != LogicalTypeId::VARIANT) { |
1001 | 0 | children.emplace_back("typed_value", VariantColumnWriter::TransformTypedValueRecursive(*shredding)); |
1002 | 0 | } |
1003 | 0 | auto res = LogicalType::STRUCT(std::move(children)); |
1004 | 0 | res.SetAlias("PARQUET_VARIANT"); |
1005 | 0 | return res; |
1006 | 0 | } |
1007 | | |
1008 | 0 | static unique_ptr<FunctionData> BindTransform(BindScalarFunctionInput &input) { |
1009 | 0 | auto &context = input.GetClientContext(); |
1010 | 0 | auto &bound_function = input.GetBoundFunction(); |
1011 | 0 | auto &arguments = input.GetArguments(); |
1012 | 0 | if (arguments.empty()) { |
1013 | 0 | return nullptr; |
1014 | 0 | } |
1015 | 0 | auto type = ExpressionBinder::GetExpressionReturnType(*arguments[0]); |
1016 | |
|
1017 | 0 | if (arguments.size() == 2) { |
1018 | 0 | auto &shredding = *arguments[1]; |
1019 | 0 | auto expr_return_type = ExpressionBinder::GetExpressionReturnType(shredding); |
1020 | 0 | expr_return_type = LogicalType::NormalizeType(expr_return_type); |
1021 | 0 | if (expr_return_type.id() != LogicalTypeId::VARCHAR) { |
1022 | 0 | throw BinderException("Optional second argument 'shredding' has to be of type VARCHAR, i.e: " |
1023 | 0 | "'STRUCT(my_field BOOLEAN)', found type: '%s' instead", |
1024 | 0 | expr_return_type); |
1025 | 0 | } |
1026 | 0 | if (!shredding.IsFoldable()) { |
1027 | 0 | throw BinderException("Optional second argument 'shredding' has to be a constant expression"); |
1028 | 0 | } |
1029 | 0 | Value type_str = ExpressionExecutor::EvaluateScalar(context, shredding); |
1030 | 0 | if (type_str.IsNull()) { |
1031 | 0 | throw BinderException("Optional second argument 'shredding' can not be NULL"); |
1032 | 0 | } |
1033 | 0 | auto shredded_type = TransformStringToLogicalType(type_str.GetValue<string>(), context); |
1034 | 0 | bound_function.SetReturnType(GetParquetVariantType(shredded_type)); |
1035 | 0 | } else { |
1036 | 0 | bound_function.SetReturnType(GetParquetVariantType()); |
1037 | 0 | } |
1038 | | |
1039 | 0 | return nullptr; |
1040 | 0 | } |
1041 | | |
1042 | 6.58k | ScalarFunction VariantColumnWriter::GetTransformFunction() { |
1043 | 6.58k | ScalarFunction transform("variant_to_parquet_variant", {LogicalType::VARIANT()}, LogicalType::ANY, ToParquetVariant, |
1044 | 6.58k | BindTransform); |
1045 | 6.58k | transform.SetNullHandling(FunctionNullHandling::SPECIAL_HANDLING); |
1046 | 6.58k | return transform; |
1047 | 6.58k | } |
1048 | | |
1049 | | } // namespace duckdb |