Coverage Report

Created: 2026-06-30 06:38

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/duckdb/extension/parquet/writer/variant/convert_variant.cpp
Line
Count
Source
1
#include <stdint.h>
2
#include <string.h>
3
#include <string>
4
#include <unordered_set>
5
#include <utility>
6
#include <vector>
7
8
#include "duckdb/common/vector/struct_vector.hpp"
9
#include "writer/variant_column_writer.hpp"
10
#include "duckdb/common/types/variant.hpp"
11
#include "duckdb/function/scalar/variant_utils.hpp"
12
#include "reader/variant/variant_binary_decoder.hpp"
13
#include "duckdb/function/variant/variant_shredding.hpp"
14
#include "duckdb/planner/expression_binder.hpp"
15
#include "duckdb/execution/expression_executor.hpp"
16
#include "duckdb/common/operator/cast_operators.hpp"
17
#include "column_writer.hpp"
18
#include "duckdb/common/assert.hpp"
19
#include "duckdb/common/enum_util.hpp"
20
#include "duckdb/common/exception.hpp"
21
#include "duckdb/common/exception/binder_exception.hpp"
22
#include "duckdb/common/helper.hpp"
23
#include "duckdb/common/hugeint.hpp"
24
#include "duckdb/common/limits.hpp"
25
#include "duckdb/common/numeric_utils.hpp"
26
#include "duckdb/common/optional_idx.hpp"
27
#include "duckdb/common/optional_ptr.hpp"
28
#include "duckdb/common/string.hpp"
29
#include "duckdb/common/string_map_set.hpp"
30
#include "duckdb/common/typedefs.hpp"
31
#include "duckdb/common/types.hpp"
32
#include "duckdb/common/types/data_chunk.hpp"
33
#include "duckdb/common/types/selection_vector.hpp"
34
#include "duckdb/common/types/string_type.hpp"
35
#include "duckdb/common/types/uuid.hpp"
36
#include "duckdb/common/types/validity_mask.hpp"
37
#include "duckdb/common/types/value.hpp"
38
#include "duckdb/common/types/vector.hpp"
39
#include "duckdb/common/uhugeint.hpp"
40
#include "duckdb/common/unique_ptr.hpp"
41
#include "duckdb/common/vector.hpp"
42
#include "duckdb/common/vector/flat_vector.hpp"
43
#include "duckdb/common/vector/string_vector.hpp"
44
#include "duckdb/common/vector/unified_vector_format.hpp"
45
#include "duckdb/function/function.hpp"
46
#include "duckdb/function/scalar_function.hpp"
47
#include "duckdb/planner/expression.hpp"
48
#include "parquet_column_schema.hpp"
49
#include "parquet_types.h"
50
51
namespace duckdb {
52
class ClientContext;
53
struct ExpressionState;
54
55
0
static idx_t CalculateByteLength(idx_t value) {
56
0
  if (value == 0) {
57
0
    return 1;
58
0
  }
59
0
  auto value_data = reinterpret_cast<data_ptr_t>(&value);
60
0
  idx_t irrelevant_bytes = 0;
61
  //! Check how many of the most significant bytes are 0
62
0
  for (idx_t i = sizeof(idx_t); i > 0 && value_data[i - 1] == 0; i--) {
63
0
    irrelevant_bytes++;
64
0
  }
65
0
  return sizeof(idx_t) - irrelevant_bytes;
66
0
}
67
68
0
static uint8_t EncodeMetadataHeader(idx_t byte_length) {
69
0
  D_ASSERT(byte_length <= 4);
70
71
0
  uint8_t header_byte = 0;
72
  //! Set 'version' to 1
73
0
  header_byte |= static_cast<uint8_t>(1);
74
  //! Set 'sorted_strings' to 1
75
0
  header_byte |= static_cast<uint8_t>(1) << 4;
76
  //! Set 'offset_size_minus_one' to byte_length-1
77
0
  header_byte |= (static_cast<uint8_t>(byte_length) - 1) << 6;
78
79
#ifdef DEBUG
80
  auto decoded_header = VariantMetadataHeader::FromHeaderByte(header_byte);
81
  D_ASSERT(decoded_header.offset_size == byte_length);
82
#endif
83
84
0
  return header_byte;
85
0
}
86
87
0
static void CreateMetadata(UnifiedVariantVectorData &variant, Vector &metadata, idx_t count) {
88
  //! NOTE: the parquet variant is limited to a max dictionary size of NumericLimits<uint32_t>::Maximum()
89
  //! Whereas we can have NumericLimits<uint32_t>::Maximum() *per* string in DuckDB
90
0
  auto metadata_data = FlatVector::GetDataMutable<string_t>(metadata);
91
0
  for (idx_t row = 0; row < count; row++) {
92
0
    uint64_t dictionary_count = 0;
93
0
    if (variant.RowIsValid(row)) {
94
0
      dictionary_count = variant.GetKeysCount(row);
95
0
    }
96
0
    idx_t dictionary_size = 0;
97
0
    for (idx_t i = 0; i < dictionary_count; i++) {
98
0
      auto &key = variant.GetKey(row, i);
99
0
      dictionary_size += key.GetSize();
100
0
    }
101
0
    if (dictionary_size >= NumericLimits<uint32_t>::Maximum()) {
102
0
      throw InvalidInputException("The total length of the dictionary exceeds a 4 byte value (uint32_t), failed "
103
0
                                  "to export VARIANT to Parquet");
104
0
    }
105
106
0
    auto byte_length = CalculateByteLength(dictionary_size);
107
0
    auto total_length = 1 + (byte_length * (dictionary_count + 2)) + dictionary_size;
108
109
0
    metadata_data[row] = StringVector::EmptyString(metadata, total_length);
110
0
    auto &metadata_blob = metadata_data[row];
111
0
    auto metadata_blob_data = metadata_blob.GetDataWriteable();
112
113
0
    metadata_blob_data[0] = static_cast<char>(EncodeMetadataHeader(byte_length));
114
0
    memcpy(metadata_blob_data + 1, const_data_ptr_cast(&dictionary_count), byte_length);
115
116
0
    auto offset_ptr = metadata_blob_data + 1 + byte_length;
117
0
    auto string_ptr =
118
0
        metadata_blob_data + 1 + byte_length + (NumericCast<idx_t>(dictionary_count + 1) * byte_length);
119
0
    idx_t total_offset = 0;
120
0
    for (idx_t i = 0; i < dictionary_count; i++) {
121
0
      memcpy(offset_ptr + (i * byte_length), const_data_ptr_cast(&total_offset), byte_length);
122
0
      auto &key = variant.GetKey(row, i);
123
124
0
      memcpy(string_ptr + total_offset, key.GetData(), key.GetSize());
125
0
      total_offset += key.GetSize();
126
0
    }
127
0
    memcpy(offset_ptr + (NumericCast<idx_t>(dictionary_count) * byte_length), const_data_ptr_cast(&total_offset),
128
0
           byte_length);
129
0
    D_ASSERT(offset_ptr + (NumericCast<idx_t>(dictionary_count + 1) * byte_length) == string_ptr);
130
0
    D_ASSERT(string_ptr + total_offset == metadata_blob_data + total_length);
131
0
    metadata_blob.SetSizeAndFinalize(total_length, total_length);
132
133
#ifdef DEBUG
134
    auto decoded_metadata = VariantMetadata(metadata_blob);
135
    D_ASSERT(decoded_metadata.strings.size() == dictionary_count);
136
    for (idx_t i = 0; i < dictionary_count; i++) {
137
      D_ASSERT(decoded_metadata.strings[i] == variant.GetKey(row, i).GetString());
138
    }
139
#endif
140
0
  }
141
0
}
142
143
namespace {
144
145
0
static unordered_set<VariantLogicalType> GetVariantType(const LogicalType &type) {
146
0
  if (type.id() == LogicalTypeId::ANY) {
147
0
    return {};
148
0
  }
149
0
  switch (type.id()) {
150
0
  case LogicalTypeId::STRUCT:
151
0
    return {VariantLogicalType::OBJECT};
152
0
  case LogicalTypeId::LIST:
153
0
    return {VariantLogicalType::ARRAY};
154
0
  case LogicalTypeId::BOOLEAN:
155
0
    return {VariantLogicalType::BOOL_TRUE, VariantLogicalType::BOOL_FALSE};
156
0
  case LogicalTypeId::TINYINT:
157
0
    return {VariantLogicalType::INT8};
158
0
  case LogicalTypeId::SMALLINT:
159
0
    return {VariantLogicalType::INT16};
160
0
  case LogicalTypeId::INTEGER:
161
0
    return {VariantLogicalType::INT32};
162
0
  case LogicalTypeId::BIGINT:
163
0
    return {VariantLogicalType::INT64};
164
0
  case LogicalTypeId::FLOAT:
165
0
    return {VariantLogicalType::FLOAT};
166
0
  case LogicalTypeId::DOUBLE:
167
0
    return {VariantLogicalType::DOUBLE};
168
0
  case LogicalTypeId::DECIMAL:
169
0
    return {VariantLogicalType::DECIMAL};
170
0
  case LogicalTypeId::DATE:
171
0
    return {VariantLogicalType::DATE};
172
0
  case LogicalTypeId::TIME:
173
0
    return {VariantLogicalType::TIME_MICROS};
174
0
  case LogicalTypeId::TIMESTAMP_TZ:
175
0
    return {VariantLogicalType::TIMESTAMP_MICROS_TZ};
176
0
  case LogicalTypeId::TIMESTAMP_TZ_NS:
177
0
    return {VariantLogicalType::TIMESTAMP_NANOS_TZ};
178
0
  case LogicalTypeId::TIMESTAMP:
179
0
    return {VariantLogicalType::TIMESTAMP_MICROS};
180
0
  case LogicalTypeId::TIMESTAMP_NS:
181
0
    return {VariantLogicalType::TIMESTAMP_NANOS};
182
0
  case LogicalTypeId::BLOB:
183
0
    return {VariantLogicalType::BLOB};
184
0
  case LogicalTypeId::VARCHAR:
185
0
    return {VariantLogicalType::VARCHAR};
186
0
  case LogicalTypeId::UUID:
187
0
    return {VariantLogicalType::UUID};
188
0
  default:
189
0
    throw BinderException("Type '%s' can't be translated to a VARIANT type", type.ToString());
190
0
  }
191
0
}
192
193
struct ParquetVariantShreddingState : public VariantShreddingState {
194
public:
195
  ParquetVariantShreddingState(const LogicalType &type, idx_t total_count)
196
0
      : VariantShreddingState(type, total_count), variant_types(GetVariantType(type)) {
197
0
  }
198
199
public:
200
0
  const unordered_set<VariantLogicalType> &GetVariantTypes() override {
201
0
    return variant_types;
202
0
  }
203
204
private:
205
  unordered_set<VariantLogicalType> variant_types;
206
};
207
208
struct ParquetVariantShredding : public VariantShredding {
209
0
  ParquetVariantShredding() {
210
    // for parquet untyped ("value") comes before typed ("typed_value")
211
0
    untyped_value_index = 0;
212
0
    typed_value_index = 1;
213
0
  }
214
215
  void WriteVariantValues(UnifiedVariantVectorData &variant, Vector &result, optional_ptr<const SelectionVector> sel,
216
                          optional_ptr<const SelectionVector> value_index_sel,
217
                          optional_ptr<const SelectionVector> result_sel, idx_t count) override;
218
219
protected:
220
  void WriteMissingField(Vector &vector, idx_t index) override;
221
};
222
223
} // namespace
224
225
vector<idx_t> GetChildIndices(const UnifiedVariantVectorData &variant, idx_t row, const VariantNestedData &nested_data,
226
0
                              optional_ptr<ParquetVariantShreddingState> shredding_state) {
227
0
  vector<idx_t> child_indices;
228
0
  if (!shredding_state || shredding_state->type.id() != LogicalTypeId::STRUCT) {
229
0
    for (idx_t i = 0; i < nested_data.child_count; i++) {
230
0
      child_indices.push_back(i);
231
0
    }
232
0
    return child_indices;
233
0
  }
234
  //! FIXME: The variant spec says that field names should be case-sensitive, not insensitive
235
0
  case_insensitive_string_set_t shredded_fields = shredding_state->ObjectFields();
236
237
0
  for (idx_t i = 0; i < nested_data.child_count; i++) {
238
0
    auto keys_index = variant.GetKeysIndex(row, i + nested_data.children_idx);
239
0
    auto &key = variant.GetKey(row, keys_index);
240
241
0
    if (shredded_fields.count(key)) {
242
      //! This field is shredded on, omit it from the value
243
0
      continue;
244
0
    }
245
0
    child_indices.push_back(i);
246
0
  }
247
0
  return child_indices;
248
0
}
249
250
static idx_t AnalyzeValueData(const UnifiedVariantVectorData &variant, idx_t row, uint32_t values_index,
251
0
                              vector<uint32_t> &offsets, optional_ptr<ParquetVariantShreddingState> shredding_state) {
252
0
  idx_t total_size = 0;
253
  //! Every value has at least a value header
254
0
  total_size++;
255
256
0
  idx_t offset_size = offsets.size();
257
0
  VariantLogicalType type_id = VariantLogicalType::VARIANT_NULL;
258
0
  if (variant.RowIsValid(row)) {
259
0
    type_id = variant.GetTypeId(row, values_index);
260
0
  }
261
0
  switch (type_id) {
262
0
  case VariantLogicalType::OBJECT: {
263
0
    auto nested_data = VariantUtils::DecodeNestedData(variant, row, values_index);
264
265
    //! Calculate value and key offsets for all children
266
0
    idx_t total_offset = 0;
267
0
    uint32_t highest_keys_index = 0;
268
269
0
    auto child_indices = GetChildIndices(variant, row, nested_data, shredding_state);
270
0
    if (nested_data.child_count && child_indices.empty()) {
271
      //! All fields of the object are shredded, omit the object entirely
272
0
      return 0;
273
0
    }
274
275
0
    auto num_elements = child_indices.size();
276
0
    offsets.resize(offset_size + num_elements + 1);
277
278
0
    for (idx_t entry = 0; entry < child_indices.size(); entry++) {
279
0
      auto i = child_indices[entry];
280
0
      auto keys_index = variant.GetKeysIndex(row, i + nested_data.children_idx);
281
0
      auto values_index = variant.GetValuesIndex(row, i + nested_data.children_idx);
282
0
      offsets[offset_size + entry] = total_offset;
283
284
0
      total_offset += AnalyzeValueData(variant, row, values_index, offsets, nullptr);
285
0
      highest_keys_index = MaxValue(highest_keys_index, keys_index);
286
0
    }
287
0
    offsets[offset_size + num_elements] = total_offset;
288
289
    //! Calculate the sizes for the objects value data
290
0
    auto field_id_size = CalculateByteLength(highest_keys_index);
291
0
    auto field_offset_size = CalculateByteLength(total_offset);
292
0
    const bool is_large = num_elements > NumericLimits<uint8_t>::Maximum();
293
294
    //! Now add the sizes for the objects value data
295
0
    if (is_large) {
296
0
      total_size += sizeof(uint32_t);
297
0
    } else {
298
0
      total_size += sizeof(uint8_t);
299
0
    }
300
0
    total_size += num_elements * field_id_size;
301
0
    total_size += (num_elements + 1) * field_offset_size;
302
0
    total_size += total_offset;
303
0
    break;
304
0
  }
305
0
  case VariantLogicalType::ARRAY: {
306
0
    auto nested_data = VariantUtils::DecodeNestedData(variant, row, values_index);
307
308
0
    idx_t total_offset = 0;
309
0
    offsets.resize(offset_size + nested_data.child_count + 1);
310
0
    for (idx_t i = 0; i < nested_data.child_count; i++) {
311
0
      auto values_index = variant.GetValuesIndex(row, i + nested_data.children_idx);
312
0
      offsets[offset_size + i] = total_offset;
313
314
0
      total_offset += AnalyzeValueData(variant, row, values_index, offsets, nullptr);
315
0
    }
316
0
    offsets[offset_size + nested_data.child_count] = total_offset;
317
318
0
    auto field_offset_size = CalculateByteLength(total_offset);
319
0
    auto num_elements = nested_data.child_count;
320
0
    const bool is_large = num_elements > NumericLimits<uint8_t>::Maximum();
321
322
0
    if (is_large) {
323
0
      total_size += sizeof(uint32_t);
324
0
    } else {
325
0
      total_size += sizeof(uint8_t);
326
0
    }
327
0
    total_size += (num_elements + 1) * field_offset_size;
328
0
    total_size += total_offset;
329
0
    break;
330
0
  }
331
0
  case VariantLogicalType::BLOB:
332
0
  case VariantLogicalType::VARCHAR: {
333
0
    auto string_value = VariantUtils::DecodeStringData(variant, row, values_index);
334
0
    total_size += string_value.GetSize();
335
0
    if (type_id == VariantLogicalType::BLOB || string_value.GetSize() > 64) {
336
      //! Save as regular string value
337
0
      total_size += sizeof(uint32_t);
338
0
    }
339
0
    break;
340
0
  }
341
0
  case VariantLogicalType::VARIANT_NULL:
342
0
  case VariantLogicalType::BOOL_TRUE:
343
0
  case VariantLogicalType::BOOL_FALSE:
344
0
    break;
345
0
  case VariantLogicalType::INT8:
346
0
    total_size += sizeof(uint8_t);
347
0
    break;
348
0
  case VariantLogicalType::INT16:
349
0
    total_size += sizeof(uint16_t);
350
0
    break;
351
0
  case VariantLogicalType::INT32:
352
0
    total_size += sizeof(uint32_t);
353
0
    break;
354
0
  case VariantLogicalType::INT64:
355
0
    total_size += sizeof(uint64_t);
356
0
    break;
357
0
  case VariantLogicalType::FLOAT:
358
0
    total_size += sizeof(float);
359
0
    break;
360
0
  case VariantLogicalType::DOUBLE:
361
0
    total_size += sizeof(double);
362
0
    break;
363
0
  case VariantLogicalType::DECIMAL: {
364
0
    auto decimal_data = VariantUtils::DecodeDecimalData(variant, row, values_index);
365
0
    total_size += 1;
366
0
    if (decimal_data.width <= 9) {
367
0
      total_size += sizeof(int32_t);
368
0
    } else if (decimal_data.width <= 18) {
369
0
      total_size += sizeof(int64_t);
370
0
    } else if (decimal_data.width <= 38) {
371
0
      total_size += sizeof(uhugeint_t);
372
0
    } else {
373
0
      throw InvalidInputException("Can't convert VARIANT DECIMAL(%d, %d) to Parquet VARIANT", decimal_data.width,
374
0
                                  decimal_data.scale);
375
0
    }
376
0
    break;
377
0
  }
378
0
  case VariantLogicalType::UUID:
379
0
    total_size += sizeof(uhugeint_t);
380
0
    break;
381
0
  case VariantLogicalType::DATE:
382
0
    total_size += sizeof(uint32_t);
383
0
    break;
384
0
  case VariantLogicalType::TIME_MICROS:
385
0
  case VariantLogicalType::TIMESTAMP_MICROS:
386
0
  case VariantLogicalType::TIMESTAMP_NANOS:
387
0
  case VariantLogicalType::TIMESTAMP_MICROS_TZ:
388
0
    total_size += sizeof(uint64_t);
389
0
    break;
390
0
  case VariantLogicalType::UINT8:
391
    // store as int16_t
392
0
    total_size += sizeof(int16_t);
393
0
    break;
394
0
  case VariantLogicalType::UINT16:
395
    // store as int32_t
396
0
    total_size += sizeof(int32_t);
397
0
    break;
398
0
  case VariantLogicalType::UINT32:
399
0
  case VariantLogicalType::UINT64:
400
0
  case VariantLogicalType::UINT128:
401
0
  case VariantLogicalType::INT128:
402
    // try to store as int64_t - fail if it doesn't fit
403
0
    total_size += sizeof(int64_t);
404
0
    break;
405
0
  case VariantLogicalType::INTERVAL:
406
0
  case VariantLogicalType::BIGNUM:
407
0
  case VariantLogicalType::BITSTRING:
408
0
  case VariantLogicalType::TIMESTAMP_MILIS:
409
0
  case VariantLogicalType::TIMESTAMP_SEC:
410
0
  case VariantLogicalType::TIME_MICROS_TZ:
411
0
  case VariantLogicalType::TIME_NANOS:
412
0
  default:
413
0
    throw InvalidInputException("Can't convert VARIANT of type '%s' to Parquet VARIANT",
414
0
                                EnumUtil::ToString(type_id));
415
0
  }
416
417
0
  return total_size;
418
0
}
419
420
template <VariantPrimitiveType TYPE_ID>
421
0
void WritePrimitiveTypeHeader(data_ptr_t &value_data) {
422
0
  uint8_t value_header = 0;
423
0
  value_header |= static_cast<uint8_t>(VariantBasicType::PRIMITIVE);
424
0
  value_header |= static_cast<uint8_t>(TYPE_ID) << 2;
425
426
0
  *value_data = value_header;
427
0
  value_data++;
428
0
}
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)15>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)16>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)0>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)1>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)2>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)3>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)4>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)5>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)6>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)14>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)7>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)20>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)11>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)17>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)13>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)19>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)12>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)8>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)9>(unsigned char*&)
Unexecuted instantiation: void duckdb::WritePrimitiveTypeHeader<(duckdb::VariantPrimitiveType)10>(unsigned char*&)
429
430
struct VariantSimpleCopy {
431
  template <class T>
432
0
  static void CopyValue(const_data_ptr_t source, data_ptr_t target) {
433
0
    memcpy(target, source, sizeof(T));
434
0
  }
Unexecuted instantiation: void duckdb::VariantSimpleCopy::CopyValue<signed char>(unsigned char const*, unsigned char*)
Unexecuted instantiation: void duckdb::VariantSimpleCopy::CopyValue<short>(unsigned char const*, unsigned char*)
Unexecuted instantiation: void duckdb::VariantSimpleCopy::CopyValue<int>(unsigned char const*, unsigned char*)
Unexecuted instantiation: void duckdb::VariantSimpleCopy::CopyValue<long>(unsigned char const*, unsigned char*)
Unexecuted instantiation: void duckdb::VariantSimpleCopy::CopyValue<float>(unsigned char const*, unsigned char*)
Unexecuted instantiation: void duckdb::VariantSimpleCopy::CopyValue<double>(unsigned char const*, unsigned char*)
435
};
436
437
template <class SRC>
438
struct VariantSimpleConversion {
439
  template <class T>
440
0
  static void CopyValue(const_data_ptr_t source, data_ptr_t target) {
441
0
    auto src = Load<SRC>(source);
442
0
    Store(static_cast<T>(src), target);
443
0
  }
Unexecuted instantiation: void duckdb::VariantSimpleConversion<unsigned char>::CopyValue<short>(unsigned char const*, unsigned char*)
Unexecuted instantiation: void duckdb::VariantSimpleConversion<unsigned short>::CopyValue<int>(unsigned char const*, unsigned char*)
Unexecuted instantiation: void duckdb::VariantSimpleConversion<unsigned int>::CopyValue<long>(unsigned char const*, unsigned char*)
444
};
445
446
template <class SRC>
447
struct VariantTryConvert {
448
  template <class T>
449
0
  static void CopyValue(const_data_ptr_t source, data_ptr_t target) {
450
0
    auto src = Load<SRC>(source);
451
0
    Store(Cast::Operation<SRC, T>(src), target);
452
0
  }
Unexecuted instantiation: void duckdb::VariantTryConvert<unsigned long>::CopyValue<long>(unsigned char const*, unsigned char*)
Unexecuted instantiation: void duckdb::VariantTryConvert<duckdb::uhugeint_t>::CopyValue<long>(unsigned char const*, unsigned char*)
Unexecuted instantiation: void duckdb::VariantTryConvert<duckdb::hugeint_t>::CopyValue<long>(unsigned char const*, unsigned char*)
453
};
454
455
template <class T, class OP = VariantSimpleCopy>
456
void CopySimplePrimitiveData(const UnifiedVariantVectorData &variant, data_ptr_t &value_data, idx_t row,
457
0
                             uint32_t values_index) {
458
0
  auto byte_offset = variant.GetByteOffset(row, values_index);
459
0
  auto data = const_data_ptr_cast(variant.GetData(row).GetData());
460
0
  auto ptr = data + byte_offset;
461
0
  OP::template CopyValue<T>(ptr, value_data);
462
0
  value_data += sizeof(T);
463
0
}
Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<signed char, duckdb::VariantSimpleCopy>(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int)
Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<short, duckdb::VariantSimpleCopy>(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int)
Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<int, duckdb::VariantSimpleCopy>(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int)
Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<long, duckdb::VariantSimpleCopy>(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int)
Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<float, duckdb::VariantSimpleCopy>(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int)
Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<double, duckdb::VariantSimpleCopy>(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int)
Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<short, duckdb::VariantSimpleConversion<unsigned char> >(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int)
Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<int, duckdb::VariantSimpleConversion<unsigned short> >(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int)
Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<long, duckdb::VariantSimpleConversion<unsigned int> >(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int)
Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<long, duckdb::VariantTryConvert<unsigned long> >(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int)
Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<long, duckdb::VariantTryConvert<duckdb::uhugeint_t> >(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int)
Unexecuted instantiation: void duckdb::CopySimplePrimitiveData<long, duckdb::VariantTryConvert<duckdb::hugeint_t> >(duckdb::UnifiedVariantVectorData const&, unsigned char*&, unsigned long, unsigned int)
464
465
0
void CopyUUIDData(const UnifiedVariantVectorData &variant, data_ptr_t &value_data, idx_t row, uint32_t values_index) {
466
0
  auto byte_offset = variant.GetByteOffset(row, values_index);
467
0
  auto data = const_data_ptr_cast(variant.GetData(row).GetData());
468
0
  auto ptr = data + byte_offset;
469
470
0
  auto uuid = Load<uhugeint_t>(ptr);
471
0
  BaseUUID::ToBlob(uuid, value_data);
472
0
  value_data += sizeof(uhugeint_t);
473
0
}
474
475
static void WritePrimitiveValueData(const UnifiedVariantVectorData &variant, idx_t row, uint32_t values_index,
476
0
                                    data_ptr_t &value_data, const vector<uint32_t> &offsets, idx_t &offset_index) {
477
0
  VariantLogicalType type_id = VariantLogicalType::VARIANT_NULL;
478
0
  if (variant.RowIsValid(row)) {
479
0
    type_id = variant.GetTypeId(row, values_index);
480
0
  }
481
482
0
  D_ASSERT(type_id != VariantLogicalType::OBJECT && type_id != VariantLogicalType::ARRAY);
483
0
  switch (type_id) {
484
0
  case VariantLogicalType::BLOB:
485
0
  case VariantLogicalType::VARCHAR: {
486
0
    auto string_value = VariantUtils::DecodeStringData(variant, row, values_index);
487
0
    auto string_size = string_value.GetSize();
488
0
    if (type_id == VariantLogicalType::BLOB || string_size > 64) {
489
0
      if (type_id == VariantLogicalType::BLOB) {
490
0
        WritePrimitiveTypeHeader<VariantPrimitiveType::BINARY>(value_data);
491
0
      } else {
492
0
        WritePrimitiveTypeHeader<VariantPrimitiveType::STRING>(value_data);
493
0
      }
494
0
      Store<uint32_t>(string_size, value_data);
495
0
      value_data += sizeof(uint32_t);
496
0
    } else {
497
0
      uint8_t value_header = 0;
498
0
      value_header |= static_cast<uint8_t>(VariantBasicType::SHORT_STRING);
499
0
      value_header |= static_cast<uint8_t>(string_size) << 2;
500
501
0
      *value_data = value_header;
502
0
      value_data++;
503
0
    }
504
0
    memcpy(value_data, reinterpret_cast<const char *>(string_value.GetData()), string_size);
505
0
    value_data += string_size;
506
0
    break;
507
0
  }
508
0
  case VariantLogicalType::VARIANT_NULL:
509
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::NULL_TYPE>(value_data);
510
0
    break;
511
0
  case VariantLogicalType::BOOL_TRUE:
512
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::BOOLEAN_TRUE>(value_data);
513
0
    break;
514
0
  case VariantLogicalType::BOOL_FALSE:
515
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::BOOLEAN_FALSE>(value_data);
516
0
    break;
517
0
  case VariantLogicalType::INT8:
518
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::INT8>(value_data);
519
0
    CopySimplePrimitiveData<int8_t>(variant, value_data, row, values_index);
520
0
    break;
521
0
  case VariantLogicalType::INT16:
522
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::INT16>(value_data);
523
0
    CopySimplePrimitiveData<int16_t>(variant, value_data, row, values_index);
524
0
    break;
525
0
  case VariantLogicalType::INT32:
526
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::INT32>(value_data);
527
0
    CopySimplePrimitiveData<int32_t>(variant, value_data, row, values_index);
528
0
    break;
529
0
  case VariantLogicalType::INT64:
530
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::INT64>(value_data);
531
0
    CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index);
532
0
    break;
533
0
  case VariantLogicalType::FLOAT:
534
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::FLOAT>(value_data);
535
0
    CopySimplePrimitiveData<float>(variant, value_data, row, values_index);
536
0
    break;
537
0
  case VariantLogicalType::DOUBLE:
538
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::DOUBLE>(value_data);
539
0
    CopySimplePrimitiveData<double>(variant, value_data, row, values_index);
540
0
    break;
541
0
  case VariantLogicalType::UUID:
542
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::UUID>(value_data);
543
0
    CopyUUIDData(variant, value_data, row, values_index);
544
0
    break;
545
0
  case VariantLogicalType::DATE:
546
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::DATE>(value_data);
547
0
    CopySimplePrimitiveData<int32_t>(variant, value_data, row, values_index);
548
0
    break;
549
0
  case VariantLogicalType::TIME_MICROS:
550
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::TIME_NTZ_MICROS>(value_data);
551
0
    CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index);
552
0
    break;
553
0
  case VariantLogicalType::TIMESTAMP_MICROS:
554
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::TIMESTAMP_NTZ_MICROS>(value_data);
555
0
    CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index);
556
0
    break;
557
0
  case VariantLogicalType::TIMESTAMP_NANOS:
558
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::TIMESTAMP_NTZ_NANOS>(value_data);
559
0
    CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index);
560
0
    break;
561
0
  case VariantLogicalType::TIMESTAMP_MICROS_TZ:
562
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::TIMESTAMP_MICROS>(value_data);
563
0
    CopySimplePrimitiveData<int64_t>(variant, value_data, row, values_index);
564
0
    break;
565
0
  case VariantLogicalType::DECIMAL: {
566
0
    auto decimal_data = VariantUtils::DecodeDecimalData(variant, row, values_index);
567
568
0
    if (decimal_data.width > 38) {
569
0
      throw InvalidInputException("Can't convert VARIANT DECIMAL(%d, %d) to Parquet VARIANT", decimal_data.width,
570
0
                                  decimal_data.scale);
571
0
    } else if (decimal_data.width <= 4) {
572
      // DuckDB uses INT16 to store small decimals, but parquet only supports DECIMAL4 at minimum, so here we
573
      // promote to INT32.
574
0
      WritePrimitiveTypeHeader<VariantPrimitiveType::DECIMAL4>(value_data);
575
0
      Store<int8_t>(NumericCast<int8_t>(decimal_data.scale), value_data);
576
0
      value_data++;
577
0
      const int32_t promoted = Load<int16_t>(decimal_data.value_ptr);
578
0
      Store<int32_t>(promoted, value_data);
579
0
      value_data += sizeof(int32_t);
580
0
    } else if (decimal_data.width <= 9) {
581
0
      WritePrimitiveTypeHeader<VariantPrimitiveType::DECIMAL4>(value_data);
582
0
      Store<int8_t>(NumericCast<int8_t>(decimal_data.scale), value_data);
583
0
      value_data++;
584
0
      memcpy(value_data, decimal_data.value_ptr, sizeof(int32_t));
585
0
      value_data += sizeof(int32_t);
586
0
    } else if (decimal_data.width <= 18) {
587
0
      WritePrimitiveTypeHeader<VariantPrimitiveType::DECIMAL8>(value_data);
588
0
      Store<int8_t>(NumericCast<int8_t>(decimal_data.scale), value_data);
589
0
      value_data++;
590
0
      memcpy(value_data, decimal_data.value_ptr, sizeof(int64_t));
591
0
      value_data += sizeof(int64_t);
592
0
    } else if (decimal_data.width <= 38) {
593
0
      WritePrimitiveTypeHeader<VariantPrimitiveType::DECIMAL16>(value_data);
594
0
      Store<int8_t>(NumericCast<int8_t>(decimal_data.scale), value_data);
595
0
      value_data++;
596
0
      memcpy(value_data, decimal_data.value_ptr, sizeof(hugeint_t));
597
0
      value_data += sizeof(hugeint_t);
598
0
    } else {
599
0
      throw InternalException(
600
0
          "Uncovered VARIANT(DECIMAL) -> Parquet VARIANT conversion for type 'DECIMAL(%d, %d)'",
601
0
          decimal_data.width, decimal_data.scale);
602
0
    }
603
0
    break;
604
0
  }
605
0
  case VariantLogicalType::UINT8:
606
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::INT16>(value_data);
607
0
    CopySimplePrimitiveData<int16_t, VariantSimpleConversion<uint8_t>>(variant, value_data, row, values_index);
608
0
    break;
609
0
  case VariantLogicalType::UINT16:
610
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::INT32>(value_data);
611
0
    CopySimplePrimitiveData<int32_t, VariantSimpleConversion<uint16_t>>(variant, value_data, row, values_index);
612
0
    break;
613
0
  case VariantLogicalType::UINT32:
614
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::INT64>(value_data);
615
0
    CopySimplePrimitiveData<int64_t, VariantSimpleConversion<uint32_t>>(variant, value_data, row, values_index);
616
0
    break;
617
0
  case VariantLogicalType::UINT64:
618
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::INT64>(value_data);
619
0
    CopySimplePrimitiveData<int64_t, VariantTryConvert<uint64_t>>(variant, value_data, row, values_index);
620
0
    break;
621
0
  case VariantLogicalType::UINT128:
622
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::INT64>(value_data);
623
0
    CopySimplePrimitiveData<int64_t, VariantTryConvert<uhugeint_t>>(variant, value_data, row, values_index);
624
0
    break;
625
0
  case VariantLogicalType::INT128:
626
0
    WritePrimitiveTypeHeader<VariantPrimitiveType::INT64>(value_data);
627
0
    CopySimplePrimitiveData<int64_t, VariantTryConvert<hugeint_t>>(variant, value_data, row, values_index);
628
0
    break;
629
0
  case VariantLogicalType::INTERVAL:
630
0
  case VariantLogicalType::BIGNUM:
631
0
  case VariantLogicalType::BITSTRING:
632
0
  case VariantLogicalType::TIMESTAMP_MILIS:
633
0
  case VariantLogicalType::TIMESTAMP_SEC:
634
0
  case VariantLogicalType::TIME_MICROS_TZ:
635
0
  case VariantLogicalType::TIME_NANOS:
636
0
  default:
637
0
    throw InvalidInputException("Can't convert VARIANT of type '%s' to Parquet VARIANT",
638
0
                                EnumUtil::ToString(type_id));
639
0
  }
640
0
}
641
642
static void WriteValueData(const UnifiedVariantVectorData &variant, idx_t row, uint32_t values_index,
643
                           data_ptr_t &value_data, const vector<uint32_t> &offsets, idx_t &offset_index,
644
0
                           optional_ptr<ParquetVariantShreddingState> shredding_state) {
645
0
  VariantLogicalType type_id = VariantLogicalType::VARIANT_NULL;
646
0
  if (variant.RowIsValid(row)) {
647
0
    type_id = variant.GetTypeId(row, values_index);
648
0
  }
649
0
  if (type_id == VariantLogicalType::OBJECT) {
650
0
    auto nested_data = VariantUtils::DecodeNestedData(variant, row, values_index);
651
652
    //! -- Object value header --
653
654
0
    auto child_indices = GetChildIndices(variant, row, nested_data, shredding_state);
655
0
    if (nested_data.child_count && child_indices.empty()) {
656
0
      throw InternalException(
657
0
          "The entire should be omitted, should have been handled by the Analyze step already");
658
0
    }
659
0
    auto num_elements = child_indices.size();
660
661
    //! Determine the 'field_id_size'
662
0
    uint32_t highest_keys_index = 0;
663
0
    for (auto &i : child_indices) {
664
0
      auto keys_index = variant.GetKeysIndex(row, i + nested_data.children_idx);
665
0
      highest_keys_index = MaxValue(highest_keys_index, keys_index);
666
0
    }
667
0
    auto field_id_size = CalculateByteLength(highest_keys_index);
668
669
0
    uint32_t last_offset = 0;
670
0
    if (num_elements) {
671
0
      last_offset = offsets[offset_index + num_elements];
672
0
    }
673
0
    offset_index += num_elements + 1;
674
0
    auto field_offset_size = CalculateByteLength(last_offset);
675
676
0
    const bool is_large = num_elements > NumericLimits<uint8_t>::Maximum();
677
678
0
    uint8_t value_header = 0;
679
0
    value_header |= static_cast<uint8_t>(VariantBasicType::OBJECT);
680
0
    value_header |= static_cast<uint8_t>(is_large) << 6;
681
0
    value_header |= (static_cast<uint8_t>(field_id_size) - 1) << 4;
682
0
    value_header |= (static_cast<uint8_t>(field_offset_size) - 1) << 2;
683
684
#ifdef DEBUG
685
    auto object_value_header = VariantValueMetadata::FromHeaderByte(value_header);
686
    D_ASSERT(object_value_header.basic_type == VariantBasicType::OBJECT);
687
    D_ASSERT(object_value_header.is_large == is_large);
688
    D_ASSERT(object_value_header.field_offset_size == field_offset_size);
689
    D_ASSERT(object_value_header.field_id_size == field_id_size);
690
#endif
691
692
0
    *value_data = value_header;
693
0
    value_data++;
694
695
    //! Write the 'num_elements'
696
0
    if (is_large) {
697
0
      Store<uint32_t>(static_cast<uint32_t>(num_elements), value_data);
698
0
      value_data += sizeof(uint32_t);
699
0
    } else {
700
0
      Store<uint8_t>(static_cast<uint8_t>(num_elements), value_data);
701
0
      value_data += sizeof(uint8_t);
702
0
    }
703
704
    //! Write the 'field_id' entries
705
0
    for (auto &i : child_indices) {
706
0
      auto keys_index = variant.GetKeysIndex(row, i + nested_data.children_idx);
707
0
      memcpy(value_data, reinterpret_cast<data_ptr_t>(&keys_index), field_id_size);
708
0
      value_data += field_id_size;
709
0
    }
710
711
    //! Write the 'field_offset' entries and the child 'value's
712
0
    auto children_ptr = value_data + ((num_elements + 1) * field_offset_size);
713
0
    idx_t total_offset = 0;
714
0
    for (auto &i : child_indices) {
715
0
      auto values_index = variant.GetValuesIndex(row, i + nested_data.children_idx);
716
717
0
      memcpy(value_data, reinterpret_cast<data_ptr_t>(&total_offset), field_offset_size);
718
0
      value_data += field_offset_size;
719
0
      auto start_ptr = children_ptr;
720
0
      WriteValueData(variant, row, values_index, children_ptr, offsets, offset_index, nullptr);
721
0
      total_offset += (children_ptr - start_ptr);
722
0
    }
723
0
    memcpy(value_data, reinterpret_cast<data_ptr_t>(&total_offset), field_offset_size);
724
0
    value_data += field_offset_size;
725
0
    D_ASSERT(children_ptr - total_offset == value_data);
726
0
    value_data = children_ptr;
727
0
  } else if (type_id == VariantLogicalType::ARRAY) {
728
0
    auto nested_data = VariantUtils::DecodeNestedData(variant, row, values_index);
729
730
    //! -- Array value header --
731
732
0
    uint32_t last_offset = 0;
733
0
    if (nested_data.child_count) {
734
0
      last_offset = offsets[offset_index + nested_data.child_count];
735
0
    }
736
0
    offset_index += nested_data.child_count + 1;
737
0
    auto field_offset_size = CalculateByteLength(last_offset);
738
739
0
    auto num_elements = nested_data.child_count;
740
0
    const bool is_large = num_elements > NumericLimits<uint8_t>::Maximum();
741
742
0
    uint8_t value_header = 0;
743
0
    value_header |= static_cast<uint8_t>(VariantBasicType::ARRAY);
744
0
    value_header |= static_cast<uint8_t>(is_large) << 4;
745
0
    value_header |= (static_cast<uint8_t>(field_offset_size) - 1) << 2;
746
747
#ifdef DEBUG
748
    auto array_value_header = VariantValueMetadata::FromHeaderByte(value_header);
749
    D_ASSERT(array_value_header.basic_type == VariantBasicType::ARRAY);
750
    D_ASSERT(array_value_header.is_large == is_large);
751
    D_ASSERT(array_value_header.field_offset_size == field_offset_size);
752
#endif
753
754
0
    *value_data = value_header;
755
0
    value_data++;
756
757
    //! Write the 'num_elements'
758
0
    if (is_large) {
759
0
      Store<uint32_t>(static_cast<uint32_t>(num_elements), value_data);
760
0
      value_data += sizeof(uint32_t);
761
0
    } else {
762
0
      Store<uint8_t>(static_cast<uint8_t>(num_elements), value_data);
763
0
      value_data += sizeof(uint8_t);
764
0
    }
765
766
    //! Write the 'field_offset' entries and the child 'value's
767
0
    auto children_ptr = value_data + ((num_elements + 1) * field_offset_size);
768
0
    idx_t total_offset = 0;
769
0
    for (idx_t i = 0; i < nested_data.child_count; i++) {
770
0
      auto values_index = variant.GetValuesIndex(row, i + nested_data.children_idx);
771
772
0
      memcpy(value_data, reinterpret_cast<data_ptr_t>(&total_offset), field_offset_size);
773
0
      value_data += field_offset_size;
774
0
      auto start_ptr = children_ptr;
775
0
      WriteValueData(variant, row, values_index, children_ptr, offsets, offset_index, nullptr);
776
0
      total_offset += (children_ptr - start_ptr);
777
0
    }
778
0
    memcpy(value_data, reinterpret_cast<data_ptr_t>(&total_offset), field_offset_size);
779
0
    value_data += field_offset_size;
780
0
    D_ASSERT(children_ptr - total_offset == value_data);
781
0
    value_data = children_ptr;
782
0
  } else {
783
0
    WritePrimitiveValueData(variant, row, values_index, value_data, offsets, offset_index);
784
0
  }
785
0
}
786
787
static void CreateValues(UnifiedVariantVectorData &variant, Vector &value, optional_ptr<const SelectionVector> sel,
788
                         optional_ptr<const SelectionVector> value_index_sel,
789
                         optional_ptr<const SelectionVector> result_sel,
790
0
                         optional_ptr<ParquetVariantShreddingState> shredding_state, idx_t count) {
791
0
  auto &validity = FlatVector::ValidityMutable(value);
792
0
  auto value_data = FlatVector::GetDataMutable<string_t>(value);
793
794
0
  for (idx_t i = 0; i < count; i++) {
795
0
    idx_t value_index = 0;
796
0
    if (value_index_sel) {
797
0
      value_index = value_index_sel->get_index(i);
798
0
    }
799
800
0
    idx_t row = i;
801
0
    if (sel) {
802
0
      row = sel->get_index(i);
803
0
    }
804
805
0
    idx_t result_index = i;
806
0
    if (result_sel) {
807
0
      result_index = result_sel->get_index(i);
808
0
    }
809
810
0
    bool is_shredded = false;
811
0
    if (variant.RowIsValid(row) && shredding_state && shredding_state->ValueIsShredded(variant, row, value_index)) {
812
0
      shredding_state->SetShredded(row, value_index, result_index);
813
0
      is_shredded = true;
814
0
      if (shredding_state->type.id() != LogicalTypeId::STRUCT) {
815
        //! Value is shredded, directly write a NULL to the 'value' if the type is not an OBJECT
816
        //! When the type is OBJECT, all excess fields would still need to be written to the 'value'
817
0
        validity.SetInvalid(result_index);
818
0
        continue;
819
0
      }
820
0
    }
821
822
    //! The (relative) offsets for each value, in the case of nesting
823
0
    vector<uint32_t> offsets;
824
    //! Determine the size of this 'value' blob
825
0
    idx_t blob_length = AnalyzeValueData(variant, row, value_index, offsets, shredding_state);
826
0
    if (!blob_length) {
827
      //! This is only allowed to happen for a shredded OBJECT, where there are no excess fields to write for the
828
      //! OBJECT
829
0
      (void)is_shredded;
830
0
      D_ASSERT(is_shredded);
831
0
      validity.SetInvalid(result_index);
832
0
      continue;
833
0
    }
834
0
    value_data[result_index] = StringVector::EmptyString(value, blob_length);
835
0
    auto &value_blob = value_data[result_index];
836
0
    auto value_blob_data = reinterpret_cast<data_ptr_t>(value_blob.GetDataWriteable());
837
838
0
    idx_t offset_index = 0;
839
0
    WriteValueData(variant, row, value_index, value_blob_data, offsets, offset_index, shredding_state);
840
0
    D_ASSERT(data_ptr_cast(value_blob.GetDataWriteable() + blob_length) == value_blob_data);
841
0
    value_blob.SetSizeAndFinalize(blob_length, blob_length);
842
0
  }
843
0
}
844
845
0
void ParquetVariantShredding::WriteMissingField(Vector &vector, idx_t index) {
846
  //! The field is missing, set it to null
847
0
  FlatVector::SetNull(vector, index, true);
848
0
}
849
850
void ParquetVariantShredding::WriteVariantValues(UnifiedVariantVectorData &variant, Vector &result,
851
                                                 optional_ptr<const SelectionVector> sel,
852
                                                 optional_ptr<const SelectionVector> value_index_sel,
853
0
                                                 optional_ptr<const SelectionVector> result_sel, idx_t count) {
854
0
  optional_ptr<Vector> value;
855
0
  optional_ptr<Vector> typed_value;
856
857
0
  auto &result_type = result.GetType();
858
0
  D_ASSERT(result_type.id() == LogicalTypeId::STRUCT);
859
0
  auto &child_types = StructType::GetChildTypes(result_type);
860
0
  auto &child_vectors = StructVector::GetEntries(result);
861
0
  D_ASSERT(child_types.size() == child_vectors.size());
862
0
  for (idx_t i = 0; i < child_types.size(); i++) {
863
0
    auto &name = child_types[i].first;
864
0
    if (name == "value") {
865
0
      value = &child_vectors[i];
866
0
    } else if (name == "typed_value") {
867
0
      typed_value = &child_vectors[i];
868
0
    }
869
0
  }
870
871
0
  if (typed_value) {
872
0
    ParquetVariantShreddingState shredding_state(typed_value->GetType(), count);
873
0
    CreateValues(variant, *value, sel, value_index_sel, result_sel, &shredding_state, count);
874
875
0
    SelectionVector null_values;
876
0
    if (shredding_state.count) {
877
0
      WriteTypedValues(variant, *typed_value, shredding_state.shredded_sel, shredding_state.values_index_sel,
878
0
                       shredding_state.result_sel, shredding_state.count);
879
      //! 'shredding_state.result_sel' will always be a subset of 'result_sel', set the rows not in the subset to
880
      //! NULL
881
0
      idx_t sel_idx = 0;
882
0
      for (idx_t i = 0; i < count; i++) {
883
0
        auto original_index = result_sel ? result_sel->get_index(i) : i;
884
0
        if (sel_idx < shredding_state.count && shredding_state.result_sel[sel_idx] == original_index) {
885
0
          sel_idx++;
886
0
          continue;
887
0
        }
888
0
        FlatVector::SetNull(*typed_value, original_index, true);
889
0
      }
890
0
    } else {
891
      //! Set all rows of the typed_value to NULL, nothing is shredded on
892
0
      for (idx_t i = 0; i < count; i++) {
893
0
        FlatVector::SetNull(*typed_value, result_sel ? result_sel->get_index(i) : i, true);
894
0
      }
895
0
    }
896
0
  } else {
897
0
    CreateValues(variant, *value, sel, value_index_sel, result_sel, nullptr, count);
898
0
  }
899
0
}
900
901
0
static void ToParquetVariant(DataChunk &input, ExpressionState &state, Vector &result) {
902
  // DuckDB Variant:
903
  // - keys = VARCHAR[]
904
  // - children = STRUCT(keys_index UINTEGER, values_index UINTEGER)[]
905
  // - values = STRUCT(type_id UTINYINT, byte_offset UINTEGER)[]
906
  // - data = BLOB
907
908
  // Parquet VARIANT:
909
  // - metadata = BLOB
910
  // - value = BLOB
911
912
0
  const auto &variant_vec = input.data[0];
913
0
  auto count = input.size();
914
915
0
  RecursiveUnifiedVectorFormat recursive_format;
916
0
  Vector::RecursiveToUnifiedFormat(variant_vec, recursive_format);
917
0
  UnifiedVariantVectorData variant(recursive_format);
918
919
0
  auto &result_vectors = StructVector::GetEntries(result);
920
0
  auto &metadata = result_vectors[0];
921
0
  CreateMetadata(variant, metadata, count);
922
923
0
  ParquetVariantShredding shredding;
924
0
  shredding.WriteVariantValues(variant, result, nullptr, nullptr, nullptr, count);
925
0
}
926
927
0
idx_t VariantColumnWriter::FinalizeSchema(vector<duckdb_parquet::SchemaElement> &schemas) {
928
0
  idx_t schema_idx = schemas.size();
929
930
0
  auto &schema = Schema();
931
0
  schema.SetSchemaIndex(schema_idx);
932
933
0
  auto &repetition_type = schema.repetition_type;
934
0
  auto &name = schema.name;
935
0
  auto &field_id = schema.field_id;
936
937
  // variant group
938
0
  duckdb_parquet::SchemaElement top_element;
939
0
  top_element.repetition_type = repetition_type;
940
0
  top_element.num_children = NumericCast<int32_t>(child_writers.size());
941
0
  top_element.logicalType.__isset.VARIANT = true;
942
0
  top_element.logicalType.VARIANT.__isset.specification_version = true;
943
0
  top_element.logicalType.VARIANT.specification_version = 1;
944
0
  top_element.__isset.logicalType = true;
945
0
  top_element.__isset.num_children = true;
946
0
  top_element.__isset.repetition_type = true;
947
0
  top_element.name = name;
948
0
  if (field_id.IsValid()) {
949
0
    top_element.__isset.field_id = true;
950
0
    top_element.field_id = NumericCast<int32_t>(field_id.GetIndex());
951
0
  }
952
0
  schemas.push_back(std::move(top_element));
953
954
0
  idx_t unique_columns = 0;
955
0
  for (auto &child_writer : child_writers) {
956
0
    unique_columns += child_writer->FinalizeSchema(schemas);
957
0
  }
958
0
  return unique_columns;
959
0
}
960
961
0
LogicalType VariantColumnWriter::TransformTypedValueRecursive(const LogicalType &type) {
962
0
  switch (type.id()) {
963
0
  case LogicalTypeId::STRUCT: {
964
    //! Wrap all fields of the struct in a struct with 'value' and 'typed_value' fields
965
0
    auto &child_types = StructType::GetChildTypes(type);
966
0
    child_list_t<LogicalType> replaced_types;
967
0
    for (auto &entry : child_types) {
968
0
      child_list_t<LogicalType> child_children;
969
0
      child_children.emplace_back("value", LogicalType::BLOB);
970
0
      if (entry.second.id() != LogicalTypeId::VARIANT) {
971
0
        child_children.emplace_back("typed_value", TransformTypedValueRecursive(entry.second));
972
0
      }
973
0
      replaced_types.emplace_back(entry.first, LogicalType::STRUCT(child_children));
974
0
    }
975
0
    return LogicalType::STRUCT(replaced_types);
976
0
  }
977
0
  case LogicalTypeId::LIST: {
978
0
    auto &child_type = ListType::GetChildType(type);
979
0
    child_list_t<LogicalType> replaced_types;
980
0
    replaced_types.emplace_back("value", LogicalType::BLOB);
981
0
    if (child_type.id() != LogicalTypeId::VARIANT) {
982
0
      replaced_types.emplace_back("typed_value", TransformTypedValueRecursive(child_type));
983
0
    }
984
0
    return LogicalType::LIST(LogicalType::STRUCT(replaced_types));
985
0
  }
986
0
  case LogicalTypeId::UNION:
987
0
  case LogicalTypeId::MAP:
988
0
  case LogicalTypeId::VARIANT:
989
0
  case LogicalTypeId::ARRAY:
990
0
    throw BinderException("'%s' can't appear inside a 'typed_value' shredded type!", type.ToString());
991
0
  default:
992
0
    return type;
993
0
  }
994
0
}
995
996
0
static LogicalType GetParquetVariantType(optional_ptr<LogicalType> shredding = nullptr) {
997
0
  child_list_t<LogicalType> children;
998
0
  children.emplace_back("metadata", LogicalType::BLOB);
999
0
  children.emplace_back("value", LogicalType::BLOB);
1000
0
  if (shredding && shredding->id() != LogicalTypeId::VARIANT) {
1001
0
    children.emplace_back("typed_value", VariantColumnWriter::TransformTypedValueRecursive(*shredding));
1002
0
  }
1003
0
  auto res = LogicalType::STRUCT(std::move(children));
1004
0
  res.SetAlias("PARQUET_VARIANT");
1005
0
  return res;
1006
0
}
1007
1008
0
static unique_ptr<FunctionData> BindTransform(BindScalarFunctionInput &input) {
1009
0
  auto &context = input.GetClientContext();
1010
0
  auto &bound_function = input.GetBoundFunction();
1011
0
  auto &arguments = input.GetArguments();
1012
0
  if (arguments.empty()) {
1013
0
    return nullptr;
1014
0
  }
1015
0
  auto type = ExpressionBinder::GetExpressionReturnType(*arguments[0]);
1016
1017
0
  if (arguments.size() == 2) {
1018
0
    auto &shredding = *arguments[1];
1019
0
    auto expr_return_type = ExpressionBinder::GetExpressionReturnType(shredding);
1020
0
    expr_return_type = LogicalType::NormalizeType(expr_return_type);
1021
0
    if (expr_return_type.id() != LogicalTypeId::VARCHAR) {
1022
0
      throw BinderException("Optional second argument 'shredding' has to be of type VARCHAR, i.e: "
1023
0
                            "'STRUCT(my_field BOOLEAN)', found type: '%s' instead",
1024
0
                            expr_return_type);
1025
0
    }
1026
0
    if (!shredding.IsFoldable()) {
1027
0
      throw BinderException("Optional second argument 'shredding' has to be a constant expression");
1028
0
    }
1029
0
    Value type_str = ExpressionExecutor::EvaluateScalar(context, shredding);
1030
0
    if (type_str.IsNull()) {
1031
0
      throw BinderException("Optional second argument 'shredding' can not be NULL");
1032
0
    }
1033
0
    auto shredded_type = TransformStringToLogicalType(type_str.GetValue<string>(), context);
1034
0
    bound_function.SetReturnType(GetParquetVariantType(shredded_type));
1035
0
  } else {
1036
0
    bound_function.SetReturnType(GetParquetVariantType());
1037
0
  }
1038
1039
0
  return nullptr;
1040
0
}
1041
1042
6.58k
ScalarFunction VariantColumnWriter::GetTransformFunction() {
1043
6.58k
  ScalarFunction transform("variant_to_parquet_variant", {LogicalType::VARIANT()}, LogicalType::ANY, ToParquetVariant,
1044
6.58k
                           BindTransform);
1045
6.58k
  transform.SetNullHandling(FunctionNullHandling::SPECIAL_HANDLING);
1046
6.58k
  return transform;
1047
6.58k
}
1048
1049
} // namespace duckdb