Coverage Report

Created: 2025-09-05 08:05

/src/duckdb/extension/parquet/include/column_writer.hpp
Line
Count
Source (jump to first uncovered line)
1
//===----------------------------------------------------------------------===//
2
//                         DuckDB
3
//
4
// column_writer.hpp
5
//
6
//
7
//===----------------------------------------------------------------------===//
8
9
#pragma once
10
11
#include "duckdb.hpp"
12
#include "parquet_types.h"
13
#include "parquet_column_schema.hpp"
14
15
namespace duckdb {
16
class MemoryStream;
17
class ParquetWriter;
18
class ColumnWriterPageState;
19
class PrimitiveColumnWriterState;
20
struct ChildFieldIDs;
21
class ResizeableBuffer;
22
class ParquetBloomFilter;
23
24
class ColumnWriterState {
25
public:
26
  virtual ~ColumnWriterState();
27
28
  unsafe_vector<uint16_t> definition_levels;
29
  unsafe_vector<uint16_t> repetition_levels;
30
  vector<bool> is_empty;
31
  idx_t parent_null_count = 0;
32
  idx_t null_count = 0;
33
34
public:
35
  template <class TARGET>
36
0
  TARGET &Cast() {
37
0
    DynamicCastCheck<TARGET>(this);
38
0
    return reinterpret_cast<TARGET &>(*this);
39
0
  }
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::string_t, duckdb::string_t, duckdb::ParquetGeometryOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::string_t, duckdb::string_t, duckdb::ParquetGeometryOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<signed char, int, duckdb::ParquetCastOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<signed char, int, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<short, int, duckdb::ParquetCastOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<short, int, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<int, int, duckdb::ParquetCastOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<int, int, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<long, long, duckdb::ParquetCastOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<long, long, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::dtime_tz_t, long, duckdb::ParquetTimeTZOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::dtime_tz_t, long, duckdb::ParquetTimeTZOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::hugeint_t, double, duckdb::ParquetHugeintOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::hugeint_t, double, duckdb::ParquetHugeintOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::uhugeint_t, double, duckdb::ParquetUhugeintOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::uhugeint_t, double, duckdb::ParquetUhugeintOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<long, long, duckdb::ParquetTimestampNSOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<long, long, duckdb::ParquetTimestampNSOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<long, long, duckdb::ParquetTimestampSOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<long, long, duckdb::ParquetTimestampSOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<unsigned char, int, duckdb::ParquetCastOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<unsigned char, int, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<unsigned short, int, duckdb::ParquetCastOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<unsigned short, int, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<unsigned int, unsigned int, duckdb::ParquetCastOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<unsigned int, unsigned int, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<unsigned long, unsigned long, duckdb::ParquetCastOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<unsigned long, unsigned long, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::float_na_equal, float, duckdb::FloatingPointOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::float_na_equal, float, duckdb::FloatingPointOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::double_na_equal, double, duckdb::FloatingPointOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::double_na_equal, double, duckdb::FloatingPointOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::string_t, duckdb::string_t, duckdb::ParquetBlobOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::string_t, duckdb::string_t, duckdb::ParquetBlobOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::string_t, duckdb::string_t, duckdb::ParquetStringOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::string_t, duckdb::string_t, duckdb::ParquetStringOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::hugeint_t, duckdb::ParquetUUIDTargetType, duckdb::ParquetUUIDOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::hugeint_t, duckdb::ParquetUUIDTargetType, duckdb::ParquetUUIDOperator> >()
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::interval_t, duckdb::ParquetIntervalTargetType, duckdb::ParquetIntervalOperator>& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::interval_t, duckdb::ParquetIntervalTargetType, duckdb::ParquetIntervalOperator> >()
Unexecuted instantiation: duckdb::ListColumnWriterState& duckdb::ColumnWriterState::Cast<duckdb::ListColumnWriterState>()
Unexecuted instantiation: duckdb::PrimitiveColumnWriterState& duckdb::ColumnWriterState::Cast<duckdb::PrimitiveColumnWriterState>()
Unexecuted instantiation: duckdb::StructColumnWriterState& duckdb::ColumnWriterState::Cast<duckdb::StructColumnWriterState>()
40
  template <class TARGET>
41
0
  const TARGET &Cast() const {
42
0
    D_ASSERT(dynamic_cast<const TARGET *>(this));
43
0
    return reinterpret_cast<const TARGET &>(*this);
44
0
  }
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::string_t, duckdb::string_t, duckdb::ParquetGeometryOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::string_t, duckdb::string_t, duckdb::ParquetGeometryOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<signed char, int, duckdb::ParquetCastOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<signed char, int, duckdb::ParquetCastOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<short, int, duckdb::ParquetCastOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<short, int, duckdb::ParquetCastOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<int, int, duckdb::ParquetCastOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<int, int, duckdb::ParquetCastOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<long, long, duckdb::ParquetCastOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<long, long, duckdb::ParquetCastOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::dtime_tz_t, long, duckdb::ParquetTimeTZOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::dtime_tz_t, long, duckdb::ParquetTimeTZOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::hugeint_t, double, duckdb::ParquetHugeintOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::hugeint_t, double, duckdb::ParquetHugeintOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::uhugeint_t, double, duckdb::ParquetUhugeintOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::uhugeint_t, double, duckdb::ParquetUhugeintOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<long, long, duckdb::ParquetTimestampNSOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<long, long, duckdb::ParquetTimestampNSOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<long, long, duckdb::ParquetTimestampSOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<long, long, duckdb::ParquetTimestampSOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<unsigned char, int, duckdb::ParquetCastOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<unsigned char, int, duckdb::ParquetCastOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<unsigned short, int, duckdb::ParquetCastOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<unsigned short, int, duckdb::ParquetCastOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<unsigned int, unsigned int, duckdb::ParquetCastOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<unsigned int, unsigned int, duckdb::ParquetCastOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<unsigned long, unsigned long, duckdb::ParquetCastOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<unsigned long, unsigned long, duckdb::ParquetCastOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::float_na_equal, float, duckdb::FloatingPointOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::float_na_equal, float, duckdb::FloatingPointOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::double_na_equal, double, duckdb::FloatingPointOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::double_na_equal, double, duckdb::FloatingPointOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::string_t, duckdb::string_t, duckdb::ParquetBlobOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::string_t, duckdb::string_t, duckdb::ParquetBlobOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::string_t, duckdb::string_t, duckdb::ParquetStringOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::string_t, duckdb::string_t, duckdb::ParquetStringOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::hugeint_t, duckdb::ParquetUUIDTargetType, duckdb::ParquetUUIDOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::hugeint_t, duckdb::ParquetUUIDTargetType, duckdb::ParquetUUIDOperator> >() const
Unexecuted instantiation: duckdb::StandardColumnWriterState<duckdb::interval_t, duckdb::ParquetIntervalTargetType, duckdb::ParquetIntervalOperator> const& duckdb::ColumnWriterState::Cast<duckdb::StandardColumnWriterState<duckdb::interval_t, duckdb::ParquetIntervalTargetType, duckdb::ParquetIntervalOperator> >() const
45
};
46
47
class ColumnWriterPageState {
48
public:
49
0
  virtual ~ColumnWriterPageState() {
50
0
  }
51
52
public:
53
  template <class TARGET>
54
0
  TARGET &Cast() {
55
0
    DynamicCastCheck<TARGET>(this);
56
0
    return reinterpret_cast<TARGET &>(*this);
57
0
  }
Unexecuted instantiation: duckdb::StandardWriterPageState<duckdb::string_t, duckdb::string_t, duckdb::ParquetGeometryOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<duckdb::string_t, duckdb::string_t, duckdb::ParquetGeometryOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<signed char, int, duckdb::ParquetCastOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<signed char, int, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<short, int, duckdb::ParquetCastOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<short, int, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<int, int, duckdb::ParquetCastOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<int, int, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<long, long, duckdb::ParquetCastOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<long, long, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<duckdb::dtime_tz_t, long, duckdb::ParquetTimeTZOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<duckdb::dtime_tz_t, long, duckdb::ParquetTimeTZOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<duckdb::hugeint_t, double, duckdb::ParquetHugeintOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<duckdb::hugeint_t, double, duckdb::ParquetHugeintOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<duckdb::uhugeint_t, double, duckdb::ParquetUhugeintOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<duckdb::uhugeint_t, double, duckdb::ParquetUhugeintOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<long, long, duckdb::ParquetTimestampNSOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<long, long, duckdb::ParquetTimestampNSOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<long, long, duckdb::ParquetTimestampSOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<long, long, duckdb::ParquetTimestampSOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<unsigned char, int, duckdb::ParquetCastOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<unsigned char, int, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<unsigned short, int, duckdb::ParquetCastOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<unsigned short, int, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<unsigned int, unsigned int, duckdb::ParquetCastOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<unsigned int, unsigned int, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<unsigned long, unsigned long, duckdb::ParquetCastOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<unsigned long, unsigned long, duckdb::ParquetCastOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<duckdb::float_na_equal, float, duckdb::FloatingPointOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<duckdb::float_na_equal, float, duckdb::FloatingPointOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<duckdb::double_na_equal, double, duckdb::FloatingPointOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<duckdb::double_na_equal, double, duckdb::FloatingPointOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<duckdb::string_t, duckdb::string_t, duckdb::ParquetBlobOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<duckdb::string_t, duckdb::string_t, duckdb::ParquetBlobOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<duckdb::string_t, duckdb::string_t, duckdb::ParquetStringOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<duckdb::string_t, duckdb::string_t, duckdb::ParquetStringOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<duckdb::hugeint_t, duckdb::ParquetUUIDTargetType, duckdb::ParquetUUIDOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<duckdb::hugeint_t, duckdb::ParquetUUIDTargetType, duckdb::ParquetUUIDOperator> >()
Unexecuted instantiation: duckdb::StandardWriterPageState<duckdb::interval_t, duckdb::ParquetIntervalTargetType, duckdb::ParquetIntervalOperator>& duckdb::ColumnWriterPageState::Cast<duckdb::StandardWriterPageState<duckdb::interval_t, duckdb::ParquetIntervalTargetType, duckdb::ParquetIntervalOperator> >()
Unexecuted instantiation: duckdb::BooleanWriterPageState& duckdb::ColumnWriterPageState::Cast<duckdb::BooleanWriterPageState>()
Unexecuted instantiation: duckdb::EnumWriterPageState& duckdb::ColumnWriterPageState::Cast<duckdb::EnumWriterPageState>()
58
  template <class TARGET>
59
  const TARGET &Cast() const {
60
    D_ASSERT(dynamic_cast<const TARGET *>(this));
61
    return reinterpret_cast<const TARGET &>(*this);
62
  }
63
};
64
65
class ColumnWriter {
66
protected:
67
  static constexpr uint16_t PARQUET_DEFINE_VALID = UINT16_C(65535);
68
69
public:
70
  ColumnWriter(ParquetWriter &writer, const ParquetColumnSchema &column_schema, vector<string> schema_path,
71
               bool can_have_nulls);
72
  virtual ~ColumnWriter();
73
74
  ParquetWriter &writer;
75
  const ParquetColumnSchema &column_schema;
76
  vector<string> schema_path;
77
  bool can_have_nulls;
78
79
public:
80
0
  const LogicalType &Type() const {
81
0
    return column_schema.type;
82
0
  }
83
0
  const ParquetColumnSchema &Schema() const {
84
0
    return column_schema;
85
0
  }
86
0
  inline idx_t SchemaIndex() const {
87
0
    return column_schema.schema_index;
88
0
  }
89
0
  inline idx_t MaxDefine() const {
90
0
    return column_schema.max_define;
91
0
  }
92
0
  idx_t MaxRepeat() const {
93
0
    return column_schema.max_repeat;
94
0
  }
95
96
  static ParquetColumnSchema FillParquetSchema(vector<duckdb_parquet::SchemaElement> &schemas,
97
                                               const LogicalType &type, const string &name,
98
                                               optional_ptr<const ChildFieldIDs> field_ids, idx_t max_repeat = 0,
99
                                               idx_t max_define = 1, bool can_have_nulls = true);
100
  //! Create the column writer for a specific type recursively
101
  static unique_ptr<ColumnWriter> CreateWriterRecursive(ClientContext &context, ParquetWriter &writer,
102
                                                        const vector<duckdb_parquet::SchemaElement> &parquet_schemas,
103
                                                        const ParquetColumnSchema &schema,
104
                                                        vector<string> path_in_schema);
105
106
  virtual unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::RowGroup &row_group) = 0;
107
108
  //! indicates whether the write need to analyse the data before preparing it
109
0
  virtual bool HasAnalyze() {
110
0
    return false;
111
0
  }
112
113
0
  virtual void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) {
114
0
    throw NotImplementedException("Writer does not need analysis");
115
0
  }
116
117
  //! Called after all data has been passed to Analyze
118
0
  virtual void FinalizeAnalyze(ColumnWriterState &state) {
119
0
    throw NotImplementedException("Writer does not need analysis");
120
0
  }
121
122
  virtual void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
123
                       bool vector_can_span_multiple_pages) = 0;
124
125
  virtual void BeginWrite(ColumnWriterState &state) = 0;
126
  virtual void Write(ColumnWriterState &state, Vector &vector, idx_t count) = 0;
127
  virtual void FinalizeWrite(ColumnWriterState &state) = 0;
128
129
protected:
130
  void HandleDefineLevels(ColumnWriterState &state, ColumnWriterState *parent, const ValidityMask &validity,
131
                          const idx_t count, const uint16_t define_value, const uint16_t null_value) const;
132
  void HandleRepeatLevels(ColumnWriterState &state_p, ColumnWriterState *parent, idx_t count, idx_t max_repeat) const;
133
134
  void CompressPage(MemoryStream &temp_writer, size_t &compressed_size, data_ptr_t &compressed_data,
135
                    AllocatedData &compressed_buf);
136
};
137
138
} // namespace duckdb