/src/duckdb/extension/parquet/writer/boolean_column_writer.cpp
Line | Count | Source |
1 | | #include "writer/boolean_column_writer.hpp" |
2 | | |
3 | | #include <stdint.h> |
4 | | #include <utility> |
5 | | |
6 | | #include "duckdb/common/helper.hpp" |
7 | | #include "duckdb/common/serializer/write_stream.hpp" |
8 | | #include "duckdb/common/types/validity_mask.hpp" |
9 | | #include "duckdb/common/vector/flat_vector.hpp" |
10 | | #include "parquet_column_schema.hpp" |
11 | | |
12 | | namespace duckdb { |
13 | | class ParquetWriter; |
14 | | class Vector; |
15 | | |
16 | | class BooleanStatisticsState : public ColumnWriterStatistics { |
17 | | public: |
18 | 0 | BooleanStatisticsState() : min(true), max(false) { |
19 | 0 | } |
20 | | |
21 | | bool min; |
22 | | bool max; |
23 | | |
24 | | public: |
25 | 0 | bool HasStats() override { |
26 | 0 | return !(min && !max); |
27 | 0 | } |
28 | | |
29 | 0 | string GetMin() override { |
30 | 0 | return GetMinValue(); |
31 | 0 | } |
32 | 0 | string GetMax() override { |
33 | 0 | return GetMaxValue(); |
34 | 0 | } |
35 | 0 | string GetMinValue() override { |
36 | 0 | return HasStats() ? string(const_char_ptr_cast(&min), sizeof(bool)) : string(); |
37 | 0 | } |
38 | 0 | string GetMaxValue() override { |
39 | 0 | return HasStats() ? string(const_char_ptr_cast(&max), sizeof(bool)) : string(); |
40 | 0 | } |
41 | | }; |
42 | | |
43 | | class BooleanWriterPageState : public ColumnWriterPageState { |
44 | | public: |
45 | | uint8_t byte = 0; |
46 | | uint8_t byte_pos = 0; |
47 | | }; |
48 | | |
49 | | BooleanColumnWriter::BooleanColumnWriter(ParquetWriter &writer, ParquetColumnSchema &&column_schema, |
50 | | vector<Identifier> schema_path_p) |
51 | 0 | : PrimitiveColumnWriter(writer, std::move(column_schema), std::move(schema_path_p)) { |
52 | 0 | } |
53 | | |
54 | 0 | unique_ptr<ColumnWriterStatistics> BooleanColumnWriter::InitializeStatsState() { |
55 | 0 | return make_uniq<BooleanStatisticsState>(); |
56 | 0 | } |
57 | | |
58 | | void BooleanColumnWriter::WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats_p, |
59 | | ColumnWriterPageState *state_p, Vector &input_column, idx_t chunk_start, |
60 | 0 | idx_t chunk_end) { |
61 | 0 | auto &stats = stats_p->Cast<BooleanStatisticsState>(); |
62 | 0 | auto &state = state_p->Cast<BooleanWriterPageState>(); |
63 | 0 | const auto &mask = FlatVector::Validity(input_column); |
64 | |
|
65 | 0 | const auto *const ptr = FlatVector::GetData<bool>(input_column); |
66 | 0 | if (stats.max && !stats.min && mask.CannotHaveNull()) { |
67 | | // Fast path: stats have already been set, and there's no NULLs |
68 | 0 | for (idx_t r = chunk_start; r < chunk_end; r++) { |
69 | 0 | const auto &val = ptr[r]; |
70 | 0 | state.byte |= val << state.byte_pos; |
71 | 0 | if (++state.byte_pos == 8) { |
72 | 0 | temp_writer.Write(state.byte); |
73 | 0 | state.byte = 0; |
74 | 0 | state.byte_pos = 0; |
75 | 0 | } |
76 | 0 | } |
77 | 0 | } else { |
78 | 0 | for (idx_t r = chunk_start; r < chunk_end; r++) { |
79 | 0 | if (!mask.RowIsValid(r)) { |
80 | 0 | continue; |
81 | 0 | } |
82 | 0 | const auto &val = ptr[r]; |
83 | |
|
84 | 0 | stats.max |= val; |
85 | 0 | stats.min &= val; |
86 | 0 | state.byte |= val << state.byte_pos; |
87 | |
|
88 | 0 | if (++state.byte_pos == 8) { |
89 | 0 | temp_writer.Write(state.byte); |
90 | 0 | state.byte = 0; |
91 | 0 | state.byte_pos = 0; |
92 | 0 | } |
93 | 0 | } |
94 | 0 | } |
95 | 0 | } |
96 | | |
97 | | unique_ptr<ColumnWriterPageState> BooleanColumnWriter::InitializePageState(PrimitiveColumnWriterState &state, |
98 | 0 | idx_t page_idx) { |
99 | 0 | return make_uniq<BooleanWriterPageState>(); |
100 | 0 | } |
101 | | |
102 | 0 | void BooleanColumnWriter::FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) { |
103 | 0 | auto &state = state_p->Cast<BooleanWriterPageState>(); |
104 | 0 | if (state.byte_pos > 0) { |
105 | 0 | temp_writer.Write<uint8_t>(state.byte); |
106 | 0 | state.byte = 0; |
107 | 0 | state.byte_pos = 0; |
108 | 0 | } |
109 | 0 | } |
110 | | |
111 | | idx_t BooleanColumnWriter::GetRowSize(const Vector &vector, const idx_t index, |
112 | 0 | const PrimitiveColumnWriterState &state) const { |
113 | 0 | return sizeof(bool); |
114 | 0 | } |
115 | | |
116 | | } // namespace duckdb |