/src/duckdb/extension/parquet/parquet_metadata.cpp
Line | Count | Source |
1 | | #include "duckdb/common/vector/list_vector.hpp" |
2 | | #include "duckdb/common/vector/map_vector.hpp" |
3 | | #include "duckdb/common/vector/struct_vector.hpp" |
4 | | #include "parquet_metadata.hpp" |
5 | | |
6 | | #include "parquet_statistics.hpp" |
7 | | |
8 | | #include <sstream> |
9 | | |
10 | | #include "duckdb/common/multi_file/multi_file_reader.hpp" |
11 | | #include "duckdb/common/types/blob.hpp" |
12 | | #include "duckdb/planner/filter/constant_filter.hpp" |
13 | | #include "duckdb/main/config.hpp" |
14 | | #include "duckdb/common/multi_file/multi_file_list.hpp" |
15 | | #include "parquet_reader.hpp" |
16 | | #include "duckdb/common/numeric_utils.hpp" |
17 | | |
18 | | namespace duckdb { |
19 | | |
20 | | struct ParquetMetadataFilePaths { |
21 | | MultiFileListScanData scan_data; |
22 | | shared_ptr<MultiFileList> file_list; |
23 | | mutex file_lock; |
24 | | |
25 | 0 | bool NextFile(OpenFileInfo &result) { |
26 | 0 | D_ASSERT(file_list); |
27 | 0 | unique_lock<mutex> lock(file_lock); |
28 | 0 | return file_list->Scan(scan_data, result); |
29 | 0 | } |
30 | | |
31 | 0 | FileExpandResult GetExpandResult() { |
32 | 0 | D_ASSERT(file_list); |
33 | 0 | unique_lock<mutex> lock(file_lock); |
34 | 0 | return file_list->GetExpandResult(); |
35 | 0 | } |
36 | | }; |
37 | | |
38 | | struct ParquetMetaDataBindData : public TableFunctionData { |
39 | | unique_ptr<ParquetMetadataFilePaths> file_paths; |
40 | | }; |
41 | | |
42 | | struct ParquetBloomProbeBindData : public ParquetMetaDataBindData { |
43 | | string probe_column_name; |
44 | | Value probe_constant; |
45 | | }; |
46 | | |
47 | | enum class ParquetMetadataOperatorType : uint8_t { |
48 | | META_DATA, |
49 | | SCHEMA, |
50 | | KEY_VALUE_META_DATA, |
51 | | FILE_META_DATA, |
52 | | BLOOM_PROBE, |
53 | | FULL_METADATA |
54 | | }; |
55 | | |
56 | | class ParquetMetadataFileProcessor { |
57 | | public: |
58 | 0 | ParquetMetadataFileProcessor() = default; |
59 | 0 | virtual ~ParquetMetadataFileProcessor() = default; |
60 | 0 | void Initialize(ClientContext &context, ParquetReader &reader) { |
61 | 0 | InitializeInternal(context, reader); |
62 | 0 | } |
63 | 0 | virtual void InitializeInternal(ClientContext &context, ParquetReader &reader) {}; |
64 | | virtual idx_t TotalRowCount(ParquetReader &reader) = 0; |
65 | | virtual void ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, ParquetReader &reader) = 0; |
66 | 0 | virtual bool ForceFlush() { |
67 | 0 | return false; |
68 | 0 | } |
69 | | }; |
70 | | |
71 | | struct ParquetMetaDataBindData; |
72 | | |
73 | | class ParquetMetaDataOperator { |
74 | | public: |
75 | | template <ParquetMetadataOperatorType OP_TYPE> |
76 | | static unique_ptr<FunctionData> Bind(ClientContext &context, TableFunctionBindInput &input, |
77 | | vector<LogicalType> &return_types, vector<string> &names); |
78 | | static unique_ptr<GlobalTableFunctionState> InitGlobal(ClientContext &context, TableFunctionInitInput &input); |
79 | | template <ParquetMetadataOperatorType OP_TYPE> |
80 | | static unique_ptr<LocalTableFunctionState> InitLocal(ExecutionContext &context, TableFunctionInitInput &input, |
81 | | GlobalTableFunctionState *global_state); |
82 | | static void Function(ClientContext &context, TableFunctionInput &data_p, DataChunk &output); |
83 | | static double Progress(ClientContext &context, const FunctionData *bind_data_p, |
84 | | const GlobalTableFunctionState *global_state); |
85 | | |
86 | | template <ParquetMetadataOperatorType OP_TYPE> |
87 | | static void BindSchema(vector<LogicalType> &return_types, vector<string> &names); |
88 | | |
89 | | static OperatorPartitionData GetPartitionData(ClientContext &context, TableFunctionGetPartitionInput &input); |
90 | | }; |
91 | | |
92 | | struct ParquetMetadataGlobalState : public GlobalTableFunctionState { |
93 | | ParquetMetadataGlobalState(unique_ptr<ParquetMetadataFilePaths> file_paths_p, ClientContext &context) |
94 | 0 | : file_paths(std::move(file_paths_p)) { |
95 | 0 | auto expand_result = file_paths->GetExpandResult(); |
96 | 0 | if (expand_result == FileExpandResult::MULTIPLE_FILES) { |
97 | 0 | max_threads = TaskScheduler::GetScheduler(context).NumberOfThreads(); |
98 | 0 | } else { |
99 | 0 | max_threads = 1; |
100 | 0 | } |
101 | 0 | } |
102 | | |
103 | 0 | idx_t MaxThreads() const override { |
104 | 0 | return max_threads; |
105 | 0 | } |
106 | | |
107 | 0 | bool NextFile(ClientContext &context, OpenFileInfo &result) { |
108 | 0 | return file_paths->NextFile(result); |
109 | 0 | } |
110 | | |
111 | 0 | double GetProgress() const { |
112 | | // Not the most accurate, instantly assumes all files are done and equal |
113 | 0 | unique_lock<mutex> lock(file_paths->file_lock); |
114 | 0 | return static_cast<double>(file_paths->scan_data.current_file_idx) / |
115 | 0 | static_cast<double>(file_paths->file_list->GetTotalFileCount()); |
116 | 0 | } |
117 | | |
118 | | unique_ptr<ParquetMetadataFilePaths> file_paths; |
119 | | idx_t max_threads; |
120 | | mutex lock; |
121 | | idx_t current_file = 0; |
122 | | }; |
123 | | |
124 | | struct ParquetMetadataLocalState : public LocalTableFunctionState { |
125 | | unique_ptr<ParquetReader> reader; |
126 | | unique_ptr<ParquetMetadataFileProcessor> processor; |
127 | | bool file_exhausted = true; |
128 | | idx_t row_idx = 0; |
129 | | idx_t total_rows = 0; |
130 | | optional_idx file_idx; |
131 | | |
132 | 0 | void Initialize(ClientContext &context, OpenFileInfo &file_info, idx_t next_file_idx) { |
133 | 0 | ParquetOptions parquet_options(context); |
134 | 0 | reader = make_uniq<ParquetReader>(context, file_info, parquet_options); |
135 | 0 | processor->Initialize(context, *reader); |
136 | 0 | total_rows = processor->TotalRowCount(*reader); |
137 | 0 | row_idx = 0; |
138 | 0 | file_idx = next_file_idx; |
139 | 0 | file_exhausted = false; |
140 | 0 | } |
141 | | }; |
142 | | |
143 | | template <class T> |
144 | 0 | static string ConvertParquetElementToString(T &&entry) { |
145 | 0 | duckdb::stringstream ss; |
146 | 0 | ss << entry; |
147 | 0 | return ss.str(); |
148 | 0 | } Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::ConvertParquetElementToString<duckdb_parquet::Type::type const&>(duckdb_parquet::Type::type const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::ConvertParquetElementToString<duckdb_parquet::CompressionCodec::type const&>(duckdb_parquet::CompressionCodec::type const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::ConvertParquetElementToString<duckdb_parquet::Encoding::type const&>(duckdb_parquet::Encoding::type const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::ConvertParquetElementToString<duckdb_parquet::FieldRepetitionType::type const&>(duckdb_parquet::FieldRepetitionType::type const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::ConvertParquetElementToString<duckdb_parquet::ConvertedType::type const&>(duckdb_parquet::ConvertedType::type const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::ConvertParquetElementToString<duckdb_parquet::EncryptionAlgorithm const&>(duckdb_parquet::EncryptionAlgorithm const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::ConvertParquetElementToString<duckdb_parquet::ColumnOrder const&>(duckdb_parquet::ColumnOrder const&) |
149 | | |
150 | | template <class T> |
151 | 0 | static string PrintParquetElementToString(T &&entry) { |
152 | 0 | duckdb::stringstream ss; |
153 | 0 | entry.printTo(ss); |
154 | 0 | return ss.str(); |
155 | 0 | } Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::StringType const&>(duckdb_parquet::StringType const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::MapType const&>(duckdb_parquet::MapType const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::ListType const&>(duckdb_parquet::ListType const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::EnumType const&>(duckdb_parquet::EnumType const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::DecimalType const&>(duckdb_parquet::DecimalType const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::DateType const&>(duckdb_parquet::DateType const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::TimeType const&>(duckdb_parquet::TimeType const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::TimestampType const&>(duckdb_parquet::TimestampType const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::IntType const&>(duckdb_parquet::IntType const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::NullType const&>(duckdb_parquet::NullType const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::JsonType const&>(duckdb_parquet::JsonType const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::BsonType const&>(duckdb_parquet::BsonType const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::UUIDType const&>(duckdb_parquet::UUIDType const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::Float16Type const&>(duckdb_parquet::Float16Type const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::GeometryType const&>(duckdb_parquet::GeometryType const&) Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::GeographyType const&>(duckdb_parquet::GeographyType const&) |
156 | | |
157 | | template <class T> |
158 | 0 | static Value ParquetElementString(T &&value, bool is_set) { |
159 | 0 | if (!is_set) { |
160 | 0 | return Value(); |
161 | 0 | } |
162 | 0 | return Value(ConvertParquetElementToString(value)); |
163 | 0 | } Unexecuted instantiation: parquet_metadata.cpp:duckdb::Value duckdb::ParquetElementString<duckdb_parquet::Type::type const&>(duckdb_parquet::Type::type const&, bool) Unexecuted instantiation: parquet_metadata.cpp:duckdb::Value duckdb::ParquetElementString<duckdb_parquet::FieldRepetitionType::type const&>(duckdb_parquet::FieldRepetitionType::type const&, bool) Unexecuted instantiation: parquet_metadata.cpp:duckdb::Value duckdb::ParquetElementString<duckdb_parquet::ConvertedType::type const&>(duckdb_parquet::ConvertedType::type const&, bool) Unexecuted instantiation: parquet_metadata.cpp:duckdb::Value duckdb::ParquetElementString<duckdb_parquet::EncryptionAlgorithm const&>(duckdb_parquet::EncryptionAlgorithm const&, bool) |
164 | | |
165 | 0 | static Value ParquetElementStringVal(const string &value, bool is_set) { |
166 | 0 | if (!is_set) { |
167 | 0 | return Value(); |
168 | 0 | } |
169 | 0 | return Value(value); |
170 | 0 | } |
171 | | |
172 | | template <class T> |
173 | 0 | static Value ParquetElementInteger(T &&value, bool is_iset) { |
174 | 0 | if (!is_iset) { |
175 | 0 | return Value(); |
176 | 0 | } |
177 | 0 | return Value::INTEGER(value); |
178 | 0 | } |
179 | | |
180 | | template <class T> |
181 | 0 | static Value ParquetElementBigint(T &&value, bool is_iset) { |
182 | 0 | if (!is_iset) { |
183 | 0 | return Value(); |
184 | 0 | } |
185 | 0 | return Value::BIGINT(value); |
186 | 0 | } Unexecuted instantiation: parquet_metadata.cpp:duckdb::Value duckdb::ParquetElementBigint<long const&>(long const&, bool) Unexecuted instantiation: parquet_metadata.cpp:duckdb::Value duckdb::ParquetElementBigint<int const&>(int const&, bool) |
187 | | |
188 | 0 | static Value ParquetElementBoolean(bool value, bool is_iset) { |
189 | 0 | if (!is_iset) { |
190 | 0 | return Value(); |
191 | 0 | } |
192 | 0 | return Value::BOOLEAN(value); |
193 | 0 | } |
194 | | |
195 | | //===--------------------------------------------------------------------===// |
196 | | // Row Group Meta Data |
197 | | //===--------------------------------------------------------------------===// |
198 | | |
199 | | class ParquetRowGroupMetadataProcessor : public ParquetMetadataFileProcessor { |
200 | | public: |
201 | | void InitializeInternal(ClientContext &context, ParquetReader &reader) override; |
202 | | idx_t TotalRowCount(ParquetReader &reader) override; |
203 | | void ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, ParquetReader &reader) override; |
204 | | |
205 | | private: |
206 | | vector<ParquetColumnSchema> column_schemas; |
207 | | }; |
208 | | |
209 | | template <> |
210 | | void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::META_DATA>(vector<LogicalType> &return_types, |
211 | 0 | vector<string> &names) { |
212 | 0 | names.emplace_back("file_name"); |
213 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
214 | |
|
215 | 0 | names.emplace_back("row_group_id"); |
216 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
217 | |
|
218 | 0 | names.emplace_back("row_group_num_rows"); |
219 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
220 | |
|
221 | 0 | names.emplace_back("row_group_num_columns"); |
222 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
223 | |
|
224 | 0 | names.emplace_back("row_group_bytes"); |
225 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
226 | |
|
227 | 0 | names.emplace_back("column_id"); |
228 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
229 | |
|
230 | 0 | names.emplace_back("file_offset"); |
231 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
232 | |
|
233 | 0 | names.emplace_back("num_values"); |
234 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
235 | |
|
236 | 0 | names.emplace_back("path_in_schema"); |
237 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
238 | |
|
239 | 0 | names.emplace_back("type"); |
240 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
241 | |
|
242 | 0 | names.emplace_back("stats_min"); |
243 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
244 | |
|
245 | 0 | names.emplace_back("stats_max"); |
246 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
247 | |
|
248 | 0 | names.emplace_back("stats_null_count"); |
249 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
250 | |
|
251 | 0 | names.emplace_back("stats_distinct_count"); |
252 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
253 | |
|
254 | 0 | names.emplace_back("stats_min_value"); |
255 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
256 | |
|
257 | 0 | names.emplace_back("stats_max_value"); |
258 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
259 | |
|
260 | 0 | names.emplace_back("compression"); |
261 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
262 | |
|
263 | 0 | names.emplace_back("encodings"); |
264 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
265 | |
|
266 | 0 | names.emplace_back("index_page_offset"); |
267 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
268 | |
|
269 | 0 | names.emplace_back("dictionary_page_offset"); |
270 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
271 | |
|
272 | 0 | names.emplace_back("data_page_offset"); |
273 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
274 | |
|
275 | 0 | names.emplace_back("total_compressed_size"); |
276 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
277 | |
|
278 | 0 | names.emplace_back("total_uncompressed_size"); |
279 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
280 | |
|
281 | 0 | names.emplace_back("key_value_metadata"); |
282 | 0 | return_types.emplace_back(LogicalType::MAP(LogicalType::BLOB, LogicalType::BLOB)); |
283 | |
|
284 | 0 | names.emplace_back("bloom_filter_offset"); |
285 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
286 | |
|
287 | 0 | names.emplace_back("bloom_filter_length"); |
288 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
289 | |
|
290 | 0 | names.emplace_back("min_is_exact"); |
291 | 0 | return_types.emplace_back(LogicalType::BOOLEAN); |
292 | |
|
293 | 0 | names.emplace_back("max_is_exact"); |
294 | 0 | return_types.emplace_back(LogicalType::BOOLEAN); |
295 | |
|
296 | 0 | names.emplace_back("row_group_compressed_bytes"); |
297 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
298 | |
|
299 | 0 | names.emplace_back("geo_bbox"); |
300 | 0 | return_types.emplace_back(LogicalType::STRUCT({ |
301 | 0 | {"xmin", LogicalType::DOUBLE}, |
302 | 0 | {"xmax", LogicalType::DOUBLE}, |
303 | 0 | {"ymin", LogicalType::DOUBLE}, |
304 | 0 | {"ymax", LogicalType::DOUBLE}, |
305 | 0 | {"zmin", LogicalType::DOUBLE}, |
306 | 0 | {"zmax", LogicalType::DOUBLE}, |
307 | 0 | {"mmin", LogicalType::DOUBLE}, |
308 | 0 | {"mmax", LogicalType::DOUBLE}, |
309 | 0 | })); |
310 | |
|
311 | 0 | names.emplace_back("geo_types"); |
312 | 0 | return_types.emplace_back(LogicalType::LIST(LogicalType::VARCHAR)); |
313 | 0 | } |
314 | | |
315 | | static Value ConvertParquetStats(const LogicalType &type, const ParquetColumnSchema &schema_ele, bool stats_is_set, |
316 | 0 | const std::string &stats) { |
317 | 0 | if (!stats_is_set) { |
318 | 0 | return Value(LogicalType::VARCHAR); |
319 | 0 | } |
320 | 0 | return ParquetStatisticsUtils::ConvertValue(type, schema_ele, stats).DefaultCastAs(LogicalType::VARCHAR); |
321 | 0 | } |
322 | | |
323 | 0 | static Value ConvertParquetGeoStatsBBOX(const duckdb_parquet::GeospatialStatistics &stats) { |
324 | 0 | if (!stats.__isset.bbox) { |
325 | 0 | return Value(LogicalType::STRUCT({ |
326 | 0 | {"xmin", LogicalType::DOUBLE}, |
327 | 0 | {"xmax", LogicalType::DOUBLE}, |
328 | 0 | {"ymin", LogicalType::DOUBLE}, |
329 | 0 | {"ymax", LogicalType::DOUBLE}, |
330 | 0 | {"zmin", LogicalType::DOUBLE}, |
331 | 0 | {"zmax", LogicalType::DOUBLE}, |
332 | 0 | {"mmin", LogicalType::DOUBLE}, |
333 | 0 | {"mmax", LogicalType::DOUBLE}, |
334 | 0 | })); |
335 | 0 | } |
336 | | |
337 | 0 | return Value::STRUCT({ |
338 | 0 | {"xmin", Value::DOUBLE(stats.bbox.xmin)}, |
339 | 0 | {"xmax", Value::DOUBLE(stats.bbox.xmax)}, |
340 | 0 | {"ymin", Value::DOUBLE(stats.bbox.ymin)}, |
341 | 0 | {"ymax", Value::DOUBLE(stats.bbox.ymax)}, |
342 | 0 | {"zmin", stats.bbox.__isset.zmin ? Value::DOUBLE(stats.bbox.zmin) : Value(LogicalTypeId::DOUBLE)}, |
343 | 0 | {"zmax", stats.bbox.__isset.zmax ? Value::DOUBLE(stats.bbox.zmax) : Value(LogicalTypeId::DOUBLE)}, |
344 | 0 | {"mmin", stats.bbox.__isset.mmin ? Value::DOUBLE(stats.bbox.mmin) : Value(LogicalTypeId::DOUBLE)}, |
345 | 0 | {"mmax", stats.bbox.__isset.mmax ? Value::DOUBLE(stats.bbox.mmax) : Value(LogicalTypeId::DOUBLE)}, |
346 | 0 | }); |
347 | 0 | } |
348 | | |
349 | 0 | static Value ConvertParquetGeoStatsTypes(const duckdb_parquet::GeospatialStatistics &stats) { |
350 | 0 | if (!stats.__isset.geospatial_types) { |
351 | 0 | return Value(LogicalType::LIST(LogicalType::VARCHAR)); |
352 | 0 | } |
353 | 0 | vector<Value> types; |
354 | 0 | types.reserve(stats.geospatial_types.size()); |
355 | |
|
356 | 0 | GeometryTypeSet type_set = GeometryTypeSet::Empty(); |
357 | 0 | for (auto &type : stats.geospatial_types) { |
358 | 0 | const auto geom_type = (type % 1000); |
359 | 0 | const auto vert_type = (type / 1000); |
360 | 0 | if (geom_type < 1 || geom_type > 7) { |
361 | 0 | throw InvalidInputException("Unsupported geometry type in Parquet geo metadata"); |
362 | 0 | } |
363 | 0 | if (vert_type < 0 || vert_type > 3) { |
364 | 0 | throw InvalidInputException("Unsupported geometry vertex type in Parquet geo metadata"); |
365 | 0 | } |
366 | 0 | type_set.Add(static_cast<GeometryType>(geom_type), static_cast<VertexType>(vert_type)); |
367 | 0 | } |
368 | | |
369 | 0 | for (auto &type_name : type_set.ToString(true)) { |
370 | 0 | types.push_back(Value(type_name)); |
371 | 0 | } |
372 | 0 | return Value::LIST(LogicalType::VARCHAR, types); |
373 | 0 | } |
374 | | |
375 | 0 | void ParquetRowGroupMetadataProcessor::InitializeInternal(ClientContext &context, ParquetReader &reader) { |
376 | 0 | auto meta_data = reader.GetFileMetadata(); |
377 | 0 | column_schemas.clear(); |
378 | 0 | for (idx_t schema_idx = 0; schema_idx < meta_data->schema.size(); schema_idx++) { |
379 | 0 | auto &schema_element = meta_data->schema[schema_idx]; |
380 | 0 | if (schema_element.num_children > 0) { |
381 | 0 | continue; |
382 | 0 | } |
383 | 0 | ParquetColumnSchema column_schema; |
384 | 0 | column_schema.type = reader.DeriveLogicalType(schema_element, column_schema); |
385 | 0 | column_schemas.push_back(std::move(column_schema)); |
386 | 0 | } |
387 | 0 | } |
388 | | |
389 | 0 | idx_t ParquetRowGroupMetadataProcessor::TotalRowCount(ParquetReader &reader) { |
390 | 0 | auto meta_data = reader.GetFileMetadata(); |
391 | 0 | return meta_data->row_groups.size() * column_schemas.size(); |
392 | 0 | } |
393 | | |
394 | | void ParquetRowGroupMetadataProcessor::ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, |
395 | 0 | ParquetReader &reader) { |
396 | 0 | auto meta_data = reader.GetFileMetadata(); |
397 | 0 | idx_t col_idx = row_idx % column_schemas.size(); |
398 | 0 | idx_t row_group_idx = row_idx / column_schemas.size(); |
399 | |
|
400 | 0 | auto &row_group = meta_data->row_groups[row_group_idx]; |
401 | |
|
402 | 0 | auto &column = row_group.columns[col_idx]; |
403 | 0 | auto &column_schema = column_schemas[col_idx]; |
404 | 0 | auto &col_meta = column.meta_data; |
405 | 0 | auto &stats = col_meta.statistics; |
406 | 0 | auto &column_type = column_schema.type; |
407 | | |
408 | | // file_name |
409 | 0 | output[0].get().SetValue(output_idx, reader.file.path); |
410 | | // row_group_id |
411 | 0 | output[1].get().SetValue(output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(row_group_idx))); |
412 | | // row_group_num_rows |
413 | 0 | output[2].get().SetValue(output_idx, Value::BIGINT(row_group.num_rows)); |
414 | | // row_group_num_columns |
415 | 0 | output[3].get().SetValue(output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(row_group.columns.size()))); |
416 | | // row_group_bytes |
417 | 0 | output[4].get().SetValue(output_idx, Value::BIGINT(row_group.total_byte_size)); |
418 | | // column_id |
419 | 0 | output[5].get().SetValue(output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(col_idx))); |
420 | | // file_offset |
421 | 0 | output[6].get().SetValue(output_idx, ParquetElementBigint(column.file_offset, row_group.__isset.file_offset)); |
422 | | // num_values |
423 | 0 | output[7].get().SetValue(output_idx, Value::BIGINT(col_meta.num_values)); |
424 | | // path_in_schema |
425 | 0 | output[8].get().SetValue(output_idx, StringUtil::Join(col_meta.path_in_schema, ", ")); |
426 | | // type |
427 | 0 | output[9].get().SetValue(output_idx, ConvertParquetElementToString(col_meta.type)); |
428 | | // stats_min |
429 | 0 | output[10].get().SetValue(output_idx, |
430 | 0 | ConvertParquetStats(column_type, column_schema, stats.__isset.min, stats.min)); |
431 | | // stats_max |
432 | 0 | output[11].get().SetValue(output_idx, |
433 | 0 | ConvertParquetStats(column_type, column_schema, stats.__isset.max, stats.max)); |
434 | | // stats_null_count |
435 | 0 | output[12].get().SetValue(output_idx, ParquetElementBigint(stats.null_count, stats.__isset.null_count)); |
436 | | // stats_distinct_count |
437 | 0 | output[13].get().SetValue(output_idx, ParquetElementBigint(stats.distinct_count, stats.__isset.distinct_count)); |
438 | | // stats_min_value |
439 | 0 | output[14].get().SetValue( |
440 | 0 | output_idx, ConvertParquetStats(column_type, column_schema, stats.__isset.min_value, stats.min_value)); |
441 | | // stats_max_value |
442 | 0 | output[15].get().SetValue( |
443 | 0 | output_idx, ConvertParquetStats(column_type, column_schema, stats.__isset.max_value, stats.max_value)); |
444 | | // compression |
445 | 0 | output[16].get().SetValue(output_idx, ConvertParquetElementToString(col_meta.codec)); |
446 | | // encodings |
447 | 0 | vector<string> encoding_string; |
448 | 0 | encoding_string.reserve(col_meta.encodings.size()); |
449 | 0 | for (auto &encoding : col_meta.encodings) { |
450 | 0 | encoding_string.push_back(ConvertParquetElementToString(encoding)); |
451 | 0 | } |
452 | 0 | output[17].get().SetValue(output_idx, Value(StringUtil::Join(encoding_string, ", "))); |
453 | | // index_page_offset |
454 | 0 | output[18].get().SetValue(output_idx, |
455 | 0 | ParquetElementBigint(col_meta.index_page_offset, col_meta.__isset.index_page_offset)); |
456 | | // dictionary_page_offset |
457 | 0 | output[19].get().SetValue( |
458 | 0 | output_idx, ParquetElementBigint(col_meta.dictionary_page_offset, col_meta.__isset.dictionary_page_offset)); |
459 | | // data_page_offset |
460 | 0 | output[20].get().SetValue(output_idx, Value::BIGINT(col_meta.data_page_offset)); |
461 | | // total_compressed_size |
462 | 0 | output[21].get().SetValue(output_idx, Value::BIGINT(col_meta.total_compressed_size)); |
463 | | // total_uncompressed_size |
464 | 0 | output[22].get().SetValue(output_idx, Value::BIGINT(col_meta.total_uncompressed_size)); |
465 | | // key_value_metadata |
466 | 0 | vector<Value> map_keys, map_values; |
467 | 0 | for (auto &entry : col_meta.key_value_metadata) { |
468 | 0 | map_keys.push_back(Value::BLOB_RAW(entry.key)); |
469 | 0 | map_values.push_back(Value::BLOB_RAW(entry.value)); |
470 | 0 | } |
471 | 0 | output[23].get().SetValue( |
472 | 0 | output_idx, Value::MAP(LogicalType::BLOB, LogicalType::BLOB, std::move(map_keys), std::move(map_values))); |
473 | | // bloom_filter_offset |
474 | 0 | output[24].get().SetValue(output_idx, |
475 | 0 | ParquetElementBigint(col_meta.bloom_filter_offset, col_meta.__isset.bloom_filter_offset)); |
476 | | // bloom_filter_length |
477 | 0 | output[25].get().SetValue(output_idx, |
478 | 0 | ParquetElementBigint(col_meta.bloom_filter_length, col_meta.__isset.bloom_filter_length)); |
479 | | // min_is_exact |
480 | 0 | output[26].get().SetValue(output_idx, |
481 | 0 | ParquetElementBoolean(stats.is_min_value_exact, stats.__isset.is_min_value_exact)); |
482 | | // max_is_exact |
483 | 0 | output[27].get().SetValue(output_idx, |
484 | 0 | ParquetElementBoolean(stats.is_max_value_exact, stats.__isset.is_max_value_exact)); |
485 | | // row_group_compressed_bytes |
486 | 0 | output[28].get().SetValue( |
487 | 0 | output_idx, ParquetElementBigint(row_group.total_compressed_size, row_group.__isset.total_compressed_size)); |
488 | | // geo_stats_bbox, LogicalType::STRUCT(...) |
489 | 0 | output[29].get().SetValue(output_idx, ConvertParquetGeoStatsBBOX(col_meta.geospatial_statistics)); |
490 | | |
491 | | // geo_stats_types, LogicalType::LIST(LogicalType::VARCHAR) |
492 | 0 | output[30].get().SetValue(output_idx, ConvertParquetGeoStatsTypes(col_meta.geospatial_statistics)); |
493 | 0 | } |
494 | | |
495 | | //===--------------------------------------------------------------------===// |
496 | | // Schema Data |
497 | | //===--------------------------------------------------------------------===// |
498 | | |
499 | | class ParquetSchemaProcessor : public ParquetMetadataFileProcessor { |
500 | | public: |
501 | | idx_t TotalRowCount(ParquetReader &reader) override; |
502 | | void ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, ParquetReader &reader) override; |
503 | | }; |
504 | | |
505 | | template <> |
506 | | void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::SCHEMA>(vector<LogicalType> &return_types, |
507 | 0 | vector<string> &names) { |
508 | 0 | names.emplace_back("file_name"); |
509 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
510 | |
|
511 | 0 | names.emplace_back("name"); |
512 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
513 | |
|
514 | 0 | names.emplace_back("type"); |
515 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
516 | |
|
517 | 0 | names.emplace_back("type_length"); |
518 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
519 | |
|
520 | 0 | names.emplace_back("repetition_type"); |
521 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
522 | |
|
523 | 0 | names.emplace_back("num_children"); |
524 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
525 | |
|
526 | 0 | names.emplace_back("converted_type"); |
527 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
528 | |
|
529 | 0 | names.emplace_back("scale"); |
530 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
531 | |
|
532 | 0 | names.emplace_back("precision"); |
533 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
534 | |
|
535 | 0 | names.emplace_back("field_id"); |
536 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
537 | |
|
538 | 0 | names.emplace_back("logical_type"); |
539 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
540 | |
|
541 | 0 | names.emplace_back("duckdb_type"); |
542 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
543 | |
|
544 | 0 | names.emplace_back("column_id"); |
545 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
546 | 0 | } |
547 | | |
548 | 0 | static Value ParquetLogicalTypeToString(const duckdb_parquet::LogicalType &type, bool is_set) { |
549 | 0 | if (!is_set) { |
550 | 0 | return Value(); |
551 | 0 | } |
552 | 0 | if (type.__isset.STRING) { |
553 | 0 | return Value(PrintParquetElementToString(type.STRING)); |
554 | 0 | } |
555 | 0 | if (type.__isset.MAP) { |
556 | 0 | return Value(PrintParquetElementToString(type.MAP)); |
557 | 0 | } |
558 | 0 | if (type.__isset.LIST) { |
559 | 0 | return Value(PrintParquetElementToString(type.LIST)); |
560 | 0 | } |
561 | 0 | if (type.__isset.ENUM) { |
562 | 0 | return Value(PrintParquetElementToString(type.ENUM)); |
563 | 0 | } |
564 | 0 | if (type.__isset.DECIMAL) { |
565 | 0 | return Value(PrintParquetElementToString(type.DECIMAL)); |
566 | 0 | } |
567 | 0 | if (type.__isset.DATE) { |
568 | 0 | return Value(PrintParquetElementToString(type.DATE)); |
569 | 0 | } |
570 | 0 | if (type.__isset.TIME) { |
571 | 0 | return Value(PrintParquetElementToString(type.TIME)); |
572 | 0 | } |
573 | 0 | if (type.__isset.TIMESTAMP) { |
574 | 0 | return Value(PrintParquetElementToString(type.TIMESTAMP)); |
575 | 0 | } |
576 | 0 | if (type.__isset.INTEGER) { |
577 | 0 | return Value(PrintParquetElementToString(type.INTEGER)); |
578 | 0 | } |
579 | 0 | if (type.__isset.UNKNOWN) { |
580 | 0 | return Value(PrintParquetElementToString(type.UNKNOWN)); |
581 | 0 | } |
582 | 0 | if (type.__isset.JSON) { |
583 | 0 | return Value(PrintParquetElementToString(type.JSON)); |
584 | 0 | } |
585 | 0 | if (type.__isset.BSON) { |
586 | 0 | return Value(PrintParquetElementToString(type.BSON)); |
587 | 0 | } |
588 | 0 | if (type.__isset.UUID) { |
589 | 0 | return Value(PrintParquetElementToString(type.UUID)); |
590 | 0 | } |
591 | 0 | if (type.__isset.FLOAT16) { |
592 | 0 | return Value(PrintParquetElementToString(type.FLOAT16)); |
593 | 0 | } |
594 | 0 | if (type.__isset.GEOMETRY) { |
595 | 0 | return Value(PrintParquetElementToString(type.GEOMETRY)); |
596 | 0 | } |
597 | 0 | if (type.__isset.GEOGRAPHY) { |
598 | 0 | return Value(PrintParquetElementToString(type.GEOGRAPHY)); |
599 | 0 | } |
600 | 0 | return Value(); |
601 | 0 | } |
602 | | |
603 | 0 | idx_t ParquetSchemaProcessor::TotalRowCount(ParquetReader &reader) { |
604 | 0 | return reader.GetFileMetadata()->schema.size(); |
605 | 0 | } |
606 | | |
607 | | void ParquetSchemaProcessor::ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, |
608 | 0 | ParquetReader &reader) { |
609 | 0 | auto meta_data = reader.GetFileMetadata(); |
610 | 0 | const auto &column = meta_data->schema[row_idx]; |
611 | | |
612 | | // file_name |
613 | 0 | output[0].get().SetValue(output_idx, reader.file.path); |
614 | | // name |
615 | 0 | output[1].get().SetValue(output_idx, column.name); |
616 | | // type |
617 | 0 | output[2].get().SetValue(output_idx, ParquetElementString(column.type, column.__isset.type)); |
618 | | // type_length |
619 | 0 | output[3].get().SetValue(output_idx, ParquetElementInteger(column.type_length, column.__isset.type_length)); |
620 | | // repetition_type |
621 | 0 | output[4].get().SetValue(output_idx, ParquetElementString(column.repetition_type, column.__isset.repetition_type)); |
622 | | // num_children |
623 | 0 | output[5].get().SetValue(output_idx, ParquetElementBigint(column.num_children, column.__isset.num_children)); |
624 | | // converted_type |
625 | 0 | output[6].get().SetValue(output_idx, ParquetElementString(column.converted_type, column.__isset.converted_type)); |
626 | | // scale |
627 | 0 | output[7].get().SetValue(output_idx, ParquetElementBigint(column.scale, column.__isset.scale)); |
628 | | // precision |
629 | 0 | output[8].get().SetValue(output_idx, ParquetElementBigint(column.precision, column.__isset.precision)); |
630 | | // field_id |
631 | 0 | output[9].get().SetValue(output_idx, ParquetElementBigint(column.field_id, column.__isset.field_id)); |
632 | | // logical_type |
633 | 0 | output[10].get().SetValue(output_idx, ParquetLogicalTypeToString(column.logicalType, column.__isset.logicalType)); |
634 | | // duckdb_type |
635 | 0 | ParquetColumnSchema column_schema; |
636 | 0 | Value duckdb_type; |
637 | 0 | if (column.__isset.type) { |
638 | 0 | duckdb_type = reader.DeriveLogicalType(column, column_schema).ToString(); |
639 | 0 | } |
640 | 0 | output[11].get().SetValue(output_idx, duckdb_type); |
641 | | // column_id |
642 | 0 | output[12].get().SetValue(output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(row_idx))); |
643 | 0 | } |
644 | | |
645 | | //===--------------------------------------------------------------------===// |
646 | | // KV Meta Data |
647 | | //===--------------------------------------------------------------------===// |
648 | | |
649 | | class ParquetKeyValueMetadataProcessor : public ParquetMetadataFileProcessor { |
650 | | public: |
651 | | idx_t TotalRowCount(ParquetReader &reader) override; |
652 | | void ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, ParquetReader &reader) override; |
653 | | }; |
654 | | |
655 | | template <> |
656 | | void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::KEY_VALUE_META_DATA>( |
657 | 0 | vector<LogicalType> &return_types, vector<string> &names) { |
658 | 0 | names.emplace_back("file_name"); |
659 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
660 | |
|
661 | 0 | names.emplace_back("key"); |
662 | 0 | return_types.emplace_back(LogicalType::BLOB); |
663 | |
|
664 | 0 | names.emplace_back("value"); |
665 | 0 | return_types.emplace_back(LogicalType::BLOB); |
666 | 0 | } |
667 | | |
668 | 0 | idx_t ParquetKeyValueMetadataProcessor::TotalRowCount(ParquetReader &reader) { |
669 | 0 | return reader.GetFileMetadata()->key_value_metadata.size(); |
670 | 0 | } |
671 | | |
672 | | void ParquetKeyValueMetadataProcessor::ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, |
673 | 0 | ParquetReader &reader) { |
674 | 0 | auto meta_data = reader.GetFileMetadata(); |
675 | 0 | auto &entry = meta_data->key_value_metadata[row_idx]; |
676 | |
|
677 | 0 | output[0].get().SetValue(output_idx, Value(reader.file.path)); |
678 | 0 | output[1].get().SetValue(output_idx, Value::BLOB_RAW(entry.key)); |
679 | 0 | output[2].get().SetValue(output_idx, Value::BLOB_RAW(entry.value)); |
680 | 0 | } |
681 | | |
682 | | //===--------------------------------------------------------------------===// |
683 | | // File Meta Data |
684 | | //===--------------------------------------------------------------------===// |
685 | | |
686 | | class ParquetFileMetadataProcessor : public ParquetMetadataFileProcessor { |
687 | | public: |
688 | | idx_t TotalRowCount(ParquetReader &reader) override; |
689 | | void ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, ParquetReader &reader) override; |
690 | | }; |
691 | | |
692 | | template <> |
693 | | void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::FILE_META_DATA>(vector<LogicalType> &return_types, |
694 | 0 | vector<string> &names) { |
695 | 0 | names.emplace_back("file_name"); |
696 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
697 | |
|
698 | 0 | names.emplace_back("created_by"); |
699 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
700 | |
|
701 | 0 | names.emplace_back("num_rows"); |
702 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
703 | |
|
704 | 0 | names.emplace_back("num_row_groups"); |
705 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
706 | |
|
707 | 0 | names.emplace_back("format_version"); |
708 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
709 | |
|
710 | 0 | names.emplace_back("encryption_algorithm"); |
711 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
712 | |
|
713 | 0 | names.emplace_back("footer_signing_key_metadata"); |
714 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
715 | |
|
716 | 0 | names.emplace_back("file_size_bytes"); |
717 | 0 | return_types.emplace_back(LogicalType::UBIGINT); |
718 | |
|
719 | 0 | names.emplace_back("footer_size"); |
720 | 0 | return_types.emplace_back(LogicalType::UBIGINT); |
721 | |
|
722 | 0 | names.emplace_back("column_orders"); |
723 | 0 | return_types.emplace_back(LogicalType::LIST(LogicalType::VARCHAR)); |
724 | 0 | } |
725 | | |
726 | 0 | idx_t ParquetFileMetadataProcessor::TotalRowCount(ParquetReader &reader) { |
727 | 0 | return 1; |
728 | 0 | } |
729 | | |
730 | | void ParquetFileMetadataProcessor::ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, |
731 | 0 | ParquetReader &reader) { |
732 | 0 | auto meta_data = reader.GetFileMetadata(); |
733 | | |
734 | | // file_name |
735 | 0 | output[0].get().SetValue(output_idx, Value(reader.file.path)); |
736 | | // created_by |
737 | 0 | output[1].get().SetValue(output_idx, ParquetElementStringVal(meta_data->created_by, meta_data->__isset.created_by)); |
738 | | // num_rows |
739 | 0 | output[2].get().SetValue(output_idx, Value::BIGINT(meta_data->num_rows)); |
740 | | // num_row_groups |
741 | 0 | output[3].get().SetValue(output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(meta_data->row_groups.size()))); |
742 | | // format_version |
743 | 0 | output[4].get().SetValue(output_idx, Value::BIGINT(meta_data->version)); |
744 | | // encryption_algorithm |
745 | 0 | output[5].get().SetValue( |
746 | 0 | output_idx, ParquetElementString(meta_data->encryption_algorithm, meta_data->__isset.encryption_algorithm)); |
747 | | // footer_signing_key_metadata |
748 | 0 | output[6].get().SetValue(output_idx, ParquetElementStringVal(meta_data->footer_signing_key_metadata, |
749 | 0 | meta_data->__isset.footer_signing_key_metadata)); |
750 | | // file_size_bytes |
751 | 0 | output[7].get().SetValue(output_idx, Value::UBIGINT(reader.GetHandle().GetFileSize())); |
752 | | // footer_size |
753 | 0 | output[8].get().SetValue(output_idx, Value::UBIGINT(reader.metadata->footer_size)); |
754 | | // column_orders |
755 | 0 | Value column_orders_value; |
756 | 0 | if (meta_data->__isset.column_orders) { |
757 | 0 | vector<Value> column_orders; |
758 | 0 | column_orders.reserve(meta_data->column_orders.size()); |
759 | 0 | for (auto &column_order : meta_data->column_orders) { |
760 | 0 | column_orders.push_back(Value(ConvertParquetElementToString(column_order))); |
761 | 0 | } |
762 | 0 | column_orders_value = Value::LIST(LogicalType::VARCHAR, column_orders); |
763 | 0 | } |
764 | 0 | output[9].get().SetValue(output_idx, column_orders_value); |
765 | 0 | } |
766 | | |
767 | | //===--------------------------------------------------------------------===// |
768 | | // Bloom Probe |
769 | | //===--------------------------------------------------------------------===// |
770 | | |
771 | | class ParquetBloomProbeProcessor : public ParquetMetadataFileProcessor { |
772 | | public: |
773 | | ParquetBloomProbeProcessor(const string &probe_column, const Value &probe_value); |
774 | | |
775 | | void InitializeInternal(ClientContext &context, ParquetReader &reader) override; |
776 | | idx_t TotalRowCount(ParquetReader &reader) override; |
777 | | void ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, ParquetReader &reader) override; |
778 | | |
779 | | private: |
780 | | string probe_column_name; |
781 | | Value probe_constant; |
782 | | optional_idx probe_column_idx; |
783 | | |
784 | | unique_ptr<duckdb_apache::thrift::protocol::TCompactProtocolT<ThriftFileTransport>> protocol; |
785 | | optional_ptr<Allocator> allocator; |
786 | | unique_ptr<ConstantFilter> filter; |
787 | | }; |
788 | | |
789 | | template <> |
790 | | void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::BLOOM_PROBE>(vector<LogicalType> &return_types, |
791 | 0 | vector<string> &names) { |
792 | 0 | names.emplace_back("file_name"); |
793 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
794 | |
|
795 | 0 | names.emplace_back("row_group_id"); |
796 | 0 | return_types.emplace_back(LogicalType::BIGINT); |
797 | |
|
798 | 0 | names.emplace_back("bloom_filter_excludes"); |
799 | 0 | return_types.emplace_back(LogicalType::BOOLEAN); |
800 | 0 | } |
801 | | |
802 | | ParquetBloomProbeProcessor::ParquetBloomProbeProcessor(const string &probe_column, const Value &probe_value) |
803 | 0 | : probe_column_name(probe_column), probe_constant(probe_value) { |
804 | 0 | } |
805 | | |
806 | 0 | void ParquetBloomProbeProcessor::InitializeInternal(ClientContext &context, ParquetReader &reader) { |
807 | 0 | probe_column_idx = optional_idx::Invalid(); |
808 | |
|
809 | 0 | for (idx_t column_idx = 0; column_idx < reader.columns.size(); column_idx++) { |
810 | 0 | if (reader.columns[column_idx].name == probe_column_name) { |
811 | 0 | probe_column_idx = column_idx; |
812 | 0 | break; |
813 | 0 | } |
814 | 0 | } |
815 | |
|
816 | 0 | if (!probe_column_idx.IsValid()) { |
817 | 0 | throw InvalidInputException("Column %s not found in %s", probe_column_name, reader.file.path); |
818 | 0 | } |
819 | | |
820 | 0 | auto transport = duckdb_base_std::make_shared<ThriftFileTransport>(reader.GetHandle(), false); |
821 | 0 | protocol = make_uniq<duckdb_apache::thrift::protocol::TCompactProtocolT<ThriftFileTransport>>(std::move(transport)); |
822 | 0 | allocator = &BufferAllocator::Get(context); |
823 | 0 | filter = make_uniq<ConstantFilter>( |
824 | 0 | ExpressionType::COMPARE_EQUAL, |
825 | 0 | probe_constant.CastAs(context, reader.GetColumns()[probe_column_idx.GetIndex()].type)); |
826 | 0 | } |
827 | | |
828 | 0 | idx_t ParquetBloomProbeProcessor::TotalRowCount(ParquetReader &reader) { |
829 | 0 | return reader.GetFileMetadata()->row_groups.size(); |
830 | 0 | } |
831 | | |
832 | | void ParquetBloomProbeProcessor::ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, |
833 | 0 | ParquetReader &reader) { |
834 | 0 | auto meta_data = reader.GetFileMetadata(); |
835 | 0 | auto &row_group = meta_data->row_groups[row_idx]; |
836 | 0 | auto &column = row_group.columns[probe_column_idx.GetIndex()]; |
837 | |
|
838 | 0 | D_ASSERT(!probe_constant.IsNull()); |
839 | |
|
840 | 0 | auto bloom_excludes = ParquetStatisticsUtils::BloomFilterExcludes(*filter, column.meta_data, *protocol, *allocator); |
841 | |
|
842 | 0 | output[0].get().SetValue(output_idx, Value(reader.file.path)); |
843 | 0 | output[1].get().SetValue(output_idx, Value::BIGINT(NumericCast<int64_t>(row_idx))); |
844 | 0 | output[2].get().SetValue(output_idx, Value::BOOLEAN(bloom_excludes)); |
845 | 0 | } |
846 | | |
847 | | //===--------------------------------------------------------------------===// |
848 | | // Full Metadata |
849 | | //===--------------------------------------------------------------------===// |
850 | | |
851 | | class FullMetadataProcessor : public ParquetMetadataFileProcessor { |
852 | | public: |
853 | 0 | FullMetadataProcessor() = default; |
854 | | |
855 | | void InitializeInternal(ClientContext &context, ParquetReader &reader) override; |
856 | | idx_t TotalRowCount(ParquetReader &reader) override; |
857 | | void ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, ParquetReader &reader) override; |
858 | 0 | bool ForceFlush() override { |
859 | 0 | return true; |
860 | 0 | } |
861 | | |
862 | | private: |
863 | | void PopulateMetadata(ParquetMetadataFileProcessor &processor, Vector &output, idx_t output_idx, |
864 | | ParquetReader &reader); |
865 | | |
866 | | ParquetFileMetadataProcessor file_processor; |
867 | | ParquetRowGroupMetadataProcessor row_group_processor; |
868 | | ParquetSchemaProcessor schema_processor; |
869 | | ParquetKeyValueMetadataProcessor kv_processor; |
870 | | }; |
871 | | |
872 | | void FullMetadataProcessor::PopulateMetadata(ParquetMetadataFileProcessor &processor, Vector &output, idx_t output_idx, |
873 | 0 | ParquetReader &reader) { |
874 | 0 | auto count = processor.TotalRowCount(reader); |
875 | 0 | auto *result_data = FlatVector::GetData<list_entry_t>(output); |
876 | 0 | auto &result_struct = ListVector::GetEntry(output); |
877 | 0 | auto &result_struct_entries = StructVector::GetEntries(result_struct); |
878 | |
|
879 | 0 | ListVector::SetListSize(output, count); |
880 | 0 | ListVector::Reserve(output, count); |
881 | |
|
882 | 0 | result_data[output_idx].offset = 0; |
883 | 0 | result_data[output_idx].length = count; |
884 | |
|
885 | 0 | FlatVector::Validity(output).SetValid(output_idx); |
886 | |
|
887 | 0 | vector<reference<Vector>> vectors; |
888 | 0 | for (auto &entry : result_struct_entries) { |
889 | 0 | vectors.push_back(std::ref(entry)); |
890 | 0 | entry.SetVectorType(VectorType::FLAT_VECTOR); |
891 | 0 | auto &validity = FlatVector::Validity(entry); |
892 | 0 | validity.Initialize(count); |
893 | 0 | } |
894 | 0 | for (idx_t i = 0; i < count; i++) { |
895 | 0 | processor.ReadRow(vectors, i, i, reader); |
896 | 0 | } |
897 | 0 | } |
898 | | |
899 | | template <> |
900 | | void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::FULL_METADATA>(vector<LogicalType> &return_types, |
901 | 0 | vector<string> &names) { |
902 | 0 | names.emplace_back("parquet_file_metadata"); |
903 | 0 | vector<LogicalType> file_meta_types; |
904 | 0 | vector<string> file_meta_names; |
905 | 0 | ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::FILE_META_DATA>(file_meta_types, file_meta_names); |
906 | 0 | child_list_t<LogicalType> file_meta_children; |
907 | 0 | for (idx_t i = 0; i < file_meta_types.size(); i++) { |
908 | 0 | file_meta_children.push_back(make_pair(file_meta_names[i], file_meta_types[i])); |
909 | 0 | } |
910 | 0 | return_types.emplace_back(LogicalType::LIST(LogicalType::STRUCT(std::move(file_meta_children)))); |
911 | |
|
912 | 0 | names.emplace_back("parquet_metadata"); |
913 | 0 | vector<LogicalType> row_group_types; |
914 | 0 | vector<string> row_group_names; |
915 | 0 | ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::META_DATA>(row_group_types, row_group_names); |
916 | 0 | child_list_t<LogicalType> row_group_children; |
917 | 0 | for (idx_t i = 0; i < row_group_types.size(); i++) { |
918 | 0 | row_group_children.push_back(make_pair(row_group_names[i], row_group_types[i])); |
919 | 0 | } |
920 | 0 | return_types.emplace_back(LogicalType::LIST(LogicalType::STRUCT(std::move(row_group_children)))); |
921 | |
|
922 | 0 | names.emplace_back("parquet_schema"); |
923 | 0 | vector<LogicalType> schema_types; |
924 | 0 | vector<string> schema_names; |
925 | 0 | ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::SCHEMA>(schema_types, schema_names); |
926 | 0 | child_list_t<LogicalType> schema_children; |
927 | 0 | for (idx_t i = 0; i < schema_types.size(); i++) { |
928 | 0 | schema_children.push_back(make_pair(schema_names[i], schema_types[i])); |
929 | 0 | } |
930 | 0 | return_types.emplace_back(LogicalType::LIST(LogicalType::STRUCT(std::move(schema_children)))); |
931 | |
|
932 | 0 | names.emplace_back("parquet_kv_metadata"); |
933 | 0 | vector<LogicalType> kv_types; |
934 | 0 | vector<string> kv_names; |
935 | 0 | ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::KEY_VALUE_META_DATA>(kv_types, kv_names); |
936 | 0 | child_list_t<LogicalType> kv_children; |
937 | 0 | for (idx_t i = 0; i < kv_types.size(); i++) { |
938 | 0 | kv_children.push_back(make_pair(kv_names[i], kv_types[i])); |
939 | 0 | } |
940 | 0 | return_types.emplace_back(LogicalType::LIST(LogicalType::STRUCT(std::move(kv_children)))); |
941 | 0 | } |
942 | | |
943 | 0 | void FullMetadataProcessor::InitializeInternal(ClientContext &context, ParquetReader &reader) { |
944 | 0 | file_processor.Initialize(context, reader); |
945 | 0 | row_group_processor.Initialize(context, reader); |
946 | 0 | schema_processor.Initialize(context, reader); |
947 | 0 | kv_processor.Initialize(context, reader); |
948 | 0 | } |
949 | | |
950 | 0 | idx_t FullMetadataProcessor::TotalRowCount(ParquetReader &reader) { |
951 | 0 | return 1; |
952 | 0 | } |
953 | | |
954 | | void FullMetadataProcessor::ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, |
955 | 0 | ParquetReader &reader) { |
956 | 0 | PopulateMetadata(file_processor, output[0].get(), output_idx, reader); |
957 | 0 | PopulateMetadata(row_group_processor, output[1].get(), output_idx, reader); |
958 | 0 | PopulateMetadata(schema_processor, output[2].get(), output_idx, reader); |
959 | 0 | PopulateMetadata(kv_processor, output[3].get(), output_idx, reader); |
960 | 0 | } |
961 | | |
962 | | //===--------------------------------------------------------------------===// |
963 | | // Template Function Implementation |
964 | | //===--------------------------------------------------------------------===// |
965 | | |
966 | | template <ParquetMetadataOperatorType OP_TYPE> |
967 | | unique_ptr<FunctionData> ParquetMetaDataOperator::Bind(ClientContext &context, TableFunctionBindInput &input, |
968 | 0 | vector<LogicalType> &return_types, vector<string> &names) { |
969 | | // Extract file paths from input using MultiFileReader (handles both single files and arrays) |
970 | 0 | auto multi_file_reader = MultiFileReader::CreateDefault("ParquetMetadata"); |
971 | 0 | auto glob_input = FileGlobInput(FileGlobOptions::FALLBACK_GLOB, "parquet"); |
972 | |
|
973 | 0 | auto result = make_uniq<ParquetMetaDataBindData>(); |
974 | | // Bind schema based on operation type |
975 | 0 | if (OP_TYPE == ParquetMetadataOperatorType::BLOOM_PROBE) { |
976 | 0 | auto probe_bind_data = make_uniq<ParquetBloomProbeBindData>(); |
977 | 0 | D_ASSERT(input.inputs.size() == 3); |
978 | 0 | if (input.inputs[1].IsNull() || input.inputs[2].IsNull()) { |
979 | 0 | throw InvalidInputException("Can't have NULL parameters for parquet_bloom_probe"); |
980 | 0 | } |
981 | 0 | probe_bind_data->probe_column_name = input.inputs[1].CastAs(context, LogicalType::VARCHAR).GetValue<string>(); |
982 | 0 | probe_bind_data->probe_constant = input.inputs[2]; |
983 | 0 | result = std::move(probe_bind_data); |
984 | 0 | } |
985 | | |
986 | 0 | result->file_paths = make_uniq<ParquetMetadataFilePaths>(); |
987 | 0 | result->file_paths->file_list = multi_file_reader->CreateFileList(context, input.inputs[0], glob_input); |
988 | 0 | D_ASSERT(!result->file_paths->file_list->IsEmpty()); |
989 | 0 | result->file_paths->file_list->InitializeScan(result->file_paths->scan_data); |
990 | |
|
991 | 0 | BindSchema<OP_TYPE>(return_types, names); |
992 | |
|
993 | 0 | return std::move(result); |
994 | 0 | } Unexecuted instantiation: duckdb::unique_ptr<duckdb::FunctionData, std::__1::default_delete<duckdb::FunctionData>, true> duckdb::ParquetMetaDataOperator::Bind<(duckdb::ParquetMetadataOperatorType)0>(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, duckdb::vector<duckdb::LogicalType, true, std::__1::allocator<duckdb::LogicalType> >&, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >&) Unexecuted instantiation: duckdb::unique_ptr<duckdb::FunctionData, std::__1::default_delete<duckdb::FunctionData>, true> duckdb::ParquetMetaDataOperator::Bind<(duckdb::ParquetMetadataOperatorType)1>(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, duckdb::vector<duckdb::LogicalType, true, std::__1::allocator<duckdb::LogicalType> >&, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >&) Unexecuted instantiation: duckdb::unique_ptr<duckdb::FunctionData, std::__1::default_delete<duckdb::FunctionData>, true> duckdb::ParquetMetaDataOperator::Bind<(duckdb::ParquetMetadataOperatorType)2>(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, duckdb::vector<duckdb::LogicalType, true, std::__1::allocator<duckdb::LogicalType> >&, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >&) Unexecuted instantiation: duckdb::unique_ptr<duckdb::FunctionData, std::__1::default_delete<duckdb::FunctionData>, true> duckdb::ParquetMetaDataOperator::Bind<(duckdb::ParquetMetadataOperatorType)3>(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, duckdb::vector<duckdb::LogicalType, true, std::__1::allocator<duckdb::LogicalType> >&, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >&) Unexecuted instantiation: duckdb::unique_ptr<duckdb::FunctionData, std::__1::default_delete<duckdb::FunctionData>, true> duckdb::ParquetMetaDataOperator::Bind<(duckdb::ParquetMetadataOperatorType)4>(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, duckdb::vector<duckdb::LogicalType, true, std::__1::allocator<duckdb::LogicalType> >&, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >&) Unexecuted instantiation: duckdb::unique_ptr<duckdb::FunctionData, std::__1::default_delete<duckdb::FunctionData>, true> duckdb::ParquetMetaDataOperator::Bind<(duckdb::ParquetMetadataOperatorType)5>(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, duckdb::vector<duckdb::LogicalType, true, std::__1::allocator<duckdb::LogicalType> >&, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >&) |
995 | | |
996 | | unique_ptr<GlobalTableFunctionState> ParquetMetaDataOperator::InitGlobal(ClientContext &context, |
997 | 0 | TableFunctionInitInput &input) { |
998 | 0 | auto &bind_data = input.bind_data->CastNoConst<ParquetMetaDataBindData>(); |
999 | 0 | return make_uniq<ParquetMetadataGlobalState>(std::move(bind_data.file_paths), context); |
1000 | 0 | } |
1001 | | |
1002 | | template <ParquetMetadataOperatorType OP_TYPE> |
1003 | | unique_ptr<LocalTableFunctionState> ParquetMetaDataOperator::InitLocal(ExecutionContext &context, |
1004 | | TableFunctionInitInput &input, |
1005 | 0 | GlobalTableFunctionState *global_state) { |
1006 | 0 | auto &bind_data = input.bind_data->Cast<ParquetMetaDataBindData>(); |
1007 | 0 | auto res = make_uniq<ParquetMetadataLocalState>(); |
1008 | 0 | switch (OP_TYPE) { |
1009 | 0 | case ParquetMetadataOperatorType::META_DATA: |
1010 | 0 | res->processor = make_uniq<ParquetRowGroupMetadataProcessor>(); |
1011 | 0 | break; |
1012 | 0 | case ParquetMetadataOperatorType::SCHEMA: |
1013 | 0 | res->processor = make_uniq<ParquetSchemaProcessor>(); |
1014 | 0 | break; |
1015 | 0 | case ParquetMetadataOperatorType::KEY_VALUE_META_DATA: |
1016 | 0 | res->processor = make_uniq<ParquetKeyValueMetadataProcessor>(); |
1017 | 0 | break; |
1018 | 0 | case ParquetMetadataOperatorType::FILE_META_DATA: |
1019 | 0 | res->processor = make_uniq<ParquetFileMetadataProcessor>(); |
1020 | 0 | break; |
1021 | 0 | case ParquetMetadataOperatorType::BLOOM_PROBE: { |
1022 | 0 | const auto &probe_bind_data = static_cast<const ParquetBloomProbeBindData &>(bind_data); |
1023 | 0 | res->processor = |
1024 | 0 | make_uniq<ParquetBloomProbeProcessor>(probe_bind_data.probe_column_name, probe_bind_data.probe_constant); |
1025 | 0 | break; |
1026 | 0 | } |
1027 | 0 | case ParquetMetadataOperatorType::FULL_METADATA: { |
1028 | 0 | res->processor = make_uniq<FullMetadataProcessor>(); |
1029 | 0 | break; |
1030 | 0 | } |
1031 | 0 | default: |
1032 | 0 | throw InternalException("Unsupported ParquetMetadataOperatorType"); |
1033 | 0 | } |
1034 | 0 | return unique_ptr_cast<LocalTableFunctionState, ParquetMetadataLocalState>(std::move(res)); |
1035 | 0 | } Unexecuted instantiation: duckdb::unique_ptr<duckdb::LocalTableFunctionState, std::__1::default_delete<duckdb::LocalTableFunctionState>, true> duckdb::ParquetMetaDataOperator::InitLocal<(duckdb::ParquetMetadataOperatorType)0>(duckdb::ExecutionContext&, duckdb::TableFunctionInitInput&, duckdb::GlobalTableFunctionState*) Unexecuted instantiation: duckdb::unique_ptr<duckdb::LocalTableFunctionState, std::__1::default_delete<duckdb::LocalTableFunctionState>, true> duckdb::ParquetMetaDataOperator::InitLocal<(duckdb::ParquetMetadataOperatorType)1>(duckdb::ExecutionContext&, duckdb::TableFunctionInitInput&, duckdb::GlobalTableFunctionState*) Unexecuted instantiation: duckdb::unique_ptr<duckdb::LocalTableFunctionState, std::__1::default_delete<duckdb::LocalTableFunctionState>, true> duckdb::ParquetMetaDataOperator::InitLocal<(duckdb::ParquetMetadataOperatorType)2>(duckdb::ExecutionContext&, duckdb::TableFunctionInitInput&, duckdb::GlobalTableFunctionState*) Unexecuted instantiation: duckdb::unique_ptr<duckdb::LocalTableFunctionState, std::__1::default_delete<duckdb::LocalTableFunctionState>, true> duckdb::ParquetMetaDataOperator::InitLocal<(duckdb::ParquetMetadataOperatorType)3>(duckdb::ExecutionContext&, duckdb::TableFunctionInitInput&, duckdb::GlobalTableFunctionState*) Unexecuted instantiation: duckdb::unique_ptr<duckdb::LocalTableFunctionState, std::__1::default_delete<duckdb::LocalTableFunctionState>, true> duckdb::ParquetMetaDataOperator::InitLocal<(duckdb::ParquetMetadataOperatorType)4>(duckdb::ExecutionContext&, duckdb::TableFunctionInitInput&, duckdb::GlobalTableFunctionState*) Unexecuted instantiation: duckdb::unique_ptr<duckdb::LocalTableFunctionState, std::__1::default_delete<duckdb::LocalTableFunctionState>, true> duckdb::ParquetMetaDataOperator::InitLocal<(duckdb::ParquetMetadataOperatorType)5>(duckdb::ExecutionContext&, duckdb::TableFunctionInitInput&, duckdb::GlobalTableFunctionState*) |
1036 | | |
1037 | 0 | void ParquetMetaDataOperator::Function(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { |
1038 | 0 | auto &global_state = data_p.global_state->Cast<ParquetMetadataGlobalState>(); |
1039 | 0 | auto &local_state = data_p.local_state->Cast<ParquetMetadataLocalState>(); |
1040 | |
|
1041 | 0 | idx_t output_count = 0; |
1042 | |
|
1043 | 0 | vector<reference<Vector>> output_vectors; |
1044 | 0 | for (idx_t i = 0; i < output.ColumnCount(); i++) { |
1045 | 0 | output_vectors.push_back(std::ref(output.data[i])); |
1046 | 0 | } |
1047 | |
|
1048 | 0 | while (output_count < STANDARD_VECTOR_SIZE) { |
1049 | | // Check if we need a new file |
1050 | 0 | if (local_state.file_exhausted) { |
1051 | 0 | if (output_count > 0) { |
1052 | | // we already have rows - emit them first |
1053 | 0 | break; |
1054 | 0 | } |
1055 | 0 | idx_t next_file_idx; |
1056 | 0 | OpenFileInfo next_file; |
1057 | 0 | { |
1058 | 0 | lock_guard<mutex> guard(global_state.lock); |
1059 | 0 | if (!global_state.file_paths->NextFile(next_file)) { |
1060 | 0 | break; // No more files to process |
1061 | 0 | } |
1062 | 0 | next_file_idx = global_state.current_file++; |
1063 | 0 | } |
1064 | | |
1065 | 0 | local_state.Initialize(context, next_file, next_file_idx); |
1066 | 0 | } |
1067 | | |
1068 | 0 | idx_t left_in_vector = STANDARD_VECTOR_SIZE - output_count; |
1069 | 0 | idx_t left_in_file = local_state.total_rows - local_state.row_idx; |
1070 | 0 | idx_t rows_to_output = 0; |
1071 | 0 | if (left_in_file <= left_in_vector) { |
1072 | 0 | local_state.file_exhausted = true; |
1073 | 0 | rows_to_output = left_in_file; |
1074 | 0 | } else { |
1075 | 0 | rows_to_output = left_in_vector; |
1076 | 0 | } |
1077 | |
|
1078 | 0 | output.SetCardinality(output_count + rows_to_output); |
1079 | |
|
1080 | 0 | for (idx_t i = 0; i < rows_to_output; ++i) { |
1081 | 0 | local_state.processor->ReadRow(output_vectors, output_count + i, local_state.row_idx + i, |
1082 | 0 | *local_state.reader); |
1083 | 0 | } |
1084 | 0 | output_count += rows_to_output; |
1085 | 0 | local_state.row_idx += rows_to_output; |
1086 | |
|
1087 | 0 | if (local_state.processor->ForceFlush()) { |
1088 | 0 | break; |
1089 | 0 | } |
1090 | 0 | } |
1091 | 0 | } |
1092 | | |
1093 | | OperatorPartitionData ParquetMetaDataOperator::GetPartitionData(ClientContext &context, |
1094 | 0 | TableFunctionGetPartitionInput &input) { |
1095 | 0 | auto &local_state = input.local_state->Cast<ParquetMetadataLocalState>(); |
1096 | 0 | return OperatorPartitionData(local_state.file_idx.GetIndex()); |
1097 | 0 | } |
1098 | | |
1099 | | double ParquetMetaDataOperator::Progress(ClientContext &context, const FunctionData *bind_data_p, |
1100 | 0 | const GlobalTableFunctionState *global_state) { |
1101 | 0 | auto &global_data = global_state->Cast<ParquetMetadataGlobalState>(); |
1102 | 0 | return global_data.GetProgress() * 100.0; |
1103 | 0 | } |
1104 | | |
1105 | | ParquetMetaDataFunction::ParquetMetaDataFunction() |
1106 | 8.91k | : TableFunction("parquet_metadata", {LogicalType::VARCHAR}, ParquetMetaDataOperator::Function, |
1107 | 8.91k | ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::META_DATA>, |
1108 | 8.91k | ParquetMetaDataOperator::InitGlobal, |
1109 | 8.91k | ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::META_DATA>) { |
1110 | 8.91k | table_scan_progress = ParquetMetaDataOperator::Progress; |
1111 | 8.91k | get_partition_data = ParquetMetaDataOperator::GetPartitionData; |
1112 | 8.91k | } |
1113 | | |
1114 | | ParquetSchemaFunction::ParquetSchemaFunction() |
1115 | 8.91k | : TableFunction("parquet_schema", {LogicalType::VARCHAR}, ParquetMetaDataOperator::Function, |
1116 | 8.91k | ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::SCHEMA>, |
1117 | 8.91k | ParquetMetaDataOperator::InitGlobal, |
1118 | 8.91k | ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::SCHEMA>) { |
1119 | 8.91k | table_scan_progress = ParquetMetaDataOperator::Progress; |
1120 | 8.91k | get_partition_data = ParquetMetaDataOperator::GetPartitionData; |
1121 | 8.91k | } |
1122 | | |
1123 | | ParquetKeyValueMetadataFunction::ParquetKeyValueMetadataFunction() |
1124 | 8.91k | : TableFunction("parquet_kv_metadata", {LogicalType::VARCHAR}, ParquetMetaDataOperator::Function, |
1125 | 8.91k | ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::KEY_VALUE_META_DATA>, |
1126 | 8.91k | ParquetMetaDataOperator::InitGlobal, |
1127 | 8.91k | ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::KEY_VALUE_META_DATA>) { |
1128 | 8.91k | table_scan_progress = ParquetMetaDataOperator::Progress; |
1129 | 8.91k | get_partition_data = ParquetMetaDataOperator::GetPartitionData; |
1130 | 8.91k | } |
1131 | | |
1132 | | ParquetFileMetadataFunction::ParquetFileMetadataFunction() |
1133 | 8.91k | : TableFunction("parquet_file_metadata", {LogicalType::VARCHAR}, ParquetMetaDataOperator::Function, |
1134 | 8.91k | ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::FILE_META_DATA>, |
1135 | 8.91k | ParquetMetaDataOperator::InitGlobal, |
1136 | 8.91k | ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::FILE_META_DATA>) { |
1137 | 8.91k | table_scan_progress = ParquetMetaDataOperator::Progress; |
1138 | 8.91k | get_partition_data = ParquetMetaDataOperator::GetPartitionData; |
1139 | 8.91k | } |
1140 | | |
1141 | | ParquetBloomProbeFunction::ParquetBloomProbeFunction() |
1142 | 8.91k | : TableFunction("parquet_bloom_probe", {LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::ANY}, |
1143 | 8.91k | ParquetMetaDataOperator::Function, |
1144 | 8.91k | ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::BLOOM_PROBE>, |
1145 | 8.91k | ParquetMetaDataOperator::InitGlobal, |
1146 | 8.91k | ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::BLOOM_PROBE>) { |
1147 | 8.91k | table_scan_progress = ParquetMetaDataOperator::Progress; |
1148 | 8.91k | get_partition_data = ParquetMetaDataOperator::GetPartitionData; |
1149 | 8.91k | } |
1150 | | |
1151 | | ParquetFullMetadataFunction::ParquetFullMetadataFunction() |
1152 | 8.91k | : TableFunction("parquet_full_metadata", {LogicalType::VARCHAR}, ParquetMetaDataOperator::Function, |
1153 | 8.91k | ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::FULL_METADATA>, |
1154 | 8.91k | ParquetMetaDataOperator::InitGlobal, |
1155 | 8.91k | ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::FULL_METADATA>) { |
1156 | 8.91k | table_scan_progress = ParquetMetaDataOperator::Progress; |
1157 | 8.91k | get_partition_data = ParquetMetaDataOperator::GetPartitionData; |
1158 | 8.91k | } |
1159 | | } // namespace duckdb |