Coverage Report

Created: 2026-03-31 07:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/duckdb/extension/parquet/parquet_metadata.cpp
Line
Count
Source
1
#include "duckdb/common/vector/list_vector.hpp"
2
#include "duckdb/common/vector/map_vector.hpp"
3
#include "duckdb/common/vector/struct_vector.hpp"
4
#include "parquet_metadata.hpp"
5
6
#include "parquet_statistics.hpp"
7
8
#include <sstream>
9
10
#include "duckdb/common/multi_file/multi_file_reader.hpp"
11
#include "duckdb/common/types/blob.hpp"
12
#include "duckdb/planner/filter/constant_filter.hpp"
13
#include "duckdb/main/config.hpp"
14
#include "duckdb/common/multi_file/multi_file_list.hpp"
15
#include "parquet_reader.hpp"
16
#include "duckdb/common/numeric_utils.hpp"
17
18
namespace duckdb {
19
20
struct ParquetMetadataFilePaths {
21
  MultiFileListScanData scan_data;
22
  shared_ptr<MultiFileList> file_list;
23
  mutex file_lock;
24
25
0
  bool NextFile(OpenFileInfo &result) {
26
0
    D_ASSERT(file_list);
27
0
    unique_lock<mutex> lock(file_lock);
28
0
    return file_list->Scan(scan_data, result);
29
0
  }
30
31
0
  FileExpandResult GetExpandResult() {
32
0
    D_ASSERT(file_list);
33
0
    unique_lock<mutex> lock(file_lock);
34
0
    return file_list->GetExpandResult();
35
0
  }
36
};
37
38
struct ParquetMetaDataBindData : public TableFunctionData {
39
  unique_ptr<ParquetMetadataFilePaths> file_paths;
40
};
41
42
struct ParquetBloomProbeBindData : public ParquetMetaDataBindData {
43
  string probe_column_name;
44
  Value probe_constant;
45
};
46
47
enum class ParquetMetadataOperatorType : uint8_t {
48
  META_DATA,
49
  SCHEMA,
50
  KEY_VALUE_META_DATA,
51
  FILE_META_DATA,
52
  BLOOM_PROBE,
53
  FULL_METADATA
54
};
55
56
class ParquetMetadataFileProcessor {
57
public:
58
0
  ParquetMetadataFileProcessor() = default;
59
0
  virtual ~ParquetMetadataFileProcessor() = default;
60
0
  void Initialize(ClientContext &context, ParquetReader &reader) {
61
0
    InitializeInternal(context, reader);
62
0
  }
63
0
  virtual void InitializeInternal(ClientContext &context, ParquetReader &reader) {};
64
  virtual idx_t TotalRowCount(ParquetReader &reader) = 0;
65
  virtual void ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, ParquetReader &reader) = 0;
66
0
  virtual bool ForceFlush() {
67
0
    return false;
68
0
  }
69
};
70
71
struct ParquetMetaDataBindData;
72
73
class ParquetMetaDataOperator {
74
public:
75
  template <ParquetMetadataOperatorType OP_TYPE>
76
  static unique_ptr<FunctionData> Bind(ClientContext &context, TableFunctionBindInput &input,
77
                                       vector<LogicalType> &return_types, vector<string> &names);
78
  static unique_ptr<GlobalTableFunctionState> InitGlobal(ClientContext &context, TableFunctionInitInput &input);
79
  template <ParquetMetadataOperatorType OP_TYPE>
80
  static unique_ptr<LocalTableFunctionState> InitLocal(ExecutionContext &context, TableFunctionInitInput &input,
81
                                                       GlobalTableFunctionState *global_state);
82
  static void Function(ClientContext &context, TableFunctionInput &data_p, DataChunk &output);
83
  static double Progress(ClientContext &context, const FunctionData *bind_data_p,
84
                         const GlobalTableFunctionState *global_state);
85
86
  template <ParquetMetadataOperatorType OP_TYPE>
87
  static void BindSchema(vector<LogicalType> &return_types, vector<string> &names);
88
89
  static OperatorPartitionData GetPartitionData(ClientContext &context, TableFunctionGetPartitionInput &input);
90
};
91
92
struct ParquetMetadataGlobalState : public GlobalTableFunctionState {
93
  ParquetMetadataGlobalState(unique_ptr<ParquetMetadataFilePaths> file_paths_p, ClientContext &context)
94
0
      : file_paths(std::move(file_paths_p)) {
95
0
    auto expand_result = file_paths->GetExpandResult();
96
0
    if (expand_result == FileExpandResult::MULTIPLE_FILES) {
97
0
      max_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
98
0
    } else {
99
0
      max_threads = 1;
100
0
    }
101
0
  }
102
103
0
  idx_t MaxThreads() const override {
104
0
    return max_threads;
105
0
  }
106
107
0
  bool NextFile(ClientContext &context, OpenFileInfo &result) {
108
0
    return file_paths->NextFile(result);
109
0
  }
110
111
0
  double GetProgress() const {
112
    // Not the most accurate, instantly assumes all files are done and equal
113
0
    unique_lock<mutex> lock(file_paths->file_lock);
114
0
    return static_cast<double>(file_paths->scan_data.current_file_idx) /
115
0
           static_cast<double>(file_paths->file_list->GetTotalFileCount());
116
0
  }
117
118
  unique_ptr<ParquetMetadataFilePaths> file_paths;
119
  idx_t max_threads;
120
  mutex lock;
121
  idx_t current_file = 0;
122
};
123
124
struct ParquetMetadataLocalState : public LocalTableFunctionState {
125
  unique_ptr<ParquetReader> reader;
126
  unique_ptr<ParquetMetadataFileProcessor> processor;
127
  bool file_exhausted = true;
128
  idx_t row_idx = 0;
129
  idx_t total_rows = 0;
130
  optional_idx file_idx;
131
132
0
  void Initialize(ClientContext &context, OpenFileInfo &file_info, idx_t next_file_idx) {
133
0
    ParquetOptions parquet_options(context);
134
0
    reader = make_uniq<ParquetReader>(context, file_info, parquet_options);
135
0
    processor->Initialize(context, *reader);
136
0
    total_rows = processor->TotalRowCount(*reader);
137
0
    row_idx = 0;
138
0
    file_idx = next_file_idx;
139
0
    file_exhausted = false;
140
0
  }
141
};
142
143
template <class T>
144
0
static string ConvertParquetElementToString(T &&entry) {
145
0
  duckdb::stringstream ss;
146
0
  ss << entry;
147
0
  return ss.str();
148
0
}
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::ConvertParquetElementToString<duckdb_parquet::Type::type const&>(duckdb_parquet::Type::type const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::ConvertParquetElementToString<duckdb_parquet::CompressionCodec::type const&>(duckdb_parquet::CompressionCodec::type const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::ConvertParquetElementToString<duckdb_parquet::Encoding::type const&>(duckdb_parquet::Encoding::type const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::ConvertParquetElementToString<duckdb_parquet::FieldRepetitionType::type const&>(duckdb_parquet::FieldRepetitionType::type const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::ConvertParquetElementToString<duckdb_parquet::ConvertedType::type const&>(duckdb_parquet::ConvertedType::type const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::ConvertParquetElementToString<duckdb_parquet::EncryptionAlgorithm const&>(duckdb_parquet::EncryptionAlgorithm const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::ConvertParquetElementToString<duckdb_parquet::ColumnOrder const&>(duckdb_parquet::ColumnOrder const&)
149
150
template <class T>
151
0
static string PrintParquetElementToString(T &&entry) {
152
0
  duckdb::stringstream ss;
153
0
  entry.printTo(ss);
154
0
  return ss.str();
155
0
}
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::StringType const&>(duckdb_parquet::StringType const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::MapType const&>(duckdb_parquet::MapType const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::ListType const&>(duckdb_parquet::ListType const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::EnumType const&>(duckdb_parquet::EnumType const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::DecimalType const&>(duckdb_parquet::DecimalType const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::DateType const&>(duckdb_parquet::DateType const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::TimeType const&>(duckdb_parquet::TimeType const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::TimestampType const&>(duckdb_parquet::TimestampType const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::IntType const&>(duckdb_parquet::IntType const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::NullType const&>(duckdb_parquet::NullType const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::JsonType const&>(duckdb_parquet::JsonType const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::BsonType const&>(duckdb_parquet::BsonType const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::UUIDType const&>(duckdb_parquet::UUIDType const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::Float16Type const&>(duckdb_parquet::Float16Type const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::GeometryType const&>(duckdb_parquet::GeometryType const&)
Unexecuted instantiation: parquet_metadata.cpp:std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > duckdb::PrintParquetElementToString<duckdb_parquet::GeographyType const&>(duckdb_parquet::GeographyType const&)
156
157
template <class T>
158
0
static Value ParquetElementString(T &&value, bool is_set) {
159
0
  if (!is_set) {
160
0
    return Value();
161
0
  }
162
0
  return Value(ConvertParquetElementToString(value));
163
0
}
Unexecuted instantiation: parquet_metadata.cpp:duckdb::Value duckdb::ParquetElementString<duckdb_parquet::Type::type const&>(duckdb_parquet::Type::type const&, bool)
Unexecuted instantiation: parquet_metadata.cpp:duckdb::Value duckdb::ParquetElementString<duckdb_parquet::FieldRepetitionType::type const&>(duckdb_parquet::FieldRepetitionType::type const&, bool)
Unexecuted instantiation: parquet_metadata.cpp:duckdb::Value duckdb::ParquetElementString<duckdb_parquet::ConvertedType::type const&>(duckdb_parquet::ConvertedType::type const&, bool)
Unexecuted instantiation: parquet_metadata.cpp:duckdb::Value duckdb::ParquetElementString<duckdb_parquet::EncryptionAlgorithm const&>(duckdb_parquet::EncryptionAlgorithm const&, bool)
164
165
0
static Value ParquetElementStringVal(const string &value, bool is_set) {
166
0
  if (!is_set) {
167
0
    return Value();
168
0
  }
169
0
  return Value(value);
170
0
}
171
172
template <class T>
173
0
static Value ParquetElementInteger(T &&value, bool is_iset) {
174
0
  if (!is_iset) {
175
0
    return Value();
176
0
  }
177
0
  return Value::INTEGER(value);
178
0
}
179
180
template <class T>
181
0
static Value ParquetElementBigint(T &&value, bool is_iset) {
182
0
  if (!is_iset) {
183
0
    return Value();
184
0
  }
185
0
  return Value::BIGINT(value);
186
0
}
Unexecuted instantiation: parquet_metadata.cpp:duckdb::Value duckdb::ParquetElementBigint<long const&>(long const&, bool)
Unexecuted instantiation: parquet_metadata.cpp:duckdb::Value duckdb::ParquetElementBigint<int const&>(int const&, bool)
187
188
0
static Value ParquetElementBoolean(bool value, bool is_iset) {
189
0
  if (!is_iset) {
190
0
    return Value();
191
0
  }
192
0
  return Value::BOOLEAN(value);
193
0
}
194
195
//===--------------------------------------------------------------------===//
196
// Row Group Meta Data
197
//===--------------------------------------------------------------------===//
198
199
class ParquetRowGroupMetadataProcessor : public ParquetMetadataFileProcessor {
200
public:
201
  void InitializeInternal(ClientContext &context, ParquetReader &reader) override;
202
  idx_t TotalRowCount(ParquetReader &reader) override;
203
  void ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, ParquetReader &reader) override;
204
205
private:
206
  vector<ParquetColumnSchema> column_schemas;
207
};
208
209
template <>
210
void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::META_DATA>(vector<LogicalType> &return_types,
211
0
                                                                                 vector<string> &names) {
212
0
  names.emplace_back("file_name");
213
0
  return_types.emplace_back(LogicalType::VARCHAR);
214
215
0
  names.emplace_back("row_group_id");
216
0
  return_types.emplace_back(LogicalType::BIGINT);
217
218
0
  names.emplace_back("row_group_num_rows");
219
0
  return_types.emplace_back(LogicalType::BIGINT);
220
221
0
  names.emplace_back("row_group_num_columns");
222
0
  return_types.emplace_back(LogicalType::BIGINT);
223
224
0
  names.emplace_back("row_group_bytes");
225
0
  return_types.emplace_back(LogicalType::BIGINT);
226
227
0
  names.emplace_back("column_id");
228
0
  return_types.emplace_back(LogicalType::BIGINT);
229
230
0
  names.emplace_back("file_offset");
231
0
  return_types.emplace_back(LogicalType::BIGINT);
232
233
0
  names.emplace_back("num_values");
234
0
  return_types.emplace_back(LogicalType::BIGINT);
235
236
0
  names.emplace_back("path_in_schema");
237
0
  return_types.emplace_back(LogicalType::VARCHAR);
238
239
0
  names.emplace_back("type");
240
0
  return_types.emplace_back(LogicalType::VARCHAR);
241
242
0
  names.emplace_back("stats_min");
243
0
  return_types.emplace_back(LogicalType::VARCHAR);
244
245
0
  names.emplace_back("stats_max");
246
0
  return_types.emplace_back(LogicalType::VARCHAR);
247
248
0
  names.emplace_back("stats_null_count");
249
0
  return_types.emplace_back(LogicalType::BIGINT);
250
251
0
  names.emplace_back("stats_distinct_count");
252
0
  return_types.emplace_back(LogicalType::BIGINT);
253
254
0
  names.emplace_back("stats_min_value");
255
0
  return_types.emplace_back(LogicalType::VARCHAR);
256
257
0
  names.emplace_back("stats_max_value");
258
0
  return_types.emplace_back(LogicalType::VARCHAR);
259
260
0
  names.emplace_back("compression");
261
0
  return_types.emplace_back(LogicalType::VARCHAR);
262
263
0
  names.emplace_back("encodings");
264
0
  return_types.emplace_back(LogicalType::VARCHAR);
265
266
0
  names.emplace_back("index_page_offset");
267
0
  return_types.emplace_back(LogicalType::BIGINT);
268
269
0
  names.emplace_back("dictionary_page_offset");
270
0
  return_types.emplace_back(LogicalType::BIGINT);
271
272
0
  names.emplace_back("data_page_offset");
273
0
  return_types.emplace_back(LogicalType::BIGINT);
274
275
0
  names.emplace_back("total_compressed_size");
276
0
  return_types.emplace_back(LogicalType::BIGINT);
277
278
0
  names.emplace_back("total_uncompressed_size");
279
0
  return_types.emplace_back(LogicalType::BIGINT);
280
281
0
  names.emplace_back("key_value_metadata");
282
0
  return_types.emplace_back(LogicalType::MAP(LogicalType::BLOB, LogicalType::BLOB));
283
284
0
  names.emplace_back("bloom_filter_offset");
285
0
  return_types.emplace_back(LogicalType::BIGINT);
286
287
0
  names.emplace_back("bloom_filter_length");
288
0
  return_types.emplace_back(LogicalType::BIGINT);
289
290
0
  names.emplace_back("min_is_exact");
291
0
  return_types.emplace_back(LogicalType::BOOLEAN);
292
293
0
  names.emplace_back("max_is_exact");
294
0
  return_types.emplace_back(LogicalType::BOOLEAN);
295
296
0
  names.emplace_back("row_group_compressed_bytes");
297
0
  return_types.emplace_back(LogicalType::BIGINT);
298
299
0
  names.emplace_back("geo_bbox");
300
0
  return_types.emplace_back(LogicalType::STRUCT({
301
0
      {"xmin", LogicalType::DOUBLE},
302
0
      {"xmax", LogicalType::DOUBLE},
303
0
      {"ymin", LogicalType::DOUBLE},
304
0
      {"ymax", LogicalType::DOUBLE},
305
0
      {"zmin", LogicalType::DOUBLE},
306
0
      {"zmax", LogicalType::DOUBLE},
307
0
      {"mmin", LogicalType::DOUBLE},
308
0
      {"mmax", LogicalType::DOUBLE},
309
0
  }));
310
311
0
  names.emplace_back("geo_types");
312
0
  return_types.emplace_back(LogicalType::LIST(LogicalType::VARCHAR));
313
0
}
314
315
static Value ConvertParquetStats(const LogicalType &type, const ParquetColumnSchema &schema_ele, bool stats_is_set,
316
0
                                 const std::string &stats) {
317
0
  if (!stats_is_set) {
318
0
    return Value(LogicalType::VARCHAR);
319
0
  }
320
0
  return ParquetStatisticsUtils::ConvertValue(type, schema_ele, stats).DefaultCastAs(LogicalType::VARCHAR);
321
0
}
322
323
0
static Value ConvertParquetGeoStatsBBOX(const duckdb_parquet::GeospatialStatistics &stats) {
324
0
  if (!stats.__isset.bbox) {
325
0
    return Value(LogicalType::STRUCT({
326
0
        {"xmin", LogicalType::DOUBLE},
327
0
        {"xmax", LogicalType::DOUBLE},
328
0
        {"ymin", LogicalType::DOUBLE},
329
0
        {"ymax", LogicalType::DOUBLE},
330
0
        {"zmin", LogicalType::DOUBLE},
331
0
        {"zmax", LogicalType::DOUBLE},
332
0
        {"mmin", LogicalType::DOUBLE},
333
0
        {"mmax", LogicalType::DOUBLE},
334
0
    }));
335
0
  }
336
337
0
  return Value::STRUCT({
338
0
      {"xmin", Value::DOUBLE(stats.bbox.xmin)},
339
0
      {"xmax", Value::DOUBLE(stats.bbox.xmax)},
340
0
      {"ymin", Value::DOUBLE(stats.bbox.ymin)},
341
0
      {"ymax", Value::DOUBLE(stats.bbox.ymax)},
342
0
      {"zmin", stats.bbox.__isset.zmin ? Value::DOUBLE(stats.bbox.zmin) : Value(LogicalTypeId::DOUBLE)},
343
0
      {"zmax", stats.bbox.__isset.zmax ? Value::DOUBLE(stats.bbox.zmax) : Value(LogicalTypeId::DOUBLE)},
344
0
      {"mmin", stats.bbox.__isset.mmin ? Value::DOUBLE(stats.bbox.mmin) : Value(LogicalTypeId::DOUBLE)},
345
0
      {"mmax", stats.bbox.__isset.mmax ? Value::DOUBLE(stats.bbox.mmax) : Value(LogicalTypeId::DOUBLE)},
346
0
  });
347
0
}
348
349
0
static Value ConvertParquetGeoStatsTypes(const duckdb_parquet::GeospatialStatistics &stats) {
350
0
  if (!stats.__isset.geospatial_types) {
351
0
    return Value(LogicalType::LIST(LogicalType::VARCHAR));
352
0
  }
353
0
  vector<Value> types;
354
0
  types.reserve(stats.geospatial_types.size());
355
356
0
  GeometryTypeSet type_set = GeometryTypeSet::Empty();
357
0
  for (auto &type : stats.geospatial_types) {
358
0
    const auto geom_type = (type % 1000);
359
0
    const auto vert_type = (type / 1000);
360
0
    if (geom_type < 1 || geom_type > 7) {
361
0
      throw InvalidInputException("Unsupported geometry type in Parquet geo metadata");
362
0
    }
363
0
    if (vert_type < 0 || vert_type > 3) {
364
0
      throw InvalidInputException("Unsupported geometry vertex type in Parquet geo metadata");
365
0
    }
366
0
    type_set.Add(static_cast<GeometryType>(geom_type), static_cast<VertexType>(vert_type));
367
0
  }
368
369
0
  for (auto &type_name : type_set.ToString(true)) {
370
0
    types.push_back(Value(type_name));
371
0
  }
372
0
  return Value::LIST(LogicalType::VARCHAR, types);
373
0
}
374
375
0
void ParquetRowGroupMetadataProcessor::InitializeInternal(ClientContext &context, ParquetReader &reader) {
376
0
  auto meta_data = reader.GetFileMetadata();
377
0
  column_schemas.clear();
378
0
  for (idx_t schema_idx = 0; schema_idx < meta_data->schema.size(); schema_idx++) {
379
0
    auto &schema_element = meta_data->schema[schema_idx];
380
0
    if (schema_element.num_children > 0) {
381
0
      continue;
382
0
    }
383
0
    ParquetColumnSchema column_schema;
384
0
    column_schema.type = reader.DeriveLogicalType(schema_element, column_schema);
385
0
    column_schemas.push_back(std::move(column_schema));
386
0
  }
387
0
}
388
389
0
idx_t ParquetRowGroupMetadataProcessor::TotalRowCount(ParquetReader &reader) {
390
0
  auto meta_data = reader.GetFileMetadata();
391
0
  return meta_data->row_groups.size() * column_schemas.size();
392
0
}
393
394
void ParquetRowGroupMetadataProcessor::ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx,
395
0
                                               ParquetReader &reader) {
396
0
  auto meta_data = reader.GetFileMetadata();
397
0
  idx_t col_idx = row_idx % column_schemas.size();
398
0
  idx_t row_group_idx = row_idx / column_schemas.size();
399
400
0
  auto &row_group = meta_data->row_groups[row_group_idx];
401
402
0
  auto &column = row_group.columns[col_idx];
403
0
  auto &column_schema = column_schemas[col_idx];
404
0
  auto &col_meta = column.meta_data;
405
0
  auto &stats = col_meta.statistics;
406
0
  auto &column_type = column_schema.type;
407
408
  // file_name
409
0
  output[0].get().SetValue(output_idx, reader.file.path);
410
  // row_group_id
411
0
  output[1].get().SetValue(output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(row_group_idx)));
412
  // row_group_num_rows
413
0
  output[2].get().SetValue(output_idx, Value::BIGINT(row_group.num_rows));
414
  // row_group_num_columns
415
0
  output[3].get().SetValue(output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(row_group.columns.size())));
416
  // row_group_bytes
417
0
  output[4].get().SetValue(output_idx, Value::BIGINT(row_group.total_byte_size));
418
  // column_id
419
0
  output[5].get().SetValue(output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(col_idx)));
420
  // file_offset
421
0
  output[6].get().SetValue(output_idx, ParquetElementBigint(column.file_offset, row_group.__isset.file_offset));
422
  // num_values
423
0
  output[7].get().SetValue(output_idx, Value::BIGINT(col_meta.num_values));
424
  // path_in_schema
425
0
  output[8].get().SetValue(output_idx, StringUtil::Join(col_meta.path_in_schema, ", "));
426
  // type
427
0
  output[9].get().SetValue(output_idx, ConvertParquetElementToString(col_meta.type));
428
  // stats_min
429
0
  output[10].get().SetValue(output_idx,
430
0
                            ConvertParquetStats(column_type, column_schema, stats.__isset.min, stats.min));
431
  // stats_max
432
0
  output[11].get().SetValue(output_idx,
433
0
                            ConvertParquetStats(column_type, column_schema, stats.__isset.max, stats.max));
434
  // stats_null_count
435
0
  output[12].get().SetValue(output_idx, ParquetElementBigint(stats.null_count, stats.__isset.null_count));
436
  // stats_distinct_count
437
0
  output[13].get().SetValue(output_idx, ParquetElementBigint(stats.distinct_count, stats.__isset.distinct_count));
438
  // stats_min_value
439
0
  output[14].get().SetValue(
440
0
      output_idx, ConvertParquetStats(column_type, column_schema, stats.__isset.min_value, stats.min_value));
441
  // stats_max_value
442
0
  output[15].get().SetValue(
443
0
      output_idx, ConvertParquetStats(column_type, column_schema, stats.__isset.max_value, stats.max_value));
444
  // compression
445
0
  output[16].get().SetValue(output_idx, ConvertParquetElementToString(col_meta.codec));
446
  // encodings
447
0
  vector<string> encoding_string;
448
0
  encoding_string.reserve(col_meta.encodings.size());
449
0
  for (auto &encoding : col_meta.encodings) {
450
0
    encoding_string.push_back(ConvertParquetElementToString(encoding));
451
0
  }
452
0
  output[17].get().SetValue(output_idx, Value(StringUtil::Join(encoding_string, ", ")));
453
  // index_page_offset
454
0
  output[18].get().SetValue(output_idx,
455
0
                            ParquetElementBigint(col_meta.index_page_offset, col_meta.__isset.index_page_offset));
456
  // dictionary_page_offset
457
0
  output[19].get().SetValue(
458
0
      output_idx, ParquetElementBigint(col_meta.dictionary_page_offset, col_meta.__isset.dictionary_page_offset));
459
  // data_page_offset
460
0
  output[20].get().SetValue(output_idx, Value::BIGINT(col_meta.data_page_offset));
461
  // total_compressed_size
462
0
  output[21].get().SetValue(output_idx, Value::BIGINT(col_meta.total_compressed_size));
463
  // total_uncompressed_size
464
0
  output[22].get().SetValue(output_idx, Value::BIGINT(col_meta.total_uncompressed_size));
465
  // key_value_metadata
466
0
  vector<Value> map_keys, map_values;
467
0
  for (auto &entry : col_meta.key_value_metadata) {
468
0
    map_keys.push_back(Value::BLOB_RAW(entry.key));
469
0
    map_values.push_back(Value::BLOB_RAW(entry.value));
470
0
  }
471
0
  output[23].get().SetValue(
472
0
      output_idx, Value::MAP(LogicalType::BLOB, LogicalType::BLOB, std::move(map_keys), std::move(map_values)));
473
  // bloom_filter_offset
474
0
  output[24].get().SetValue(output_idx,
475
0
                            ParquetElementBigint(col_meta.bloom_filter_offset, col_meta.__isset.bloom_filter_offset));
476
  // bloom_filter_length
477
0
  output[25].get().SetValue(output_idx,
478
0
                            ParquetElementBigint(col_meta.bloom_filter_length, col_meta.__isset.bloom_filter_length));
479
  // min_is_exact
480
0
  output[26].get().SetValue(output_idx,
481
0
                            ParquetElementBoolean(stats.is_min_value_exact, stats.__isset.is_min_value_exact));
482
  // max_is_exact
483
0
  output[27].get().SetValue(output_idx,
484
0
                            ParquetElementBoolean(stats.is_max_value_exact, stats.__isset.is_max_value_exact));
485
  // row_group_compressed_bytes
486
0
  output[28].get().SetValue(
487
0
      output_idx, ParquetElementBigint(row_group.total_compressed_size, row_group.__isset.total_compressed_size));
488
  // geo_stats_bbox, LogicalType::STRUCT(...)
489
0
  output[29].get().SetValue(output_idx, ConvertParquetGeoStatsBBOX(col_meta.geospatial_statistics));
490
491
  // geo_stats_types, LogicalType::LIST(LogicalType::VARCHAR)
492
0
  output[30].get().SetValue(output_idx, ConvertParquetGeoStatsTypes(col_meta.geospatial_statistics));
493
0
}
494
495
//===--------------------------------------------------------------------===//
496
// Schema Data
497
//===--------------------------------------------------------------------===//
498
499
class ParquetSchemaProcessor : public ParquetMetadataFileProcessor {
500
public:
501
  idx_t TotalRowCount(ParquetReader &reader) override;
502
  void ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, ParquetReader &reader) override;
503
};
504
505
template <>
506
void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::SCHEMA>(vector<LogicalType> &return_types,
507
0
                                                                              vector<string> &names) {
508
0
  names.emplace_back("file_name");
509
0
  return_types.emplace_back(LogicalType::VARCHAR);
510
511
0
  names.emplace_back("name");
512
0
  return_types.emplace_back(LogicalType::VARCHAR);
513
514
0
  names.emplace_back("type");
515
0
  return_types.emplace_back(LogicalType::VARCHAR);
516
517
0
  names.emplace_back("type_length");
518
0
  return_types.emplace_back(LogicalType::VARCHAR);
519
520
0
  names.emplace_back("repetition_type");
521
0
  return_types.emplace_back(LogicalType::VARCHAR);
522
523
0
  names.emplace_back("num_children");
524
0
  return_types.emplace_back(LogicalType::BIGINT);
525
526
0
  names.emplace_back("converted_type");
527
0
  return_types.emplace_back(LogicalType::VARCHAR);
528
529
0
  names.emplace_back("scale");
530
0
  return_types.emplace_back(LogicalType::BIGINT);
531
532
0
  names.emplace_back("precision");
533
0
  return_types.emplace_back(LogicalType::BIGINT);
534
535
0
  names.emplace_back("field_id");
536
0
  return_types.emplace_back(LogicalType::BIGINT);
537
538
0
  names.emplace_back("logical_type");
539
0
  return_types.emplace_back(LogicalType::VARCHAR);
540
541
0
  names.emplace_back("duckdb_type");
542
0
  return_types.emplace_back(LogicalType::VARCHAR);
543
544
0
  names.emplace_back("column_id");
545
0
  return_types.emplace_back(LogicalType::BIGINT);
546
0
}
547
548
0
static Value ParquetLogicalTypeToString(const duckdb_parquet::LogicalType &type, bool is_set) {
549
0
  if (!is_set) {
550
0
    return Value();
551
0
  }
552
0
  if (type.__isset.STRING) {
553
0
    return Value(PrintParquetElementToString(type.STRING));
554
0
  }
555
0
  if (type.__isset.MAP) {
556
0
    return Value(PrintParquetElementToString(type.MAP));
557
0
  }
558
0
  if (type.__isset.LIST) {
559
0
    return Value(PrintParquetElementToString(type.LIST));
560
0
  }
561
0
  if (type.__isset.ENUM) {
562
0
    return Value(PrintParquetElementToString(type.ENUM));
563
0
  }
564
0
  if (type.__isset.DECIMAL) {
565
0
    return Value(PrintParquetElementToString(type.DECIMAL));
566
0
  }
567
0
  if (type.__isset.DATE) {
568
0
    return Value(PrintParquetElementToString(type.DATE));
569
0
  }
570
0
  if (type.__isset.TIME) {
571
0
    return Value(PrintParquetElementToString(type.TIME));
572
0
  }
573
0
  if (type.__isset.TIMESTAMP) {
574
0
    return Value(PrintParquetElementToString(type.TIMESTAMP));
575
0
  }
576
0
  if (type.__isset.INTEGER) {
577
0
    return Value(PrintParquetElementToString(type.INTEGER));
578
0
  }
579
0
  if (type.__isset.UNKNOWN) {
580
0
    return Value(PrintParquetElementToString(type.UNKNOWN));
581
0
  }
582
0
  if (type.__isset.JSON) {
583
0
    return Value(PrintParquetElementToString(type.JSON));
584
0
  }
585
0
  if (type.__isset.BSON) {
586
0
    return Value(PrintParquetElementToString(type.BSON));
587
0
  }
588
0
  if (type.__isset.UUID) {
589
0
    return Value(PrintParquetElementToString(type.UUID));
590
0
  }
591
0
  if (type.__isset.FLOAT16) {
592
0
    return Value(PrintParquetElementToString(type.FLOAT16));
593
0
  }
594
0
  if (type.__isset.GEOMETRY) {
595
0
    return Value(PrintParquetElementToString(type.GEOMETRY));
596
0
  }
597
0
  if (type.__isset.GEOGRAPHY) {
598
0
    return Value(PrintParquetElementToString(type.GEOGRAPHY));
599
0
  }
600
0
  return Value();
601
0
}
602
603
0
idx_t ParquetSchemaProcessor::TotalRowCount(ParquetReader &reader) {
604
0
  return reader.GetFileMetadata()->schema.size();
605
0
}
606
607
void ParquetSchemaProcessor::ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx,
608
0
                                     ParquetReader &reader) {
609
0
  auto meta_data = reader.GetFileMetadata();
610
0
  const auto &column = meta_data->schema[row_idx];
611
612
  // file_name
613
0
  output[0].get().SetValue(output_idx, reader.file.path);
614
  // name
615
0
  output[1].get().SetValue(output_idx, column.name);
616
  // type
617
0
  output[2].get().SetValue(output_idx, ParquetElementString(column.type, column.__isset.type));
618
  // type_length
619
0
  output[3].get().SetValue(output_idx, ParquetElementInteger(column.type_length, column.__isset.type_length));
620
  // repetition_type
621
0
  output[4].get().SetValue(output_idx, ParquetElementString(column.repetition_type, column.__isset.repetition_type));
622
  // num_children
623
0
  output[5].get().SetValue(output_idx, ParquetElementBigint(column.num_children, column.__isset.num_children));
624
  // converted_type
625
0
  output[6].get().SetValue(output_idx, ParquetElementString(column.converted_type, column.__isset.converted_type));
626
  // scale
627
0
  output[7].get().SetValue(output_idx, ParquetElementBigint(column.scale, column.__isset.scale));
628
  // precision
629
0
  output[8].get().SetValue(output_idx, ParquetElementBigint(column.precision, column.__isset.precision));
630
  // field_id
631
0
  output[9].get().SetValue(output_idx, ParquetElementBigint(column.field_id, column.__isset.field_id));
632
  // logical_type
633
0
  output[10].get().SetValue(output_idx, ParquetLogicalTypeToString(column.logicalType, column.__isset.logicalType));
634
  // duckdb_type
635
0
  ParquetColumnSchema column_schema;
636
0
  Value duckdb_type;
637
0
  if (column.__isset.type) {
638
0
    duckdb_type = reader.DeriveLogicalType(column, column_schema).ToString();
639
0
  }
640
0
  output[11].get().SetValue(output_idx, duckdb_type);
641
  // column_id
642
0
  output[12].get().SetValue(output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(row_idx)));
643
0
}
644
645
//===--------------------------------------------------------------------===//
646
// KV Meta Data
647
//===--------------------------------------------------------------------===//
648
649
class ParquetKeyValueMetadataProcessor : public ParquetMetadataFileProcessor {
650
public:
651
  idx_t TotalRowCount(ParquetReader &reader) override;
652
  void ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, ParquetReader &reader) override;
653
};
654
655
template <>
656
void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::KEY_VALUE_META_DATA>(
657
0
    vector<LogicalType> &return_types, vector<string> &names) {
658
0
  names.emplace_back("file_name");
659
0
  return_types.emplace_back(LogicalType::VARCHAR);
660
661
0
  names.emplace_back("key");
662
0
  return_types.emplace_back(LogicalType::BLOB);
663
664
0
  names.emplace_back("value");
665
0
  return_types.emplace_back(LogicalType::BLOB);
666
0
}
667
668
0
idx_t ParquetKeyValueMetadataProcessor::TotalRowCount(ParquetReader &reader) {
669
0
  return reader.GetFileMetadata()->key_value_metadata.size();
670
0
}
671
672
void ParquetKeyValueMetadataProcessor::ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx,
673
0
                                               ParquetReader &reader) {
674
0
  auto meta_data = reader.GetFileMetadata();
675
0
  auto &entry = meta_data->key_value_metadata[row_idx];
676
677
0
  output[0].get().SetValue(output_idx, Value(reader.file.path));
678
0
  output[1].get().SetValue(output_idx, Value::BLOB_RAW(entry.key));
679
0
  output[2].get().SetValue(output_idx, Value::BLOB_RAW(entry.value));
680
0
}
681
682
//===--------------------------------------------------------------------===//
683
// File Meta Data
684
//===--------------------------------------------------------------------===//
685
686
class ParquetFileMetadataProcessor : public ParquetMetadataFileProcessor {
687
public:
688
  idx_t TotalRowCount(ParquetReader &reader) override;
689
  void ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, ParquetReader &reader) override;
690
};
691
692
template <>
693
void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::FILE_META_DATA>(vector<LogicalType> &return_types,
694
0
                                                                                      vector<string> &names) {
695
0
  names.emplace_back("file_name");
696
0
  return_types.emplace_back(LogicalType::VARCHAR);
697
698
0
  names.emplace_back("created_by");
699
0
  return_types.emplace_back(LogicalType::VARCHAR);
700
701
0
  names.emplace_back("num_rows");
702
0
  return_types.emplace_back(LogicalType::BIGINT);
703
704
0
  names.emplace_back("num_row_groups");
705
0
  return_types.emplace_back(LogicalType::BIGINT);
706
707
0
  names.emplace_back("format_version");
708
0
  return_types.emplace_back(LogicalType::BIGINT);
709
710
0
  names.emplace_back("encryption_algorithm");
711
0
  return_types.emplace_back(LogicalType::VARCHAR);
712
713
0
  names.emplace_back("footer_signing_key_metadata");
714
0
  return_types.emplace_back(LogicalType::VARCHAR);
715
716
0
  names.emplace_back("file_size_bytes");
717
0
  return_types.emplace_back(LogicalType::UBIGINT);
718
719
0
  names.emplace_back("footer_size");
720
0
  return_types.emplace_back(LogicalType::UBIGINT);
721
722
0
  names.emplace_back("column_orders");
723
0
  return_types.emplace_back(LogicalType::LIST(LogicalType::VARCHAR));
724
0
}
725
726
0
idx_t ParquetFileMetadataProcessor::TotalRowCount(ParquetReader &reader) {
727
0
  return 1;
728
0
}
729
730
void ParquetFileMetadataProcessor::ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx,
731
0
                                           ParquetReader &reader) {
732
0
  auto meta_data = reader.GetFileMetadata();
733
734
  // file_name
735
0
  output[0].get().SetValue(output_idx, Value(reader.file.path));
736
  // created_by
737
0
  output[1].get().SetValue(output_idx, ParquetElementStringVal(meta_data->created_by, meta_data->__isset.created_by));
738
  // num_rows
739
0
  output[2].get().SetValue(output_idx, Value::BIGINT(meta_data->num_rows));
740
  // num_row_groups
741
0
  output[3].get().SetValue(output_idx, Value::BIGINT(UnsafeNumericCast<int64_t>(meta_data->row_groups.size())));
742
  // format_version
743
0
  output[4].get().SetValue(output_idx, Value::BIGINT(meta_data->version));
744
  // encryption_algorithm
745
0
  output[5].get().SetValue(
746
0
      output_idx, ParquetElementString(meta_data->encryption_algorithm, meta_data->__isset.encryption_algorithm));
747
  // footer_signing_key_metadata
748
0
  output[6].get().SetValue(output_idx, ParquetElementStringVal(meta_data->footer_signing_key_metadata,
749
0
                                                               meta_data->__isset.footer_signing_key_metadata));
750
  // file_size_bytes
751
0
  output[7].get().SetValue(output_idx, Value::UBIGINT(reader.GetHandle().GetFileSize()));
752
  // footer_size
753
0
  output[8].get().SetValue(output_idx, Value::UBIGINT(reader.metadata->footer_size));
754
  // column_orders
755
0
  Value column_orders_value;
756
0
  if (meta_data->__isset.column_orders) {
757
0
    vector<Value> column_orders;
758
0
    column_orders.reserve(meta_data->column_orders.size());
759
0
    for (auto &column_order : meta_data->column_orders) {
760
0
      column_orders.push_back(Value(ConvertParquetElementToString(column_order)));
761
0
    }
762
0
    column_orders_value = Value::LIST(LogicalType::VARCHAR, column_orders);
763
0
  }
764
0
  output[9].get().SetValue(output_idx, column_orders_value);
765
0
}
766
767
//===--------------------------------------------------------------------===//
768
// Bloom Probe
769
//===--------------------------------------------------------------------===//
770
771
class ParquetBloomProbeProcessor : public ParquetMetadataFileProcessor {
772
public:
773
  ParquetBloomProbeProcessor(const string &probe_column, const Value &probe_value);
774
775
  void InitializeInternal(ClientContext &context, ParquetReader &reader) override;
776
  idx_t TotalRowCount(ParquetReader &reader) override;
777
  void ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, ParquetReader &reader) override;
778
779
private:
780
  string probe_column_name;
781
  Value probe_constant;
782
  optional_idx probe_column_idx;
783
784
  unique_ptr<duckdb_apache::thrift::protocol::TCompactProtocolT<ThriftFileTransport>> protocol;
785
  optional_ptr<Allocator> allocator;
786
  unique_ptr<ConstantFilter> filter;
787
};
788
789
template <>
790
void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::BLOOM_PROBE>(vector<LogicalType> &return_types,
791
0
                                                                                   vector<string> &names) {
792
0
  names.emplace_back("file_name");
793
0
  return_types.emplace_back(LogicalType::VARCHAR);
794
795
0
  names.emplace_back("row_group_id");
796
0
  return_types.emplace_back(LogicalType::BIGINT);
797
798
0
  names.emplace_back("bloom_filter_excludes");
799
0
  return_types.emplace_back(LogicalType::BOOLEAN);
800
0
}
801
802
ParquetBloomProbeProcessor::ParquetBloomProbeProcessor(const string &probe_column, const Value &probe_value)
803
0
    : probe_column_name(probe_column), probe_constant(probe_value) {
804
0
}
805
806
0
void ParquetBloomProbeProcessor::InitializeInternal(ClientContext &context, ParquetReader &reader) {
807
0
  probe_column_idx = optional_idx::Invalid();
808
809
0
  for (idx_t column_idx = 0; column_idx < reader.columns.size(); column_idx++) {
810
0
    if (reader.columns[column_idx].name == probe_column_name) {
811
0
      probe_column_idx = column_idx;
812
0
      break;
813
0
    }
814
0
  }
815
816
0
  if (!probe_column_idx.IsValid()) {
817
0
    throw InvalidInputException("Column %s not found in %s", probe_column_name, reader.file.path);
818
0
  }
819
820
0
  auto transport = duckdb_base_std::make_shared<ThriftFileTransport>(reader.GetHandle(), false);
821
0
  protocol = make_uniq<duckdb_apache::thrift::protocol::TCompactProtocolT<ThriftFileTransport>>(std::move(transport));
822
0
  allocator = &BufferAllocator::Get(context);
823
0
  filter = make_uniq<ConstantFilter>(
824
0
      ExpressionType::COMPARE_EQUAL,
825
0
      probe_constant.CastAs(context, reader.GetColumns()[probe_column_idx.GetIndex()].type));
826
0
}
827
828
0
idx_t ParquetBloomProbeProcessor::TotalRowCount(ParquetReader &reader) {
829
0
  return reader.GetFileMetadata()->row_groups.size();
830
0
}
831
832
void ParquetBloomProbeProcessor::ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx,
833
0
                                         ParquetReader &reader) {
834
0
  auto meta_data = reader.GetFileMetadata();
835
0
  auto &row_group = meta_data->row_groups[row_idx];
836
0
  auto &column = row_group.columns[probe_column_idx.GetIndex()];
837
838
0
  D_ASSERT(!probe_constant.IsNull());
839
840
0
  auto bloom_excludes = ParquetStatisticsUtils::BloomFilterExcludes(*filter, column.meta_data, *protocol, *allocator);
841
842
0
  output[0].get().SetValue(output_idx, Value(reader.file.path));
843
0
  output[1].get().SetValue(output_idx, Value::BIGINT(NumericCast<int64_t>(row_idx)));
844
0
  output[2].get().SetValue(output_idx, Value::BOOLEAN(bloom_excludes));
845
0
}
846
847
//===--------------------------------------------------------------------===//
848
// Full Metadata
849
//===--------------------------------------------------------------------===//
850
851
class FullMetadataProcessor : public ParquetMetadataFileProcessor {
852
public:
853
0
  FullMetadataProcessor() = default;
854
855
  void InitializeInternal(ClientContext &context, ParquetReader &reader) override;
856
  idx_t TotalRowCount(ParquetReader &reader) override;
857
  void ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx, ParquetReader &reader) override;
858
0
  bool ForceFlush() override {
859
0
    return true;
860
0
  }
861
862
private:
863
  void PopulateMetadata(ParquetMetadataFileProcessor &processor, Vector &output, idx_t output_idx,
864
                        ParquetReader &reader);
865
866
  ParquetFileMetadataProcessor file_processor;
867
  ParquetRowGroupMetadataProcessor row_group_processor;
868
  ParquetSchemaProcessor schema_processor;
869
  ParquetKeyValueMetadataProcessor kv_processor;
870
};
871
872
void FullMetadataProcessor::PopulateMetadata(ParquetMetadataFileProcessor &processor, Vector &output, idx_t output_idx,
873
0
                                             ParquetReader &reader) {
874
0
  auto count = processor.TotalRowCount(reader);
875
0
  auto *result_data = FlatVector::GetData<list_entry_t>(output);
876
0
  auto &result_struct = ListVector::GetEntry(output);
877
0
  auto &result_struct_entries = StructVector::GetEntries(result_struct);
878
879
0
  ListVector::SetListSize(output, count);
880
0
  ListVector::Reserve(output, count);
881
882
0
  result_data[output_idx].offset = 0;
883
0
  result_data[output_idx].length = count;
884
885
0
  FlatVector::Validity(output).SetValid(output_idx);
886
887
0
  vector<reference<Vector>> vectors;
888
0
  for (auto &entry : result_struct_entries) {
889
0
    vectors.push_back(std::ref(entry));
890
0
    entry.SetVectorType(VectorType::FLAT_VECTOR);
891
0
    auto &validity = FlatVector::Validity(entry);
892
0
    validity.Initialize(count);
893
0
  }
894
0
  for (idx_t i = 0; i < count; i++) {
895
0
    processor.ReadRow(vectors, i, i, reader);
896
0
  }
897
0
}
898
899
template <>
900
void ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::FULL_METADATA>(vector<LogicalType> &return_types,
901
0
                                                                                     vector<string> &names) {
902
0
  names.emplace_back("parquet_file_metadata");
903
0
  vector<LogicalType> file_meta_types;
904
0
  vector<string> file_meta_names;
905
0
  ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::FILE_META_DATA>(file_meta_types, file_meta_names);
906
0
  child_list_t<LogicalType> file_meta_children;
907
0
  for (idx_t i = 0; i < file_meta_types.size(); i++) {
908
0
    file_meta_children.push_back(make_pair(file_meta_names[i], file_meta_types[i]));
909
0
  }
910
0
  return_types.emplace_back(LogicalType::LIST(LogicalType::STRUCT(std::move(file_meta_children))));
911
912
0
  names.emplace_back("parquet_metadata");
913
0
  vector<LogicalType> row_group_types;
914
0
  vector<string> row_group_names;
915
0
  ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::META_DATA>(row_group_types, row_group_names);
916
0
  child_list_t<LogicalType> row_group_children;
917
0
  for (idx_t i = 0; i < row_group_types.size(); i++) {
918
0
    row_group_children.push_back(make_pair(row_group_names[i], row_group_types[i]));
919
0
  }
920
0
  return_types.emplace_back(LogicalType::LIST(LogicalType::STRUCT(std::move(row_group_children))));
921
922
0
  names.emplace_back("parquet_schema");
923
0
  vector<LogicalType> schema_types;
924
0
  vector<string> schema_names;
925
0
  ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::SCHEMA>(schema_types, schema_names);
926
0
  child_list_t<LogicalType> schema_children;
927
0
  for (idx_t i = 0; i < schema_types.size(); i++) {
928
0
    schema_children.push_back(make_pair(schema_names[i], schema_types[i]));
929
0
  }
930
0
  return_types.emplace_back(LogicalType::LIST(LogicalType::STRUCT(std::move(schema_children))));
931
932
0
  names.emplace_back("parquet_kv_metadata");
933
0
  vector<LogicalType> kv_types;
934
0
  vector<string> kv_names;
935
0
  ParquetMetaDataOperator::BindSchema<ParquetMetadataOperatorType::KEY_VALUE_META_DATA>(kv_types, kv_names);
936
0
  child_list_t<LogicalType> kv_children;
937
0
  for (idx_t i = 0; i < kv_types.size(); i++) {
938
0
    kv_children.push_back(make_pair(kv_names[i], kv_types[i]));
939
0
  }
940
0
  return_types.emplace_back(LogicalType::LIST(LogicalType::STRUCT(std::move(kv_children))));
941
0
}
942
943
0
void FullMetadataProcessor::InitializeInternal(ClientContext &context, ParquetReader &reader) {
944
0
  file_processor.Initialize(context, reader);
945
0
  row_group_processor.Initialize(context, reader);
946
0
  schema_processor.Initialize(context, reader);
947
0
  kv_processor.Initialize(context, reader);
948
0
}
949
950
0
idx_t FullMetadataProcessor::TotalRowCount(ParquetReader &reader) {
951
0
  return 1;
952
0
}
953
954
void FullMetadataProcessor::ReadRow(vector<reference<Vector>> &output, idx_t output_idx, idx_t row_idx,
955
0
                                    ParquetReader &reader) {
956
0
  PopulateMetadata(file_processor, output[0].get(), output_idx, reader);
957
0
  PopulateMetadata(row_group_processor, output[1].get(), output_idx, reader);
958
0
  PopulateMetadata(schema_processor, output[2].get(), output_idx, reader);
959
0
  PopulateMetadata(kv_processor, output[3].get(), output_idx, reader);
960
0
}
961
962
//===--------------------------------------------------------------------===//
963
// Template Function Implementation
964
//===--------------------------------------------------------------------===//
965
966
template <ParquetMetadataOperatorType OP_TYPE>
967
unique_ptr<FunctionData> ParquetMetaDataOperator::Bind(ClientContext &context, TableFunctionBindInput &input,
968
0
                                                       vector<LogicalType> &return_types, vector<string> &names) {
969
  // Extract file paths from input using MultiFileReader (handles both single files and arrays)
970
0
  auto multi_file_reader = MultiFileReader::CreateDefault("ParquetMetadata");
971
0
  auto glob_input = FileGlobInput(FileGlobOptions::FALLBACK_GLOB, "parquet");
972
973
0
  auto result = make_uniq<ParquetMetaDataBindData>();
974
  // Bind schema based on operation type
975
0
  if (OP_TYPE == ParquetMetadataOperatorType::BLOOM_PROBE) {
976
0
    auto probe_bind_data = make_uniq<ParquetBloomProbeBindData>();
977
0
    D_ASSERT(input.inputs.size() == 3);
978
0
    if (input.inputs[1].IsNull() || input.inputs[2].IsNull()) {
979
0
      throw InvalidInputException("Can't have NULL parameters for parquet_bloom_probe");
980
0
    }
981
0
    probe_bind_data->probe_column_name = input.inputs[1].CastAs(context, LogicalType::VARCHAR).GetValue<string>();
982
0
    probe_bind_data->probe_constant = input.inputs[2];
983
0
    result = std::move(probe_bind_data);
984
0
  }
985
986
0
  result->file_paths = make_uniq<ParquetMetadataFilePaths>();
987
0
  result->file_paths->file_list = multi_file_reader->CreateFileList(context, input.inputs[0], glob_input);
988
0
  D_ASSERT(!result->file_paths->file_list->IsEmpty());
989
0
  result->file_paths->file_list->InitializeScan(result->file_paths->scan_data);
990
991
0
  BindSchema<OP_TYPE>(return_types, names);
992
993
0
  return std::move(result);
994
0
}
Unexecuted instantiation: duckdb::unique_ptr<duckdb::FunctionData, std::__1::default_delete<duckdb::FunctionData>, true> duckdb::ParquetMetaDataOperator::Bind<(duckdb::ParquetMetadataOperatorType)0>(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, duckdb::vector<duckdb::LogicalType, true, std::__1::allocator<duckdb::LogicalType> >&, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >&)
Unexecuted instantiation: duckdb::unique_ptr<duckdb::FunctionData, std::__1::default_delete<duckdb::FunctionData>, true> duckdb::ParquetMetaDataOperator::Bind<(duckdb::ParquetMetadataOperatorType)1>(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, duckdb::vector<duckdb::LogicalType, true, std::__1::allocator<duckdb::LogicalType> >&, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >&)
Unexecuted instantiation: duckdb::unique_ptr<duckdb::FunctionData, std::__1::default_delete<duckdb::FunctionData>, true> duckdb::ParquetMetaDataOperator::Bind<(duckdb::ParquetMetadataOperatorType)2>(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, duckdb::vector<duckdb::LogicalType, true, std::__1::allocator<duckdb::LogicalType> >&, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >&)
Unexecuted instantiation: duckdb::unique_ptr<duckdb::FunctionData, std::__1::default_delete<duckdb::FunctionData>, true> duckdb::ParquetMetaDataOperator::Bind<(duckdb::ParquetMetadataOperatorType)3>(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, duckdb::vector<duckdb::LogicalType, true, std::__1::allocator<duckdb::LogicalType> >&, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >&)
Unexecuted instantiation: duckdb::unique_ptr<duckdb::FunctionData, std::__1::default_delete<duckdb::FunctionData>, true> duckdb::ParquetMetaDataOperator::Bind<(duckdb::ParquetMetadataOperatorType)4>(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, duckdb::vector<duckdb::LogicalType, true, std::__1::allocator<duckdb::LogicalType> >&, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >&)
Unexecuted instantiation: duckdb::unique_ptr<duckdb::FunctionData, std::__1::default_delete<duckdb::FunctionData>, true> duckdb::ParquetMetaDataOperator::Bind<(duckdb::ParquetMetadataOperatorType)5>(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, duckdb::vector<duckdb::LogicalType, true, std::__1::allocator<duckdb::LogicalType> >&, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > >&)
995
996
unique_ptr<GlobalTableFunctionState> ParquetMetaDataOperator::InitGlobal(ClientContext &context,
997
0
                                                                         TableFunctionInitInput &input) {
998
0
  auto &bind_data = input.bind_data->CastNoConst<ParquetMetaDataBindData>();
999
0
  return make_uniq<ParquetMetadataGlobalState>(std::move(bind_data.file_paths), context);
1000
0
}
1001
1002
template <ParquetMetadataOperatorType OP_TYPE>
1003
unique_ptr<LocalTableFunctionState> ParquetMetaDataOperator::InitLocal(ExecutionContext &context,
1004
                                                                       TableFunctionInitInput &input,
1005
0
                                                                       GlobalTableFunctionState *global_state) {
1006
0
  auto &bind_data = input.bind_data->Cast<ParquetMetaDataBindData>();
1007
0
  auto res = make_uniq<ParquetMetadataLocalState>();
1008
0
  switch (OP_TYPE) {
1009
0
  case ParquetMetadataOperatorType::META_DATA:
1010
0
    res->processor = make_uniq<ParquetRowGroupMetadataProcessor>();
1011
0
    break;
1012
0
  case ParquetMetadataOperatorType::SCHEMA:
1013
0
    res->processor = make_uniq<ParquetSchemaProcessor>();
1014
0
    break;
1015
0
  case ParquetMetadataOperatorType::KEY_VALUE_META_DATA:
1016
0
    res->processor = make_uniq<ParquetKeyValueMetadataProcessor>();
1017
0
    break;
1018
0
  case ParquetMetadataOperatorType::FILE_META_DATA:
1019
0
    res->processor = make_uniq<ParquetFileMetadataProcessor>();
1020
0
    break;
1021
0
  case ParquetMetadataOperatorType::BLOOM_PROBE: {
1022
0
    const auto &probe_bind_data = static_cast<const ParquetBloomProbeBindData &>(bind_data);
1023
0
    res->processor =
1024
0
        make_uniq<ParquetBloomProbeProcessor>(probe_bind_data.probe_column_name, probe_bind_data.probe_constant);
1025
0
    break;
1026
0
  }
1027
0
  case ParquetMetadataOperatorType::FULL_METADATA: {
1028
0
    res->processor = make_uniq<FullMetadataProcessor>();
1029
0
    break;
1030
0
  }
1031
0
  default:
1032
0
    throw InternalException("Unsupported ParquetMetadataOperatorType");
1033
0
  }
1034
0
  return unique_ptr_cast<LocalTableFunctionState, ParquetMetadataLocalState>(std::move(res));
1035
0
}
Unexecuted instantiation: duckdb::unique_ptr<duckdb::LocalTableFunctionState, std::__1::default_delete<duckdb::LocalTableFunctionState>, true> duckdb::ParquetMetaDataOperator::InitLocal<(duckdb::ParquetMetadataOperatorType)0>(duckdb::ExecutionContext&, duckdb::TableFunctionInitInput&, duckdb::GlobalTableFunctionState*)
Unexecuted instantiation: duckdb::unique_ptr<duckdb::LocalTableFunctionState, std::__1::default_delete<duckdb::LocalTableFunctionState>, true> duckdb::ParquetMetaDataOperator::InitLocal<(duckdb::ParquetMetadataOperatorType)1>(duckdb::ExecutionContext&, duckdb::TableFunctionInitInput&, duckdb::GlobalTableFunctionState*)
Unexecuted instantiation: duckdb::unique_ptr<duckdb::LocalTableFunctionState, std::__1::default_delete<duckdb::LocalTableFunctionState>, true> duckdb::ParquetMetaDataOperator::InitLocal<(duckdb::ParquetMetadataOperatorType)2>(duckdb::ExecutionContext&, duckdb::TableFunctionInitInput&, duckdb::GlobalTableFunctionState*)
Unexecuted instantiation: duckdb::unique_ptr<duckdb::LocalTableFunctionState, std::__1::default_delete<duckdb::LocalTableFunctionState>, true> duckdb::ParquetMetaDataOperator::InitLocal<(duckdb::ParquetMetadataOperatorType)3>(duckdb::ExecutionContext&, duckdb::TableFunctionInitInput&, duckdb::GlobalTableFunctionState*)
Unexecuted instantiation: duckdb::unique_ptr<duckdb::LocalTableFunctionState, std::__1::default_delete<duckdb::LocalTableFunctionState>, true> duckdb::ParquetMetaDataOperator::InitLocal<(duckdb::ParquetMetadataOperatorType)4>(duckdb::ExecutionContext&, duckdb::TableFunctionInitInput&, duckdb::GlobalTableFunctionState*)
Unexecuted instantiation: duckdb::unique_ptr<duckdb::LocalTableFunctionState, std::__1::default_delete<duckdb::LocalTableFunctionState>, true> duckdb::ParquetMetaDataOperator::InitLocal<(duckdb::ParquetMetadataOperatorType)5>(duckdb::ExecutionContext&, duckdb::TableFunctionInitInput&, duckdb::GlobalTableFunctionState*)
1036
1037
0
void ParquetMetaDataOperator::Function(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
1038
0
  auto &global_state = data_p.global_state->Cast<ParquetMetadataGlobalState>();
1039
0
  auto &local_state = data_p.local_state->Cast<ParquetMetadataLocalState>();
1040
1041
0
  idx_t output_count = 0;
1042
1043
0
  vector<reference<Vector>> output_vectors;
1044
0
  for (idx_t i = 0; i < output.ColumnCount(); i++) {
1045
0
    output_vectors.push_back(std::ref(output.data[i]));
1046
0
  }
1047
1048
0
  while (output_count < STANDARD_VECTOR_SIZE) {
1049
    // Check if we need a new file
1050
0
    if (local_state.file_exhausted) {
1051
0
      if (output_count > 0) {
1052
        // we already have rows - emit them first
1053
0
        break;
1054
0
      }
1055
0
      idx_t next_file_idx;
1056
0
      OpenFileInfo next_file;
1057
0
      {
1058
0
        lock_guard<mutex> guard(global_state.lock);
1059
0
        if (!global_state.file_paths->NextFile(next_file)) {
1060
0
          break; // No more files to process
1061
0
        }
1062
0
        next_file_idx = global_state.current_file++;
1063
0
      }
1064
1065
0
      local_state.Initialize(context, next_file, next_file_idx);
1066
0
    }
1067
1068
0
    idx_t left_in_vector = STANDARD_VECTOR_SIZE - output_count;
1069
0
    idx_t left_in_file = local_state.total_rows - local_state.row_idx;
1070
0
    idx_t rows_to_output = 0;
1071
0
    if (left_in_file <= left_in_vector) {
1072
0
      local_state.file_exhausted = true;
1073
0
      rows_to_output = left_in_file;
1074
0
    } else {
1075
0
      rows_to_output = left_in_vector;
1076
0
    }
1077
1078
0
    output.SetCardinality(output_count + rows_to_output);
1079
1080
0
    for (idx_t i = 0; i < rows_to_output; ++i) {
1081
0
      local_state.processor->ReadRow(output_vectors, output_count + i, local_state.row_idx + i,
1082
0
                                     *local_state.reader);
1083
0
    }
1084
0
    output_count += rows_to_output;
1085
0
    local_state.row_idx += rows_to_output;
1086
1087
0
    if (local_state.processor->ForceFlush()) {
1088
0
      break;
1089
0
    }
1090
0
  }
1091
0
}
1092
1093
OperatorPartitionData ParquetMetaDataOperator::GetPartitionData(ClientContext &context,
1094
0
                                                                TableFunctionGetPartitionInput &input) {
1095
0
  auto &local_state = input.local_state->Cast<ParquetMetadataLocalState>();
1096
0
  return OperatorPartitionData(local_state.file_idx.GetIndex());
1097
0
}
1098
1099
double ParquetMetaDataOperator::Progress(ClientContext &context, const FunctionData *bind_data_p,
1100
0
                                         const GlobalTableFunctionState *global_state) {
1101
0
  auto &global_data = global_state->Cast<ParquetMetadataGlobalState>();
1102
0
  return global_data.GetProgress() * 100.0;
1103
0
}
1104
1105
ParquetMetaDataFunction::ParquetMetaDataFunction()
1106
8.91k
    : TableFunction("parquet_metadata", {LogicalType::VARCHAR}, ParquetMetaDataOperator::Function,
1107
8.91k
                    ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::META_DATA>,
1108
8.91k
                    ParquetMetaDataOperator::InitGlobal,
1109
8.91k
                    ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::META_DATA>) {
1110
8.91k
  table_scan_progress = ParquetMetaDataOperator::Progress;
1111
8.91k
  get_partition_data = ParquetMetaDataOperator::GetPartitionData;
1112
8.91k
}
1113
1114
ParquetSchemaFunction::ParquetSchemaFunction()
1115
8.91k
    : TableFunction("parquet_schema", {LogicalType::VARCHAR}, ParquetMetaDataOperator::Function,
1116
8.91k
                    ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::SCHEMA>,
1117
8.91k
                    ParquetMetaDataOperator::InitGlobal,
1118
8.91k
                    ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::SCHEMA>) {
1119
8.91k
  table_scan_progress = ParquetMetaDataOperator::Progress;
1120
8.91k
  get_partition_data = ParquetMetaDataOperator::GetPartitionData;
1121
8.91k
}
1122
1123
ParquetKeyValueMetadataFunction::ParquetKeyValueMetadataFunction()
1124
8.91k
    : TableFunction("parquet_kv_metadata", {LogicalType::VARCHAR}, ParquetMetaDataOperator::Function,
1125
8.91k
                    ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::KEY_VALUE_META_DATA>,
1126
8.91k
                    ParquetMetaDataOperator::InitGlobal,
1127
8.91k
                    ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::KEY_VALUE_META_DATA>) {
1128
8.91k
  table_scan_progress = ParquetMetaDataOperator::Progress;
1129
8.91k
  get_partition_data = ParquetMetaDataOperator::GetPartitionData;
1130
8.91k
}
1131
1132
ParquetFileMetadataFunction::ParquetFileMetadataFunction()
1133
8.91k
    : TableFunction("parquet_file_metadata", {LogicalType::VARCHAR}, ParquetMetaDataOperator::Function,
1134
8.91k
                    ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::FILE_META_DATA>,
1135
8.91k
                    ParquetMetaDataOperator::InitGlobal,
1136
8.91k
                    ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::FILE_META_DATA>) {
1137
8.91k
  table_scan_progress = ParquetMetaDataOperator::Progress;
1138
8.91k
  get_partition_data = ParquetMetaDataOperator::GetPartitionData;
1139
8.91k
}
1140
1141
ParquetBloomProbeFunction::ParquetBloomProbeFunction()
1142
8.91k
    : TableFunction("parquet_bloom_probe", {LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::ANY},
1143
8.91k
                    ParquetMetaDataOperator::Function,
1144
8.91k
                    ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::BLOOM_PROBE>,
1145
8.91k
                    ParquetMetaDataOperator::InitGlobal,
1146
8.91k
                    ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::BLOOM_PROBE>) {
1147
8.91k
  table_scan_progress = ParquetMetaDataOperator::Progress;
1148
8.91k
  get_partition_data = ParquetMetaDataOperator::GetPartitionData;
1149
8.91k
}
1150
1151
ParquetFullMetadataFunction::ParquetFullMetadataFunction()
1152
8.91k
    : TableFunction("parquet_full_metadata", {LogicalType::VARCHAR}, ParquetMetaDataOperator::Function,
1153
8.91k
                    ParquetMetaDataOperator::Bind<ParquetMetadataOperatorType::FULL_METADATA>,
1154
8.91k
                    ParquetMetaDataOperator::InitGlobal,
1155
8.91k
                    ParquetMetaDataOperator::InitLocal<ParquetMetadataOperatorType::FULL_METADATA>) {
1156
8.91k
  table_scan_progress = ParquetMetaDataOperator::Progress;
1157
8.91k
  get_partition_data = ParquetMetaDataOperator::GetPartitionData;
1158
8.91k
}
1159
} // namespace duckdb