Coverage Report

Created: 2026-06-30 06:38

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/duckdb/extension/parquet/parquet_statistics.cpp
Line
Count
Source
1
#include "parquet_statistics.hpp"
2
3
#include <cmath>
4
#include <memory>
5
#include <utility>
6
#include <vector>
7
8
#include "parquet_decimal_utils.hpp"
9
#include "parquet_timestamp.hpp"
10
#include "parquet_float16.hpp"
11
#include "reader/string_column_reader.hpp"
12
#include "reader/variant_column_reader.hpp"
13
#include "zstd/common/xxhash.hpp"
14
#include "duckdb/common/types/blob.hpp"
15
#include "duckdb/common/types/time.hpp"
16
#include "duckdb/common/types/value.hpp"
17
#include "duckdb/common/helper.hpp"
18
#include "duckdb/storage/statistics/struct_stats.hpp"
19
#include "duckdb/storage/statistics/list_stats.hpp"
20
#include "duckdb/planner/filter/expression_filter.hpp"
21
#include "duckdb/planner/expression/bound_comparison_expression.hpp"
22
#include "duckdb/planner/expression/bound_conjunction_expression.hpp"
23
#include "duckdb/planner/expression/bound_constant_expression.hpp"
24
#include "reader/uuid_column_reader.hpp"
25
#include "duckdb/common/type_visitor.hpp"
26
#include "column_reader.hpp"
27
#include "duckdb/common/allocator.hpp"
28
#include "duckdb/common/constants.hpp"
29
#include "duckdb/common/enums/expression_type.hpp"
30
#include "duckdb/common/exception.hpp"
31
#include "duckdb/common/helper.hpp"
32
#include "duckdb/common/hugeint.hpp"
33
#include "duckdb/common/numeric_utils.hpp"
34
#include "duckdb/common/optional_ptr.hpp"
35
#include "duckdb/common/string.hpp"
36
#include "duckdb/common/types.hpp"
37
#include "duckdb/common/types/date.hpp"
38
#include "duckdb/common/types/datetime.hpp"
39
#include "duckdb/common/types/geometry.hpp"
40
#include "duckdb/common/types/string_type.hpp"
41
#include "duckdb/common/types/timestamp.hpp"
42
#include "duckdb/planner/table_filter.hpp"
43
#include "duckdb/storage/statistics/geometry_stats.hpp"
44
#include "duckdb/storage/statistics/numeric_stats.hpp"
45
#include "duckdb/storage/statistics/string_stats.hpp"
46
#include "duckdb/storage/statistics/variant_stats.hpp"
47
#include "parquet_column_schema.hpp"
48
#include "parquet_types.h"
49
#include "thrift/protocol/TProtocol.h"
50
#include "thrift_tools.hpp"
51
52
namespace duckdb {
53
54
using duckdb_parquet::ConvertedType;
55
using duckdb_parquet::Type;
56
57
unique_ptr<BaseStatistics> ParquetStatisticsUtils::CreateNumericStats(const LogicalType &type,
58
                                                                      const ParquetColumnSchema &schema_ele,
59
0
                                                                      const duckdb_parquet::Statistics &parquet_stats) {
60
0
  auto stats = NumericStats::CreateUnknown(type);
61
62
  // for reasons unknown to science, Parquet defines *both* `min` and `min_value` as well as `max` and
63
  // `max_value`. All are optional. such elegance.
64
0
  Value min;
65
0
  Value max;
66
0
  if (parquet_stats.__isset.min_value) {
67
0
    min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min_value);
68
0
  } else if (parquet_stats.__isset.min) {
69
0
    min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min);
70
0
  } else {
71
0
    min = Value(type);
72
0
  }
73
0
  if (parquet_stats.__isset.max_value) {
74
0
    max = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.max_value);
75
0
  } else if (parquet_stats.__isset.max) {
76
0
    max = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.max);
77
0
  } else {
78
0
    max = Value(type);
79
0
  }
80
0
  NumericStats::SetMin(stats, min);
81
0
  NumericStats::SetMax(stats, max);
82
0
  return stats.ToUnique();
83
0
}
84
85
static unique_ptr<BaseStatistics> CreateFloatingPointStats(const LogicalType &type,
86
                                                           const ParquetColumnSchema &schema_ele,
87
0
                                                           const duckdb_parquet::Statistics &parquet_stats) {
88
0
  auto stats = NumericStats::CreateUnknown(type);
89
90
  // floating point values can always have NaN values - hence we cannot use the max value from the file
91
0
  Value min;
92
0
  Value max;
93
0
  if (parquet_stats.__isset.min_value) {
94
0
    min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min_value);
95
0
  } else if (parquet_stats.__isset.min) {
96
0
    min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min);
97
0
  } else {
98
0
    min = Value(type);
99
0
  }
100
0
  max = Value("nan").DefaultCastAs(type);
101
0
  NumericStats::SetMin(stats, min);
102
0
  NumericStats::SetMax(stats, max);
103
0
  return stats.ToUnique();
104
0
}
105
106
Value ParquetStatisticsUtils::ConvertValue(const LogicalType &type, const ParquetColumnSchema &schema_ele,
107
0
                                           const std::string &stats) {
108
0
  Value result;
109
0
  string error;
110
0
  auto stats_val = ConvertValueInternal(type, schema_ele, stats);
111
0
  if (!stats_val.DefaultTryCastAs(type, result, &error)) {
112
0
    return Value(type);
113
0
  }
114
0
  return result;
115
0
}
116
Value ParquetStatisticsUtils::ConvertValueInternal(const LogicalType &type, const ParquetColumnSchema &schema_ele,
117
0
                                                   const std::string &stats) {
118
0
  auto stats_data = const_data_ptr_cast(stats.c_str());
119
0
  switch (type.id()) {
120
0
  case LogicalTypeId::BOOLEAN: {
121
0
    if (stats.size() != sizeof(bool)) {
122
0
      throw InvalidInputException("Incorrect stats size for type BOOLEAN");
123
0
    }
124
0
    return Value::BOOLEAN(Load<bool>(stats_data));
125
0
  }
126
0
  case LogicalTypeId::UTINYINT:
127
0
  case LogicalTypeId::USMALLINT:
128
0
  case LogicalTypeId::UINTEGER:
129
0
    if (stats.size() != sizeof(uint32_t)) {
130
0
      throw InvalidInputException("Incorrect stats size for type UINTEGER");
131
0
    }
132
0
    return Value::UINTEGER(Load<uint32_t>(stats_data));
133
0
  case LogicalTypeId::UBIGINT:
134
0
    if (stats.size() != sizeof(uint64_t)) {
135
0
      throw InvalidInputException("Incorrect stats size for type UBIGINT");
136
0
    }
137
0
    return Value::UBIGINT(Load<uint64_t>(stats_data));
138
0
  case LogicalTypeId::TINYINT:
139
0
  case LogicalTypeId::SMALLINT:
140
0
  case LogicalTypeId::INTEGER:
141
0
    if (stats.size() != sizeof(int32_t)) {
142
0
      throw InvalidInputException("Incorrect stats size for type INTEGER");
143
0
    }
144
0
    return Value::INTEGER(Load<int32_t>(stats_data));
145
0
  case LogicalTypeId::BIGINT:
146
0
    if (stats.size() != sizeof(int64_t)) {
147
0
      throw InvalidInputException("Incorrect stats size for type BIGINT");
148
0
    }
149
0
    return Value::BIGINT(Load<int64_t>(stats_data));
150
0
  case LogicalTypeId::FLOAT: {
151
0
    float val;
152
0
    if (schema_ele.type_info == ParquetExtraTypeInfo::FLOAT16) {
153
0
      if (stats.size() != sizeof(uint16_t)) {
154
0
        throw InvalidInputException("Incorrect stats size for type FLOAT16");
155
0
      }
156
0
      val = Float16ToFloat32(Load<uint16_t>(stats_data));
157
0
    } else {
158
0
      if (stats.size() != sizeof(float)) {
159
0
        throw InvalidInputException("Incorrect stats size for type FLOAT");
160
0
      }
161
0
      val = Load<float>(stats_data);
162
0
    }
163
0
    if (!Value::FloatIsFinite(val)) {
164
0
      return Value();
165
0
    }
166
0
    return Value::FLOAT(val);
167
0
  }
168
0
  case LogicalTypeId::DOUBLE: {
169
0
    if (schema_ele.type_info == ParquetExtraTypeInfo::DECIMAL_BYTE_ARRAY) {
170
      // decimals cast to double
171
0
      return Value::DOUBLE(ParquetDecimalUtils::ReadDecimalValue<double>(stats_data, stats.size(), schema_ele));
172
0
    }
173
0
    if (stats.size() != sizeof(double)) {
174
0
      throw InvalidInputException("Incorrect stats size for type DOUBLE");
175
0
    }
176
0
    auto val = Load<double>(stats_data);
177
0
    if (!Value::DoubleIsFinite(val)) {
178
0
      return Value();
179
0
    }
180
0
    return Value::DOUBLE(val);
181
0
  }
182
0
  case LogicalTypeId::DECIMAL: {
183
0
    auto width = DecimalType::GetWidth(type);
184
0
    auto scale = DecimalType::GetScale(type);
185
0
    switch (schema_ele.type_info) {
186
0
    case ParquetExtraTypeInfo::DECIMAL_INT32:
187
0
      if (stats.size() != sizeof(int32_t)) {
188
0
        throw InvalidInputException("Incorrect stats size for type %s", type.ToString());
189
0
      }
190
0
      return Value::DECIMAL(Load<int32_t>(stats_data), width, scale);
191
0
    case ParquetExtraTypeInfo::DECIMAL_INT64:
192
0
      if (stats.size() != sizeof(int64_t)) {
193
0
        throw InvalidInputException("Incorrect stats size for type %s", type.ToString());
194
0
      }
195
0
      return Value::DECIMAL(Load<int64_t>(stats_data), width, scale);
196
0
    case ParquetExtraTypeInfo::DECIMAL_BYTE_ARRAY:
197
0
      switch (type.InternalType()) {
198
0
      case PhysicalType::INT16:
199
0
        return Value::DECIMAL(
200
0
            ParquetDecimalUtils::ReadDecimalValue<int16_t>(stats_data, stats.size(), schema_ele), width, scale);
201
0
      case PhysicalType::INT32:
202
0
        return Value::DECIMAL(
203
0
            ParquetDecimalUtils::ReadDecimalValue<int32_t>(stats_data, stats.size(), schema_ele), width, scale);
204
0
      case PhysicalType::INT64:
205
0
        return Value::DECIMAL(
206
0
            ParquetDecimalUtils::ReadDecimalValue<int64_t>(stats_data, stats.size(), schema_ele), width, scale);
207
0
      case PhysicalType::INT128:
208
0
        return Value::DECIMAL(
209
0
            ParquetDecimalUtils::ReadDecimalValue<hugeint_t>(stats_data, stats.size(), schema_ele), width,
210
0
            scale);
211
0
      default:
212
0
        throw InvalidInputException("Unsupported internal type for decimal");
213
0
      }
214
0
    default:
215
0
      throw NotImplementedException("Unrecognized Parquet type for Decimal");
216
0
    }
217
0
  }
218
0
  case LogicalTypeId::VARCHAR:
219
0
  case LogicalTypeId::BLOB:
220
0
    if (type.id() == LogicalTypeId::BLOB || !Value::StringIsValid(stats)) {
221
0
      return Value(Blob::ToString(string_t(stats)));
222
0
    }
223
0
    return Value(stats);
224
0
  case LogicalTypeId::DATE:
225
0
    if (stats.size() != sizeof(int32_t)) {
226
0
      throw InvalidInputException("Incorrect stats size for type DATE");
227
0
    }
228
0
    return Value::DATE(date_t(Load<int32_t>(stats_data)));
229
0
  case LogicalTypeId::TIME: {
230
0
    int64_t val;
231
0
    if (stats.size() == sizeof(int32_t)) {
232
0
      val = Load<int32_t>(stats_data);
233
0
    } else if (stats.size() == sizeof(int64_t)) {
234
0
      val = Load<int64_t>(stats_data);
235
0
    } else {
236
0
      throw InvalidInputException("Incorrect stats size for type TIME");
237
0
    }
238
0
    switch (schema_ele.type_info) {
239
0
    case ParquetExtraTypeInfo::UNIT_MS:
240
0
      return Value::TIME(Time::FromTimeMs(val));
241
0
    case ParquetExtraTypeInfo::UNIT_NS:
242
0
      return Value::TIME(Time::FromTimeNs(val));
243
0
    case ParquetExtraTypeInfo::UNIT_MICROS:
244
0
    default:
245
0
      return Value::TIME(dtime_t(val));
246
0
    }
247
0
  }
248
0
  case LogicalTypeId::TIME_NS: {
249
0
    int64_t val;
250
0
    if (stats.size() == sizeof(int32_t)) {
251
0
      val = Load<int32_t>(stats_data);
252
0
    } else if (stats.size() == sizeof(int64_t)) {
253
0
      val = Load<int64_t>(stats_data);
254
0
    } else {
255
0
      throw InvalidInputException("Incorrect stats size for type TIME_NS");
256
0
    }
257
0
    switch (schema_ele.type_info) {
258
0
    case ParquetExtraTypeInfo::UNIT_MS:
259
0
      return Value::TIME_NS(ParquetMsIntToTimeNs(NumericCast<int32_t>(val)));
260
0
    case ParquetExtraTypeInfo::UNIT_NS:
261
0
      return Value::TIME_NS(ParquetIntToTimeNs(val));
262
0
    case ParquetExtraTypeInfo::UNIT_MICROS:
263
0
    default:
264
0
      return Value::TIME_NS(dtime_ns_t(val));
265
0
    }
266
0
  }
267
0
  case LogicalTypeId::TIME_TZ: {
268
0
    int64_t val;
269
0
    if (stats.size() == sizeof(int32_t)) {
270
0
      val = Load<int32_t>(stats_data);
271
0
    } else if (stats.size() == sizeof(int64_t)) {
272
0
      val = Load<int64_t>(stats_data);
273
0
    } else {
274
0
      throw InvalidInputException("Incorrect stats size for type TIMETZ");
275
0
    }
276
0
    switch (schema_ele.type_info) {
277
0
    case ParquetExtraTypeInfo::UNIT_MS:
278
0
      return Value::TIMETZ(ParquetIntToTimeMsTZ(NumericCast<int32_t>(val)));
279
0
    case ParquetExtraTypeInfo::UNIT_NS:
280
0
      return Value::TIMETZ(ParquetIntToTimeNsTZ(val));
281
0
    case ParquetExtraTypeInfo::UNIT_MICROS:
282
0
    default:
283
0
      return Value::TIMETZ(ParquetIntToTimeTZ(val));
284
0
    }
285
0
  }
286
0
  case LogicalTypeId::TIMESTAMP:
287
0
  case LogicalTypeId::TIMESTAMP_TZ: {
288
0
    timestamp_t timestamp_value;
289
0
    if (schema_ele.type_info == ParquetExtraTypeInfo::IMPALA_TIMESTAMP) {
290
0
      if (stats.size() != sizeof(Int96)) {
291
0
        throw InvalidInputException("Incorrect stats size for type TIMESTAMP");
292
0
      }
293
0
      timestamp_value = ImpalaTimestampToTimestamp(Load<Int96>(stats_data));
294
0
    } else {
295
0
      if (stats.size() != sizeof(int64_t)) {
296
0
        throw InvalidInputException("Incorrect stats size for type TIMESTAMP");
297
0
      }
298
0
      auto val = Load<int64_t>(stats_data);
299
0
      switch (schema_ele.type_info) {
300
0
      case ParquetExtraTypeInfo::UNIT_MS:
301
0
        timestamp_value = Timestamp::FromEpochMs(val);
302
0
        break;
303
0
      case ParquetExtraTypeInfo::UNIT_NS:
304
0
        timestamp_value = Timestamp::FromEpochNanoSeconds(val);
305
0
        break;
306
0
      case ParquetExtraTypeInfo::UNIT_MICROS:
307
0
      default:
308
0
        timestamp_value = timestamp_t(val);
309
0
        break;
310
0
      }
311
0
    }
312
0
    if (type.id() == LogicalTypeId::TIMESTAMP_TZ) {
313
0
      return Value::TIMESTAMPTZ(timestamp_tz_t(timestamp_value));
314
0
    }
315
0
    return Value::TIMESTAMP(timestamp_value);
316
0
  }
317
0
  case LogicalTypeId::TIMESTAMP_TZ_NS:
318
0
  case LogicalTypeId::TIMESTAMP_NS: {
319
0
    timestamp_ns_t timestamp_value;
320
0
    if (schema_ele.type_info == ParquetExtraTypeInfo::IMPALA_TIMESTAMP) {
321
0
      if (stats.size() != sizeof(Int96)) {
322
0
        throw InvalidInputException("Incorrect stats size for type TIMESTAMP_NS");
323
0
      }
324
0
      timestamp_value = ImpalaTimestampToTimestampNS(Load<Int96>(stats_data));
325
0
    } else {
326
0
      if (stats.size() != sizeof(int64_t)) {
327
0
        throw InvalidInputException("Incorrect stats size for type TIMESTAMP_NS");
328
0
      }
329
0
      auto val = Load<int64_t>(stats_data);
330
0
      switch (schema_ele.type_info) {
331
0
      case ParquetExtraTypeInfo::UNIT_MS:
332
0
        timestamp_value = ParquetTimestampMsToTimestampNs(val);
333
0
        break;
334
0
      case ParquetExtraTypeInfo::UNIT_NS:
335
0
        timestamp_value = ParquetTimestampNsToTimestampNs(val);
336
0
        break;
337
0
      case ParquetExtraTypeInfo::UNIT_MICROS:
338
0
      default:
339
0
        timestamp_value = ParquetTimestampUsToTimestampNs(val);
340
0
        break;
341
0
      }
342
0
    }
343
0
    if (type.id() == LogicalTypeId::TIMESTAMP_TZ_NS) {
344
0
      return Value::TIMESTAMPTZNS(timestamp_tz_ns_t(timestamp_value));
345
0
    }
346
0
    return Value::TIMESTAMPNS(timestamp_value);
347
0
  }
348
0
  case LogicalTypeId::UUID: {
349
0
    if (stats.size() != 16) {
350
0
      throw InvalidInputException("Incorrect stats size for type UUID");
351
0
    }
352
0
    auto uuid_val = UUIDValueConversion::ReadParquetUUID(const_data_ptr_cast(stats.c_str()));
353
0
    return Value::UUID(uuid_val);
354
0
  }
355
0
  default:
356
0
    throw InternalException("Unsupported type for stats %s", type.ToString());
357
0
  }
358
0
}
359
360
0
bool IsVariantNull(const string &str) {
361
0
  return str.size() == 1 && str[0] == '\0';
362
0
}
363
364
// The conversion is best-effort and non-fatal: when the statistics of a particular (sub)node cannot be
365
// converted, that node is left as UNKNOWN stats (which makes it "not fully shredded", so consumers fall
366
// back to a full scan for that field) instead of discarding the statistics of the entire variant.
367
0
static void ConvertUnshreddedStats(BaseStatistics &result, optional_ptr<BaseStatistics> input_p) {
368
0
  D_ASSERT(result.GetType().id() == LogicalTypeId::UINTEGER);
369
370
0
  if (!input_p) {
371
    //! No overlay statistics -> conservatively unknown (this node is not "fully shredded")
372
0
    result.Copy(BaseStatistics::CreateUnknown(LogicalType::UINTEGER));
373
0
    return;
374
0
  }
375
0
  auto &input = *input_p;
376
0
  D_ASSERT(input.GetType().id() == LogicalTypeId::BLOB);
377
0
  result.CopyValidity(input);
378
379
0
  if (!result.CanHaveNoNull()) {
380
    //! The overlay is entirely NULL -> no overlay values -> fully shredded
381
0
    return;
382
0
  }
383
0
  if (!StringStats::HasMinMax(input)) {
384
    //! The overlay may contain values but we can't tell what they are (e.g. the writer dropped min/max for
385
    //! a large blob value) -> conservatively unknown so this node is treated as not fully shredded
386
0
    result.Copy(BaseStatistics::CreateUnknown(LogicalType::UINTEGER));
387
0
    return;
388
0
  }
389
390
0
  auto min = StringStats::Min(input);
391
0
  auto max = StringStats::Max(input);
392
0
  if (IsVariantNull(min) && IsVariantNull(max)) {
393
    //! All non-shredded values are NULL or VARIANT_NULL, set the stats to indicate this
394
0
    NumericStats::SetMin<uint32_t>(result, 0);
395
0
    NumericStats::SetMax<uint32_t>(result, 0);
396
0
    result.SetHasNoNull();
397
0
  }
398
  //! else: there are real overlay values -> leave min/max unset, so this node is not fully shredded
399
0
}
400
401
static void ConvertShreddedStats(BaseStatistics &result, optional_ptr<BaseStatistics> input_p);
402
403
0
static void ConvertShreddedStatsItem(BaseStatistics &result, BaseStatistics &input) {
404
0
  D_ASSERT(result.GetType().id() == LogicalTypeId::STRUCT);
405
0
  D_ASSERT(input.GetType().id() == LogicalTypeId::STRUCT);
406
407
  // result variant stats
408
0
  auto &untyped_value_index_stats = StructStats::GetChildStats(result, VariantStats::UNTYPED_VALUE_INDEX);
409
0
  auto &typed_value_result = StructStats::GetChildStats(result, VariantStats::TYPED_VALUE_INDEX);
410
411
  // input parquet stats
412
0
  auto &value_stats = StructStats::GetChildStats(input, 0);
413
0
  auto &typed_value_input = StructStats::GetChildStats(input, 1);
414
415
0
  ConvertUnshreddedStats(untyped_value_index_stats, value_stats);
416
0
  ConvertShreddedStats(typed_value_result, typed_value_input);
417
0
}
418
419
0
static void ConvertShreddedStats(BaseStatistics &result, optional_ptr<BaseStatistics> input_p) {
420
0
  if (!input_p) {
421
    //! No statistics for this shredded subtree -> leave it unknown (conservative)
422
0
    result.Copy(BaseStatistics::CreateUnknown(result.GetType()));
423
0
    return;
424
0
  }
425
0
  auto &input = *input_p;
426
0
  result.CopyValidity(input);
427
428
0
  auto type_id = result.GetType().id();
429
0
  if (type_id == LogicalTypeId::LIST) {
430
0
    ConvertShreddedStatsItem(ListStats::GetChildStats(result), ListStats::GetChildStats(input));
431
0
    return;
432
0
  }
433
0
  if (type_id == LogicalTypeId::STRUCT) {
434
0
    auto field_count = StructType::GetChildCount(result.GetType());
435
0
    for (idx_t i = 0; i < field_count; i++) {
436
0
      ConvertShreddedStatsItem(StructStats::GetChildStats(result, i), StructStats::GetChildStats(input, i));
437
0
    }
438
0
    return;
439
0
  }
440
  //! Primitive leaf - copy the parquet stats if the types line up, otherwise leave it unknown
441
0
  if (result.GetType() == input.GetType()) {
442
0
    result.Copy(input);
443
0
  } else {
444
0
    result.Copy(BaseStatistics::CreateUnknown(result.GetType()));
445
0
  }
446
0
}
447
448
0
bool StringStatsAreValid(const string &stats, bool is_varchar, StringStatsType stats_type) {
449
0
  if (stats_type == StringStatsType::TRUNCATED_STATS) {
450
    // truncated stats can contain invalid UTF8 due to truncation - this is fine
451
0
    return true;
452
0
  }
453
  // for exact stats we need the stats to be valid because we might emit them
454
  // we could optionally convert these into truncated stats...
455
  // but if a file has corrupt exact string stats it's likely these are bogus, so just ignore them
456
0
  return StringColumnReader::IsValid(stats, is_varchar);
457
0
}
458
459
unique_ptr<BaseStatistics>
460
ParquetStatisticsUtils::TransformParquetStatistics(const LogicalType &type, const ParquetColumnSchema &schema,
461
                                                   const duckdb_parquet::Statistics &parquet_stats, bool can_have_nan,
462
0
                                                   optional_ptr<const ColumnChunk> column_chunk) {
463
0
  switch (type.id()) {
464
0
  case LogicalTypeId::BOOLEAN:
465
0
  case LogicalTypeId::UTINYINT:
466
0
  case LogicalTypeId::USMALLINT:
467
0
  case LogicalTypeId::UINTEGER:
468
0
  case LogicalTypeId::UBIGINT:
469
0
  case LogicalTypeId::TINYINT:
470
0
  case LogicalTypeId::SMALLINT:
471
0
  case LogicalTypeId::INTEGER:
472
0
  case LogicalTypeId::BIGINT:
473
0
  case LogicalTypeId::DATE:
474
0
  case LogicalTypeId::TIME:
475
0
  case LogicalTypeId::TIME_TZ:
476
0
  case LogicalTypeId::TIMESTAMP:
477
0
  case LogicalTypeId::TIMESTAMP_TZ:
478
0
  case LogicalTypeId::TIMESTAMP_TZ_NS:
479
0
  case LogicalTypeId::TIMESTAMP_SEC:
480
0
  case LogicalTypeId::TIMESTAMP_MS:
481
0
  case LogicalTypeId::TIMESTAMP_NS:
482
0
  case LogicalTypeId::DECIMAL:
483
0
  case LogicalTypeId::UUID:
484
0
    return CreateNumericStats(type, schema, parquet_stats);
485
0
  case LogicalTypeId::FLOAT:
486
0
  case LogicalTypeId::DOUBLE:
487
0
    if (can_have_nan) {
488
      // Since parquet doesn't tell us if the column has NaN values, if the user has explicitly declared that it
489
      // does, we create stats without an upper max value, as NaN compares larger than anything else.
490
0
      return CreateFloatingPointStats(type, schema, parquet_stats);
491
0
    } else {
492
      // Otherwise we use the numeric stats as usual, which might lead to "wrong" pruning if the column contains
493
      // NaN values. The parquet spec is not clear on how to handle NaN values in statistics, and so this is
494
      // probably the best we can do for now.
495
0
      return CreateNumericStats(type, schema, parquet_stats);
496
0
    }
497
0
    break;
498
0
  case LogicalTypeId::BLOB:
499
0
  case LogicalTypeId::VARCHAR: {
500
0
    auto string_stats = StringStats::CreateUnknown(type);
501
0
    const bool is_varchar = type.id() == LogicalTypeId::VARCHAR;
502
0
    auto min_stats_type = parquet_stats.__isset.is_min_value_exact && parquet_stats.is_min_value_exact
503
0
                              ? StringStatsType::EXACT_STATS
504
0
                              : StringStatsType::TRUNCATED_STATS;
505
0
    auto max_stats_type = parquet_stats.__isset.is_max_value_exact && parquet_stats.is_max_value_exact
506
0
                              ? StringStatsType::EXACT_STATS
507
0
                              : StringStatsType::TRUNCATED_STATS;
508
0
    if (parquet_stats.__isset.min_value &&
509
0
        StringStatsAreValid(parquet_stats.min_value, is_varchar, min_stats_type)) {
510
0
      StringStats::SetMin(string_stats, parquet_stats.min_value, min_stats_type);
511
0
    } else if (parquet_stats.__isset.min && StringStatsAreValid(parquet_stats.min, is_varchar, min_stats_type)) {
512
0
      StringStats::SetMin(string_stats, parquet_stats.min, min_stats_type);
513
0
    }
514
0
    if (parquet_stats.__isset.max_value &&
515
0
        StringStatsAreValid(parquet_stats.max_value, is_varchar, max_stats_type)) {
516
0
      StringStats::SetMax(string_stats, parquet_stats.max_value, max_stats_type);
517
0
    } else if (parquet_stats.__isset.max && StringStatsAreValid(parquet_stats.max, is_varchar, max_stats_type)) {
518
0
      StringStats::SetMax(string_stats, parquet_stats.max, max_stats_type);
519
0
    }
520
0
    return string_stats.ToUnique();
521
0
  }
522
0
  case LogicalTypeId::GEOMETRY: {
523
0
    if (!column_chunk) {
524
0
      break;
525
0
    }
526
0
    auto geo_stats = GeometryStats::CreateUnknown(type);
527
0
    if (column_chunk->meta_data.__isset.geospatial_statistics) {
528
0
      if (column_chunk->meta_data.geospatial_statistics.__isset.bbox) {
529
0
        auto &bbox = column_chunk->meta_data.geospatial_statistics.bbox;
530
0
        auto &stats_bbox = GeometryStats::GetExtent(geo_stats);
531
532
        // xmin > xmax is allowed if the geometry crosses the antimeridian,
533
        // but we don't handle this right now
534
0
        if (bbox.xmin <= bbox.xmax) {
535
0
          stats_bbox.x_min = bbox.xmin;
536
0
          stats_bbox.x_max = bbox.xmax;
537
0
        }
538
539
0
        if (bbox.ymin <= bbox.ymax) {
540
0
          stats_bbox.y_min = bbox.ymin;
541
0
          stats_bbox.y_max = bbox.ymax;
542
0
        }
543
544
0
        if (bbox.__isset.zmin && bbox.__isset.zmax && bbox.zmin <= bbox.zmax) {
545
0
          stats_bbox.z_min = bbox.zmin;
546
0
          stats_bbox.z_max = bbox.zmax;
547
0
        }
548
549
0
        if (bbox.__isset.mmin && bbox.__isset.mmax && bbox.mmin <= bbox.mmax) {
550
0
          stats_bbox.m_min = bbox.mmin;
551
0
          stats_bbox.m_max = bbox.mmax;
552
0
        }
553
0
      }
554
0
      if (column_chunk->meta_data.geospatial_statistics.__isset.geospatial_types) {
555
0
        auto &types = column_chunk->meta_data.geospatial_statistics.geospatial_types;
556
0
        auto &stats_types = GeometryStats::GetTypes(geo_stats);
557
558
        // if types are set but empty, that still means "any type" - so we leave stats_types as-is (unknown)
559
        // otherwise, clear and set to the actual types
560
561
0
        if (!types.empty()) {
562
0
          stats_types.Clear();
563
0
          for (auto &geom_type : types) {
564
0
            stats_types.AddWKBType(geom_type);
565
0
          }
566
0
        }
567
0
      }
568
0
    }
569
0
    return geo_stats.ToUnique();
570
0
  }
571
0
  default:
572
0
    break;
573
0
  } // end of type switch
574
575
  // no specific stats, only create unknown stats to hold validity information
576
0
  auto unknown_stats = BaseStatistics::CreateUnknown(type);
577
0
  return unknown_stats.ToUnique();
578
0
}
579
580
unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(const ParquetColumnSchema &schema,
581
                                                                             const vector<ColumnChunk> &columns,
582
0
                                                                             bool can_have_nan) {
583
  // Not supported types
584
0
  auto &type = schema.type;
585
0
  if (type.id() == LogicalTypeId::ARRAY || type.id() == LogicalTypeId::MAP) {
586
0
    return nullptr;
587
0
  }
588
589
0
  unique_ptr<BaseStatistics> row_group_stats;
590
591
0
  if (type.id() == LogicalTypeId::LIST) {
592
0
    auto list_stats = ListStats::CreateUnknown(type);
593
0
    auto &child_schema = schema.children[0];
594
0
    auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns, can_have_nan);
595
0
    ListStats::SetChildStats(list_stats, std::move(child_stats));
596
0
    row_group_stats = list_stats.ToUnique();
597
0
    return row_group_stats;
598
0
  }
599
  // Structs are handled differently (they dont have stats)
600
0
  if (type.id() == LogicalTypeId::STRUCT) {
601
0
    auto struct_stats = StructStats::CreateUnknown(type);
602
    // Recurse into child readers
603
0
    for (idx_t i = 0; i < schema.children.size(); i++) {
604
0
      auto &child_schema = schema.children[i];
605
0
      auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns, can_have_nan);
606
0
      StructStats::SetChildStats(struct_stats, i, std::move(child_stats));
607
0
    }
608
0
    row_group_stats = struct_stats.ToUnique();
609
0
    return row_group_stats;
610
0
  } else if (schema.schema_type == ParquetColumnSchemaType::VARIANT) {
611
0
    auto children_count = schema.children.size();
612
0
    if (children_count != 3) {
613
0
      return nullptr;
614
0
    }
615
    //! Create the VARIANT stats
616
0
    auto &typed_value = schema.children[2];
617
0
    LogicalType logical_type;
618
0
    if (!VariantColumnReader::TypedValueLayoutToType(typed_value.type, logical_type)) {
619
      //! We couldn't convert the parquet typed_value to a structured type (likely because a nested 'typed_value'
620
      //! field is missing)
621
0
      return nullptr;
622
0
    }
623
0
    auto shredding_type = TypeVisitor::VisitReplace(logical_type, [](const LogicalType &type) {
624
0
      return LogicalType::STRUCT({{"typed_value", type}, {"untyped_value_index", LogicalType::UINTEGER}});
625
0
    });
626
0
    auto variant_stats = VariantStats::CreateShredded(shredding_type);
627
628
    //! Take the root stats
629
0
    auto &shredded_stats = VariantStats::GetShreddedStats(variant_stats);
630
0
    auto &untyped_value_index_stats = StructStats::GetChildStats(shredded_stats, VariantStats::UNTYPED_VALUE_INDEX);
631
0
    auto &typed_value_stats = StructStats::GetChildStats(shredded_stats, VariantStats::TYPED_VALUE_INDEX);
632
633
    //! Convert the root 'value' -> 'untyped_value_index'
634
0
    auto &value = schema.children[1];
635
0
    D_ASSERT(value.name == "value");
636
0
    auto value_stats = ParquetStatisticsUtils::TransformColumnStatistics(value, columns, can_have_nan);
637
    //! Best-effort: nodes whose stats can't be converted are left UNKNOWN (not fully shredded) rather
638
    //! than discarding the statistics for the entire variant column
639
0
    ConvertUnshreddedStats(untyped_value_index_stats, value_stats.get());
640
641
0
    auto parquet_typed_value_stats =
642
0
        ParquetStatisticsUtils::TransformColumnStatistics(typed_value, columns, can_have_nan);
643
0
    ConvertShreddedStats(typed_value_stats, parquet_typed_value_stats.get());
644
    //! Set validity to UNKNOWN
645
0
    variant_stats.SetHasNoNull();
646
0
    variant_stats.SetHasNull();
647
0
    return variant_stats.ToUnique();
648
0
  }
649
650
  // Otherwise, its a standard column with stats
651
0
  auto &column_chunk = columns[schema.column_index];
652
0
  if (!column_chunk.__isset.meta_data || !column_chunk.meta_data.__isset.statistics) {
653
    // no stats present for row group
654
0
    return nullptr;
655
0
  }
656
0
  auto &parquet_stats = column_chunk.meta_data.statistics;
657
0
  row_group_stats = TransformParquetStatistics(type, schema, parquet_stats, can_have_nan, &column_chunk);
658
659
  // null count is generic
660
0
  if (row_group_stats) {
661
0
    row_group_stats->Set(StatsInfo::CAN_HAVE_NULL_AND_VALID_VALUES);
662
0
    if (parquet_stats.__isset.null_count && parquet_stats.null_count == 0) {
663
0
      row_group_stats->Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
664
0
    }
665
0
    if (parquet_stats.__isset.null_count && parquet_stats.null_count == column_chunk.meta_data.num_values) {
666
0
      row_group_stats->Set(StatsInfo::CANNOT_HAVE_VALID_VALUES);
667
0
    }
668
0
  }
669
0
  return row_group_stats;
670
0
}
671
672
0
static bool HasFilterConstants(const Expression &expr) {
673
0
  if (BoundComparisonExpression::IsComparison(expr)) {
674
0
    auto &comp = expr.Cast<BoundFunctionExpression>();
675
0
    if (comp.GetExpressionType() != ExpressionType::COMPARE_EQUAL) {
676
0
      return false;
677
0
    }
678
0
    auto &right = BoundComparisonExpression::Right(comp);
679
0
    if (right.GetExpressionType() != ExpressionType::VALUE_CONSTANT) {
680
0
      return false;
681
0
    }
682
0
    auto &constant = right.Cast<BoundConstantExpression>();
683
0
    return !constant.GetValue().IsNull();
684
0
  }
685
0
  if (expr.GetExpressionClass() != ExpressionClass::BOUND_CONJUNCTION) {
686
0
    return false;
687
0
  }
688
0
  bool child_has_constant = false;
689
0
  ExpressionIterator::EnumerateChildren(expr, [&](const Expression &child) {
690
0
    if (!child_has_constant) {
691
0
      child_has_constant = HasFilterConstants(child);
692
0
    }
693
0
  });
694
0
  return child_has_constant;
695
0
}
696
697
0
static bool HasFilterConstants(const TableFilter &duckdb_filter) {
698
0
  auto &expr_filter = ExpressionFilter::GetExpressionFilter(duckdb_filter, "ParquetStatistics::HasFilterConstants");
699
0
  return HasFilterConstants(*expr_filter.expr);
700
0
}
701
702
template <class T>
703
0
static uint64_t ValueXH64FixedWidth(const Value &constant) {
704
0
  T val = constant.GetValue<T>();
705
0
  return duckdb_zstd::XXH64(&val, sizeof(val), 0);
706
0
}
Unexecuted instantiation: parquet_statistics.cpp:unsigned long duckdb::ValueXH64FixedWidth<int>(duckdb::Value const&)
Unexecuted instantiation: parquet_statistics.cpp:unsigned long duckdb::ValueXH64FixedWidth<unsigned int>(duckdb::Value const&)
Unexecuted instantiation: parquet_statistics.cpp:unsigned long duckdb::ValueXH64FixedWidth<unsigned long>(duckdb::Value const&)
Unexecuted instantiation: parquet_statistics.cpp:unsigned long duckdb::ValueXH64FixedWidth<long>(duckdb::Value const&)
Unexecuted instantiation: parquet_statistics.cpp:unsigned long duckdb::ValueXH64FixedWidth<float>(duckdb::Value const&)
Unexecuted instantiation: parquet_statistics.cpp:unsigned long duckdb::ValueXH64FixedWidth<double>(duckdb::Value const&)
707
708
// TODO we can only this if the parquet representation of the type exactly matches the duckdb rep!
709
// TODO TEST THIS!
710
// TODO perhaps we can re-use some writer infra here
711
0
static uint64_t ValueXXH64(const Value &constant) {
712
0
  switch (constant.type().InternalType()) {
713
0
  case PhysicalType::UINT8:
714
0
    return ValueXH64FixedWidth<int32_t>(constant);
715
0
  case PhysicalType::INT8:
716
0
    return ValueXH64FixedWidth<int32_t>(constant);
717
0
  case PhysicalType::UINT16:
718
0
    return ValueXH64FixedWidth<int32_t>(constant);
719
0
  case PhysicalType::INT16:
720
0
    return ValueXH64FixedWidth<int32_t>(constant);
721
0
  case PhysicalType::UINT32:
722
0
    return ValueXH64FixedWidth<uint32_t>(constant);
723
0
  case PhysicalType::INT32:
724
0
    return ValueXH64FixedWidth<int32_t>(constant);
725
0
  case PhysicalType::UINT64:
726
0
    return ValueXH64FixedWidth<uint64_t>(constant);
727
0
  case PhysicalType::INT64:
728
0
    return ValueXH64FixedWidth<int64_t>(constant);
729
0
  case PhysicalType::FLOAT:
730
0
    return ValueXH64FixedWidth<float>(constant);
731
0
  case PhysicalType::DOUBLE:
732
0
    return ValueXH64FixedWidth<double>(constant);
733
0
  case PhysicalType::VARCHAR: {
734
0
    auto val = constant.GetValue<string>();
735
0
    return duckdb_zstd::XXH64(val.c_str(), val.length(), 0);
736
0
  }
737
0
  default:
738
0
    return 0;
739
0
  }
740
0
}
741
742
0
static bool ApplyBloomFilter(const Expression &expr, ParquetBloomFilter &bloom_filter) {
743
0
  if (BoundComparisonExpression::IsComparison(expr)) {
744
0
    auto &comp = expr.Cast<BoundFunctionExpression>();
745
0
    if (comp.GetExpressionType() != ExpressionType::COMPARE_EQUAL) {
746
0
      return false;
747
0
    }
748
0
    auto &right = BoundComparisonExpression::Right(comp);
749
0
    if (right.GetExpressionType() != ExpressionType::VALUE_CONSTANT) {
750
0
      return false;
751
0
    }
752
0
    auto &constant = right.Cast<BoundConstantExpression>();
753
0
    D_ASSERT(!constant.GetValue().IsNull());
754
0
    auto hash = ValueXXH64(constant.GetValue());
755
0
    return hash > 0 && !bloom_filter.FilterCheck(hash);
756
0
  }
757
0
  if (expr.GetExpressionClass() != ExpressionClass::BOUND_CONJUNCTION) {
758
0
    return false;
759
0
  }
760
0
  switch (expr.GetExpressionType()) {
761
0
  case ExpressionType::CONJUNCTION_AND: {
762
0
    bool any_children_true = false;
763
0
    ExpressionIterator::EnumerateChildren(
764
0
        expr, [&](const Expression &child) { any_children_true |= ApplyBloomFilter(child, bloom_filter); });
765
0
    return any_children_true;
766
0
  }
767
0
  case ExpressionType::CONJUNCTION_OR: {
768
0
    bool all_children_true = true;
769
0
    ExpressionIterator::EnumerateChildren(
770
0
        expr, [&](const Expression &child) { all_children_true &= ApplyBloomFilter(child, bloom_filter); });
771
0
    return all_children_true;
772
0
  }
773
0
  default:
774
0
    return false;
775
0
  }
776
0
}
777
778
0
static bool ApplyBloomFilter(const TableFilter &duckdb_filter, ParquetBloomFilter &bloom_filter) {
779
0
  auto &expr_filter = ExpressionFilter::GetExpressionFilter(duckdb_filter, "ParquetStatistics::ApplyBloomFilter");
780
0
  return ApplyBloomFilter(*expr_filter.expr, bloom_filter);
781
0
}
782
783
0
bool ParquetStatisticsUtils::BloomFilterSupported(const LogicalTypeId &type_id) {
784
0
  switch (type_id) {
785
0
  case LogicalTypeId::TINYINT:
786
0
  case LogicalTypeId::UTINYINT:
787
0
  case LogicalTypeId::SMALLINT:
788
0
  case LogicalTypeId::USMALLINT:
789
0
  case LogicalTypeId::INTEGER:
790
0
  case LogicalTypeId::UINTEGER:
791
0
  case LogicalTypeId::BIGINT:
792
0
  case LogicalTypeId::UBIGINT:
793
0
  case LogicalTypeId::FLOAT:
794
0
  case LogicalTypeId::DOUBLE:
795
0
  case LogicalTypeId::VARCHAR:
796
0
  case LogicalTypeId::BLOB:
797
0
    return true;
798
0
  default:
799
0
    return false;
800
0
  }
801
0
}
802
803
bool ParquetStatisticsUtils::BloomFilterExcludes(const TableFilter &duckdb_filter,
804
                                                 const duckdb_parquet::ColumnMetaData &column_meta_data,
805
0
                                                 TProtocol &file_proto, Allocator &allocator) {
806
0
  if (!HasFilterConstants(duckdb_filter) || !column_meta_data.__isset.bloom_filter_offset ||
807
0
      column_meta_data.bloom_filter_offset <= 0) {
808
0
    return false;
809
0
  }
810
  // TODO check length against file length!
811
812
0
  auto &transport = reinterpret_cast<ThriftFileTransport &>(*file_proto.getTransport());
813
0
  transport.SetLocation(column_meta_data.bloom_filter_offset);
814
0
  if (column_meta_data.__isset.bloom_filter_length && column_meta_data.bloom_filter_length > 0) {
815
0
    transport.Prefetch(column_meta_data.bloom_filter_offset, column_meta_data.bloom_filter_length);
816
0
  }
817
818
0
  duckdb_parquet::BloomFilterHeader filter_header;
819
  // TODO the bloom filter could be encrypted, too, so need to double check that this is NOT the case
820
0
  filter_header.read(&file_proto);
821
0
  if (!filter_header.algorithm.__isset.BLOCK || !filter_header.compression.__isset.UNCOMPRESSED ||
822
0
      !filter_header.hash.__isset.XXHASH) {
823
0
    return false;
824
0
  }
825
826
0
  auto new_buffer = make_uniq<ResizeableBuffer>(allocator, filter_header.numBytes);
827
0
  transport.read(new_buffer->ptr, filter_header.numBytes);
828
0
  ParquetBloomFilter bloom_filter(std::move(new_buffer));
829
0
  return ApplyBloomFilter(duckdb_filter, bloom_filter);
830
0
}
831
832
0
ParquetBloomFilter::ParquetBloomFilter(idx_t num_entries, double bloom_filter_false_positive_ratio) {
833
  // aim for hit ratio of 0.01%
834
  // see http://tfk.mit.edu/pdf/bloom.pdf
835
0
  double f = bloom_filter_false_positive_ratio;
836
0
  double k = 8.0;
837
0
  double n = LossyNumericCast<double>(num_entries);
838
0
  double m = -k * n / std::log(1 - std::pow(f, 1 / k));
839
0
  auto b = MaxValue<idx_t>(NextPowerOfTwo(LossyNumericCast<idx_t>(m / k)) / 32, 1);
840
841
0
  D_ASSERT(b > 0 && IsPowerOfTwo(b));
842
843
0
  data = make_uniq<ResizeableBuffer>(Allocator::DefaultAllocator(), sizeof(ParquetBloomBlock) * b);
844
0
  data->zero();
845
0
  block_count = data->len / sizeof(ParquetBloomBlock);
846
0
  D_ASSERT(data->len % sizeof(ParquetBloomBlock) == 0);
847
0
}
848
849
0
ParquetBloomFilter::ParquetBloomFilter(unique_ptr<ResizeableBuffer> data_p) {
850
0
  D_ASSERT(data_p->len % sizeof(ParquetBloomBlock) == 0);
851
0
  data = std::move(data_p);
852
0
  block_count = data->len / sizeof(ParquetBloomBlock);
853
0
  D_ASSERT(data->len % sizeof(ParquetBloomBlock) == 0);
854
0
}
855
856
0
void ParquetBloomFilter::FilterInsert(uint64_t x) {
857
0
  auto blocks = reinterpret_cast<ParquetBloomBlock *>(data->ptr);
858
0
  uint64_t i = ((x >> 32) * block_count) >> 32;
859
0
  auto &b = blocks[i];
860
0
  ParquetBloomBlock::BlockInsert(b, x);
861
0
}
862
863
0
bool ParquetBloomFilter::FilterCheck(uint64_t x) {
864
0
  auto blocks = reinterpret_cast<ParquetBloomBlock *>(data->ptr);
865
0
  auto i = ((x >> 32) * block_count) >> 32;
866
0
  return ParquetBloomBlock::BlockCheck(blocks[i], x);
867
0
}
868
869
// compiler optimizes this into a single instruction (popcnt)
870
0
static uint8_t PopCnt64(uint64_t n) {
871
0
  uint8_t c = 0;
872
0
  for (; n; ++c) {
873
0
    n &= n - 1;
874
0
  }
875
0
  return c;
876
0
}
877
878
0
double ParquetBloomFilter::OneRatio() {
879
0
  auto bloom_ptr = reinterpret_cast<uint64_t *>(data->ptr);
880
0
  idx_t one_count = 0;
881
0
  for (idx_t b_idx = 0; b_idx < data->len / sizeof(uint64_t); ++b_idx) {
882
0
    one_count += PopCnt64(bloom_ptr[b_idx]);
883
0
  }
884
0
  return LossyNumericCast<double>(one_count) / (LossyNumericCast<double>(data->len) * 8.0);
885
0
}
886
887
0
ResizeableBuffer *ParquetBloomFilter::Get() {
888
0
  return data.get();
889
0
}
890
891
} // namespace duckdb