/src/duckdb/extension/parquet/parquet_statistics.cpp
Line | Count | Source |
1 | | #include "parquet_statistics.hpp" |
2 | | |
3 | | #include <cmath> |
4 | | #include <memory> |
5 | | #include <utility> |
6 | | #include <vector> |
7 | | |
8 | | #include "parquet_decimal_utils.hpp" |
9 | | #include "parquet_timestamp.hpp" |
10 | | #include "parquet_float16.hpp" |
11 | | #include "reader/string_column_reader.hpp" |
12 | | #include "reader/variant_column_reader.hpp" |
13 | | #include "zstd/common/xxhash.hpp" |
14 | | #include "duckdb/common/types/blob.hpp" |
15 | | #include "duckdb/common/types/time.hpp" |
16 | | #include "duckdb/common/types/value.hpp" |
17 | | #include "duckdb/common/helper.hpp" |
18 | | #include "duckdb/storage/statistics/struct_stats.hpp" |
19 | | #include "duckdb/storage/statistics/list_stats.hpp" |
20 | | #include "duckdb/planner/filter/expression_filter.hpp" |
21 | | #include "duckdb/planner/expression/bound_comparison_expression.hpp" |
22 | | #include "duckdb/planner/expression/bound_conjunction_expression.hpp" |
23 | | #include "duckdb/planner/expression/bound_constant_expression.hpp" |
24 | | #include "reader/uuid_column_reader.hpp" |
25 | | #include "duckdb/common/type_visitor.hpp" |
26 | | #include "column_reader.hpp" |
27 | | #include "duckdb/common/allocator.hpp" |
28 | | #include "duckdb/common/constants.hpp" |
29 | | #include "duckdb/common/enums/expression_type.hpp" |
30 | | #include "duckdb/common/exception.hpp" |
31 | | #include "duckdb/common/helper.hpp" |
32 | | #include "duckdb/common/hugeint.hpp" |
33 | | #include "duckdb/common/numeric_utils.hpp" |
34 | | #include "duckdb/common/optional_ptr.hpp" |
35 | | #include "duckdb/common/string.hpp" |
36 | | #include "duckdb/common/types.hpp" |
37 | | #include "duckdb/common/types/date.hpp" |
38 | | #include "duckdb/common/types/datetime.hpp" |
39 | | #include "duckdb/common/types/geometry.hpp" |
40 | | #include "duckdb/common/types/string_type.hpp" |
41 | | #include "duckdb/common/types/timestamp.hpp" |
42 | | #include "duckdb/planner/table_filter.hpp" |
43 | | #include "duckdb/storage/statistics/geometry_stats.hpp" |
44 | | #include "duckdb/storage/statistics/numeric_stats.hpp" |
45 | | #include "duckdb/storage/statistics/string_stats.hpp" |
46 | | #include "duckdb/storage/statistics/variant_stats.hpp" |
47 | | #include "parquet_column_schema.hpp" |
48 | | #include "parquet_types.h" |
49 | | #include "thrift/protocol/TProtocol.h" |
50 | | #include "thrift_tools.hpp" |
51 | | |
52 | | namespace duckdb { |
53 | | |
54 | | using duckdb_parquet::ConvertedType; |
55 | | using duckdb_parquet::Type; |
56 | | |
57 | | unique_ptr<BaseStatistics> ParquetStatisticsUtils::CreateNumericStats(const LogicalType &type, |
58 | | const ParquetColumnSchema &schema_ele, |
59 | 0 | const duckdb_parquet::Statistics &parquet_stats) { |
60 | 0 | auto stats = NumericStats::CreateUnknown(type); |
61 | | |
62 | | // for reasons unknown to science, Parquet defines *both* `min` and `min_value` as well as `max` and |
63 | | // `max_value`. All are optional. such elegance. |
64 | 0 | Value min; |
65 | 0 | Value max; |
66 | 0 | if (parquet_stats.__isset.min_value) { |
67 | 0 | min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min_value); |
68 | 0 | } else if (parquet_stats.__isset.min) { |
69 | 0 | min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min); |
70 | 0 | } else { |
71 | 0 | min = Value(type); |
72 | 0 | } |
73 | 0 | if (parquet_stats.__isset.max_value) { |
74 | 0 | max = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.max_value); |
75 | 0 | } else if (parquet_stats.__isset.max) { |
76 | 0 | max = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.max); |
77 | 0 | } else { |
78 | 0 | max = Value(type); |
79 | 0 | } |
80 | 0 | NumericStats::SetMin(stats, min); |
81 | 0 | NumericStats::SetMax(stats, max); |
82 | 0 | return stats.ToUnique(); |
83 | 0 | } |
84 | | |
85 | | static unique_ptr<BaseStatistics> CreateFloatingPointStats(const LogicalType &type, |
86 | | const ParquetColumnSchema &schema_ele, |
87 | 0 | const duckdb_parquet::Statistics &parquet_stats) { |
88 | 0 | auto stats = NumericStats::CreateUnknown(type); |
89 | | |
90 | | // floating point values can always have NaN values - hence we cannot use the max value from the file |
91 | 0 | Value min; |
92 | 0 | Value max; |
93 | 0 | if (parquet_stats.__isset.min_value) { |
94 | 0 | min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min_value); |
95 | 0 | } else if (parquet_stats.__isset.min) { |
96 | 0 | min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min); |
97 | 0 | } else { |
98 | 0 | min = Value(type); |
99 | 0 | } |
100 | 0 | max = Value("nan").DefaultCastAs(type); |
101 | 0 | NumericStats::SetMin(stats, min); |
102 | 0 | NumericStats::SetMax(stats, max); |
103 | 0 | return stats.ToUnique(); |
104 | 0 | } |
105 | | |
106 | | Value ParquetStatisticsUtils::ConvertValue(const LogicalType &type, const ParquetColumnSchema &schema_ele, |
107 | 0 | const std::string &stats) { |
108 | 0 | Value result; |
109 | 0 | string error; |
110 | 0 | auto stats_val = ConvertValueInternal(type, schema_ele, stats); |
111 | 0 | if (!stats_val.DefaultTryCastAs(type, result, &error)) { |
112 | 0 | return Value(type); |
113 | 0 | } |
114 | 0 | return result; |
115 | 0 | } |
116 | | Value ParquetStatisticsUtils::ConvertValueInternal(const LogicalType &type, const ParquetColumnSchema &schema_ele, |
117 | 0 | const std::string &stats) { |
118 | 0 | auto stats_data = const_data_ptr_cast(stats.c_str()); |
119 | 0 | switch (type.id()) { |
120 | 0 | case LogicalTypeId::BOOLEAN: { |
121 | 0 | if (stats.size() != sizeof(bool)) { |
122 | 0 | throw InvalidInputException("Incorrect stats size for type BOOLEAN"); |
123 | 0 | } |
124 | 0 | return Value::BOOLEAN(Load<bool>(stats_data)); |
125 | 0 | } |
126 | 0 | case LogicalTypeId::UTINYINT: |
127 | 0 | case LogicalTypeId::USMALLINT: |
128 | 0 | case LogicalTypeId::UINTEGER: |
129 | 0 | if (stats.size() != sizeof(uint32_t)) { |
130 | 0 | throw InvalidInputException("Incorrect stats size for type UINTEGER"); |
131 | 0 | } |
132 | 0 | return Value::UINTEGER(Load<uint32_t>(stats_data)); |
133 | 0 | case LogicalTypeId::UBIGINT: |
134 | 0 | if (stats.size() != sizeof(uint64_t)) { |
135 | 0 | throw InvalidInputException("Incorrect stats size for type UBIGINT"); |
136 | 0 | } |
137 | 0 | return Value::UBIGINT(Load<uint64_t>(stats_data)); |
138 | 0 | case LogicalTypeId::TINYINT: |
139 | 0 | case LogicalTypeId::SMALLINT: |
140 | 0 | case LogicalTypeId::INTEGER: |
141 | 0 | if (stats.size() != sizeof(int32_t)) { |
142 | 0 | throw InvalidInputException("Incorrect stats size for type INTEGER"); |
143 | 0 | } |
144 | 0 | return Value::INTEGER(Load<int32_t>(stats_data)); |
145 | 0 | case LogicalTypeId::BIGINT: |
146 | 0 | if (stats.size() != sizeof(int64_t)) { |
147 | 0 | throw InvalidInputException("Incorrect stats size for type BIGINT"); |
148 | 0 | } |
149 | 0 | return Value::BIGINT(Load<int64_t>(stats_data)); |
150 | 0 | case LogicalTypeId::FLOAT: { |
151 | 0 | float val; |
152 | 0 | if (schema_ele.type_info == ParquetExtraTypeInfo::FLOAT16) { |
153 | 0 | if (stats.size() != sizeof(uint16_t)) { |
154 | 0 | throw InvalidInputException("Incorrect stats size for type FLOAT16"); |
155 | 0 | } |
156 | 0 | val = Float16ToFloat32(Load<uint16_t>(stats_data)); |
157 | 0 | } else { |
158 | 0 | if (stats.size() != sizeof(float)) { |
159 | 0 | throw InvalidInputException("Incorrect stats size for type FLOAT"); |
160 | 0 | } |
161 | 0 | val = Load<float>(stats_data); |
162 | 0 | } |
163 | 0 | if (!Value::FloatIsFinite(val)) { |
164 | 0 | return Value(); |
165 | 0 | } |
166 | 0 | return Value::FLOAT(val); |
167 | 0 | } |
168 | 0 | case LogicalTypeId::DOUBLE: { |
169 | 0 | if (schema_ele.type_info == ParquetExtraTypeInfo::DECIMAL_BYTE_ARRAY) { |
170 | | // decimals cast to double |
171 | 0 | return Value::DOUBLE(ParquetDecimalUtils::ReadDecimalValue<double>(stats_data, stats.size(), schema_ele)); |
172 | 0 | } |
173 | 0 | if (stats.size() != sizeof(double)) { |
174 | 0 | throw InvalidInputException("Incorrect stats size for type DOUBLE"); |
175 | 0 | } |
176 | 0 | auto val = Load<double>(stats_data); |
177 | 0 | if (!Value::DoubleIsFinite(val)) { |
178 | 0 | return Value(); |
179 | 0 | } |
180 | 0 | return Value::DOUBLE(val); |
181 | 0 | } |
182 | 0 | case LogicalTypeId::DECIMAL: { |
183 | 0 | auto width = DecimalType::GetWidth(type); |
184 | 0 | auto scale = DecimalType::GetScale(type); |
185 | 0 | switch (schema_ele.type_info) { |
186 | 0 | case ParquetExtraTypeInfo::DECIMAL_INT32: |
187 | 0 | if (stats.size() != sizeof(int32_t)) { |
188 | 0 | throw InvalidInputException("Incorrect stats size for type %s", type.ToString()); |
189 | 0 | } |
190 | 0 | return Value::DECIMAL(Load<int32_t>(stats_data), width, scale); |
191 | 0 | case ParquetExtraTypeInfo::DECIMAL_INT64: |
192 | 0 | if (stats.size() != sizeof(int64_t)) { |
193 | 0 | throw InvalidInputException("Incorrect stats size for type %s", type.ToString()); |
194 | 0 | } |
195 | 0 | return Value::DECIMAL(Load<int64_t>(stats_data), width, scale); |
196 | 0 | case ParquetExtraTypeInfo::DECIMAL_BYTE_ARRAY: |
197 | 0 | switch (type.InternalType()) { |
198 | 0 | case PhysicalType::INT16: |
199 | 0 | return Value::DECIMAL( |
200 | 0 | ParquetDecimalUtils::ReadDecimalValue<int16_t>(stats_data, stats.size(), schema_ele), width, scale); |
201 | 0 | case PhysicalType::INT32: |
202 | 0 | return Value::DECIMAL( |
203 | 0 | ParquetDecimalUtils::ReadDecimalValue<int32_t>(stats_data, stats.size(), schema_ele), width, scale); |
204 | 0 | case PhysicalType::INT64: |
205 | 0 | return Value::DECIMAL( |
206 | 0 | ParquetDecimalUtils::ReadDecimalValue<int64_t>(stats_data, stats.size(), schema_ele), width, scale); |
207 | 0 | case PhysicalType::INT128: |
208 | 0 | return Value::DECIMAL( |
209 | 0 | ParquetDecimalUtils::ReadDecimalValue<hugeint_t>(stats_data, stats.size(), schema_ele), width, |
210 | 0 | scale); |
211 | 0 | default: |
212 | 0 | throw InvalidInputException("Unsupported internal type for decimal"); |
213 | 0 | } |
214 | 0 | default: |
215 | 0 | throw NotImplementedException("Unrecognized Parquet type for Decimal"); |
216 | 0 | } |
217 | 0 | } |
218 | 0 | case LogicalTypeId::VARCHAR: |
219 | 0 | case LogicalTypeId::BLOB: |
220 | 0 | if (type.id() == LogicalTypeId::BLOB || !Value::StringIsValid(stats)) { |
221 | 0 | return Value(Blob::ToString(string_t(stats))); |
222 | 0 | } |
223 | 0 | return Value(stats); |
224 | 0 | case LogicalTypeId::DATE: |
225 | 0 | if (stats.size() != sizeof(int32_t)) { |
226 | 0 | throw InvalidInputException("Incorrect stats size for type DATE"); |
227 | 0 | } |
228 | 0 | return Value::DATE(date_t(Load<int32_t>(stats_data))); |
229 | 0 | case LogicalTypeId::TIME: { |
230 | 0 | int64_t val; |
231 | 0 | if (stats.size() == sizeof(int32_t)) { |
232 | 0 | val = Load<int32_t>(stats_data); |
233 | 0 | } else if (stats.size() == sizeof(int64_t)) { |
234 | 0 | val = Load<int64_t>(stats_data); |
235 | 0 | } else { |
236 | 0 | throw InvalidInputException("Incorrect stats size for type TIME"); |
237 | 0 | } |
238 | 0 | switch (schema_ele.type_info) { |
239 | 0 | case ParquetExtraTypeInfo::UNIT_MS: |
240 | 0 | return Value::TIME(Time::FromTimeMs(val)); |
241 | 0 | case ParquetExtraTypeInfo::UNIT_NS: |
242 | 0 | return Value::TIME(Time::FromTimeNs(val)); |
243 | 0 | case ParquetExtraTypeInfo::UNIT_MICROS: |
244 | 0 | default: |
245 | 0 | return Value::TIME(dtime_t(val)); |
246 | 0 | } |
247 | 0 | } |
248 | 0 | case LogicalTypeId::TIME_NS: { |
249 | 0 | int64_t val; |
250 | 0 | if (stats.size() == sizeof(int32_t)) { |
251 | 0 | val = Load<int32_t>(stats_data); |
252 | 0 | } else if (stats.size() == sizeof(int64_t)) { |
253 | 0 | val = Load<int64_t>(stats_data); |
254 | 0 | } else { |
255 | 0 | throw InvalidInputException("Incorrect stats size for type TIME_NS"); |
256 | 0 | } |
257 | 0 | switch (schema_ele.type_info) { |
258 | 0 | case ParquetExtraTypeInfo::UNIT_MS: |
259 | 0 | return Value::TIME_NS(ParquetMsIntToTimeNs(NumericCast<int32_t>(val))); |
260 | 0 | case ParquetExtraTypeInfo::UNIT_NS: |
261 | 0 | return Value::TIME_NS(ParquetIntToTimeNs(val)); |
262 | 0 | case ParquetExtraTypeInfo::UNIT_MICROS: |
263 | 0 | default: |
264 | 0 | return Value::TIME_NS(dtime_ns_t(val)); |
265 | 0 | } |
266 | 0 | } |
267 | 0 | case LogicalTypeId::TIME_TZ: { |
268 | 0 | int64_t val; |
269 | 0 | if (stats.size() == sizeof(int32_t)) { |
270 | 0 | val = Load<int32_t>(stats_data); |
271 | 0 | } else if (stats.size() == sizeof(int64_t)) { |
272 | 0 | val = Load<int64_t>(stats_data); |
273 | 0 | } else { |
274 | 0 | throw InvalidInputException("Incorrect stats size for type TIMETZ"); |
275 | 0 | } |
276 | 0 | switch (schema_ele.type_info) { |
277 | 0 | case ParquetExtraTypeInfo::UNIT_MS: |
278 | 0 | return Value::TIMETZ(ParquetIntToTimeMsTZ(NumericCast<int32_t>(val))); |
279 | 0 | case ParquetExtraTypeInfo::UNIT_NS: |
280 | 0 | return Value::TIMETZ(ParquetIntToTimeNsTZ(val)); |
281 | 0 | case ParquetExtraTypeInfo::UNIT_MICROS: |
282 | 0 | default: |
283 | 0 | return Value::TIMETZ(ParquetIntToTimeTZ(val)); |
284 | 0 | } |
285 | 0 | } |
286 | 0 | case LogicalTypeId::TIMESTAMP: |
287 | 0 | case LogicalTypeId::TIMESTAMP_TZ: { |
288 | 0 | timestamp_t timestamp_value; |
289 | 0 | if (schema_ele.type_info == ParquetExtraTypeInfo::IMPALA_TIMESTAMP) { |
290 | 0 | if (stats.size() != sizeof(Int96)) { |
291 | 0 | throw InvalidInputException("Incorrect stats size for type TIMESTAMP"); |
292 | 0 | } |
293 | 0 | timestamp_value = ImpalaTimestampToTimestamp(Load<Int96>(stats_data)); |
294 | 0 | } else { |
295 | 0 | if (stats.size() != sizeof(int64_t)) { |
296 | 0 | throw InvalidInputException("Incorrect stats size for type TIMESTAMP"); |
297 | 0 | } |
298 | 0 | auto val = Load<int64_t>(stats_data); |
299 | 0 | switch (schema_ele.type_info) { |
300 | 0 | case ParquetExtraTypeInfo::UNIT_MS: |
301 | 0 | timestamp_value = Timestamp::FromEpochMs(val); |
302 | 0 | break; |
303 | 0 | case ParquetExtraTypeInfo::UNIT_NS: |
304 | 0 | timestamp_value = Timestamp::FromEpochNanoSeconds(val); |
305 | 0 | break; |
306 | 0 | case ParquetExtraTypeInfo::UNIT_MICROS: |
307 | 0 | default: |
308 | 0 | timestamp_value = timestamp_t(val); |
309 | 0 | break; |
310 | 0 | } |
311 | 0 | } |
312 | 0 | if (type.id() == LogicalTypeId::TIMESTAMP_TZ) { |
313 | 0 | return Value::TIMESTAMPTZ(timestamp_tz_t(timestamp_value)); |
314 | 0 | } |
315 | 0 | return Value::TIMESTAMP(timestamp_value); |
316 | 0 | } |
317 | 0 | case LogicalTypeId::TIMESTAMP_TZ_NS: |
318 | 0 | case LogicalTypeId::TIMESTAMP_NS: { |
319 | 0 | timestamp_ns_t timestamp_value; |
320 | 0 | if (schema_ele.type_info == ParquetExtraTypeInfo::IMPALA_TIMESTAMP) { |
321 | 0 | if (stats.size() != sizeof(Int96)) { |
322 | 0 | throw InvalidInputException("Incorrect stats size for type TIMESTAMP_NS"); |
323 | 0 | } |
324 | 0 | timestamp_value = ImpalaTimestampToTimestampNS(Load<Int96>(stats_data)); |
325 | 0 | } else { |
326 | 0 | if (stats.size() != sizeof(int64_t)) { |
327 | 0 | throw InvalidInputException("Incorrect stats size for type TIMESTAMP_NS"); |
328 | 0 | } |
329 | 0 | auto val = Load<int64_t>(stats_data); |
330 | 0 | switch (schema_ele.type_info) { |
331 | 0 | case ParquetExtraTypeInfo::UNIT_MS: |
332 | 0 | timestamp_value = ParquetTimestampMsToTimestampNs(val); |
333 | 0 | break; |
334 | 0 | case ParquetExtraTypeInfo::UNIT_NS: |
335 | 0 | timestamp_value = ParquetTimestampNsToTimestampNs(val); |
336 | 0 | break; |
337 | 0 | case ParquetExtraTypeInfo::UNIT_MICROS: |
338 | 0 | default: |
339 | 0 | timestamp_value = ParquetTimestampUsToTimestampNs(val); |
340 | 0 | break; |
341 | 0 | } |
342 | 0 | } |
343 | 0 | if (type.id() == LogicalTypeId::TIMESTAMP_TZ_NS) { |
344 | 0 | return Value::TIMESTAMPTZNS(timestamp_tz_ns_t(timestamp_value)); |
345 | 0 | } |
346 | 0 | return Value::TIMESTAMPNS(timestamp_value); |
347 | 0 | } |
348 | 0 | case LogicalTypeId::UUID: { |
349 | 0 | if (stats.size() != 16) { |
350 | 0 | throw InvalidInputException("Incorrect stats size for type UUID"); |
351 | 0 | } |
352 | 0 | auto uuid_val = UUIDValueConversion::ReadParquetUUID(const_data_ptr_cast(stats.c_str())); |
353 | 0 | return Value::UUID(uuid_val); |
354 | 0 | } |
355 | 0 | default: |
356 | 0 | throw InternalException("Unsupported type for stats %s", type.ToString()); |
357 | 0 | } |
358 | 0 | } |
359 | | |
360 | 0 | bool IsVariantNull(const string &str) { |
361 | 0 | return str.size() == 1 && str[0] == '\0'; |
362 | 0 | } |
363 | | |
364 | | // The conversion is best-effort and non-fatal: when the statistics of a particular (sub)node cannot be |
365 | | // converted, that node is left as UNKNOWN stats (which makes it "not fully shredded", so consumers fall |
366 | | // back to a full scan for that field) instead of discarding the statistics of the entire variant. |
367 | 0 | static void ConvertUnshreddedStats(BaseStatistics &result, optional_ptr<BaseStatistics> input_p) { |
368 | 0 | D_ASSERT(result.GetType().id() == LogicalTypeId::UINTEGER); |
369 | |
|
370 | 0 | if (!input_p) { |
371 | | //! No overlay statistics -> conservatively unknown (this node is not "fully shredded") |
372 | 0 | result.Copy(BaseStatistics::CreateUnknown(LogicalType::UINTEGER)); |
373 | 0 | return; |
374 | 0 | } |
375 | 0 | auto &input = *input_p; |
376 | 0 | D_ASSERT(input.GetType().id() == LogicalTypeId::BLOB); |
377 | 0 | result.CopyValidity(input); |
378 | |
|
379 | 0 | if (!result.CanHaveNoNull()) { |
380 | | //! The overlay is entirely NULL -> no overlay values -> fully shredded |
381 | 0 | return; |
382 | 0 | } |
383 | 0 | if (!StringStats::HasMinMax(input)) { |
384 | | //! The overlay may contain values but we can't tell what they are (e.g. the writer dropped min/max for |
385 | | //! a large blob value) -> conservatively unknown so this node is treated as not fully shredded |
386 | 0 | result.Copy(BaseStatistics::CreateUnknown(LogicalType::UINTEGER)); |
387 | 0 | return; |
388 | 0 | } |
389 | | |
390 | 0 | auto min = StringStats::Min(input); |
391 | 0 | auto max = StringStats::Max(input); |
392 | 0 | if (IsVariantNull(min) && IsVariantNull(max)) { |
393 | | //! All non-shredded values are NULL or VARIANT_NULL, set the stats to indicate this |
394 | 0 | NumericStats::SetMin<uint32_t>(result, 0); |
395 | 0 | NumericStats::SetMax<uint32_t>(result, 0); |
396 | 0 | result.SetHasNoNull(); |
397 | 0 | } |
398 | | //! else: there are real overlay values -> leave min/max unset, so this node is not fully shredded |
399 | 0 | } |
400 | | |
401 | | static void ConvertShreddedStats(BaseStatistics &result, optional_ptr<BaseStatistics> input_p); |
402 | | |
403 | 0 | static void ConvertShreddedStatsItem(BaseStatistics &result, BaseStatistics &input) { |
404 | 0 | D_ASSERT(result.GetType().id() == LogicalTypeId::STRUCT); |
405 | 0 | D_ASSERT(input.GetType().id() == LogicalTypeId::STRUCT); |
406 | | |
407 | | // result variant stats |
408 | 0 | auto &untyped_value_index_stats = StructStats::GetChildStats(result, VariantStats::UNTYPED_VALUE_INDEX); |
409 | 0 | auto &typed_value_result = StructStats::GetChildStats(result, VariantStats::TYPED_VALUE_INDEX); |
410 | | |
411 | | // input parquet stats |
412 | 0 | auto &value_stats = StructStats::GetChildStats(input, 0); |
413 | 0 | auto &typed_value_input = StructStats::GetChildStats(input, 1); |
414 | |
|
415 | 0 | ConvertUnshreddedStats(untyped_value_index_stats, value_stats); |
416 | 0 | ConvertShreddedStats(typed_value_result, typed_value_input); |
417 | 0 | } |
418 | | |
419 | 0 | static void ConvertShreddedStats(BaseStatistics &result, optional_ptr<BaseStatistics> input_p) { |
420 | 0 | if (!input_p) { |
421 | | //! No statistics for this shredded subtree -> leave it unknown (conservative) |
422 | 0 | result.Copy(BaseStatistics::CreateUnknown(result.GetType())); |
423 | 0 | return; |
424 | 0 | } |
425 | 0 | auto &input = *input_p; |
426 | 0 | result.CopyValidity(input); |
427 | |
|
428 | 0 | auto type_id = result.GetType().id(); |
429 | 0 | if (type_id == LogicalTypeId::LIST) { |
430 | 0 | ConvertShreddedStatsItem(ListStats::GetChildStats(result), ListStats::GetChildStats(input)); |
431 | 0 | return; |
432 | 0 | } |
433 | 0 | if (type_id == LogicalTypeId::STRUCT) { |
434 | 0 | auto field_count = StructType::GetChildCount(result.GetType()); |
435 | 0 | for (idx_t i = 0; i < field_count; i++) { |
436 | 0 | ConvertShreddedStatsItem(StructStats::GetChildStats(result, i), StructStats::GetChildStats(input, i)); |
437 | 0 | } |
438 | 0 | return; |
439 | 0 | } |
440 | | //! Primitive leaf - copy the parquet stats if the types line up, otherwise leave it unknown |
441 | 0 | if (result.GetType() == input.GetType()) { |
442 | 0 | result.Copy(input); |
443 | 0 | } else { |
444 | 0 | result.Copy(BaseStatistics::CreateUnknown(result.GetType())); |
445 | 0 | } |
446 | 0 | } |
447 | | |
448 | 0 | bool StringStatsAreValid(const string &stats, bool is_varchar, StringStatsType stats_type) { |
449 | 0 | if (stats_type == StringStatsType::TRUNCATED_STATS) { |
450 | | // truncated stats can contain invalid UTF8 due to truncation - this is fine |
451 | 0 | return true; |
452 | 0 | } |
453 | | // for exact stats we need the stats to be valid because we might emit them |
454 | | // we could optionally convert these into truncated stats... |
455 | | // but if a file has corrupt exact string stats it's likely these are bogus, so just ignore them |
456 | 0 | return StringColumnReader::IsValid(stats, is_varchar); |
457 | 0 | } |
458 | | |
459 | | unique_ptr<BaseStatistics> |
460 | | ParquetStatisticsUtils::TransformParquetStatistics(const LogicalType &type, const ParquetColumnSchema &schema, |
461 | | const duckdb_parquet::Statistics &parquet_stats, bool can_have_nan, |
462 | 0 | optional_ptr<const ColumnChunk> column_chunk) { |
463 | 0 | switch (type.id()) { |
464 | 0 | case LogicalTypeId::BOOLEAN: |
465 | 0 | case LogicalTypeId::UTINYINT: |
466 | 0 | case LogicalTypeId::USMALLINT: |
467 | 0 | case LogicalTypeId::UINTEGER: |
468 | 0 | case LogicalTypeId::UBIGINT: |
469 | 0 | case LogicalTypeId::TINYINT: |
470 | 0 | case LogicalTypeId::SMALLINT: |
471 | 0 | case LogicalTypeId::INTEGER: |
472 | 0 | case LogicalTypeId::BIGINT: |
473 | 0 | case LogicalTypeId::DATE: |
474 | 0 | case LogicalTypeId::TIME: |
475 | 0 | case LogicalTypeId::TIME_TZ: |
476 | 0 | case LogicalTypeId::TIMESTAMP: |
477 | 0 | case LogicalTypeId::TIMESTAMP_TZ: |
478 | 0 | case LogicalTypeId::TIMESTAMP_TZ_NS: |
479 | 0 | case LogicalTypeId::TIMESTAMP_SEC: |
480 | 0 | case LogicalTypeId::TIMESTAMP_MS: |
481 | 0 | case LogicalTypeId::TIMESTAMP_NS: |
482 | 0 | case LogicalTypeId::DECIMAL: |
483 | 0 | case LogicalTypeId::UUID: |
484 | 0 | return CreateNumericStats(type, schema, parquet_stats); |
485 | 0 | case LogicalTypeId::FLOAT: |
486 | 0 | case LogicalTypeId::DOUBLE: |
487 | 0 | if (can_have_nan) { |
488 | | // Since parquet doesn't tell us if the column has NaN values, if the user has explicitly declared that it |
489 | | // does, we create stats without an upper max value, as NaN compares larger than anything else. |
490 | 0 | return CreateFloatingPointStats(type, schema, parquet_stats); |
491 | 0 | } else { |
492 | | // Otherwise we use the numeric stats as usual, which might lead to "wrong" pruning if the column contains |
493 | | // NaN values. The parquet spec is not clear on how to handle NaN values in statistics, and so this is |
494 | | // probably the best we can do for now. |
495 | 0 | return CreateNumericStats(type, schema, parquet_stats); |
496 | 0 | } |
497 | 0 | break; |
498 | 0 | case LogicalTypeId::BLOB: |
499 | 0 | case LogicalTypeId::VARCHAR: { |
500 | 0 | auto string_stats = StringStats::CreateUnknown(type); |
501 | 0 | const bool is_varchar = type.id() == LogicalTypeId::VARCHAR; |
502 | 0 | auto min_stats_type = parquet_stats.__isset.is_min_value_exact && parquet_stats.is_min_value_exact |
503 | 0 | ? StringStatsType::EXACT_STATS |
504 | 0 | : StringStatsType::TRUNCATED_STATS; |
505 | 0 | auto max_stats_type = parquet_stats.__isset.is_max_value_exact && parquet_stats.is_max_value_exact |
506 | 0 | ? StringStatsType::EXACT_STATS |
507 | 0 | : StringStatsType::TRUNCATED_STATS; |
508 | 0 | if (parquet_stats.__isset.min_value && |
509 | 0 | StringStatsAreValid(parquet_stats.min_value, is_varchar, min_stats_type)) { |
510 | 0 | StringStats::SetMin(string_stats, parquet_stats.min_value, min_stats_type); |
511 | 0 | } else if (parquet_stats.__isset.min && StringStatsAreValid(parquet_stats.min, is_varchar, min_stats_type)) { |
512 | 0 | StringStats::SetMin(string_stats, parquet_stats.min, min_stats_type); |
513 | 0 | } |
514 | 0 | if (parquet_stats.__isset.max_value && |
515 | 0 | StringStatsAreValid(parquet_stats.max_value, is_varchar, max_stats_type)) { |
516 | 0 | StringStats::SetMax(string_stats, parquet_stats.max_value, max_stats_type); |
517 | 0 | } else if (parquet_stats.__isset.max && StringStatsAreValid(parquet_stats.max, is_varchar, max_stats_type)) { |
518 | 0 | StringStats::SetMax(string_stats, parquet_stats.max, max_stats_type); |
519 | 0 | } |
520 | 0 | return string_stats.ToUnique(); |
521 | 0 | } |
522 | 0 | case LogicalTypeId::GEOMETRY: { |
523 | 0 | if (!column_chunk) { |
524 | 0 | break; |
525 | 0 | } |
526 | 0 | auto geo_stats = GeometryStats::CreateUnknown(type); |
527 | 0 | if (column_chunk->meta_data.__isset.geospatial_statistics) { |
528 | 0 | if (column_chunk->meta_data.geospatial_statistics.__isset.bbox) { |
529 | 0 | auto &bbox = column_chunk->meta_data.geospatial_statistics.bbox; |
530 | 0 | auto &stats_bbox = GeometryStats::GetExtent(geo_stats); |
531 | | |
532 | | // xmin > xmax is allowed if the geometry crosses the antimeridian, |
533 | | // but we don't handle this right now |
534 | 0 | if (bbox.xmin <= bbox.xmax) { |
535 | 0 | stats_bbox.x_min = bbox.xmin; |
536 | 0 | stats_bbox.x_max = bbox.xmax; |
537 | 0 | } |
538 | |
|
539 | 0 | if (bbox.ymin <= bbox.ymax) { |
540 | 0 | stats_bbox.y_min = bbox.ymin; |
541 | 0 | stats_bbox.y_max = bbox.ymax; |
542 | 0 | } |
543 | |
|
544 | 0 | if (bbox.__isset.zmin && bbox.__isset.zmax && bbox.zmin <= bbox.zmax) { |
545 | 0 | stats_bbox.z_min = bbox.zmin; |
546 | 0 | stats_bbox.z_max = bbox.zmax; |
547 | 0 | } |
548 | |
|
549 | 0 | if (bbox.__isset.mmin && bbox.__isset.mmax && bbox.mmin <= bbox.mmax) { |
550 | 0 | stats_bbox.m_min = bbox.mmin; |
551 | 0 | stats_bbox.m_max = bbox.mmax; |
552 | 0 | } |
553 | 0 | } |
554 | 0 | if (column_chunk->meta_data.geospatial_statistics.__isset.geospatial_types) { |
555 | 0 | auto &types = column_chunk->meta_data.geospatial_statistics.geospatial_types; |
556 | 0 | auto &stats_types = GeometryStats::GetTypes(geo_stats); |
557 | | |
558 | | // if types are set but empty, that still means "any type" - so we leave stats_types as-is (unknown) |
559 | | // otherwise, clear and set to the actual types |
560 | |
|
561 | 0 | if (!types.empty()) { |
562 | 0 | stats_types.Clear(); |
563 | 0 | for (auto &geom_type : types) { |
564 | 0 | stats_types.AddWKBType(geom_type); |
565 | 0 | } |
566 | 0 | } |
567 | 0 | } |
568 | 0 | } |
569 | 0 | return geo_stats.ToUnique(); |
570 | 0 | } |
571 | 0 | default: |
572 | 0 | break; |
573 | 0 | } // end of type switch |
574 | | |
575 | | // no specific stats, only create unknown stats to hold validity information |
576 | 0 | auto unknown_stats = BaseStatistics::CreateUnknown(type); |
577 | 0 | return unknown_stats.ToUnique(); |
578 | 0 | } |
579 | | |
580 | | unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(const ParquetColumnSchema &schema, |
581 | | const vector<ColumnChunk> &columns, |
582 | 0 | bool can_have_nan) { |
583 | | // Not supported types |
584 | 0 | auto &type = schema.type; |
585 | 0 | if (type.id() == LogicalTypeId::ARRAY || type.id() == LogicalTypeId::MAP) { |
586 | 0 | return nullptr; |
587 | 0 | } |
588 | | |
589 | 0 | unique_ptr<BaseStatistics> row_group_stats; |
590 | |
|
591 | 0 | if (type.id() == LogicalTypeId::LIST) { |
592 | 0 | auto list_stats = ListStats::CreateUnknown(type); |
593 | 0 | auto &child_schema = schema.children[0]; |
594 | 0 | auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns, can_have_nan); |
595 | 0 | ListStats::SetChildStats(list_stats, std::move(child_stats)); |
596 | 0 | row_group_stats = list_stats.ToUnique(); |
597 | 0 | return row_group_stats; |
598 | 0 | } |
599 | | // Structs are handled differently (they dont have stats) |
600 | 0 | if (type.id() == LogicalTypeId::STRUCT) { |
601 | 0 | auto struct_stats = StructStats::CreateUnknown(type); |
602 | | // Recurse into child readers |
603 | 0 | for (idx_t i = 0; i < schema.children.size(); i++) { |
604 | 0 | auto &child_schema = schema.children[i]; |
605 | 0 | auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns, can_have_nan); |
606 | 0 | StructStats::SetChildStats(struct_stats, i, std::move(child_stats)); |
607 | 0 | } |
608 | 0 | row_group_stats = struct_stats.ToUnique(); |
609 | 0 | return row_group_stats; |
610 | 0 | } else if (schema.schema_type == ParquetColumnSchemaType::VARIANT) { |
611 | 0 | auto children_count = schema.children.size(); |
612 | 0 | if (children_count != 3) { |
613 | 0 | return nullptr; |
614 | 0 | } |
615 | | //! Create the VARIANT stats |
616 | 0 | auto &typed_value = schema.children[2]; |
617 | 0 | LogicalType logical_type; |
618 | 0 | if (!VariantColumnReader::TypedValueLayoutToType(typed_value.type, logical_type)) { |
619 | | //! We couldn't convert the parquet typed_value to a structured type (likely because a nested 'typed_value' |
620 | | //! field is missing) |
621 | 0 | return nullptr; |
622 | 0 | } |
623 | 0 | auto shredding_type = TypeVisitor::VisitReplace(logical_type, [](const LogicalType &type) { |
624 | 0 | return LogicalType::STRUCT({{"typed_value", type}, {"untyped_value_index", LogicalType::UINTEGER}}); |
625 | 0 | }); |
626 | 0 | auto variant_stats = VariantStats::CreateShredded(shredding_type); |
627 | | |
628 | | //! Take the root stats |
629 | 0 | auto &shredded_stats = VariantStats::GetShreddedStats(variant_stats); |
630 | 0 | auto &untyped_value_index_stats = StructStats::GetChildStats(shredded_stats, VariantStats::UNTYPED_VALUE_INDEX); |
631 | 0 | auto &typed_value_stats = StructStats::GetChildStats(shredded_stats, VariantStats::TYPED_VALUE_INDEX); |
632 | | |
633 | | //! Convert the root 'value' -> 'untyped_value_index' |
634 | 0 | auto &value = schema.children[1]; |
635 | 0 | D_ASSERT(value.name == "value"); |
636 | 0 | auto value_stats = ParquetStatisticsUtils::TransformColumnStatistics(value, columns, can_have_nan); |
637 | | //! Best-effort: nodes whose stats can't be converted are left UNKNOWN (not fully shredded) rather |
638 | | //! than discarding the statistics for the entire variant column |
639 | 0 | ConvertUnshreddedStats(untyped_value_index_stats, value_stats.get()); |
640 | |
|
641 | 0 | auto parquet_typed_value_stats = |
642 | 0 | ParquetStatisticsUtils::TransformColumnStatistics(typed_value, columns, can_have_nan); |
643 | 0 | ConvertShreddedStats(typed_value_stats, parquet_typed_value_stats.get()); |
644 | | //! Set validity to UNKNOWN |
645 | 0 | variant_stats.SetHasNoNull(); |
646 | 0 | variant_stats.SetHasNull(); |
647 | 0 | return variant_stats.ToUnique(); |
648 | 0 | } |
649 | | |
650 | | // Otherwise, its a standard column with stats |
651 | 0 | auto &column_chunk = columns[schema.column_index]; |
652 | 0 | if (!column_chunk.__isset.meta_data || !column_chunk.meta_data.__isset.statistics) { |
653 | | // no stats present for row group |
654 | 0 | return nullptr; |
655 | 0 | } |
656 | 0 | auto &parquet_stats = column_chunk.meta_data.statistics; |
657 | 0 | row_group_stats = TransformParquetStatistics(type, schema, parquet_stats, can_have_nan, &column_chunk); |
658 | | |
659 | | // null count is generic |
660 | 0 | if (row_group_stats) { |
661 | 0 | row_group_stats->Set(StatsInfo::CAN_HAVE_NULL_AND_VALID_VALUES); |
662 | 0 | if (parquet_stats.__isset.null_count && parquet_stats.null_count == 0) { |
663 | 0 | row_group_stats->Set(StatsInfo::CANNOT_HAVE_NULL_VALUES); |
664 | 0 | } |
665 | 0 | if (parquet_stats.__isset.null_count && parquet_stats.null_count == column_chunk.meta_data.num_values) { |
666 | 0 | row_group_stats->Set(StatsInfo::CANNOT_HAVE_VALID_VALUES); |
667 | 0 | } |
668 | 0 | } |
669 | 0 | return row_group_stats; |
670 | 0 | } |
671 | | |
672 | 0 | static bool HasFilterConstants(const Expression &expr) { |
673 | 0 | if (BoundComparisonExpression::IsComparison(expr)) { |
674 | 0 | auto &comp = expr.Cast<BoundFunctionExpression>(); |
675 | 0 | if (comp.GetExpressionType() != ExpressionType::COMPARE_EQUAL) { |
676 | 0 | return false; |
677 | 0 | } |
678 | 0 | auto &right = BoundComparisonExpression::Right(comp); |
679 | 0 | if (right.GetExpressionType() != ExpressionType::VALUE_CONSTANT) { |
680 | 0 | return false; |
681 | 0 | } |
682 | 0 | auto &constant = right.Cast<BoundConstantExpression>(); |
683 | 0 | return !constant.GetValue().IsNull(); |
684 | 0 | } |
685 | 0 | if (expr.GetExpressionClass() != ExpressionClass::BOUND_CONJUNCTION) { |
686 | 0 | return false; |
687 | 0 | } |
688 | 0 | bool child_has_constant = false; |
689 | 0 | ExpressionIterator::EnumerateChildren(expr, [&](const Expression &child) { |
690 | 0 | if (!child_has_constant) { |
691 | 0 | child_has_constant = HasFilterConstants(child); |
692 | 0 | } |
693 | 0 | }); |
694 | 0 | return child_has_constant; |
695 | 0 | } |
696 | | |
697 | 0 | static bool HasFilterConstants(const TableFilter &duckdb_filter) { |
698 | 0 | auto &expr_filter = ExpressionFilter::GetExpressionFilter(duckdb_filter, "ParquetStatistics::HasFilterConstants"); |
699 | 0 | return HasFilterConstants(*expr_filter.expr); |
700 | 0 | } |
701 | | |
702 | | template <class T> |
703 | 0 | static uint64_t ValueXH64FixedWidth(const Value &constant) { |
704 | 0 | T val = constant.GetValue<T>(); |
705 | 0 | return duckdb_zstd::XXH64(&val, sizeof(val), 0); |
706 | 0 | } Unexecuted instantiation: parquet_statistics.cpp:unsigned long duckdb::ValueXH64FixedWidth<int>(duckdb::Value const&) Unexecuted instantiation: parquet_statistics.cpp:unsigned long duckdb::ValueXH64FixedWidth<unsigned int>(duckdb::Value const&) Unexecuted instantiation: parquet_statistics.cpp:unsigned long duckdb::ValueXH64FixedWidth<unsigned long>(duckdb::Value const&) Unexecuted instantiation: parquet_statistics.cpp:unsigned long duckdb::ValueXH64FixedWidth<long>(duckdb::Value const&) Unexecuted instantiation: parquet_statistics.cpp:unsigned long duckdb::ValueXH64FixedWidth<float>(duckdb::Value const&) Unexecuted instantiation: parquet_statistics.cpp:unsigned long duckdb::ValueXH64FixedWidth<double>(duckdb::Value const&) |
707 | | |
708 | | // TODO we can only this if the parquet representation of the type exactly matches the duckdb rep! |
709 | | // TODO TEST THIS! |
710 | | // TODO perhaps we can re-use some writer infra here |
711 | 0 | static uint64_t ValueXXH64(const Value &constant) { |
712 | 0 | switch (constant.type().InternalType()) { |
713 | 0 | case PhysicalType::UINT8: |
714 | 0 | return ValueXH64FixedWidth<int32_t>(constant); |
715 | 0 | case PhysicalType::INT8: |
716 | 0 | return ValueXH64FixedWidth<int32_t>(constant); |
717 | 0 | case PhysicalType::UINT16: |
718 | 0 | return ValueXH64FixedWidth<int32_t>(constant); |
719 | 0 | case PhysicalType::INT16: |
720 | 0 | return ValueXH64FixedWidth<int32_t>(constant); |
721 | 0 | case PhysicalType::UINT32: |
722 | 0 | return ValueXH64FixedWidth<uint32_t>(constant); |
723 | 0 | case PhysicalType::INT32: |
724 | 0 | return ValueXH64FixedWidth<int32_t>(constant); |
725 | 0 | case PhysicalType::UINT64: |
726 | 0 | return ValueXH64FixedWidth<uint64_t>(constant); |
727 | 0 | case PhysicalType::INT64: |
728 | 0 | return ValueXH64FixedWidth<int64_t>(constant); |
729 | 0 | case PhysicalType::FLOAT: |
730 | 0 | return ValueXH64FixedWidth<float>(constant); |
731 | 0 | case PhysicalType::DOUBLE: |
732 | 0 | return ValueXH64FixedWidth<double>(constant); |
733 | 0 | case PhysicalType::VARCHAR: { |
734 | 0 | auto val = constant.GetValue<string>(); |
735 | 0 | return duckdb_zstd::XXH64(val.c_str(), val.length(), 0); |
736 | 0 | } |
737 | 0 | default: |
738 | 0 | return 0; |
739 | 0 | } |
740 | 0 | } |
741 | | |
742 | 0 | static bool ApplyBloomFilter(const Expression &expr, ParquetBloomFilter &bloom_filter) { |
743 | 0 | if (BoundComparisonExpression::IsComparison(expr)) { |
744 | 0 | auto &comp = expr.Cast<BoundFunctionExpression>(); |
745 | 0 | if (comp.GetExpressionType() != ExpressionType::COMPARE_EQUAL) { |
746 | 0 | return false; |
747 | 0 | } |
748 | 0 | auto &right = BoundComparisonExpression::Right(comp); |
749 | 0 | if (right.GetExpressionType() != ExpressionType::VALUE_CONSTANT) { |
750 | 0 | return false; |
751 | 0 | } |
752 | 0 | auto &constant = right.Cast<BoundConstantExpression>(); |
753 | 0 | D_ASSERT(!constant.GetValue().IsNull()); |
754 | 0 | auto hash = ValueXXH64(constant.GetValue()); |
755 | 0 | return hash > 0 && !bloom_filter.FilterCheck(hash); |
756 | 0 | } |
757 | 0 | if (expr.GetExpressionClass() != ExpressionClass::BOUND_CONJUNCTION) { |
758 | 0 | return false; |
759 | 0 | } |
760 | 0 | switch (expr.GetExpressionType()) { |
761 | 0 | case ExpressionType::CONJUNCTION_AND: { |
762 | 0 | bool any_children_true = false; |
763 | 0 | ExpressionIterator::EnumerateChildren( |
764 | 0 | expr, [&](const Expression &child) { any_children_true |= ApplyBloomFilter(child, bloom_filter); }); |
765 | 0 | return any_children_true; |
766 | 0 | } |
767 | 0 | case ExpressionType::CONJUNCTION_OR: { |
768 | 0 | bool all_children_true = true; |
769 | 0 | ExpressionIterator::EnumerateChildren( |
770 | 0 | expr, [&](const Expression &child) { all_children_true &= ApplyBloomFilter(child, bloom_filter); }); |
771 | 0 | return all_children_true; |
772 | 0 | } |
773 | 0 | default: |
774 | 0 | return false; |
775 | 0 | } |
776 | 0 | } |
777 | | |
778 | 0 | static bool ApplyBloomFilter(const TableFilter &duckdb_filter, ParquetBloomFilter &bloom_filter) { |
779 | 0 | auto &expr_filter = ExpressionFilter::GetExpressionFilter(duckdb_filter, "ParquetStatistics::ApplyBloomFilter"); |
780 | 0 | return ApplyBloomFilter(*expr_filter.expr, bloom_filter); |
781 | 0 | } |
782 | | |
783 | 0 | bool ParquetStatisticsUtils::BloomFilterSupported(const LogicalTypeId &type_id) { |
784 | 0 | switch (type_id) { |
785 | 0 | case LogicalTypeId::TINYINT: |
786 | 0 | case LogicalTypeId::UTINYINT: |
787 | 0 | case LogicalTypeId::SMALLINT: |
788 | 0 | case LogicalTypeId::USMALLINT: |
789 | 0 | case LogicalTypeId::INTEGER: |
790 | 0 | case LogicalTypeId::UINTEGER: |
791 | 0 | case LogicalTypeId::BIGINT: |
792 | 0 | case LogicalTypeId::UBIGINT: |
793 | 0 | case LogicalTypeId::FLOAT: |
794 | 0 | case LogicalTypeId::DOUBLE: |
795 | 0 | case LogicalTypeId::VARCHAR: |
796 | 0 | case LogicalTypeId::BLOB: |
797 | 0 | return true; |
798 | 0 | default: |
799 | 0 | return false; |
800 | 0 | } |
801 | 0 | } |
802 | | |
803 | | bool ParquetStatisticsUtils::BloomFilterExcludes(const TableFilter &duckdb_filter, |
804 | | const duckdb_parquet::ColumnMetaData &column_meta_data, |
805 | 0 | TProtocol &file_proto, Allocator &allocator) { |
806 | 0 | if (!HasFilterConstants(duckdb_filter) || !column_meta_data.__isset.bloom_filter_offset || |
807 | 0 | column_meta_data.bloom_filter_offset <= 0) { |
808 | 0 | return false; |
809 | 0 | } |
810 | | // TODO check length against file length! |
811 | | |
812 | 0 | auto &transport = reinterpret_cast<ThriftFileTransport &>(*file_proto.getTransport()); |
813 | 0 | transport.SetLocation(column_meta_data.bloom_filter_offset); |
814 | 0 | if (column_meta_data.__isset.bloom_filter_length && column_meta_data.bloom_filter_length > 0) { |
815 | 0 | transport.Prefetch(column_meta_data.bloom_filter_offset, column_meta_data.bloom_filter_length); |
816 | 0 | } |
817 | |
|
818 | 0 | duckdb_parquet::BloomFilterHeader filter_header; |
819 | | // TODO the bloom filter could be encrypted, too, so need to double check that this is NOT the case |
820 | 0 | filter_header.read(&file_proto); |
821 | 0 | if (!filter_header.algorithm.__isset.BLOCK || !filter_header.compression.__isset.UNCOMPRESSED || |
822 | 0 | !filter_header.hash.__isset.XXHASH) { |
823 | 0 | return false; |
824 | 0 | } |
825 | | |
826 | 0 | auto new_buffer = make_uniq<ResizeableBuffer>(allocator, filter_header.numBytes); |
827 | 0 | transport.read(new_buffer->ptr, filter_header.numBytes); |
828 | 0 | ParquetBloomFilter bloom_filter(std::move(new_buffer)); |
829 | 0 | return ApplyBloomFilter(duckdb_filter, bloom_filter); |
830 | 0 | } |
831 | | |
832 | 0 | ParquetBloomFilter::ParquetBloomFilter(idx_t num_entries, double bloom_filter_false_positive_ratio) { |
833 | | // aim for hit ratio of 0.01% |
834 | | // see http://tfk.mit.edu/pdf/bloom.pdf |
835 | 0 | double f = bloom_filter_false_positive_ratio; |
836 | 0 | double k = 8.0; |
837 | 0 | double n = LossyNumericCast<double>(num_entries); |
838 | 0 | double m = -k * n / std::log(1 - std::pow(f, 1 / k)); |
839 | 0 | auto b = MaxValue<idx_t>(NextPowerOfTwo(LossyNumericCast<idx_t>(m / k)) / 32, 1); |
840 | |
|
841 | 0 | D_ASSERT(b > 0 && IsPowerOfTwo(b)); |
842 | |
|
843 | 0 | data = make_uniq<ResizeableBuffer>(Allocator::DefaultAllocator(), sizeof(ParquetBloomBlock) * b); |
844 | 0 | data->zero(); |
845 | 0 | block_count = data->len / sizeof(ParquetBloomBlock); |
846 | 0 | D_ASSERT(data->len % sizeof(ParquetBloomBlock) == 0); |
847 | 0 | } |
848 | | |
849 | 0 | ParquetBloomFilter::ParquetBloomFilter(unique_ptr<ResizeableBuffer> data_p) { |
850 | 0 | D_ASSERT(data_p->len % sizeof(ParquetBloomBlock) == 0); |
851 | 0 | data = std::move(data_p); |
852 | 0 | block_count = data->len / sizeof(ParquetBloomBlock); |
853 | 0 | D_ASSERT(data->len % sizeof(ParquetBloomBlock) == 0); |
854 | 0 | } |
855 | | |
856 | 0 | void ParquetBloomFilter::FilterInsert(uint64_t x) { |
857 | 0 | auto blocks = reinterpret_cast<ParquetBloomBlock *>(data->ptr); |
858 | 0 | uint64_t i = ((x >> 32) * block_count) >> 32; |
859 | 0 | auto &b = blocks[i]; |
860 | 0 | ParquetBloomBlock::BlockInsert(b, x); |
861 | 0 | } |
862 | | |
863 | 0 | bool ParquetBloomFilter::FilterCheck(uint64_t x) { |
864 | 0 | auto blocks = reinterpret_cast<ParquetBloomBlock *>(data->ptr); |
865 | 0 | auto i = ((x >> 32) * block_count) >> 32; |
866 | 0 | return ParquetBloomBlock::BlockCheck(blocks[i], x); |
867 | 0 | } |
868 | | |
869 | | // compiler optimizes this into a single instruction (popcnt) |
870 | 0 | static uint8_t PopCnt64(uint64_t n) { |
871 | 0 | uint8_t c = 0; |
872 | 0 | for (; n; ++c) { |
873 | 0 | n &= n - 1; |
874 | 0 | } |
875 | 0 | return c; |
876 | 0 | } |
877 | | |
878 | 0 | double ParquetBloomFilter::OneRatio() { |
879 | 0 | auto bloom_ptr = reinterpret_cast<uint64_t *>(data->ptr); |
880 | 0 | idx_t one_count = 0; |
881 | 0 | for (idx_t b_idx = 0; b_idx < data->len / sizeof(uint64_t); ++b_idx) { |
882 | 0 | one_count += PopCnt64(bloom_ptr[b_idx]); |
883 | 0 | } |
884 | 0 | return LossyNumericCast<double>(one_count) / (LossyNumericCast<double>(data->len) * 8.0); |
885 | 0 | } |
886 | | |
887 | 0 | ResizeableBuffer *ParquetBloomFilter::Get() { |
888 | 0 | return data.get(); |
889 | 0 | } |
890 | | |
891 | | } // namespace duckdb |