/src/duckdb/src/storage/statistics/distinct_statistics.cpp
Line | Count | Source |
1 | | #include "duckdb/storage/statistics/distinct_statistics.hpp" |
2 | | |
3 | | #include "duckdb/common/string_util.hpp" |
4 | | #include "duckdb/common/vector_operations/vector_operations.hpp" |
5 | | |
6 | | namespace duckdb { |
7 | | |
8 | 3.25k | DistinctStatistics::DistinctStatistics() : log(make_uniq<HyperLogLog>()), sample_count(0), total_count(0) { |
9 | 3.25k | } |
10 | | |
11 | | DistinctStatistics::DistinctStatistics(unique_ptr<HyperLogLog> log, idx_t sample_count, idx_t total_count) |
12 | 0 | : log(std::move(log)), sample_count(sample_count), total_count(total_count) { |
13 | 0 | } |
14 | | |
15 | 0 | unique_ptr<DistinctStatistics> DistinctStatistics::Copy() const { |
16 | 0 | return make_uniq<DistinctStatistics>(log->Copy(), sample_count, total_count); |
17 | 0 | } |
18 | | |
19 | 2.10k | void DistinctStatistics::Merge(const DistinctStatistics &other) { |
20 | 2.10k | log->Merge(*other.log); |
21 | 2.10k | sample_count += other.sample_count; |
22 | 2.10k | total_count += other.total_count; |
23 | 2.10k | } |
24 | | |
25 | 1.05k | void DistinctStatistics::UpdateSample(Vector &new_data, idx_t count, Vector &hashes) { |
26 | 1.05k | total_count += count; |
27 | 1.05k | const auto original_count = count; |
28 | 1.05k | const auto sample_rate = new_data.GetType().IsIntegral() ? INTEGRAL_SAMPLE_RATE : BASE_SAMPLE_RATE; |
29 | | // Sample up to 'sample_rate' of STANDARD_VECTOR_SIZE of this vector (at least 1) |
30 | 1.05k | count = MaxValue<idx_t>(LossyNumericCast<idx_t>(sample_rate * static_cast<double>(STANDARD_VECTOR_SIZE)), 1); |
31 | | // But never more than the original count |
32 | 1.05k | count = MinValue<idx_t>(count, original_count); |
33 | | |
34 | 1.05k | UpdateInternal(new_data, count, hashes); |
35 | 1.05k | } |
36 | | |
37 | 0 | void DistinctStatistics::Update(Vector &new_data, idx_t count, Vector &hashes) { |
38 | 0 | total_count += count; |
39 | 0 | UpdateInternal(new_data, count, hashes); |
40 | 0 | } |
41 | | |
42 | 1.05k | void DistinctStatistics::UpdateInternal(Vector &new_data, idx_t count, Vector &hashes) { |
43 | 1.05k | sample_count += count; |
44 | 1.05k | VectorOperations::Hash(new_data, hashes, count); |
45 | | |
46 | 1.05k | log->Update(new_data, hashes, count); |
47 | 1.05k | } |
48 | | |
49 | 0 | string DistinctStatistics::ToString() const { |
50 | 0 | return StringUtil::Format("[Approx Unique: %llu]", GetCount()); |
51 | 0 | } |
52 | | |
53 | 0 | idx_t DistinctStatistics::GetCount() const { |
54 | 0 | if (sample_count == 0 || total_count == 0) { |
55 | 0 | return 0; |
56 | 0 | } |
57 | | |
58 | 0 | double u = static_cast<double>(MinValue<idx_t>(log->Count(), sample_count)); |
59 | 0 | double s = static_cast<double>(sample_count.load()); |
60 | 0 | double n = static_cast<double>(total_count.load()); |
61 | | |
62 | | // Assume this proportion of the the sampled values occurred only once |
63 | 0 | double u1 = pow(u / s, 2) * u; |
64 | | |
65 | | // Estimate total uniques using Good Turing Estimation |
66 | 0 | idx_t estimate = LossyNumericCast<idx_t>(u + u1 / s * (n - s)); |
67 | 0 | return MinValue<idx_t>(estimate, total_count); |
68 | 0 | } |
69 | | |
70 | 3.64k | bool DistinctStatistics::TypeIsSupported(const LogicalType &type) { |
71 | 3.64k | switch (type.InternalType()) { |
72 | 0 | case PhysicalType::LIST: |
73 | 0 | case PhysicalType::STRUCT: |
74 | 0 | case PhysicalType::ARRAY: |
75 | 0 | return false; // We don't support nested types |
76 | 0 | case PhysicalType::BIT: |
77 | 386 | case PhysicalType::BOOL: |
78 | 386 | return false; // Doesn't make much sense |
79 | 3.25k | default: |
80 | 3.25k | return true; |
81 | 3.64k | } |
82 | 3.64k | } |
83 | | |
84 | | } // namespace duckdb |