Coverage Report

Created: 2026-01-31 08:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/duckdb/src/storage/statistics/distinct_statistics.cpp
Line
Count
Source
1
#include "duckdb/storage/statistics/distinct_statistics.hpp"
2
3
#include "duckdb/common/string_util.hpp"
4
#include "duckdb/common/vector_operations/vector_operations.hpp"
5
6
namespace duckdb {
7
8
3.25k
DistinctStatistics::DistinctStatistics() : log(make_uniq<HyperLogLog>()), sample_count(0), total_count(0) {
9
3.25k
}
10
11
DistinctStatistics::DistinctStatistics(unique_ptr<HyperLogLog> log, idx_t sample_count, idx_t total_count)
12
0
    : log(std::move(log)), sample_count(sample_count), total_count(total_count) {
13
0
}
14
15
0
unique_ptr<DistinctStatistics> DistinctStatistics::Copy() const {
16
0
  return make_uniq<DistinctStatistics>(log->Copy(), sample_count, total_count);
17
0
}
18
19
2.10k
void DistinctStatistics::Merge(const DistinctStatistics &other) {
20
2.10k
  log->Merge(*other.log);
21
2.10k
  sample_count += other.sample_count;
22
2.10k
  total_count += other.total_count;
23
2.10k
}
24
25
1.05k
void DistinctStatistics::UpdateSample(Vector &new_data, idx_t count, Vector &hashes) {
26
1.05k
  total_count += count;
27
1.05k
  const auto original_count = count;
28
1.05k
  const auto sample_rate = new_data.GetType().IsIntegral() ? INTEGRAL_SAMPLE_RATE : BASE_SAMPLE_RATE;
29
  // Sample up to 'sample_rate' of STANDARD_VECTOR_SIZE of this vector (at least 1)
30
1.05k
  count = MaxValue<idx_t>(LossyNumericCast<idx_t>(sample_rate * static_cast<double>(STANDARD_VECTOR_SIZE)), 1);
31
  // But never more than the original count
32
1.05k
  count = MinValue<idx_t>(count, original_count);
33
34
1.05k
  UpdateInternal(new_data, count, hashes);
35
1.05k
}
36
37
0
void DistinctStatistics::Update(Vector &new_data, idx_t count, Vector &hashes) {
38
0
  total_count += count;
39
0
  UpdateInternal(new_data, count, hashes);
40
0
}
41
42
1.05k
void DistinctStatistics::UpdateInternal(Vector &new_data, idx_t count, Vector &hashes) {
43
1.05k
  sample_count += count;
44
1.05k
  VectorOperations::Hash(new_data, hashes, count);
45
46
1.05k
  log->Update(new_data, hashes, count);
47
1.05k
}
48
49
0
string DistinctStatistics::ToString() const {
50
0
  return StringUtil::Format("[Approx Unique: %llu]", GetCount());
51
0
}
52
53
0
idx_t DistinctStatistics::GetCount() const {
54
0
  if (sample_count == 0 || total_count == 0) {
55
0
    return 0;
56
0
  }
57
58
0
  double u = static_cast<double>(MinValue<idx_t>(log->Count(), sample_count));
59
0
  double s = static_cast<double>(sample_count.load());
60
0
  double n = static_cast<double>(total_count.load());
61
62
  // Assume this proportion of the the sampled values occurred only once
63
0
  double u1 = pow(u / s, 2) * u;
64
65
  // Estimate total uniques using Good Turing Estimation
66
0
  idx_t estimate = LossyNumericCast<idx_t>(u + u1 / s * (n - s));
67
0
  return MinValue<idx_t>(estimate, total_count);
68
0
}
69
70
3.64k
bool DistinctStatistics::TypeIsSupported(const LogicalType &type) {
71
3.64k
  switch (type.InternalType()) {
72
0
  case PhysicalType::LIST:
73
0
  case PhysicalType::STRUCT:
74
0
  case PhysicalType::ARRAY:
75
0
    return false; // We don't support nested types
76
0
  case PhysicalType::BIT:
77
386
  case PhysicalType::BOOL:
78
386
    return false; // Doesn't make much sense
79
3.25k
  default:
80
3.25k
    return true;
81
3.64k
  }
82
3.64k
}
83
84
} // namespace duckdb