HistogramCalculator.java
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.spi.statistics;
import com.facebook.presto.common.predicate.Range;
import static com.facebook.presto.spi.statistics.ColumnStatistics.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR;
import static com.facebook.presto.spi.statistics.ColumnStatistics.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR;
import static java.lang.Double.NEGATIVE_INFINITY;
import static java.lang.Double.POSITIVE_INFINITY;
import static java.lang.Double.isFinite;
import static java.lang.Double.isNaN;
import static java.lang.Math.min;
public class HistogramCalculator
{
private HistogramCalculator()
{}
/**
* Calculates the "filter factor" corresponding to the overlap between the statistic range
* and the histogram distribution.
* <br>
* The filter factor is a fractional value in [0.0, 1.0] that represents the proportion of
* tuples in the source column that would be included in the result of a filter where the valid
* values in the filter are represented by the {@code range} parameter of this function.
*
* @param range the intersecting range with the histogram
* @param histogram the source histogram
* @param totalDistinctValues the total number of distinct values in the domain of the histogram
* @param useHeuristics whether to return heuristic values based on constants and/or distinct
* value counts. If false, {@link Estimate#unknown()} will be returned in any case a
* heuristic would have been used
* @return an estimate, x, where 0.0 <= x <= 1.0.
*/
public static Estimate calculateFilterFactor(Range range, double rangeDistinctValues, ConnectorHistogram histogram, Estimate totalDistinctValues, boolean useHeuristics)
{
boolean openHigh = !range.isHighInclusive();
boolean openLow = !range.isLowInclusive();
Estimate min = histogram.inverseCumulativeProbability(0.0);
Estimate max = histogram.inverseCumulativeProbability(1.0);
double rangeLow = range.getLowValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY);
double rangeHigh = range.getHighValue().map(Double.class::cast).orElse(POSITIVE_INFINITY);
double rangeLength = rangeHigh - rangeLow;
// range is either above or below histogram
if ((!max.isUnknown() && (openHigh ? max.getValue() <= rangeLow : max.getValue() < rangeLow))
|| (!min.isUnknown() && (openLow ? min.getValue() >= rangeHigh : min.getValue() > rangeHigh))) {
return Estimate.of(0.0);
}
// one of the max/min bounds can't be determined
if ((max.isUnknown() && !min.isUnknown()) || (!max.isUnknown() && min.isUnknown())) {
// when the range length is 0, the filter factor should be 1/distinct value count
if (!useHeuristics) {
return Estimate.unknown();
}
if (rangeLength == 0.0) {
return totalDistinctValues.map(distinct -> 1.0 / distinct);
}
if (isFinite(rangeLength)) {
return Estimate.of(INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR);
}
return Estimate.of(INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR);
}
// we know the bounds are both known, so calculate the percentile for each bound
// The inclusivity arguments can be derived from the open-ness of the interval we're
// calculating the filter factor for
// e.g. given a variable with values in [0, 10] to calculate the filter of
// [1, 9) (openness: false, true) we need the percentile from
// [0.0 to 1.0) (openness: false, true) and from [0.0, 9.0) (openness: false, true)
// thus for the "lowPercentile" calculation we should pass "false" to be non-inclusive
// (same as openness) however, on the high-end we want the inclusivity to be the opposite
// of the openness since if it's open, we _don't_ want to include the bound.
Estimate lowPercentile = histogram.cumulativeProbability(rangeLow, openLow);
Estimate highPercentile = histogram.cumulativeProbability(rangeHigh, !openHigh);
// both bounds are probably infinity, use the infinite-infinite heuristic
if (lowPercentile.isUnknown() || highPercentile.isUnknown()) {
if (!useHeuristics) {
return Estimate.unknown();
}
// in the case the histogram has no values
if (totalDistinctValues.equals(Estimate.zero()) || rangeDistinctValues == 0.0) {
return Estimate.of(0.0);
}
// in the case only one is unknown
if (((lowPercentile.isUnknown() && !highPercentile.isUnknown()) ||
(!lowPercentile.isUnknown() && highPercentile.isUnknown())) &&
isFinite(rangeLength)) {
return Estimate.of(INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR);
}
if (rangeLength == 0.0) {
return totalDistinctValues.map(distinct -> 1.0 / distinct);
}
if (!isNaN(rangeDistinctValues)) {
return totalDistinctValues.map(distinct -> min(1.0, rangeDistinctValues / distinct));
}
return Estimate.of(INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR);
}
// in the case the range is a single value, this can occur if the input
// filter range is a single value (low == high) OR in the case that the
// bounds of the filter or this histogram are infinite.
// in the case of infinite bounds, we should return an estimate that
// correlates to the overlapping distinct values.
if (lowPercentile.equals(highPercentile)) {
if (!useHeuristics) {
return Estimate.zero();
}
return totalDistinctValues.map(distinct -> 1.0 / distinct);
}
// in the case that we return the entire range, the returned factor percent should be
// proportional to the number of distinct values in the range
if (lowPercentile.equals(Estimate.zero()) && highPercentile.equals(Estimate.of(1.0)) && min.isUnknown() && max.isUnknown()) {
if (!useHeuristics) {
return Estimate.unknown();
}
return totalDistinctValues.flatMap(totalDistinct -> {
if (fuzzyEquals(totalDistinct, 0.0, 1E-6)) {
return Estimate.of(1.0);
}
return Estimate.of(min(1.0, rangeDistinctValues / totalDistinct));
})
// in the case totalDistinct is NaN or 0
.or(() -> Estimate.of(INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR));
}
return lowPercentile.flatMap(lowPercent -> highPercentile.map(highPercent -> highPercent - lowPercent));
}
private static boolean fuzzyEquals(double a, double b, double tolerance)
{
return Math.copySign(a - b, 1.0) <= tolerance
// copySign(x, 1.0) is a branch-free version of abs(x), but with different NaN semantics
|| (a == b) // needed to ensure that infinities equal themselves
|| (Double.isNaN(a) && Double.isNaN(b));
}
}