ColumnStatisticsRecorder.java

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.tpch.statistics;

import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;
import io.airlift.tpch.TpchColumnType;
import net.agkn.hll.HLL;
import net.agkn.hll.HLLType;

import java.nio.charset.StandardCharsets;
import java.util.Optional;

import static com.google.common.base.Preconditions.checkArgument;
import static java.util.Objects.requireNonNull;

class ColumnStatisticsRecorder
{
    final MinMaxSet<Object> minMaxValues = new MinMaxSet<>();

    /**
     * We use a HLL that provides an exact count till 2^18 values.
     * Choice for {@code log2m} and {@code regwidth} parameters was determined empirically to keep total memory usage in check
     */
    final HLL hll = new HLL(25, 8, 18, true, HLLType.EMPTY);
    private final TpchColumnType type;
    private final HashFunction hashFunction = Hashing.murmur3_128();
    long varCharSize;

    public ColumnStatisticsRecorder(TpchColumnType type)
    {
        this.type = requireNonNull(type, "type is null");
    }

    void record(Comparable<?> value)
    {
        if (value != null) {
            final Hasher hasher = hashFunction.newHasher();

            switch (type.getBase()) {
                case IDENTIFIER:
                    hasher.putLong((Long) value);
                    break;
                case INTEGER:
                case DATE:
                    hasher.putInt((Integer) value);
                    break;
                case DOUBLE:
                    hasher.putDouble((Double) value);
                    break;
                case VARCHAR:
                    hasher.putString((String) value, StandardCharsets.UTF_8);
                    break;
            }
            hll.addRaw(hasher.hash().asLong());

            minMaxValues.add(value);

            if (type.getBase() == TpchColumnType.Base.VARCHAR) {
                varCharSize += ((String) value).length();
            }
        }
    }

    /**
     * Merge statistics from another {@link ColumnStatisticsRecorder} into the current object
     *
     * @param other
     * @return
     */
    public ColumnStatisticsRecorder mergeWith(ColumnStatisticsRecorder other)
    {
        checkArgument(type.equals(other.type), "Merging incompatible column statistics");
        varCharSize += other.varCharSize;
        other.minMaxValues.getMin().ifPresent(minMaxValues::add);
        other.minMaxValues.getMax().ifPresent(minMaxValues::add);
        hll.union(other.hll);
        return this;
    }

    ColumnStatisticsData getRecording()
    {
        return new ColumnStatisticsData(
                Optional.of(getUniqueValuesCount()),
                getLowestValue(),
                getHighestValue(),
                getDataSize());
    }

    private long getUniqueValuesCount()
    {
        return hll.cardinality();
    }

    private Optional<Object> getLowestValue()
    {
        return minMaxValues.getMin();
    }

    private Optional<Object> getHighestValue()
    {
        return minMaxValues.getMax();
    }

    public Optional<Long> getDataSize()
    {
        if (type.getBase() == TpchColumnType.Base.VARCHAR) {
            return Optional.of(varCharSize);
        }
        return Optional.empty();
    }
}