ColumnStatistics.java
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.orc.metadata.statistics;
import com.facebook.presto.orc.metadata.statistics.StatisticsHasher.Hashable;
import com.facebook.presto.orc.proto.DwrfProto;
import com.google.common.base.MoreObjects.ToStringHelper;
import it.unimi.dsi.fastutil.objects.Object2LongMap;
import org.openjdk.jol.info.ClassLayout;
import java.util.List;
import java.util.Objects;
import static com.facebook.presto.orc.metadata.statistics.BinaryStatisticsBuilder.mergeBinaryStatistics;
import static com.facebook.presto.orc.metadata.statistics.BooleanStatisticsBuilder.mergeBooleanStatistics;
import static com.facebook.presto.orc.metadata.statistics.DateStatisticsBuilder.mergeDateStatistics;
import static com.facebook.presto.orc.metadata.statistics.DoubleStatisticsBuilder.mergeDoubleStatistics;
import static com.facebook.presto.orc.metadata.statistics.IntegerStatisticsBuilder.mergeIntegerStatistics;
import static com.facebook.presto.orc.metadata.statistics.LongDecimalStatisticsBuilder.mergeDecimalStatistics;
import static com.facebook.presto.orc.metadata.statistics.MapColumnStatisticsBuilder.mergeMapStatistics;
import static com.facebook.presto.orc.metadata.statistics.StringStatisticsBuilder.mergeStringStatistics;
import static com.google.common.base.MoreObjects.toStringHelper;
public class ColumnStatistics
implements Hashable
{
private static final int INSTANCE_SIZE = ClassLayout.parseClass(ColumnStatistics.class).instanceSize();
private final boolean hasNumberOfValues;
private final long numberOfValues;
private final boolean hasRawSize;
private final long rawSize;
private final boolean hasStorageSize;
private final long storageSize;
private final HiveBloomFilter bloomFilter;
public ColumnStatistics(
Long numberOfValues,
HiveBloomFilter bloomFilter,
Long rawSize,
Long storageSize)
{
this.hasNumberOfValues = numberOfValues != null;
this.numberOfValues = hasNumberOfValues ? numberOfValues : 0;
this.hasRawSize = rawSize != null;
this.rawSize = hasRawSize ? rawSize : 0;
this.hasStorageSize = storageSize != null;
this.storageSize = hasStorageSize ? storageSize : 0;
this.bloomFilter = bloomFilter;
}
public boolean hasNumberOfValues()
{
return hasNumberOfValues;
}
public long getNumberOfValues()
{
return hasNumberOfValues ? numberOfValues : 0;
}
public boolean hasRawSize()
{
return hasRawSize;
}
public long getRawSize()
{
return hasRawSize() ? rawSize : 0;
}
public boolean hasStorageSize()
{
return hasStorageSize;
}
public long getStorageSize()
{
return hasStorageSize() ? storageSize : 0;
}
public boolean hasMinAverageValueSizeInBytes()
{
return hasNumberOfValues() && numberOfValues > 0;
}
/**
* The minimum average value sizes.
* The actual average value size is no less than the return value.
* It provides a lower bound of the size of data to be loaded
*/
public long getTotalValueSizeInBytes()
{
return 0;
}
public BooleanStatistics getBooleanStatistics()
{
return null;
}
public DateStatistics getDateStatistics()
{
return null;
}
public DoubleStatistics getDoubleStatistics()
{
return null;
}
public IntegerStatistics getIntegerStatistics()
{
return null;
}
public StringStatistics getStringStatistics()
{
return null;
}
public DecimalStatistics getDecimalStatistics()
{
return null;
}
public BinaryStatistics getBinaryStatistics()
{
return null;
}
public MapStatistics getMapStatistics()
{
return null;
}
public HiveBloomFilter getBloomFilter()
{
return bloomFilter;
}
protected final long getMembersSizeInBytes()
{
return bloomFilter == null ? 0 : bloomFilter.getRetainedSizeInBytes();
}
public long getRetainedSizeInBytes()
{
return INSTANCE_SIZE + getMembersSizeInBytes();
}
protected final boolean equalsInternal(ColumnStatistics that)
{
return hasNumberOfValues == that.hasNumberOfValues &&
numberOfValues == that.numberOfValues &&
hasRawSize == that.hasRawSize &&
rawSize == that.rawSize &&
hasStorageSize == that.hasStorageSize &&
storageSize == that.storageSize &&
Objects.equals(getBloomFilter(), that.getBloomFilter());
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
ColumnStatistics that = (ColumnStatistics) o;
return equalsInternal(that);
}
@Override
public int hashCode()
{
return Objects.hash(
hasNumberOfValues,
getNumberOfValues(),
hasRawSize(),
getRawSize(),
hasStorageSize(),
getStorageSize(),
getBloomFilter());
}
protected ToStringHelper getToStringHelper()
{
return toStringHelper(this)
.omitNullValues()
.add("numberOfValues", getNumberOfValues())
.add("rawSize", getRawSize())
.add("storageSize", getStorageSize())
.add("bloomFilter", getBloomFilter());
}
@Override
public String toString()
{
return getToStringHelper().toString();
}
@Override
public void addHash(StatisticsHasher hasher)
{
// This hashing function is used for ORC writer file validation.
// Fields rawSize and storageSize are not included because they cannot
// be calculated by the reader during the file validation.
hasher.putOptionalLong(hasNumberOfValues, numberOfValues)
.putOptionalHashable(getBloomFilter());
}
public static ColumnStatistics mergeColumnStatistics(List<ColumnStatistics> stats)
{
return mergeColumnStatistics(stats, null, null);
}
public static ColumnStatistics mergeColumnStatistics(List<ColumnStatistics> stats, Long extraStorageSize, Object2LongMap<DwrfProto.KeyInfo> mapKeySizes)
{
if (stats.isEmpty()) {
return new ColumnStatistics(0L, null, null, null);
}
long numberOfRows = 0;
long rawSize = 0;
long storageSize = 0;
boolean hasRawSize = false;
boolean hasStorageSize = false;
if (extraStorageSize != null) {
hasStorageSize = true;
storageSize = extraStorageSize;
}
for (ColumnStatistics stat : stats) {
numberOfRows += stat.getNumberOfValues();
if (stat.hasRawSize()) {
rawSize += stat.getRawSize();
hasRawSize = true;
}
if (stat.hasStorageSize()) {
storageSize += stat.getStorageSize();
hasStorageSize = true;
}
}
return createColumnStatistics(
numberOfRows,
hasRawSize ? rawSize : null,
hasStorageSize ? storageSize : null,
mergeBooleanStatistics(stats).orElse(null),
mergeIntegerStatistics(stats).orElse(null),
mergeDoubleStatistics(stats).orElse(null),
mergeStringStatistics(stats).orElse(null),
mergeDateStatistics(stats).orElse(null),
mergeDecimalStatistics(stats).orElse(null),
mergeBinaryStatistics(stats).orElse(null),
mergeMapStatistics(stats, mapKeySizes).orElse(null),
null);
}
public static ColumnStatistics createColumnStatistics(
Long numberOfValues,
Long rawSize,
Long storageSize,
BooleanStatistics booleanStatistics,
IntegerStatistics integerStatistics,
DoubleStatistics doubleStatistics,
StringStatistics stringStatistics,
DateStatistics dateStatistics,
DecimalStatistics decimalStatistics,
BinaryStatistics binaryStatistics,
MapStatistics mapStatistics,
HiveBloomFilter bloomFilter)
{
if (booleanStatistics != null) {
return new BooleanColumnStatistics(numberOfValues, bloomFilter, rawSize, storageSize, booleanStatistics);
}
if (integerStatistics != null) {
return new IntegerColumnStatistics(numberOfValues, bloomFilter, rawSize, storageSize, integerStatistics);
}
if (doubleStatistics != null) {
return new DoubleColumnStatistics(numberOfValues, bloomFilter, rawSize, storageSize, doubleStatistics);
}
if (stringStatistics != null) {
return new StringColumnStatistics(numberOfValues, bloomFilter, rawSize, storageSize, stringStatistics);
}
if (dateStatistics != null) {
return new DateColumnStatistics(numberOfValues, bloomFilter, rawSize, storageSize, dateStatistics);
}
if (decimalStatistics != null) {
return new DecimalColumnStatistics(numberOfValues, bloomFilter, rawSize, storageSize, decimalStatistics);
}
if (binaryStatistics != null) {
return new BinaryColumnStatistics(numberOfValues, bloomFilter, rawSize, storageSize, binaryStatistics);
}
if (mapStatistics != null) {
return new MapColumnStatistics(numberOfValues, bloomFilter, rawSize, storageSize, mapStatistics);
}
return new ColumnStatistics(numberOfValues, bloomFilter, rawSize, storageSize);
}
}