KHyperLogLogFunctions.java
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.type.khyperloglog;
import com.facebook.presto.common.block.Block;
import com.facebook.presto.common.block.BlockBuilder;
import com.facebook.presto.common.type.StandardTypes;
import com.facebook.presto.common.type.Type;
import com.facebook.presto.spi.function.ScalarFunction;
import com.facebook.presto.spi.function.SqlNullable;
import com.facebook.presto.spi.function.SqlType;
import com.facebook.presto.spi.function.TypeParameter;
import io.airlift.slice.Slice;
import java.util.Map;
import static com.facebook.presto.common.type.BigintType.BIGINT;
import static com.facebook.presto.common.type.DoubleType.DOUBLE;
public final class KHyperLogLogFunctions
{
private KHyperLogLogFunctions()
{
}
@ScalarFunction
@SqlType(StandardTypes.BIGINT)
public static long cardinality(@SqlType(KHyperLogLogType.NAME) Slice khll)
{
return KHyperLogLog.newInstance(khll).cardinality();
}
@ScalarFunction
@SqlType(StandardTypes.BIGINT)
public static long intersectionCardinality(@SqlType(KHyperLogLogType.NAME) Slice slice1, @SqlType(KHyperLogLogType.NAME) Slice slice2)
{
KHyperLogLog khll1 = KHyperLogLog.newInstance(slice1);
KHyperLogLog khll2 = KHyperLogLog.newInstance(slice2);
if (khll1.isExact() && khll2.isExact()) {
return KHyperLogLog.exactIntersectionCardinality(khll1, khll2);
}
long lowestCardinality = Math.min(khll1.cardinality(), khll2.cardinality());
double jaccard = KHyperLogLog.jaccardIndex(khll1, khll2);
KHyperLogLog setUnion = KHyperLogLog.merge(khll1, khll2);
long result = Math.round(jaccard * setUnion.cardinality());
// When one of the sets is much smaller than the other and approaches being a true
// subset of the other, the computed cardinality may exceed the cardinality estimate
// of the smaller set. When this happens the cardinality of the smaller set is obviously
// a better estimate of the one computed with the Jaccard Index.
return Math.min(result, lowestCardinality);
}
@ScalarFunction
@SqlType(StandardTypes.DOUBLE)
public static double jaccardIndex(@SqlType(KHyperLogLogType.NAME) Slice slice1, @SqlType(KHyperLogLogType.NAME) Slice slice2)
{
KHyperLogLog khll1 = KHyperLogLog.newInstance(slice1);
KHyperLogLog khll2 = KHyperLogLog.newInstance(slice2);
return KHyperLogLog.jaccardIndex(khll1, khll2);
}
@ScalarFunction
@SqlType("map(bigint,double)")
public static Block uniquenessDistribution(@TypeParameter("map<bigint,double>") Type mapType, @SqlType(KHyperLogLogType.NAME) Slice slice)
{
KHyperLogLog khll = KHyperLogLog.newInstance(slice);
return uniquenessDistribution(mapType, slice, khll.getMinhashSize());
}
@ScalarFunction
@SqlType("map(bigint,double)")
public static Block uniquenessDistribution(@TypeParameter("map<bigint,double>") Type mapType, @SqlType(KHyperLogLogType.NAME) Slice slice, @SqlType(StandardTypes.BIGINT) long histogramSize)
{
KHyperLogLog khll = KHyperLogLog.newInstance(slice);
BlockBuilder blockBuilder = mapType.createBlockBuilder(null, 1);
BlockBuilder singleMapBlockBuilder = blockBuilder.beginBlockEntry();
for (Map.Entry<Long, Double> entry : khll.uniquenessDistribution(histogramSize).entrySet()) {
BIGINT.writeLong(singleMapBlockBuilder, entry.getKey());
DOUBLE.writeDouble(singleMapBlockBuilder, entry.getValue());
}
blockBuilder.closeEntry();
return (Block) mapType.getObject(blockBuilder, 0);
}
@ScalarFunction
@SqlType(StandardTypes.DOUBLE)
public static double reidentificationPotential(@SqlType(KHyperLogLogType.NAME) Slice khll, @SqlType(StandardTypes.BIGINT) long threshold)
{
return KHyperLogLog.newInstance(khll).reidentificationPotential(threshold);
}
@ScalarFunction
@SqlType(KHyperLogLogType.NAME)
@SqlNullable
public static Slice mergeKhll(@SqlType("array(KHyperLogLog)") Block block)
{
if (block.getPositionCount() == 0) {
return null;
}
KHyperLogLog merged = null;
int firstNonNullIndex = 0;
while (firstNonNullIndex < block.getPositionCount() && block.isNull(firstNonNullIndex)) {
firstNonNullIndex++;
}
if (firstNonNullIndex == block.getPositionCount()) {
return null;
}
Slice initialSlice = block.getSlice(firstNonNullIndex, 0, block.getSliceLength(firstNonNullIndex));
merged = KHyperLogLog.newInstance(initialSlice);
for (int i = firstNonNullIndex; i < block.getPositionCount(); i++) {
Slice currentSlice = block.getSlice(i, 0, block.getSliceLength(i));
if (!block.isNull(i)) {
merged = KHyperLogLog.merge(merged, KHyperLogLog.newInstance(currentSlice));
}
}
return merged.serialize();
}
}