SetDigestFunctions.java

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.facebook.presto.type.setdigest;

import com.facebook.presto.common.block.Block;
import com.facebook.presto.common.block.BlockBuilder;
import com.facebook.presto.common.type.StandardTypes;
import com.facebook.presto.common.type.Type;
import com.facebook.presto.spi.function.ScalarFunction;
import com.facebook.presto.spi.function.SqlType;
import com.facebook.presto.spi.function.TypeParameter;
import io.airlift.slice.Slice;

import java.util.Map;

import static com.facebook.presto.common.type.BigintType.BIGINT;
import static com.facebook.presto.common.type.SmallintType.SMALLINT;
import static com.facebook.presto.type.setdigest.SetDigest.exactIntersectionCardinality;

public final class SetDigestFunctions
{
    private SetDigestFunctions()
    {
    }

    @ScalarFunction
    @SqlType(StandardTypes.BIGINT)
    public static long cardinality(@SqlType(SetDigestType.NAME) Slice digest)
    {
        return SetDigest.newInstance(digest).cardinality();
    }

    @ScalarFunction
    @SqlType(StandardTypes.BIGINT)
    public static long intersectionCardinality(@SqlType(SetDigestType.NAME) Slice slice1, @SqlType(SetDigestType.NAME) Slice slice2)
    {
        SetDigest digest1 = SetDigest.newInstance(slice1);
        SetDigest digest2 = SetDigest.newInstance(slice2);

        if (digest1.isExact() && digest2.isExact()) {
            return exactIntersectionCardinality(digest1, digest2);
        }

        long cardinality1 = digest1.cardinality();
        long cardinality2 = digest2.cardinality();
        double jaccard = SetDigest.jaccardIndex(digest1, digest2);
        digest1.mergeWith(digest2);
        long result = Math.round(jaccard * digest1.cardinality());

        // When one of the sets is much smaller than the other and approaches being a true
        // subset of the other, the computed cardinality may exceed the cardinality estimate
        // of the smaller set. When this happens the cardinality of the smaller set is obviously
        // a better estimate of the one computed with the Jaccard Index.
        return Math.min(result, Math.min(cardinality1, cardinality2));
    }

    @ScalarFunction
    @SqlType(StandardTypes.DOUBLE)
    public static double jaccardIndex(@SqlType(SetDigestType.NAME) Slice slice1, @SqlType(SetDigestType.NAME) Slice slice2)
    {
        SetDigest digest1 = SetDigest.newInstance(slice1);
        SetDigest digest2 = SetDigest.newInstance(slice2);

        return SetDigest.jaccardIndex(digest1, digest2);
    }

    @ScalarFunction
    @SqlType("map(bigint,smallint)")
    public static Block hashCounts(@TypeParameter("map<bigint,smallint>") Type mapType, @SqlType(SetDigestType.NAME) Slice slice)
    {
        SetDigest digest = SetDigest.newInstance(slice);

        // Maybe use static BlockBuilderStatus in order avoid `new`?
        BlockBuilder blockBuilder = mapType.createBlockBuilder(null, 1);
        BlockBuilder singleMapBlockBuilder = blockBuilder.beginBlockEntry();
        for (Map.Entry<Long, Short> entry : digest.getHashCounts().entrySet()) {
            BIGINT.writeLong(singleMapBlockBuilder, entry.getKey());
            SMALLINT.writeLong(singleMapBlockBuilder, entry.getValue());
        }
        blockBuilder.closeEntry();

        return (Block) mapType.getObject(blockBuilder, 0);
    }
}