TestStringStatisticsBuilder.java

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.orc.metadata.statistics;

import com.facebook.presto.common.block.VariableWidthBlockBuilder;
import com.google.common.collect.ImmutableList;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import org.testng.annotations.Test;

import java.util.ArrayList;
import java.util.List;

import static com.facebook.presto.common.type.VarcharType.VARCHAR;
import static com.facebook.presto.orc.metadata.statistics.AbstractStatisticsBuilderTest.StatisticsType.STRING;
import static com.facebook.presto.orc.metadata.statistics.ColumnStatistics.mergeColumnStatistics;
import static com.facebook.presto.orc.metadata.statistics.StringStatistics.STRING_VALUE_BYTES_OVERHEAD;
import static com.google.common.base.Preconditions.checkArgument;
import static io.airlift.slice.Slices.EMPTY_SLICE;
import static io.airlift.slice.Slices.utf8Slice;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertNotNull;
import static org.testng.Assert.assertNull;

public class TestStringStatisticsBuilder
        extends AbstractStatisticsBuilderTest<StringStatisticsBuilder, Slice>
{
    // U+0000 to U+D7FF
    private static final Slice LOW_BOTTOM_VALUE = utf8Slice("foo \u0000");
    private static final Slice LOW_TOP_VALUE = utf8Slice("foo \uD7FF");
    // U+E000 to U+FFFF
    private static final Slice MEDIUM_BOTTOM_VALUE = utf8Slice("foo \uE000");
    private static final Slice MEDIUM_TOP_VALUE = utf8Slice("foo \uFFFF");
    // U+10000 to U+10FFFF
    private static final Slice HIGH_BOTTOM_VALUE = utf8Slice("foo \uD800\uDC00");
    private static final Slice HIGH_TOP_VALUE = utf8Slice("foo \uDBFF\uDFFF");

    private static final Slice LONG_BOTTOM_VALUE = utf8Slice("aaaaaaaaaaaaaaaaaaaa");

    public TestStringStatisticsBuilder()
    {
        super(STRING, () -> new StringStatisticsBuilder(Integer.MAX_VALUE), TestStringStatisticsBuilder::addValue);
    }

    @Test
    public void testMinMaxValues()
    {
        assertMinMaxValues(EMPTY_SLICE, EMPTY_SLICE);
        assertMinMaxValues(LOW_BOTTOM_VALUE, LOW_BOTTOM_VALUE);
        assertMinMaxValues(LOW_TOP_VALUE, LOW_TOP_VALUE);
        assertMinMaxValues(MEDIUM_BOTTOM_VALUE, MEDIUM_BOTTOM_VALUE);
        assertMinMaxValues(MEDIUM_TOP_VALUE, MEDIUM_TOP_VALUE);
        assertMinMaxValues(HIGH_BOTTOM_VALUE, HIGH_BOTTOM_VALUE);
        assertMinMaxValues(HIGH_TOP_VALUE, HIGH_TOP_VALUE);

        assertMinMaxValues(EMPTY_SLICE, LOW_BOTTOM_VALUE);
        assertMinMaxValues(EMPTY_SLICE, LOW_TOP_VALUE);
        assertMinMaxValues(EMPTY_SLICE, MEDIUM_BOTTOM_VALUE);
        assertMinMaxValues(EMPTY_SLICE, MEDIUM_TOP_VALUE);
        assertMinMaxValues(EMPTY_SLICE, HIGH_BOTTOM_VALUE);
        assertMinMaxValues(EMPTY_SLICE, HIGH_TOP_VALUE);

        assertMinMaxValues(LOW_BOTTOM_VALUE, LOW_TOP_VALUE);
        assertMinMaxValues(LOW_BOTTOM_VALUE, MEDIUM_BOTTOM_VALUE);
        assertMinMaxValues(LOW_BOTTOM_VALUE, MEDIUM_TOP_VALUE);
        assertMinMaxValues(LOW_BOTTOM_VALUE, HIGH_BOTTOM_VALUE);
        assertMinMaxValues(LOW_BOTTOM_VALUE, HIGH_TOP_VALUE);

        assertMinMaxValues(LOW_TOP_VALUE, MEDIUM_BOTTOM_VALUE);
        assertMinMaxValues(LOW_TOP_VALUE, MEDIUM_TOP_VALUE);
        assertMinMaxValues(LOW_TOP_VALUE, HIGH_BOTTOM_VALUE);
        assertMinMaxValues(LOW_TOP_VALUE, HIGH_TOP_VALUE);

        assertMinMaxValues(MEDIUM_BOTTOM_VALUE, MEDIUM_TOP_VALUE);
        assertMinMaxValues(MEDIUM_BOTTOM_VALUE, HIGH_BOTTOM_VALUE);
        assertMinMaxValues(MEDIUM_BOTTOM_VALUE, HIGH_TOP_VALUE);

        assertMinMaxValues(MEDIUM_TOP_VALUE, HIGH_BOTTOM_VALUE);
        assertMinMaxValues(MEDIUM_TOP_VALUE, HIGH_TOP_VALUE);

        assertMinMaxValues(HIGH_BOTTOM_VALUE, HIGH_TOP_VALUE);
    }

    @Test
    public void testSum()
    {
        StringStatisticsBuilder stringStatisticsBuilder = new StringStatisticsBuilder(Integer.MAX_VALUE);
        for (Slice value : ImmutableList.of(EMPTY_SLICE, LOW_BOTTOM_VALUE, LOW_TOP_VALUE)) {
            addValue(stringStatisticsBuilder, value);
        }
        assertStringStatistics(stringStatisticsBuilder.buildColumnStatistics(), 3, EMPTY_SLICE.length() + LOW_BOTTOM_VALUE.length() + LOW_TOP_VALUE.length());
    }

    @Test
    public void testMerge()
    {
        List<ColumnStatistics> statisticsList = new ArrayList<>();

        StringStatisticsBuilder statisticsBuilder = new StringStatisticsBuilder(Integer.MAX_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMergedStringStatistics(statisticsList, 0, 0);

        addValue(statisticsBuilder, EMPTY_SLICE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMergedStringStatistics(statisticsList, 1, 0);

        addValue(statisticsBuilder, LOW_BOTTOM_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMergedStringStatistics(statisticsList, 3, LOW_BOTTOM_VALUE.length());

        addValue(statisticsBuilder, LOW_TOP_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMergedStringStatistics(statisticsList, 6, LOW_BOTTOM_VALUE.length() * 2 + LOW_TOP_VALUE.length());
    }

    @Test
    public void testMinMaxValuesWithLimit()
    {
        assertMinMaxValuesWithLimit(MEDIUM_TOP_VALUE, null, ImmutableList.of(MEDIUM_TOP_VALUE, HIGH_BOTTOM_VALUE), 7);
        assertMinMaxValuesWithLimit(null, MEDIUM_TOP_VALUE, ImmutableList.of(LONG_BOTTOM_VALUE, MEDIUM_TOP_VALUE), 7);
        assertMinMaxValuesWithLimit(null, null, ImmutableList.of(LONG_BOTTOM_VALUE), 6, 20);
    }

    @Test
    public void testMergeWithLimit()
    {
        List<ColumnStatistics> statisticsList = new ArrayList<>();
        statisticsList.add(stringColumnStatistics(MEDIUM_BOTTOM_VALUE, MEDIUM_BOTTOM_VALUE));
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), MEDIUM_BOTTOM_VALUE, MEDIUM_BOTTOM_VALUE);
        statisticsList.add(stringColumnStatistics(null, MEDIUM_BOTTOM_VALUE));
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), null, MEDIUM_BOTTOM_VALUE);
        statisticsList.add(stringColumnStatistics(null, MEDIUM_TOP_VALUE));
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), null, MEDIUM_TOP_VALUE);
        statisticsList.add(stringColumnStatistics(MEDIUM_TOP_VALUE, null));
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), null, null, 400);
        statisticsList.add(stringColumnStatistics(MEDIUM_BOTTOM_VALUE, MEDIUM_BOTTOM_VALUE));
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), null, null, 500);
    }

    @Test
    public void testMergeWithNullMinMaxValues()
    {
        List<ColumnStatistics> statisticsList = new ArrayList<>();
        statisticsList.add(stringColumnStatistics(MEDIUM_BOTTOM_VALUE, MEDIUM_BOTTOM_VALUE));
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), MEDIUM_BOTTOM_VALUE, MEDIUM_BOTTOM_VALUE);
        statisticsList.add(stringColumnStatistics(null, null));
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), null, null);
    }

    @Test
    public void testMixingAddValueAndMergeWithLimit()
    {
        // max merged to null
        List<ColumnStatistics> statisticsList = new ArrayList<>();

        StringStatisticsBuilder statisticsBuilder = new StringStatisticsBuilder(7);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMergedStringStatistics(statisticsList, 0, 0);

        addValue(statisticsBuilder, LOW_BOTTOM_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMergedStringStatistics(statisticsList, 1, LOW_BOTTOM_VALUE.length());
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), LOW_BOTTOM_VALUE, LOW_BOTTOM_VALUE);

        addValue(statisticsBuilder, LOW_TOP_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMergedStringStatistics(statisticsList, 3, LOW_BOTTOM_VALUE.length() * 2 + LOW_TOP_VALUE.length());
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), LOW_BOTTOM_VALUE, LOW_TOP_VALUE);

        addValue(statisticsBuilder, HIGH_BOTTOM_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMergedStringStatistics(statisticsList, 6, LOW_BOTTOM_VALUE.length() * 3 + LOW_TOP_VALUE.length() * 2 + HIGH_BOTTOM_VALUE.length());
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), LOW_BOTTOM_VALUE, null);

        addValue(statisticsBuilder, HIGH_BOTTOM_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMergedStringStatistics(statisticsList, 10, LOW_BOTTOM_VALUE.length() * 4 + LOW_TOP_VALUE.length() * 3 + HIGH_BOTTOM_VALUE.length() * 3);
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), LOW_BOTTOM_VALUE, null);

        // min merged to null
        statisticsList = new ArrayList<>();

        statisticsBuilder = new StringStatisticsBuilder(7);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMergedStringStatistics(statisticsList, 0, 0);

        addValue(statisticsBuilder, MEDIUM_TOP_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMergedStringStatistics(statisticsList, 1, MEDIUM_TOP_VALUE.length());
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), MEDIUM_TOP_VALUE, MEDIUM_TOP_VALUE);

        addValue(statisticsBuilder, MEDIUM_BOTTOM_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMergedStringStatistics(statisticsList, 3, MEDIUM_TOP_VALUE.length() * 2 + MEDIUM_BOTTOM_VALUE.length());
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), MEDIUM_BOTTOM_VALUE, MEDIUM_TOP_VALUE);

        addValue(statisticsBuilder, LONG_BOTTOM_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMergedStringStatistics(statisticsList, 6, MEDIUM_TOP_VALUE.length() * 3 + MEDIUM_BOTTOM_VALUE.length() * 2 + LONG_BOTTOM_VALUE.length());
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), null, MEDIUM_TOP_VALUE);

        addValue(statisticsBuilder, LONG_BOTTOM_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMergedStringStatistics(statisticsList, 10, MEDIUM_TOP_VALUE.length() * 4 + MEDIUM_BOTTOM_VALUE.length() * 3 + LONG_BOTTOM_VALUE.length() * 3);
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), null, MEDIUM_TOP_VALUE);

        // min and max both merged to null
        statisticsList = new ArrayList<>();

        statisticsBuilder = new StringStatisticsBuilder(7);
        addValue(statisticsBuilder, MEDIUM_BOTTOM_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), MEDIUM_BOTTOM_VALUE, MEDIUM_BOTTOM_VALUE);

        addValue(statisticsBuilder, LONG_BOTTOM_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), null, MEDIUM_BOTTOM_VALUE);

        addValue(statisticsBuilder, HIGH_TOP_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), null, null);

        addValue(statisticsBuilder, HIGH_BOTTOM_VALUE);
        statisticsList.add(statisticsBuilder.buildColumnStatistics());
        assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), null, null);
    }

    @Test
    public void testCopyStatsToSaveMemory()
    {
        StringStatisticsBuilder statisticsBuilder = new StringStatisticsBuilder(Integer.MAX_VALUE);
        Slice shortSlice = Slices.wrappedBuffer(LONG_BOTTOM_VALUE.getBytes(), 0, 1);
        addValue(statisticsBuilder, shortSlice);
        Slice stats = statisticsBuilder.buildColumnStatistics().getStringStatistics().getMax();

        // assert we only spend 1 byte for stats
        assertNotNull(stats);
        assertEquals(stats.getRetainedSize(), Slices.wrappedBuffer(new byte[1]).getRetainedSize());
    }

    @Test
    public void testTotalValueBytes()
    {
        assertTotalValueBytes(0L, ImmutableList.of());
        assertTotalValueBytes(STRING_VALUE_BYTES_OVERHEAD, ImmutableList.of(EMPTY_SLICE));
        assertTotalValueBytes(LOW_BOTTOM_VALUE.length() + STRING_VALUE_BYTES_OVERHEAD, ImmutableList.of(LOW_BOTTOM_VALUE));
        assertTotalValueBytes((LOW_BOTTOM_VALUE.length() + LOW_TOP_VALUE.length()) + 2 * STRING_VALUE_BYTES_OVERHEAD, ImmutableList.of(LOW_BOTTOM_VALUE, LOW_TOP_VALUE));
    }

    @Test
    public void testBlockStringStatistics()
    {
        String alphabets = "abcdefghijklmnopqrstuvwxyz";
        VariableWidthBlockBuilder blockBuilder = new VariableWidthBlockBuilder(null, alphabets.length(), alphabets.length());
        Slice slice = utf8Slice(alphabets);
        for (int i = 0; i < slice.length(); i++) {
            VARCHAR.writeSlice(blockBuilder, slice, i, 1);
        }
        blockBuilder.appendNull();

        StringStatisticsBuilder stringStatisticsBuilder = new StringStatisticsBuilder(1000);
        stringStatisticsBuilder.addBlock(VARCHAR, blockBuilder);

        StringStatistics stringStatistics = stringStatisticsBuilder.buildColumnStatistics().getStringStatistics();
        assertEquals(stringStatistics.getMin(), slice.slice(0, 1));
        assertEquals(stringStatistics.getMax(), slice.slice(slice.length() - 1, 1));
        assertEquals(stringStatistics.getSum(), slice.length());
    }

    @Test
    public void testAddValueByPosition()
    {
        String alphabet = "abcdefghijklmnopqrstuvwxyz";
        VariableWidthBlockBuilder blockBuilder = new VariableWidthBlockBuilder(null, alphabet.length(), alphabet.length());
        Slice slice = utf8Slice(alphabet);
        for (int i = 0; i < slice.length(); i++) {
            VARCHAR.writeSlice(blockBuilder, slice, i, 1);
        }
        blockBuilder.appendNull();

        StringStatisticsBuilder stringStatisticsBuilder = new StringStatisticsBuilder(1000);
        int positionCount = blockBuilder.getPositionCount();
        for (int position = 0; position < positionCount; position++) {
            stringStatisticsBuilder.addValue(VARCHAR, blockBuilder, position);
        }

        ColumnStatistics columnStatistics = stringStatisticsBuilder.buildColumnStatistics();
        assertEquals(columnStatistics.getNumberOfValues(), positionCount - 1);

        StringStatistics stringStatistics = columnStatistics.getStringStatistics();
        assertEquals(stringStatistics.getMin(), slice.slice(0, 1));
        assertEquals(stringStatistics.getMax(), slice.slice(slice.length() - 1, 1));
        assertEquals(stringStatistics.getSum(), slice.length());
    }

    private void assertMergedStringStatistics(List<ColumnStatistics> statisticsList, int expectedNumberOfValues, int expectedSum)
    {
        assertStringStatistics(mergeColumnStatistics(statisticsList), expectedNumberOfValues, expectedSum);

        assertNoColumnStatistics(mergeColumnStatistics(insertEmptyColumnStatisticsAt(statisticsList, 0, 10)), expectedNumberOfValues + 10);
        assertNoColumnStatistics(mergeColumnStatistics(insertEmptyColumnStatisticsAt(statisticsList, statisticsList.size(), 10)), expectedNumberOfValues + 10);
        assertNoColumnStatistics(mergeColumnStatistics(insertEmptyColumnStatisticsAt(statisticsList, statisticsList.size() / 2, 10)), expectedNumberOfValues + 10);
    }

    private static void assertMinMaxValuesWithLimit(Slice expectedMin, Slice expectedMax, List<Slice> values, int limit)
    {
        checkArgument(values != null && values.size() > 0);
        StringStatisticsBuilder builder = new StringStatisticsBuilder(limit);
        for (Slice value : values) {
            addValue(builder, value);
        }
        assertMinMax(builder.buildColumnStatistics().getStringStatistics(), expectedMin, expectedMax);
    }

    private static void assertMinMaxValuesWithLimit(Slice expectedMin, Slice expectedMax, List<Slice> values, int limit, long expectedSum)
    {
        checkArgument(values != null && values.size() > 0);
        StringStatisticsBuilder builder = new StringStatisticsBuilder(limit);
        for (Slice value : values) {
            addValue(builder, value);
        }
        assertMinMax(builder.buildColumnStatistics().getStringStatistics(), expectedMin, expectedMax, expectedSum);
    }

    private static void assertMinMax(StringStatistics actualStringStatistics, Slice expectedMin, Slice expectedMax)
    {
        if (expectedMax == null && expectedMin == null) {
            assertNull(actualStringStatistics);
            return;
        }

        assertNotNull(actualStringStatistics);
        assertEquals(actualStringStatistics.getMin(), expectedMin);
        assertEquals(actualStringStatistics.getMax(), expectedMax);
    }

    private static void assertMinMax(StringStatistics actualStringStatistics, Slice expectedMin, Slice expectedMax, long expectedSum)
    {
        assertNotNull(actualStringStatistics);
        assertEquals(actualStringStatistics.getMin(), expectedMin);
        assertEquals(actualStringStatistics.getMax(), expectedMax);
        assertEquals(actualStringStatistics.getSum(), expectedSum);
    }

    private static ColumnStatistics stringColumnStatistics(Slice minimum, Slice maximum)
    {
        if (minimum == null && maximum == null) {
            return new ColumnStatistics(100L, null, null, null);
        }
        return new StringColumnStatistics(
                100L,
                null,
                null,
                null,
                new StringStatistics(minimum, maximum, 100));
    }

    private void assertStringStatistics(ColumnStatistics columnStatistics, int expectedNumberOfValues, long expectedSum)
    {
        if (expectedNumberOfValues > 0) {
            assertEquals(columnStatistics.getNumberOfValues(), expectedNumberOfValues);
            assertEquals(columnStatistics.getStringStatistics().getSum(), expectedSum);
        }
        else {
            assertNull(columnStatistics.getStringStatistics());
            assertEquals(columnStatistics.getNumberOfValues(), 0);
        }
    }

    public static void addValue(StringStatisticsBuilder stringStatisticsBuilder, Slice slice)
    {
        VariableWidthBlockBuilder blockBuilder = new VariableWidthBlockBuilder(null, 1, slice.length());
        blockBuilder.writeBytes(slice, 0, slice.length()).closeEntry();

        stringStatisticsBuilder.addBlock(VARCHAR, blockBuilder);
    }
}