AbstractTestAggregationsNative.java

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.nativetests;

import com.facebook.presto.tests.AbstractTestAggregations;
import org.testng.annotations.Test;

import static java.lang.String.format;

public abstract class AbstractTestAggregationsNative
        extends AbstractTestAggregations
{
    private static final String QDIGEST_TYPE = "qdigest";

    private String storageFormat;
    private String approxDistinctUnsupportedSignatureError;
    private String charTypeUnsupportedError;
    private String timeTypeUnsupportedError;

    public void init(String storageFormat, boolean sidecarEnabled)
    {
        this.storageFormat = storageFormat;
        if (sidecarEnabled) {
            charTypeUnsupportedError = ".*Unknown type: char.*";
            timeTypeUnsupportedError = ".*Unknown type: time.*";
            approxDistinctUnsupportedSignatureError = ".*Unexpected parameters \\(timestamp with time zone.*\\) for function.*";
        }
        else {
            charTypeUnsupportedError = "Failed to parse type.*char";
            timeTypeUnsupportedError = "Failed to parse type.*time";
            approxDistinctUnsupportedSignatureError = ".*Aggregate function signature is not supported.*";
        }
    }

    /// `approx_distinct` aggregate function returns a different value for certain datatypes in Presto C++, see this
    /// issue for more details: https://github.com/facebookincubator/velox/issues/9761.
    /// `approx_distinct` does not support arguments of type `TIMESTAMP WITH TIME ZONE` in Presto C++, see this issue
    /// for more details: https://github.com/prestodb/presto/issues/24815.
    /// Presto C++ does not support datatypes `CHAR` and `TIME`, see:
    /// https://github.com/prestodb/presto/blob/master/presto-docs/src/main/sphinx/presto_cpp/limitations.rst.
    @Override
    @Test
    public void testApproximateCountDistinct()
    {
        // test NULL
        assertQuery("SELECT approx_distinct(NULL)", "SELECT 0");
        assertQuery("SELECT approx_distinct(NULL, 0.023)", "SELECT 0");

        // test date
        String orderdate = storageFormat.equals("DWRF") ? "cast(orderdate as DATE)" : "orderdate";
        assertQuery(format("SELECT approx_distinct(%s) FROM orders", orderdate), "SELECT 2372");
        assertQuery(format("SELECT approx_distinct(%s, 0.023) FROM orders", orderdate), "SELECT 2372");

        // test timestamp
        assertQuery("SELECT approx_distinct(CAST(orderdate AS TIMESTAMP)) FROM orders", "SELECT 2347");
        assertQuery("SELECT approx_distinct(CAST(orderdate AS TIMESTAMP), 0.023) FROM orders", "SELECT 2347");

        // test timestamp with time zone
        assertQueryFails("SELECT approx_distinct(CAST(orderdate AS TIMESTAMP WITH TIME ZONE)) FROM orders",
                approxDistinctUnsupportedSignatureError, true);
        assertQueryFails("SELECT approx_distinct(CAST(orderdate AS TIMESTAMP WITH TIME ZONE), 0.023) FROM orders",
                approxDistinctUnsupportedSignatureError, true);

        // test time
        assertQueryFails("SELECT approx_distinct(CAST(from_unixtime(custkey) AS TIME)) FROM orders", timeTypeUnsupportedError, true);
        assertQueryFails("SELECT approx_distinct(CAST(from_unixtime(custkey) AS TIME), 0.023) FROM orders", timeTypeUnsupportedError, true);

        // test time with time zone
        assertQueryFails("SELECT approx_distinct(CAST(from_unixtime(custkey) AS TIME WITH TIME ZONE)) FROM orders", timeTypeUnsupportedError, true);
        assertQueryFails("SELECT approx_distinct(CAST(from_unixtime(custkey) AS TIME WITH TIME ZONE), 0.023) FROM orders", timeTypeUnsupportedError, true);

        // test short decimal
        assertQuery("SELECT approx_distinct(CAST(custkey AS DECIMAL(18, 0))) FROM orders", "SELECT 990");
        assertQuery("SELECT approx_distinct(CAST(custkey AS DECIMAL(18, 0)), 0.023) FROM orders", "SELECT 990");

        // test long decimal
        assertQuery("SELECT approx_distinct(CAST(custkey AS DECIMAL(25, 20))) FROM orders", "SELECT 1013");
        assertQuery("SELECT approx_distinct(CAST(custkey AS DECIMAL(25, 20)), 0.023) FROM orders", "SELECT 1013");

        // test real
        assertQuery("SELECT approx_distinct(CAST(custkey AS REAL)) FROM orders", "SELECT 982");
        assertQuery("SELECT approx_distinct(CAST(custkey AS REAL), 0.023) FROM orders", "SELECT 982");

        // test bigint
        assertQuery("SELECT approx_distinct(custkey) FROM orders", "SELECT 990");
        assertQuery("SELECT approx_distinct(custkey, 0.023) FROM orders", "SELECT 990");

        // test integer
        assertQuery("SELECT approx_distinct(CAST(custkey AS INTEGER)) FROM orders", "SELECT 1028");
        assertQuery("SELECT approx_distinct(CAST(custkey AS INTEGER), 0.023) FROM orders", "SELECT 1028");

        // test smallint
        assertQuery("SELECT approx_distinct(CAST(custkey AS SMALLINT)) FROM orders", "SELECT 1023");
        assertQuery("SELECT approx_distinct(CAST(custkey AS SMALLINT), 0.023) FROM orders", "SELECT 1023");

        // test tinyint
        assertQuery("SELECT approx_distinct(CAST((custkey % 128) AS TINYINT)) FROM orders", "SELECT 128");
        assertQuery("SELECT approx_distinct(CAST((custkey % 128) AS TINYINT), 0.023) FROM orders", "SELECT 128");

        // test double
        assertQuery("SELECT approx_distinct(CAST(custkey AS DOUBLE)) FROM orders", "SELECT 1014");
        assertQuery("SELECT approx_distinct(CAST(custkey AS DOUBLE), 0.023) FROM orders", "SELECT 1014");

        // test varchar
        assertQuery("SELECT approx_distinct(CAST(custkey AS VARCHAR)) FROM orders", "SELECT 1036");
        assertQuery("SELECT approx_distinct(CAST(custkey AS VARCHAR), 0.023) FROM orders", "SELECT 1036");

        // test char
        assertQueryFails("SELECT approx_distinct(CAST(CAST(custkey AS VARCHAR) AS CHAR(20))) FROM orders", charTypeUnsupportedError, true);
        assertQueryFails("SELECT approx_distinct(CAST(CAST(custkey AS VARCHAR) AS CHAR(20)), 0.023) FROM orders", charTypeUnsupportedError, true);

        // test varbinary
        assertQuery("SELECT approx_distinct(to_utf8(CAST(custkey AS VARCHAR))) FROM orders", "SELECT 1036");
        assertQuery("SELECT approx_distinct(to_utf8(CAST(custkey AS VARCHAR)), 0.023) FROM orders", "SELECT 1036");
    }

    /// `sum_data_size_for_stats` returns a different value for `Varchar` and `Varbinary` datatypes in Presto C++, see:
    /// https://github.com/prestodb/presto/issues/20909. `CHAR` datatype is not supported in Presto C++, see issue:
    /// https://github.com/prestodb/presto/issues/21332.
    @Override
    @Test
    public void testSumDataSizeForStats()
    {
        // varchar
        assertQuery("SELECT \"sum_data_size_for_stats\"(comment) FROM orders", "SELECT 787364");

        // char
        // Presto removes trailing whitespaces when casting to CHAR.
        // Hard code the expected data size since there is no easy to way to compute it in H2.
        assertQueryFails("SELECT \"sum_data_size_for_stats\"(CAST(comment AS CHAR(1000))) FROM orders",
                charTypeUnsupportedError, true);

        // varbinary
        assertQuery("SELECT \"sum_data_size_for_stats\"(CAST(comment AS VARBINARY)) FROM orders", "SELECT 787364");

        // array
        assertQuery("SELECT \"sum_data_size_for_stats\"(ARRAY[comment]) FROM orders", "SELECT 847364");
        assertQuery("SELECT \"sum_data_size_for_stats\"(ARRAY[comment, comment]) FROM orders", "SELECT 1634728");

        // map
        assertQuery("SELECT \"sum_data_size_for_stats\"(map(ARRAY[1], ARRAY[comment])) FROM orders", "SELECT 907364");
        assertQuery("SELECT \"sum_data_size_for_stats\"(map(ARRAY[1, 2], ARRAY[comment, comment])) FROM orders", "SELECT 1754728");

        // row
        assertQuery("SELECT \"sum_data_size_for_stats\"(ROW(comment)) FROM orders", "SELECT 847364");
        assertQuery("SELECT \"sum_data_size_for_stats\"(ROW(comment, comment)) FROM orders", "SELECT 1634728");
    }

    /// `max_data_size_for_stats` returns a different value for `Varchar` and `Varbinary` datatypes in Presto C++, see:
    /// https://github.com/prestodb/presto/issues/20909. `CHAR` datatype is not supported in Presto C++, see issue:
    /// https://github.com/prestodb/presto/issues/21332.
    @Override
    @Test
    public void testMaxDataSizeForStats()
    {
        // varchar
        assertQuery("SELECT \"max_data_size_for_stats\"(comment) FROM orders", "select 82");

        // char
        assertQueryFails("SELECT \"max_data_size_for_stats\"(CAST(comment AS CHAR(1000))) FROM orders",
                charTypeUnsupportedError, true);

        // varbinary
        assertQuery("SELECT \"max_data_size_for_stats\"(CAST(comment AS VARBINARY)) FROM orders", "select 82");

        // max_data_size_for_stats is not needed for array, map and row
    }

    @Override
    @Test(dataProvider = "getType")
    public void testStatisticalDigest(String type)
    {
        assertQuery(format("SELECT value_at_quantile(%s_agg(CAST(orderkey AS DOUBLE)), 0.5E0) > 0 FROM lineitem", type), "SELECT true");
        assertQuery(format("SELECT value_at_quantile(%s_agg(CAST(quantity AS DOUBLE)), 0.5E0) > 0 FROM lineitem", type), "SELECT true");
        assertQuery(format("SELECT value_at_quantile(%s_agg(CAST(quantity AS DOUBLE)), 0.5E0) > 0 FROM lineitem", type), "SELECT true");
        assertQuery(format("SELECT value_at_quantile(%s_agg(CAST(orderkey AS DOUBLE), 2), 0.5E0) > 0 FROM lineitem", type), "SELECT true");
        assertQuery(format("SELECT value_at_quantile(%s_agg(CAST(quantity AS DOUBLE), 3), 0.5E0) > 0 FROM lineitem", type), "SELECT true");
        assertQuery(format("SELECT value_at_quantile(%s_agg(CAST(quantity AS DOUBLE), 4), 0.5E0) > 0 FROM lineitem", type), "SELECT true");
        assertQuery(format("SELECT value_at_quantile(%s_agg(CAST(orderkey AS DOUBLE), 2, 0.0001E0), 0.5E0) > 0 FROM lineitem", type), "SELECT true");
        assertQuery(format("SELECT value_at_quantile(%s_agg(CAST(quantity AS DOUBLE), 3, 0.0001E0), 0.5E0) > 0 FROM lineitem", type), "SELECT true");
        assertQuery(format("SELECT value_at_quantile(%s_agg(CAST(quantity AS DOUBLE), 4, 0.0001E0), 0.5E0) > 0 FROM lineitem", type), "SELECT true");
    }

    /// Function `tdigest_agg` is not supported in Presto C++, see: https://github.com/prestodb/presto/issues/24811.
    @Override
    @Test(dataProvider = "getType")
    public void testStatisticalDigestGroupBy(String type)
    {
        assertQuery(format("SELECT partkey, value_at_quantile(%s_agg(CAST(orderkey AS DOUBLE)), 0.5E0) > 0 FROM lineitem GROUP BY partkey", type),
                    "SELECT partkey, true FROM lineitem GROUP BY partkey");
        assertQuery(format("SELECT partkey, value_at_quantile(%s_agg(CAST(quantity AS DOUBLE)), 0.5E0) > 0 FROM lineitem GROUP BY partkey", type),
                    "SELECT partkey, true FROM lineitem GROUP BY partkey");
        assertQuery(format("SELECT partkey, value_at_quantile(%s_agg(CAST(quantity AS DOUBLE)), 0.5E0) > 0 FROM lineitem GROUP BY partkey", type),
                    "SELECT partkey, true FROM lineitem GROUP BY partkey");
        assertQuery(format("SELECT partkey, value_at_quantile(%s_agg(CAST(orderkey AS DOUBLE), 2), 0.5E0) > 0 FROM lineitem GROUP BY partkey", type),
                    "SELECT partkey, true FROM lineitem GROUP BY partkey");
        assertQuery(format("SELECT partkey, value_at_quantile(%s_agg(CAST(quantity AS DOUBLE), 3), 0.5E0) > 0 FROM lineitem GROUP BY partkey", type),
                    "SELECT partkey, true FROM lineitem GROUP BY partkey");
        assertQuery(format("SELECT partkey, value_at_quantile(%s_agg(CAST(quantity AS DOUBLE), 4), 0.5E0) > 0 FROM lineitem GROUP BY partkey", type),
                    "SELECT partkey, true FROM lineitem GROUP BY partkey");
        assertQuery(format("SELECT partkey, value_at_quantile(%s_agg(CAST(orderkey AS DOUBLE), 2, 0.0001E0), 0.5E0) > 0 FROM lineitem GROUP BY partkey", type),
                    "SELECT partkey, true FROM lineitem GROUP BY partkey");
        assertQuery(format("SELECT partkey, value_at_quantile(%s_agg(CAST(quantity AS DOUBLE), 3, 0.0001E0), 0.5E0) > 0 FROM lineitem GROUP BY partkey", type),
                    "SELECT partkey, true FROM lineitem GROUP BY partkey");
        assertQuery(format("SELECT partkey, value_at_quantile(%s_agg(CAST(quantity AS DOUBLE), 4, 0.0001E0), 0.5E0) > 0 FROM lineitem GROUP BY partkey", type),
                    "SELECT partkey, true FROM lineitem GROUP BY partkey");
    }

    @Override
    @Test(dataProvider = "getType", enabled = false)
    public void testStatisticalDigestMerge(String type)
    {
        assertQuery(format("SELECT value_at_quantile(merge(%s), 0.5E0) > 0 FROM (SELECT partkey, %s_agg(CAST(orderkey AS DOUBLE)) as %s FROM lineitem GROUP BY partkey)",
                        type,
                        type,
                        type),
                "SELECT true");
    }

    @Override
    @Test(dataProvider = "getType", enabled = false)
    public void testStatisticalDigestMergeGroupBy(String type)
    {
        assertQuery(format("SELECT partkey, value_at_quantile(merge(%s), 0.5E0) > 0 " +
                                "FROM (SELECT partkey, suppkey, %s_agg(CAST(orderkey AS DOUBLE)) as %s FROM lineitem GROUP BY partkey, suppkey)" +
                                "GROUP BY partkey",
                        type,
                        type,
                        type),
                "SELECT partkey, true FROM lineitem GROUP BY partkey");
    }
}