TestHiveAnalyze.java

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.hive;

import com.facebook.presto.Session;
import com.facebook.presto.common.type.TimeZoneKey;
import com.facebook.presto.testing.QueryRunner;
import com.facebook.presto.tests.AbstractTestQueryFramework;
import com.facebook.presto.tests.DistributedQueryRunner;
import com.google.common.collect.ImmutableMap;
import org.testng.annotations.Test;

import java.io.IOException;
import java.nio.file.Path;
import java.util.Map;

import static com.facebook.presto.testing.TestingSession.testSessionBuilder;
import static java.lang.String.format;

@Test
public class TestHiveAnalyze
        extends AbstractTestQueryFramework
{
    private static final String CATALOG = "hive";
    private static final String SCHEMA = "test_analyze_schema";

    @Override
    protected QueryRunner createQueryRunner()
            throws Exception
    {
        Session session = testSessionBuilder().setCatalog(CATALOG).setSchema(SCHEMA).setTimeZoneKey(TimeZoneKey.UTC_KEY).build();
        DistributedQueryRunner queryRunner = DistributedQueryRunner.builder(session).setExtraProperties(ImmutableMap.<String, String>builder().build()).build();

        queryRunner.installPlugin(new HivePlugin(CATALOG));
        Path catalogDirectory = queryRunner.getCoordinator().getDataDirectory().resolve("hive_data").getParent().resolve("catalog");
        Map<String, String> properties = ImmutableMap.<String, String>builder()
                .put("hive.metastore", "file")
                .put("hive.metastore.catalog.dir", catalogDirectory.toFile().toURI().toString())
                .put("hive.allow-drop-table", "true")
                .put("hive.non-managed-table-writes-enabled", "true")
                .put("hive.parquet.use-column-names", "true")
                .build();

        queryRunner.createCatalog(CATALOG, CATALOG, properties);
        queryRunner.execute(format("CREATE SCHEMA %s.%s", CATALOG, SCHEMA));

        return queryRunner;
    }

    @Test
    public void testAnalyzePartitionedTableWithNonCanonicalValues()
            throws IOException
    {
        String tableName = "test_analyze_table_canonicalization";
        assertUpdate(format("CREATE TABLE %s (a_varchar varchar, month varchar) WITH (partitioned_by = ARRAY['month'], external_location='%s')", tableName, com.google.common.io.Files.createTempDir().getPath()));

        assertUpdate(format("INSERT INTO %s VALUES ('A', '01'), ('B', '01'), ('C', '02'), ('D', '03')", tableName), 4);

        String tableLocation = (String) computeActual(format("SELECT DISTINCT regexp_replace(\"$path\", '/[^/]*/[^/]*$', '') FROM %s", tableName)).getOnlyValue();

        String externalTableName = "external_" + tableName;
        assertUpdate(format(
                "CREATE TABLE %s (a_varchar varchar, month integer) WITH (partitioned_by = ARRAY['month'], external_location='%s')", externalTableName, tableLocation));

        assertUpdate(format("CALL system.sync_partition_metadata('%s', '%s', 'ADD')", SCHEMA, externalTableName));
        assertQuery(format("SELECT * FROM \"%s$partitions\"", externalTableName), "SELECT * FROM VALUES 1, 2, 3");
        assertUpdate(format("ANALYZE %s", externalTableName), 4);
        assertQuery(format("SHOW STATS FOR %s", externalTableName),
                "SELECT * FROM VALUES " +
                        "('a_varchar', 4.0, 2.0, 0.0, null, null, null, null), " +
                        "('month', null, 3.0, 0.0, null, 1, 3, null), " +
                        "(null, null, null, null, 4.0, null, null, null)");

        assertUpdate(format("INSERT INTO %s VALUES ('E', '04')", tableName), 1);
        assertUpdate(format("CALL system.sync_partition_metadata('%s', '%s', 'ADD')", SCHEMA, externalTableName));
        assertQuery(format("SELECT * FROM \"%s$partitions\"", externalTableName), "SELECT * FROM VALUES 1, 2, 3, 4");
        assertUpdate(format("ANALYZE %s WITH (partitions = ARRAY[ARRAY['04']])", externalTableName), 1);
        assertQuery(format("SHOW STATS FOR %s", externalTableName),
                "SELECT * FROM VALUES " +
                        "('a_varchar', 5.0, 2.0, 0.0, null, null, null, null), " +
                        "('month', null, 4.0, 0.0, null, 1, 4, null), " +
                        "(null, null, null, null, 5.0, null, null, null)");
        // TODO fix selective ANALYZE for table with non-canonical partition values
        assertQueryFails(format("ANALYZE %s WITH (partitions = ARRAY[ARRAY['4']])", externalTableName),
                format("Partition no longer exists: %s.%s/month=4", SCHEMA, externalTableName));

        assertUpdate(format("DROP TABLE %s", tableName));
        assertUpdate(format("DROP TABLE %s", externalTableName));
    }

    @Test
    public void testAnalyzePartitionedTableWithNonCanonicalValuesUnsupportedScenario()
            throws IOException
    {
        String tableName = "test_analyze_table_canonicalization_unsupported";
        assertUpdate(format("CREATE TABLE %s (a_varchar varchar, month varchar) WITH (partitioned_by = ARRAY['month'], external_location='%s')", tableName, com.google.common.io.Files.createTempDir().getPath()));

        assertUpdate(format("INSERT INTO %s VALUES ('A', '1'), ('B', '01'), ('C', '001'), ('D', '02')", tableName), 4);

        String tableLocation = (String) computeActual(format("SELECT DISTINCT regexp_replace(\"$path\", '/[^/]*/[^/]*$', '') FROM %s", tableName)).getOnlyValue();

        String externalTableName = "external_" + tableName;
        assertUpdate(format(
                "CREATE TABLE %s (a_varchar varchar, month integer) WITH (partitioned_by = ARRAY['month'], external_location='%s')", externalTableName, tableLocation));

        assertUpdate(format("CALL system.sync_partition_metadata('%s', '%s', 'ADD')", SCHEMA, externalTableName));
        assertQuery(format("SELECT * FROM \"%s$partitions\"", externalTableName), "SELECT * FROM VALUES 1, 1, 1, 2");

        assertQueryFails(format("ANALYZE %s", externalTableName),
                "There are multiple variants of the same partition, e.g. p=1, p=01, p=001. All partitions must follow the same key=value representation");

        assertUpdate(format("DROP TABLE %s", tableName));
        assertUpdate(format("DROP TABLE %s", externalTableName));
    }
}