TestIcebergFileWriter.java
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.iceberg;
import com.facebook.presto.Session;
import com.facebook.presto.common.Page;
import com.facebook.presto.common.type.BooleanType;
import com.facebook.presto.common.type.DoubleType;
import com.facebook.presto.common.type.Type;
import com.facebook.presto.common.type.TypeManager;
import com.facebook.presto.common.type.TypeSignature;
import com.facebook.presto.common.type.TypeSignatureParameter;
import com.facebook.presto.hive.FileFormatDataSourceStats;
import com.facebook.presto.hive.HdfsContext;
import com.facebook.presto.hive.HdfsEnvironment;
import com.facebook.presto.hive.HiveClientConfig;
import com.facebook.presto.hive.HiveCompressionCodec;
import com.facebook.presto.hive.HiveDwrfEncryptionProvider;
import com.facebook.presto.hive.MetastoreClientConfig;
import com.facebook.presto.hive.NodeVersion;
import com.facebook.presto.hive.OrcFileWriterConfig;
import com.facebook.presto.hive.s3.HiveS3Config;
import com.facebook.presto.metadata.SessionPropertyManager;
import com.facebook.presto.parquet.FileParquetDataSource;
import com.facebook.presto.parquet.cache.MetadataReader;
import com.facebook.presto.spi.ColumnMetadata;
import com.facebook.presto.spi.ConnectorId;
import com.facebook.presto.spi.ConnectorSession;
import com.facebook.presto.spi.session.PropertyMetadata;
import com.google.common.collect.ImmutableList;
import io.airlift.units.DataSize;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.iceberg.MetricsConfig;
import org.apache.iceberg.Schema;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.io.File;
import java.util.List;
import java.util.Optional;
import static com.facebook.presto.RowPagesBuilder.rowPagesBuilder;
import static com.facebook.presto.common.type.BigintType.BIGINT;
import static com.facebook.presto.common.type.DateType.DATE;
import static com.facebook.presto.common.type.HyperLogLogType.HYPER_LOG_LOG;
import static com.facebook.presto.common.type.IntegerType.INTEGER;
import static com.facebook.presto.common.type.TimestampType.TIMESTAMP;
import static com.facebook.presto.common.type.VarbinaryType.VARBINARY;
import static com.facebook.presto.common.type.VarcharType.VARCHAR;
import static com.facebook.presto.iceberg.IcebergAbstractMetadata.toIcebergSchema;
import static com.facebook.presto.iceberg.IcebergDistributedTestBase.getHdfsEnvironment;
import static com.facebook.presto.iceberg.IcebergQueryRunner.ICEBERG_CATALOG;
import static com.facebook.presto.iceberg.IcebergSessionProperties.dataSizeSessionProperty;
import static com.facebook.presto.metadata.SessionPropertyManager.createTestingSessionPropertyManager;
import static com.facebook.presto.testing.TestingSession.testSessionBuilder;
import static com.google.common.io.Files.createTempDir;
import static org.apache.iceberg.parquet.ParquetSchemaUtil.convert;
import static org.testng.Assert.assertEquals;
public class TestIcebergFileWriter
{
private IcebergFileWriterFactory icebergFileWriterFactory;
private HdfsContext hdfsContext;
private ConnectorSession connectorSession;
@BeforeClass
public void setup()
throws Exception
{
ConnectorId connectorId = new ConnectorId("iceberg");
SessionPropertyManager sessionPropertyManager = createTestingSessionPropertyManager();
sessionPropertyManager.addConnectorSessionProperties(
connectorId,
ImmutableList.of(
dataSizeSessionProperty("parquet_writer_page_size", "Parquet: Writer page size", new DataSize(10, DataSize.Unit.KILOBYTE), false),
dataSizeSessionProperty("parquet_writer_block_size", "Parquet: Writer block size", new DataSize(10, DataSize.Unit.KILOBYTE), false),
new PropertyMetadata<>(
"parquet_writer_version",
"Parquet: Writer version",
VARCHAR,
ParquetProperties.WriterVersion.class,
ParquetProperties.WriterVersion.PARQUET_2_0,
false,
value -> ParquetProperties.WriterVersion.valueOf(((String) value).toUpperCase()),
ParquetProperties.WriterVersion::name),
new PropertyMetadata<>(
"compression_codec",
"The compression codec to use when writing files",
VARCHAR,
HiveCompressionCodec.class,
HiveCompressionCodec.NONE,
false,
value -> HiveCompressionCodec.valueOf(((String) value).toUpperCase()),
HiveCompressionCodec::name)));
Session session = testSessionBuilder(sessionPropertyManager)
.setCatalog(ICEBERG_CATALOG)
.setSchema("tpch")
.build();
this.connectorSession = session.toConnectorSession(connectorId);
TypeManager typeManager = new TestingTypeManager();
this.hdfsContext = new HdfsContext(connectorSession);
HdfsEnvironment hdfsEnvironment = getHdfsEnvironment(new HiveClientConfig(), new MetastoreClientConfig(), new HiveS3Config());
this.icebergFileWriterFactory = new IcebergFileWriterFactory(hdfsEnvironment, typeManager,
new FileFormatDataSourceStats(), new NodeVersion("test"), new OrcFileWriterConfig(), HiveDwrfEncryptionProvider.NO_ENCRYPTION);
}
@Test
public void testWriteParquetFileWithLogicalTypes()
throws Exception
{
Path path = new Path(createTempDir().getAbsolutePath() + "/test.parquet");
Schema icebergSchema = toIcebergSchema(ImmutableList.of(
ColumnMetadata.builder().setName("a").setType(VARCHAR).build(),
ColumnMetadata.builder().setName("b").setType(INTEGER).build(),
ColumnMetadata.builder().setName("c").setType(TIMESTAMP).build(),
ColumnMetadata.builder().setName("d").setType(DATE).build()));
IcebergFileWriter icebergFileWriter = this.icebergFileWriterFactory.createFileWriter(path, icebergSchema, new JobConf(), connectorSession,
hdfsContext, FileFormat.PARQUET, MetricsConfig.getDefault());
List<Page> input = rowPagesBuilder(VARCHAR, BIGINT, TIMESTAMP, DATE)
.addSequencePage(100, 0, 0, 123, 100)
.addSequencePage(100, 100, 100, 223, 100)
.addSequencePage(100, 200, 200, 323, 100)
.build();
for (Page page : input) {
icebergFileWriter.appendRows(page);
}
icebergFileWriter.commit();
File parquetFile = new File(path.toString());
FileParquetDataSource dataSource = new FileParquetDataSource(parquetFile);
ParquetMetadata parquetMetadata = MetadataReader.readFooter(
dataSource,
parquetFile.length(),
Optional.empty(),
false).getParquetMetadata();
MessageType writtenSchema = parquetMetadata.getFileMetaData().getSchema();
MessageType originalSchema = convert(icebergSchema, "table");
assertEquals(originalSchema, writtenSchema);
}
private static class TestingTypeManager
implements TypeManager
{
@Override
public Type getType(TypeSignature signature)
{
for (Type type : getTypes()) {
if (signature.getBase().equals(type.getTypeSignature().getBase())) {
return type;
}
}
return null;
}
@Override
public Type getParameterizedType(String baseTypeName, List<TypeSignatureParameter> typeParameters)
{
return getType(new TypeSignature(baseTypeName, typeParameters));
}
@Override
public boolean canCoerce(Type actualType, Type expectedType)
{
throw new UnsupportedOperationException();
}
@Override
public List<Type> getTypes()
{
return ImmutableList.of(BooleanType.BOOLEAN, INTEGER, BIGINT, DoubleType.DOUBLE, VARCHAR, VARBINARY, TIMESTAMP, DATE, HYPER_LOG_LOG);
}
}
}