ShortDecimalFlatBatchReader.java

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.parquet.batchreader;

import com.facebook.presto.common.block.Block;
import com.facebook.presto.common.block.LongArrayBlock;
import com.facebook.presto.common.block.RunLengthEncodedBlock;
import com.facebook.presto.parquet.ColumnReader;
import com.facebook.presto.parquet.DataPage;
import com.facebook.presto.parquet.DictionaryPage;
import com.facebook.presto.parquet.Field;
import com.facebook.presto.parquet.RichColumnDescriptor;
import com.facebook.presto.parquet.batchreader.decoders.Decoders.FlatDecoders;
import com.facebook.presto.parquet.batchreader.decoders.FlatDefinitionLevelDecoder;
import com.facebook.presto.parquet.batchreader.decoders.ValuesDecoder.ShortDecimalValuesDecoder;
import com.facebook.presto.parquet.batchreader.dictionary.Dictionaries;
import com.facebook.presto.parquet.dictionary.Dictionary;
import com.facebook.presto.parquet.reader.ColumnChunk;
import com.facebook.presto.parquet.reader.PageReader;
import com.facebook.presto.spi.PrestoException;
import org.apache.parquet.internal.filter2.columnindex.RowRanges;
import org.apache.parquet.io.ParquetDecodingException;
import org.openjdk.jol.info.ClassLayout;

import java.io.IOException;
import java.util.Optional;

import static com.facebook.presto.parquet.ParquetErrorCode.PARQUET_IO_READ_ERROR;
import static com.facebook.presto.parquet.batchreader.decoders.Decoders.readFlatPage;
import static com.google.common.base.Preconditions.checkArgument;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;

public class ShortDecimalFlatBatchReader
        implements ColumnReader
{
    private static final int INSTANCE_SIZE = ClassLayout.parseClass(ShortDecimalFlatBatchReader.class).instanceSize();

    private final RichColumnDescriptor columnDescriptor;

    protected Field field;
    protected int nextBatchSize;
    protected FlatDefinitionLevelDecoder definitionLevelDecoder;
    protected ShortDecimalValuesDecoder valuesDecoder;
    protected int remainingCountInPage;

    private Dictionary dictionary;
    private int readOffset;
    private PageReader pageReader;

    public ShortDecimalFlatBatchReader(RichColumnDescriptor columnDescriptor)
    {
        this.columnDescriptor = requireNonNull(columnDescriptor, "columnDescriptor is null");
    }

    @Override
    public boolean isInitialized()
    {
        return pageReader != null && field != null;
    }

    @Override
    public void init(PageReader pageReader, Field field, RowRanges rowRanges)
    {
        checkArgument(!isInitialized(), "Parquet batch reader already initialized");
        this.pageReader = requireNonNull(pageReader, "pageReader is null");
        checkArgument(pageReader.getValueCountInColumnChunk() > 0, "page is empty");
        this.field = requireNonNull(field, "field is null");

        DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
        if (dictionaryPage != null) {
            dictionary = Dictionaries.createDictionary(columnDescriptor, dictionaryPage);
        }
    }

    @Override
    public void prepareNextRead(int batchSize)
    {
        readOffset = readOffset + nextBatchSize;
        nextBatchSize = batchSize;
    }

    @Override
    public ColumnChunk readNext()
    {
        ColumnChunk columnChunk = null;
        try {
            seek();
            if (field.isRequired()) {
                columnChunk = readWithoutNull();
            }
            else {
                columnChunk = readWithNull();
            }
        }
        catch (IOException exception) {
            throw new PrestoException(PARQUET_IO_READ_ERROR, "Error reading Parquet column " + columnDescriptor, exception);
        }

        readOffset = 0;
        nextBatchSize = 0;
        return columnChunk;
    }

    @Override
    public long getRetainedSizeInBytes()
    {
        return INSTANCE_SIZE +
                (definitionLevelDecoder == null ? 0 : definitionLevelDecoder.getRetainedSizeInBytes()) +
                (valuesDecoder == null ? 0 : valuesDecoder.getRetainedSizeInBytes()) +
                (dictionary == null ? 0 : dictionary.getRetainedSizeInBytes()) +
                (pageReader == null ? 0 : pageReader.getRetainedSizeInBytes());
    }

    protected boolean readNextPage()
    {
        definitionLevelDecoder = null;
        valuesDecoder = null;
        remainingCountInPage = 0;

        DataPage page = pageReader.readPage();
        if (page == null) {
            return false;
        }

        FlatDecoders flatDecoders = readFlatPage(page, columnDescriptor, dictionary);
        definitionLevelDecoder = flatDecoders.getDefinitionLevelDecoder();
        valuesDecoder = (ShortDecimalValuesDecoder) flatDecoders.getValuesDecoder();

        remainingCountInPage = page.getValueCount();
        return true;
    }

    private ColumnChunk readWithNull()
            throws IOException
    {
        long[] values = new long[nextBatchSize];
        boolean[] isNull = new boolean[nextBatchSize];

        int totalNonNullCount = 0;
        int remainingInBatch = nextBatchSize;
        int startOffset = 0;
        while (remainingInBatch > 0) {
            if (remainingCountInPage == 0) {
                if (!readNextPage()) {
                    break;
                }
            }

            int chunkSize = Math.min(remainingCountInPage, remainingInBatch);
            int nonNullCount = definitionLevelDecoder.readNext(isNull, startOffset, chunkSize);
            totalNonNullCount += nonNullCount;

            if (nonNullCount > 0) {
                valuesDecoder.readNext(values, startOffset, nonNullCount);

                int valueDestinationIndex = startOffset + chunkSize - 1;
                int valueSourceIndex = startOffset + nonNullCount - 1;

                while (valueDestinationIndex >= startOffset) {
                    if (!isNull[valueDestinationIndex]) {
                        values[valueDestinationIndex] = values[valueSourceIndex];
                        valueSourceIndex--;
                    }
                    valueDestinationIndex--;
                }
            }

            startOffset += chunkSize;
            remainingInBatch -= chunkSize;
            remainingCountInPage -= chunkSize;
        }

        if (remainingInBatch != 0) {
            throw new ParquetDecodingException("Still remaining to be read in current batch.");
        }

        if (totalNonNullCount == 0) {
            Block block = RunLengthEncodedBlock.create(field.getType(), null, nextBatchSize);
            return new ColumnChunk(block, new int[0], new int[0]);
        }

        boolean hasNoNull = totalNonNullCount == nextBatchSize;
        Block block = new LongArrayBlock(nextBatchSize, hasNoNull ? Optional.empty() : Optional.of(isNull), values);
        return new ColumnChunk(block, new int[0], new int[0]);
    }

    private ColumnChunk readWithoutNull()
            throws IOException
    {
        long[] values = new long[nextBatchSize];
        int remainingInBatch = nextBatchSize;
        int startOffset = 0;
        while (remainingInBatch > 0) {
            if (remainingCountInPage == 0) {
                if (!readNextPage()) {
                    break;
                }
            }

            int chunkSize = Math.min(remainingCountInPage, remainingInBatch);

            valuesDecoder.readNext(values, startOffset, chunkSize);
            startOffset += chunkSize;
            remainingInBatch -= chunkSize;
            remainingCountInPage -= chunkSize;
        }

        if (remainingInBatch != 0) {
            throw new ParquetDecodingException(format("Corrupted Parquet file: extra %d values to be consumed when scanning current batch", remainingInBatch));
        }

        Block block = new LongArrayBlock(nextBatchSize, Optional.empty(), values);
        return new ColumnChunk(block, new int[0], new int[0]);
    }

    private void seek()
            throws IOException
    {
        if (readOffset == 0) {
            return;
        }

        int remainingInBatch = readOffset;
        int startOffset = 0;
        while (remainingInBatch > 0) {
            if (remainingCountInPage == 0) {
                if (!readNextPage()) {
                    break;
                }
            }

            int chunkSize = Math.min(remainingCountInPage, remainingInBatch);
            int skipSize = chunkSize;
            if (!columnDescriptor.isRequired()) {
                boolean[] isNull = new boolean[readOffset];
                int nonNullCount = definitionLevelDecoder.readNext(isNull, startOffset, chunkSize);
                skipSize = nonNullCount;
                startOffset += chunkSize;
            }
            valuesDecoder.skip(skipSize);
            remainingInBatch -= chunkSize;
            remainingCountInPage -= chunkSize;
        }
    }
}