RcFileWriteValidation.java

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.rcfile;

import com.facebook.presto.common.Page;
import com.facebook.presto.common.block.Block;
import com.facebook.presto.common.type.StandardTypes;
import com.facebook.presto.common.type.Type;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import io.airlift.slice.XxHash64;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;

import static com.google.common.base.Preconditions.checkArgument;
import static java.util.Objects.requireNonNull;
import static java.util.stream.Collectors.toList;

public class RcFileWriteValidation
{
    private final byte version;
    private final Map<String, String> metadata;
    private final Optional<String> codecClassName;
    private final long syncFirst;
    private final long syncSecond;
    private final WriteChecksum checksum;

    private RcFileWriteValidation(byte version, Map<String, String> metadata, Optional<String> codecClassName, long syncFirst, long syncSecond, WriteChecksum checksum)
    {
        this.version = version;
        this.metadata = ImmutableMap.copyOf(requireNonNull(metadata, "metadata is null"));
        this.codecClassName = requireNonNull(codecClassName, "codecClassName is null");
        this.syncFirst = syncFirst;
        this.syncSecond = syncSecond;
        this.checksum = requireNonNull(checksum, "checksum is null");
    }

    public byte getVersion()
    {
        return version;
    }

    public Map<String, String> getMetadata()
    {
        return metadata;
    }

    public Optional<String> getCodecClassName()
    {
        return codecClassName;
    }

    public long getSyncFirst()
    {
        return syncFirst;
    }

    public long getSyncSecond()
    {
        return syncSecond;
    }

    public WriteChecksum getChecksum()
    {
        return checksum;
    }

    public static class WriteChecksum
    {
        private final long totalRowCount;
        private final long rowGroupHash;
        private final List<Long> columnHashes;

        public WriteChecksum(long totalRowCount, long rowGroupHash, List<Long> columnHashes)
        {
            this.totalRowCount = totalRowCount;
            this.rowGroupHash = rowGroupHash;
            this.columnHashes = columnHashes;
        }

        public long getTotalRowCount()
        {
            return totalRowCount;
        }

        public long getRowGroupHash()
        {
            return rowGroupHash;
        }

        public List<Long> getColumnHashes()
        {
            return columnHashes;
        }
    }

    public static class WriteChecksumBuilder
    {
        // This value is a large arbitrary prime
        private static final long NULL_HASH_CODE = 0x6e3efbd56c16a0cbL;

        private final List<Type> types;
        private long totalRowCount;
        private final List<XxHash64> columnHashes;
        private final XxHash64 rowGroupHash = new XxHash64();

        private final byte[] longBuffer = new byte[Long.BYTES];
        private final Slice longSlice = Slices.wrappedBuffer(longBuffer);

        private WriteChecksumBuilder(List<Type> types)
        {
            this.types = ImmutableList.copyOf(requireNonNull(types, "types is null"));

            ImmutableList.Builder<XxHash64> columnHashes = ImmutableList.builder();
            for (Type ignored : types) {
                columnHashes.add(new XxHash64());
            }
            this.columnHashes = columnHashes.build();
        }

        public static WriteChecksumBuilder createWriteChecksumBuilder(Map<Integer, Type> readColumns)
        {
            requireNonNull(readColumns, "readColumns is null");
            checkArgument(!readColumns.isEmpty(), "readColumns is empty");
            int columnCount = readColumns.keySet().stream()
                    .mapToInt(Integer::intValue)
                    .max().getAsInt() + 1;
            checkArgument(readColumns.size() == columnCount, "checksum requires all columns to be read");

            ImmutableList.Builder<Type> types = ImmutableList.builder();
            for (int column = 0; column < columnCount; column++) {
                Type type = readColumns.get(column);
                checkArgument(type != null, "checksum requires all columns to be read");
                types.add(type);
            }
            return new WriteChecksumBuilder(types.build());
        }

        public void addRowGroup(int rowCount)
        {
            longSlice.setInt(0, rowCount);
            rowGroupHash.update(longBuffer, 0, Integer.BYTES);
        }

        public void addPage(Page page)
        {
            requireNonNull(page, "page is null");
            checkArgument(page.getChannelCount() == columnHashes.size(), "invalid page");

            totalRowCount += page.getPositionCount();
            for (int channel = 0; channel < columnHashes.size(); channel++) {
                Type type = types.get(channel);
                Block block = page.getBlock(channel);
                XxHash64 xxHash64 = columnHashes.get(channel);
                for (int position = 0; position < block.getPositionCount(); position++) {
                    long hash = hashPositionSkipNullMapKeys(type, block, position);
                    longSlice.setLong(0, hash);
                    xxHash64.update(longBuffer);
                }
            }
        }

        private static long hashPositionSkipNullMapKeys(Type type, Block block, int position)
        {
            if (block.isNull(position)) {
                return NULL_HASH_CODE;
            }

            if (type.getTypeSignature().getBase().equals(StandardTypes.MAP)) {
                Type keyType = type.getTypeParameters().get(0);
                Type valueType = type.getTypeParameters().get(1);
                Block mapBlock = (Block) type.getObject(block, position);
                long hash = 0;
                for (int i = 0; i < mapBlock.getPositionCount(); i += 2) {
                    if (!mapBlock.isNull(i)) {
                        hash += hashPositionSkipNullMapKeys(keyType, mapBlock, i);
                        hash += hashPositionSkipNullMapKeys(valueType, mapBlock, i + 1);
                    }
                }
                return hash;
            }

            if (type.getTypeSignature().getBase().equals(StandardTypes.ARRAY)) {
                Type elementType = type.getTypeParameters().get(0);
                Block array = (Block) type.getObject(block, position);
                long hash = 0;
                for (int i = 0; i < array.getPositionCount(); i++) {
                    hash = 31 * hash + hashPositionSkipNullMapKeys(elementType, array, i);
                }
                return hash;
            }

            if (type.getTypeSignature().getBase().equals(StandardTypes.ROW)) {
                Block row = (Block) type.getObject(block, position);
                long hash = 0;
                for (int i = 0; i < row.getPositionCount(); i++) {
                    Type elementType = type.getTypeParameters().get(i);
                    hash = 31 * hash + hashPositionSkipNullMapKeys(elementType, row, i);
                }
                return hash;
            }

            return type.hash(block, position);
        }

        public WriteChecksum build()
        {
            return new WriteChecksum(
                    totalRowCount,
                    rowGroupHash.hash(),
                    columnHashes.stream()
                            .map(XxHash64::hash)
                            .collect(toList()));
        }
    }

    public static class RcFileWriteValidationBuilder
    {
        private byte version;
        private final Map<String, String> metadata = new HashMap<>();
        private Optional<String> codecClassName;
        private long syncFirst;
        private long syncSecond;
        private final WriteChecksumBuilder checksum;

        public RcFileWriteValidationBuilder(List<Type> types)
        {
            this.checksum = new WriteChecksumBuilder(types);
        }

        public RcFileWriteValidationBuilder setVersion(byte version)
        {
            this.version = version;
            return this;
        }

        public RcFileWriteValidationBuilder addMetadataProperty(String key, String value)
        {
            metadata.put(key, value);
            return this;
        }

        public RcFileWriteValidationBuilder setCodecClassName(Optional<String> codecClassName)
        {
            this.codecClassName = codecClassName;
            return this;
        }

        public RcFileWriteValidationBuilder setSyncFirst(long syncFirst)
        {
            this.syncFirst = syncFirst;
            return this;
        }

        public RcFileWriteValidationBuilder setSyncSecond(long syncSecond)
        {
            this.syncSecond = syncSecond;
            return this;
        }

        public RcFileWriteValidationBuilder addRowGroup(int rowCount)
        {
            checksum.addRowGroup(rowCount);
            return this;
        }

        public RcFileWriteValidationBuilder addPage(Page page)
        {
            checksum.addPage(page);
            return this;
        }

        public RcFileWriteValidation build()
        {
            return new RcFileWriteValidation(version, metadata, codecClassName, syncFirst, syncSecond, checksum.build());
        }
    }
}