RDCAnalysisChunking.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.chunking;

import static org.apache.tika.parser.microsoft.onenote.fsshttpb.unsigned.Unsigned.ubyte;
import static org.apache.tika.parser.microsoft.onenote.fsshttpb.unsigned.Unsigned.uint;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.LeafNodeObject;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.SignatureObject;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.basic.BinaryItem;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.unsigned.UInteger;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.util.ByteUtil;

/**
 * This class is used to process RDC analysis chunking
 */
public class RDCAnalysisChunking extends AbstractChunking {
    private static final Logger LOGGER = LoggerFactory.getLogger(RDCAnalysisChunking.class);

    /**
     * The max chunk size in RDC analysis chunking.
     */
    private final int maxChunkSize = 65535;

    /**
     * Initializes a new instance of the <see cref="RDCAnalysisChunking"/> class
     *
     * @param fileContent The content of the file.
     */
    public RDCAnalysisChunking(byte[] fileContent) {
        super(fileContent);
    }

    /**
     * This method is used to chunk the file data.
     *
     * @return A list of LeafNodeObjectData.
     */
    @Override
    public List<LeafNodeObject> chunking() throws IOException {
        int horizon = 16384;
        List<LeafNodeObject> list = new ArrayList<>();
        int inputLength = fileContent.length;

        if (inputLength <= 0) {
            throw new IOException("Cannot support the length less than 0");
        } else if (inputLength <= horizon) {
            list.add(this.getChunk(0, inputLength));
            return list;
        }

        long chunkStart = 0;
        UInteger[] hashValues = this.getHashValues();

        while (chunkStart + 1 < inputLength) {
            long chunkEndMax = Math.min(chunkStart + this.maxChunkSize, inputLength);
            long chunkEnd = (int) chunkEndMax;

            for (long n = chunkStart; n < chunkEndMax; n++) {
                boolean isBoundary = true;

                if (n == chunkStart) {
                    n = chunkStart + horizon;
                }

                if (n - chunkStart == this.maxChunkSize) {
                    break;
                }

                long end = Math.min(n + horizon, chunkEndMax);
                for (long i = n - horizon; i < end; i++) {
                    if (i != n &&
                            hashValues[(int) n].intValue() <= hashValues[(int) i].intValue()) {
                        isBoundary = false;
                        break;
                    }
                }

                if (!isBoundary) {
                    continue;
                }

                if (n + horizon > inputLength) {
                    n = chunkEndMax;
                    continue;
                }

                if ((n - (n % horizon) + (2 * horizon)) > inputLength) {
                    continue;
                }

                if (inputLength % horizon == 0 &&
                        ((int) chunkStart - ((int) chunkStart % horizon) + (2 * horizon)) ==
                                inputLength) {
                    continue;
                }

                chunkEnd = n;
                break;
            }

            list.add(this.getChunk(chunkStart, chunkEnd));
            chunkStart = chunkEnd;
        }

        return list;
    }

    /**
     * Get a chunk with the input bytes.
     *
     * @param chunkStart The start index of the chunk.
     * @param chunkEnd   The end index of the chunk.
     * @return An LeafNodeObjectData which contains a chunk.
     */
    private LeafNodeObject getChunk(long chunkStart, long chunkEnd) throws IOException {
        if (chunkEnd <= chunkStart || (chunkEnd - chunkStart > this.maxChunkSize) ||
                chunkStart > Integer.MAX_VALUE) {
            throw new IOException("ChunkStart out of range");
        }

        byte[] temp = Arrays.copyOfRange(this.fileContent, (int) chunkStart,
                (int) (chunkEnd - chunkStart));


        SignatureObject signature = new SignatureObject();
        signature.signatureData = new BinaryItem(ByteUtil.toListOfByte(temp));

//            RDCSignatureGenerator generator = new RDCSignatureGenerator();
//            signatureBytes = generator.ComputeHash(temp);
//
//            SignatureObject signature = new SignatureObject();
//            signature.SignatureData = new BinaryItem(signatureBytes);

        return new LeafNodeObject.IntermediateNodeObjectBuilder().Build(temp, signature);
    }

    /**
     * Compute the hash value with the file content.
     *
     * @return The array of hash value.
     */
    private UInteger[] getHashValues() {
        int hashWindowSize = 48;
        UInteger[] hashValues = new UInteger[this.fileContent.length];
        int shiftAmount = this.getShiftAmount(hashWindowSize);
        int i = 0;

        int[] lookupTable =
                {0x5e3f7c48, 0x796a0d2b, 0xbecd4e32, 0x6f16159c, 0x687312bc, 0x12a6f30a, 0x8fca2662,
                        0x79b83d14, 0x3fab3f30, 0x984d6ca2, 0x4df5fe6c, 0x4acd3196, 0x6245ad21,
                        0x3a15e5ba, 0x90db6499, 0x05aacb6b, 0x791cf724, 0x504cd910, 0x98093570,
                        0x090392df, 0xf193e5b8, 0x42023c5b, 0x80a95c6a, 0x11e676be, 0xc70f2117,
                        0xeed4587f, 0x6479e9bd, 0x1b0c427c, 0x410486ba, 0x30f5b837, 0xf957d307,
                        0x1535f121, 0xabe45e90, 0x7a1ab8f0, 0x1c6887e4, 0x4170b7ba, 0x8b491bed,
                        0x5c920e73, 0x1b1ed791, 0x7a0ed482, 0xcce86619, 0x45dc7290, 0x57e71362,
                        0x2e24f01c, 0x0a0637f3, 0x0e8c5565, 0x15944012, 0x34f7eeea, 0xbc628141,
                        0x1e200874, 0xe9244379, 0x3e63aeca, 0x7a3b3cce, 0x73f8a245, 0xd734e215,
                        0x834fa434, 0xf96a0904, 0xfb39a424, 0x0bfa963a, 0x9b236ee2, 0xa2131005,
                        0x3eb70acf, 0x2907bcd8, 0x3f685f3a, 0x3765fd37, 0x1c1c34d2, 0x03a95179,
                        0x024be6c3, 0x06128960, 0x844e7490, 0xe2b371a3, 0x3382909c, 0x3d519a77,
                        0x90971ec9, 0x6ea745e5, 0x490b3a5c, 0x7f3916f7, 0xbc150351, 0x241a7ba0,
                        0xec93c2bb, 0x6c7083aa, 0xf3937751, 0xe6aa1df1, 0x129fc001, 0xb90709b9,
                        0x7e59a4fc, 0x4509e58a, 0x8a93ed43, 0x6934ce62, 0x8ec6af1a, 0xf36581a9,
                        0x53d01d93, 0xb34eef69, 0x08494a84, 0x0f6dff34, 0x74729aa3, 0x48b5475f,
                        0xb986dc84, 0xd0424c8d, 0xb72ad089, 0x0adbbdb8, 0x824fdbe8, 0x99ad1058,
                        0x98faec38, 0xe746242b, 0x2b7ee7fc, 0x2e151fa7, 0x6413270f, 0x68ed7239,
                        0x7729e2d3, 0x5697b3a5, 0x0b90a6c3, 0xdf7cefcf, 0xded46a48, 0x46956888,
                        0xb3bb6dc4, 0xe987578f, 0xf82e74b7, 0xc8eeeba4, 0xdd960ff9, 0x482ed28d,
                        0x4f343078, 0x563ab8a4, 0x3ec7aa0d, 0x2481d448, 0x5fe98704, 0x5aafc580,
                        0x841d81ec, 0xae7fe8fd, 0x6b31ccb6, 0x911ebdd4, 0x75f4703d, 0xe6855a0f,
                        0x6184b42e, 0x147a4a95, 0x39528e48, 0xe975b416, 0x3cba13d3, 0x1e23e544,
                        0xf7955286, 0xa5f96b7f, 0xaaa697aa, 0x29e794e3, 0x87628c09, 0xfeebf5f1,
                        0xf8b070cd, 0xe361b627, 0x8c7a8682, 0x69cab331, 0xca867ad1, 0xd0151a96,
                        0xfc19a6b9, 0x6d7439e7, 0x64cd62ac, 0x4a650747, 0x9ddbfa28, 0x337c8bed,
                        0xf12a6860, 0x3767ffd3, 0x13559ced, 0x71ac2011, 0xc11dc687, 0x260b7105,
                        0xc13bca0c, 0xcd0af893, 0x793b54e6, 0x89d27fc3, 0xc6bd1c88, 0xe3337313,
                        0x387bc671, 0x61280de4, 0x76941a36, 0xaa52a2b9, 0x6d7cb52c, 0x18ff4d70,
                        0x8987cf38, 0x306e47ed, 0xf7df8135, 0x18a8e024, 0xc9eb085f, 0xc1a7c769,
                        0xd5667a12, 0x9c8be93a, 0x028781b1, 0x6213dada, 0x07fef4f5, 0x5e6bf91d,
                        0x469ea798, 0xb9654a37, 0x1cb5e74e, 0x525d502d, 0xe805ec68, 0xdd8c4320,
                        0x7890848f, 0x61e59c8e, 0x1d99f9ef, 0x25b60b20, 0x2f198088, 0xe01b6926,
                        0xffa4917f, 0xb2fa0f22, 0xee8ac924, 0x18a1c5a7, 0xb76d8d7f, 0x88ad5e0d,
                        0x7b3fb12b, 0xc8a91add, 0x762a6f4e, 0x056fad31, 0xebecfab8, 0xea54cd17,
                        0x71f5af9f, 0xfaececa1, 0x08a52f4d, 0xbb5efebe, 0x5bcb04c2, 0xcb2530b0,
                        0x01bb862b, 0xbb5d54f0, 0x404deb4b, 0x038658bd, 0x09399005, 0xddd862c8,
                        0x8985776f, 0xcfcfd717, 0xbec756cb, 0x52aecc5a, 0x09ac3f62, 0x62c1c6fb,
                        0x76cc3221, 0xcde6d028, 0x844d9291, 0xc143eeac, 0x0ea5e772, 0x8855456e,
                        0xeb03a426, 0x3398475d, 0x73dc8107, 0x681605d0, 0xd18b6264, 0x934e43eb,
                        0x59e76d21, 0xd3ce2b77, 0x4ccfee1c, 0x2f4af76d, 0x8b12a309, 0x849bb415,
                        0xf45ad809, 0xc7bccae7, 0xac891c35, 0x59db2274, 0xbcd71393, 0x2c9b1705,
                        0xcb536a69, 0xb2800f00, 0x111313fc};

        while (i < this.fileContent.length) {
            UInteger hashValue = i == 0 ? uint(0) : hashValues[i - 1];
            int trailingEdgeData = i < hashWindowSize ? ubyte(0).intValue() :
                    ubyte(this.fileContent[i - hashWindowSize]).intValue();
            int leadingEdgeData = ubyte(this.fileContent[i]).intValue();
            UInteger val = hashValue.xor(uint(lookupTable[trailingEdgeData]))
                    .xor(uint(lookupTable[leadingEdgeData]));
            hashValues[i] =
                    val.leftShift(2).inclusiveOr(val.rightShift(Integer.SIZE - shiftAmount));
            i++;
        }

        return hashValues;
    }

    /**
     * Get the shift amount value.
     *
     * @param hashWindowSize The value of hash window size.
     * @return The value of shift amount.
     */
    private int getShiftAmount(int hashWindowSize) {
        int shiftAmount = 1;
        int i = 32;

        while (i > 0 && hashWindowSize % i != 0) {
            shiftAmount *= 2;
            i /= 2;
        }

        shiftAmount = shiftAmount % 32;

        return shiftAmount;
    }
}