ChmExtractor.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.chm;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.microsoft.chm.ChmCommons.EntryType;

/**
 * Extracts text from chm file. Enumerates chm entries.
 */
public class ChmExtractor {

    private static final Logger LOG = LoggerFactory.getLogger(ChmExtractor.class);


    private List<ChmLzxBlock> lzxBlocksCache = null;
    private ChmDirectoryListingSet chmDirList = null;
    private ChmItsfHeader chmItsfHeader = null;
    private ChmItspHeader chmItspHeader = null;
    private ChmLzxcResetTable chmLzxcResetTable = null;
    private ChmLzxcControlData chmLzxcControlData = null;
    private byte[] data = null;
    private int indexOfContent;
    private long lzxBlockOffset;
    private long lzxBlockLength;
    private ChmBlockInfo chmBlockInfo = null;//this will be instantiated at first call of

    public ChmExtractor(InputStream is) throws TikaException, IOException {
        ChmAssert.assertInputStreamNotNull(is);
        try {
            setData(IOUtils.toByteArray(is));

            /* Creates and parses chm itsf header */
            setChmItsfHeader(new ChmItsfHeader());
            // getChmItsfHeader().parse(Arrays.copyOfRange(getData(), 0,
            // ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
            getChmItsfHeader()
                    .parse(ChmCommons.copyOfRange(getData(), 0,
                            ChmConstants.CHM_ITSF_V3_LEN - 1),
                            getChmItsfHeader());

            /* Creates and parses chm itsp header */
            setChmItspHeader(new ChmItspHeader());
            // getChmItspHeader().parse(Arrays.copyOfRange( getData(), (int)
            // getChmItsfHeader().getDirOffset(),
            // (int) getChmItsfHeader().getDirOffset() +
            // ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
            getChmItspHeader().parse(ChmCommons
                            .copyOfRange(getData(), (int) getChmItsfHeader().getDirOffset(),
                                    (int) getChmItsfHeader().getDirOffset() +
                                            ChmConstants.CHM_ITSP_V1_LEN),
                    getChmItspHeader());

            /* Creates instance of ChmDirListingContainer */
            setChmDirList(
                    new ChmDirectoryListingSet(getData(), getChmItsfHeader(), getChmItspHeader()));

            int indexOfControlData = getChmDirList().getControlDataIndex();
            int indexOfResetData =
                    ChmCommons.indexOfResetTableBlock(getData(), ChmConstants.LZXC.getBytes(UTF_8));
            byte[] dir_chunk = null;
            if (indexOfResetData > 0) {
                dir_chunk = ChmCommons.copyOfRange(getData(), indexOfResetData, indexOfResetData +
                        getChmDirList().getDirectoryListingEntryList().get(indexOfControlData)
                                .getLength());
            }
            // dir_chunk = Arrays.copyOfRange(getData(), indexOfResetData,
            // indexOfResetData
            // +
            // getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());

            /* Creates and parses chm control data */
            setChmLzxcControlData(new ChmLzxcControlData());
            getChmLzxcControlData().parse(dir_chunk, getChmLzxcControlData());

            int indexOfResetTable = getChmDirList().getResetTableIndex();
            setChmLzxcResetTable(new ChmLzxcResetTable());

            int startIndex = (int) getChmDirList().getDataOffset() +
                    getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable)
                            .getOffset();

            // assert startIndex < data.length
            ChmAssert.assertCopyingDataIndex(startIndex, getData().length);

            // dir_chunk = Arrays.copyOfRange(getData(), startIndex, startIndex
            // +
            // getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
            dir_chunk = ChmCommons.copyOfRange(getData(), startIndex, startIndex +
                    getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable)
                            .getLength());

            getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable());

            setIndexOfContent(ChmCommons
                    .indexOfDataSpaceStorageElement(getChmDirList().getDirectoryListingEntryList(),
                            ChmConstants.CONTENT));
            setLzxBlockOffset(
                    (getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent())
                            .getOffset() + getChmItsfHeader().getDataOffset()));
            setLzxBlockLength(
                    getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent())
                            .getLength());

            setLzxBlocksCache(new ArrayList<>());

        } catch (IOException e) {
            LOG.warn("IOException parsing chm file", e);
        }
    }

    /**
     * Returns lzxc control data.
     *
     * @return ChmLzxcControlData
     */
    private ChmLzxcControlData getChmLzxcControlData() {
        return chmLzxcControlData;
    }

    /**
     * Sets lzxc control data
     *
     * @param chmLzxcControlData
     */
    private void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
        this.chmLzxcControlData = chmLzxcControlData;
    }

    private ChmItspHeader getChmItspHeader() {
        return chmItspHeader;
    }

    private void setChmItspHeader(ChmItspHeader chmItspHeader) {
        this.chmItspHeader = chmItspHeader;
    }

    /**
     * Returns lzxc reset table
     *
     * @return ChmLzxcResetTable
     */
    private ChmLzxcResetTable getChmLzxcResetTable() {
        return chmLzxcResetTable;
    }

    /**
     * Sets lzxc reset table
     *
     * @param chmLzxcResetTable
     */
    private void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
        this.chmLzxcResetTable = chmLzxcResetTable;
    }

    /**
     * Returns lzxc hit_cache length
     *
     * @return lzxBlockLength
     */
    private long getLzxBlockLength() {
        return lzxBlockLength;
    }

    /**
     * Sets lzxc hit_cache length
     *
     * @param lzxBlockLength
     */
    private void setLzxBlockLength(long lzxBlockLength) {
        this.lzxBlockLength = lzxBlockLength;
    }

    /**
     * Returns lzxc hit_cache offset
     *
     * @return lzxBlockOffset
     */
    private long getLzxBlockOffset() {
        return lzxBlockOffset;
    }

    /**
     * Sets lzxc hit_cache offset
     */
    private void setLzxBlockOffset(long lzxBlockOffset) {
        this.lzxBlockOffset = lzxBlockOffset;
    }

    private int getIndexOfContent() {
        return indexOfContent;
    }

    private void setIndexOfContent(int indexOfContent) {
        this.indexOfContent = indexOfContent;
    }

    private byte[] getData() {
        return data;
    }

    private void setData(byte[] data) {
        this.data = data;
    }

    /**
     * Enumerates chm entities
     *
     * @return list of chm entities
     */
    public List<String> enumerateChm() {
        List<String> listOfEntries = new ArrayList<>();
        for (DirectoryListingEntry directoryListingEntry : getChmDirList()
                .getDirectoryListingEntryList()) {
            listOfEntries.add(directoryListingEntry.getName());
        }
        return listOfEntries;
    }

    /**
     * Decompresses a chm entry
     *
     * @param directoryListingEntry
     * @return decompressed data
     * @throws TikaException
     */
    public byte[] extractChmEntry(DirectoryListingEntry directoryListingEntry)
            throws TikaException {
        UnsynchronizedByteArrayOutputStream buffer = UnsynchronizedByteArrayOutputStream.builder().get();
        ChmLzxBlock lzxBlock = null;
        try {
            /* UNCOMPRESSED type is easiest one */
            if (directoryListingEntry.getEntryType() == EntryType.UNCOMPRESSED &&
                    directoryListingEntry.getLength() > 0 &&
                    !ChmCommons.hasSkip(directoryListingEntry)) {
                int dataOffset = (int) (getChmItsfHeader().getDataOffset() +
                        directoryListingEntry.getOffset());
                // dataSegment = Arrays.copyOfRange(getData(), dataOffset,
                // dataOffset + directoryListingEntry.getLength());
                buffer.write(ChmCommons.copyOfRange(getData(), dataOffset,
                        dataOffset + directoryListingEntry.getLength()));
            } else if (directoryListingEntry.getEntryType() == EntryType.COMPRESSED &&
                    !ChmCommons.hasSkip(directoryListingEntry)) {
                /* Gets a chm hit_cache info */
                chmBlockInfo = ChmBlockInfo.getChmBlockInfoInstance(directoryListingEntry,
                        (int) getChmLzxcResetTable().getBlockLen(), getChmLzxcControlData(),
                        chmBlockInfo);

                int i = 0, start = 0, hit_cache = 0;

                if ((getLzxBlockLength() < Integer.MAX_VALUE) &&
                        (getLzxBlockOffset() < Integer.MAX_VALUE)) {
                    // TODO: Improve the caching
                    // caching ... = O(n^2) - depends on startBlock and endBlock
                    start = -1;
                    if (!getLzxBlocksCache().isEmpty()) {
                        for (i = 0; i < getLzxBlocksCache().size(); i++) {
                            //lzxBlock = getLzxBlocksCache().get(i);
                            int bn = getLzxBlocksCache().get(i).getBlockNumber();
                            for (int j = chmBlockInfo.getIniBlock();
                                    j <= chmBlockInfo.getStartBlock(); j++) {
                                if (bn == j) {
                                    if (j > start) {
                                        start = j;
                                        hit_cache = i;
                                    }
                                }
                            }
                            if (start == chmBlockInfo.getStartBlock()) {
                                break;
                            }
                        }
                    }

//                    if (i == getLzxBlocksCache().size() && i == 0) {
                    if (start < 0) {
                        start = chmBlockInfo.getIniBlock();

                        byte[] dataSegment = ChmCommons
                                .getChmBlockSegment(getData(), getChmLzxcResetTable(), start,
                                        (int) getLzxBlockOffset(), (int) getLzxBlockLength());

                        lzxBlock = new ChmLzxBlock(start, dataSegment,
                                getChmLzxcResetTable().getBlockLen(), null);

                        getLzxBlocksCache().add(lzxBlock);
                    } else {
                        lzxBlock = getLzxBlocksCache().get(hit_cache);
                    }

                    for (i = start; i <= chmBlockInfo.getEndBlock(); ) {
                        if (i == chmBlockInfo.getStartBlock() && i == chmBlockInfo.getEndBlock()) {
                            buffer.write(lzxBlock.getContent(chmBlockInfo.getStartOffset(),
                                    chmBlockInfo.getEndOffset()));
                            break;
                        }

                        if (i == chmBlockInfo.getStartBlock()) {
                            buffer.write(lzxBlock.getContent(chmBlockInfo.getStartOffset()));
                        }

                        if (i > chmBlockInfo.getStartBlock() && i < chmBlockInfo.getEndBlock()) {
                            buffer.write(lzxBlock.getContent());
                        }

                        if (i == chmBlockInfo.getEndBlock()) {
                            buffer.write(lzxBlock.getContent(0, chmBlockInfo.getEndOffset()));
                            break;
                        }

                        i++;

                        if (i % getChmLzxcControlData().getResetInterval() == 0) {
                            lzxBlock = new ChmLzxBlock(i, ChmCommons
                                    .getChmBlockSegment(getData(), getChmLzxcResetTable(), i,
                                            (int) getLzxBlockOffset(), (int) getLzxBlockLength()),
                                    getChmLzxcResetTable().getBlockLen(), null);
                        } else {
                            lzxBlock = new ChmLzxBlock(i, ChmCommons
                                    .getChmBlockSegment(getData(), getChmLzxcResetTable(), i,
                                            (int) getLzxBlockOffset(), (int) getLzxBlockLength()),
                                    getChmLzxcResetTable().getBlockLen(), lzxBlock);
                        }

                        getLzxBlocksCache().add(lzxBlock);
                    }

                    if (getLzxBlocksCache().size() > getChmLzxcResetTable().getBlockCount()) {
                        getLzxBlocksCache().clear();
                    }
                } //end of if

                if (buffer.size() != directoryListingEntry.getLength()) {
                    throw new TikaException("CHM file extract error: extracted Length is wrong.");
                }
            } //end of if compressed
        } catch (Exception e) {
            throw new TikaException(e.getMessage());
        }

        return buffer.toByteArray();
    }

    private List<ChmLzxBlock> getLzxBlocksCache() {
        return lzxBlocksCache;
    }

    private void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
        this.lzxBlocksCache = lzxBlocksCache;
    }

    public ChmDirectoryListingSet getChmDirList() {
        return chmDirList;
    }

    private void setChmDirList(ChmDirectoryListingSet chmDirList) {
        this.chmDirList = chmDirList;
    }

    private ChmItsfHeader getChmItsfHeader() {
        return chmItsfHeader;
    }

    private void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
        this.chmItsfHeader = chmItsfHeader;
    }
}