TestChmExtraction.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.chm;

import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Pattern;

import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;

import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BodyContentHandler;

public class TestChmExtraction extends MultiThreadedTikaTest {

    private final Parser parser = new ChmParser();

    private final List<String> files =
            Arrays.asList("/test-documents/testChm.chm", "/test-documents/testChm2.chm",
                    "/test-documents/testChm3.chm");

    @Test
    public void testGetText() throws Exception {
        BodyContentHandler handler = new BodyContentHandler();
        try (TikaInputStream tis = TikaInputStream.get(TestParameters.chmData)) {
            new ChmParser()
                    .parse(tis, handler, new Metadata(),
                            new ParseContext());
        }
        assertTrue(handler.toString().contains("The TCard method accepts only numeric arguments"));
    }

    @Test
    public void testChmParser() throws Exception {
        for (String fileName : files) {
            TikaInputStream tis = getResourceAsStream(fileName);
            testingChm(tis);
        }
    }

    private void testingChm(TikaInputStream tis) throws IOException, SAXException, TikaException {
        try (tis) {
            BodyContentHandler handler = new BodyContentHandler(-1);
            parser.parse(tis, handler, new Metadata(), new ParseContext());
            assertTrue(!handler.toString().isEmpty());
        }
    }

    @Test
    public void testExtractChmEntries() throws TikaException, IOException {
        for (String fileName : files) {
            try (TikaInputStream tis = getResourceAsStream(fileName)) {
                testExtractChmEntry(tis);
            }
        }
    }

    protected boolean findZero(byte[] textData) {
        for (byte b : textData) {
            if (b == 0) {
                return true;
            }
        }

        return false;
    }

    protected boolean niceAscFileName(String name) {
        for (char c : name.toCharArray()) {
            if (c >= 127 || c < 32) {
                //non-ascii char or control char
                return false;
            }
        }

        return true;
    }

    protected void testExtractChmEntry(InputStream stream) throws TikaException, IOException {
        ChmExtractor chmExtractor = new ChmExtractor(stream);
        ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
        final Pattern htmlPairP = Pattern.compile("\\Q<html\\E.+\\Q</html>\\E",
                Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

        Set<String> names = new HashSet<>();

        for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
            byte[] data = chmExtractor.extractChmEntry(directoryListingEntry);

            //Entry names should be nice. Disable this if the test chm do have bad looking but
            // valid entry names.
            if (!niceAscFileName(directoryListingEntry.getName())) {
                throw new TikaException("Warning: File name contains a non ascii char : " +
                        directoryListingEntry.getName());
            }

            final String lowName = directoryListingEntry.getName().toLowerCase(Locale.ROOT);

            //check duplicate entry name which is seen before.
            if (names.contains(lowName)) {
                throw new TikaException(
                        "Duplicate File name detected : " + directoryListingEntry.getName());
            }
            names.add(lowName);

            if (lowName.endsWith(".html") || lowName.endsWith(".htm") || lowName.endsWith(".hhk") ||
                    lowName.endsWith(".hhc")
                //|| name.endsWith(".bmp")
            ) {
                if (findZero(data)) {
                    throw new TikaException(
                            "Xhtml/text file contains '\\0' : " + directoryListingEntry.getName());
                }

                //validate html
                String html = new String(data, ISO_8859_1);
                if (!htmlPairP.matcher(html).find()) {
                    System.err.println(lowName + " is invalid.");
                    System.err.println(html);
                    throw new TikaException(
                            "Invalid xhtml file : " + directoryListingEntry.getName());
                }
//                else {
//                    System.err.println(directoryListingEntry.getName() + " is valid.");
//                }
            }
        }
    }


    @Test //TODO: redo with new MultiThreadedTikaTest
    public void testMultiThreadedChmExtraction() throws InterruptedException {
        ExecutorService executor = Executors.newFixedThreadPool(TestParameters.NTHREADS);
        for (int i = 0; i < TestParameters.NTHREADS; i++) {
            executor.execute(() -> {
                for (String fileName : files) {
                    TikaInputStream tis = null;
                    try {
                        tis = getResourceAsStream(fileName);
                        BodyContentHandler handler = new BodyContentHandler(-1);
                        parser.parse(tis, handler, new Metadata(), new ParseContext());
                        assertFalse(handler.toString().isEmpty());
                    } catch (Exception e) {
                        e.printStackTrace();
                    } finally {
                        try {
                            tis.close();
                        } catch (IOException e) {
                            e.printStackTrace();
                        }
                    }
                }
            });
        }
        executor.shutdown();
        // Waits until all threads will have finished
        while (!executor.isTerminated()) {
            Thread.sleep(500);
        }
    }

    @Test
    public void test_TIKA_1446() throws Exception {
        URL chmDir = getResourceAsUrl("/test-documents/chm/");
        File chmFolder = new File(chmDir.toURI());
        for (String fileName : chmFolder.list()) {
            File file = new File(chmFolder, fileName);
            TikaInputStream tis = TikaInputStream.get(file.toPath());
            testingChm(tis);
        }
    }

    @Test
    public void testOOM() {
        assertThrows(TikaException.class, () -> {
            getXML("testChm_oom.chm");
        });
    }

    @Test
    public void testMultiThreaded() throws Exception {
        ParseContext[] parseContexts = new ParseContext[10];
        for (int i = 0; i < parseContexts.length; i++) {
            parseContexts[i] = new ParseContext();
        }

        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
        testMultiThreaded(wrapper, parseContexts, 10, 10, pathname -> {
            if (pathname.getName().toLowerCase(Locale.ENGLISH).endsWith(".chm")) {
                //this file is a beast, skip it
                if (pathname.getName().equals("testChm2.chm")) {
                    return false;
                    //this file throws an exception in the baseline and then
                    //isn't included in the actual tests.
                    //If we do want to include it we need to change the way
                    //MultiThreadedTikaTest handles files that throw exceptions
                } else {
                    return !pathname.getName().equals("testChm_oom.chm");
                }
            } else {
                return false;
            }
        });
    }
}