TikaParserConfigTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.config;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;

import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.Set;

import org.junit.jupiter.api.Test;

import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.executable.ExecutableParser;
import org.apache.tika.parser.xml.XMLParser;

/**
 * Junit test class for parser configuration via JSON,
 * covering things that require the full set of parsers.
 */
public class TikaParserConfigTest extends TikaTest {

    protected static ParseContext context = new ParseContext();

    private TikaLoader getLoader(String config) throws Exception {
        Path path = Paths.get(TikaParserConfigTest.class.getResource(config).toURI());
        return TikaLoader.load(path);
    }

    @Test
    public void testMimeExcludeInclude() throws Exception {
        TikaLoader loader = getLoader("TIKA-1558-exclude.json");
        Parser parser = loader.loadParsers();
        assertNotNull(parser);
        assertNotNull(loader.loadDetectors());

        MediaType PDF = MediaType.application("pdf");
        MediaType JPEG = MediaType.image("jpeg");

        // Has two parsers: EmptyParser (decorated) and CompositeParser of SPI parsers (decorated)
        assertEquals(CompositeParser.class, parser.getClass());
        CompositeParser cParser = (CompositeParser) parser;
        assertEquals(2, cParser.getAllComponentParsers().size());

        // First parser should be EmptyParser decorated with mimeInclude for PDF
        Parser p0 = cParser.getAllComponentParsers().get(0);
        assertTrue(p0 instanceof ParserDecorator, "First parser should be decorated");
        ParserDecorator pd0 = (ParserDecorator) p0;
        assertEquals(EmptyParser.class, pd0.getWrappedParser().getClass());
        Set<MediaType> p0Types = pd0.getSupportedTypes(context);
        assertContains(PDF, p0Types);
        assertEquals(1, p0Types.size());

        // Second parser should be SPI parsers decorated with mimeExclude for PDF/JPEG
        Parser p1 = cParser.getAllComponentParsers().get(1);
        assertTrue(p1 instanceof ParserDecorator, "Second parser should be decorated");
        ParserDecorator pd1 = (ParserDecorator) p1;
        Set<MediaType> p1Types = pd1.getSupportedTypes(context);
        assertNotContained(PDF, p1Types);
        assertNotContained(JPEG, p1Types);
    }

    @Test
    public void testParserExcludeFromDefault() throws Exception {
        TikaLoader loader = getLoader("TIKA-1558-exclude.json");
        Parser parser = loader.loadParsers();
        assertNotNull(parser);

        MediaType PE_EXE = MediaType.application("x-msdownload");
        MediaType ELF = MediaType.application("x-elf");

        // Get a fresh "default" DefaultParser for comparison
        DefaultParser normParser = new DefaultParser(TikaLoader.getMediaTypeRegistry());

        // The default one will offer the Executable Parser
        assertContains(PE_EXE, normParser.getSupportedTypes(context));
        assertContains(ELF, normParser.getSupportedTypes(context));

        boolean hasExec = false;
        for (Parser p : normParser.getParsers().values()) {
            if (p instanceof ExecutableParser) {
                hasExec = true;
                break;
            }
        }
        assertTrue(hasExec);

        // The config-loaded parser should NOT support executable types
        // (ExecutableParser was excluded)
        CompositeParser cParser = (CompositeParser) parser;
        Set<MediaType> supportedTypes = cParser.getSupportedTypes(context);
        assertNotContained(PE_EXE, supportedTypes);
        assertNotContained(ELF, supportedTypes);
    }

    /**
     * TIKA-1558 It should be possible to exclude Parsers from being picked up by
     * DefaultParser.
     */
    @Test
    public void defaultParserExclude() throws Exception {
        // First verify default config includes XMLParser
        TikaLoader defaultLoader = TikaLoader.loadDefault();
        CompositeParser cp = (CompositeParser) defaultLoader.loadParsers();
        List<Parser> parsers = cp.getAllComponentParsers();

        boolean hasXML = false;
        for (Parser p : parsers) {
            if (p instanceof XMLParser) {
                hasXML = true;
                break;
            }
        }
        assertTrue(hasXML, "Default config should include an XMLParser.");

        // This custom config should exclude XMLParser
        TikaLoader loader = getLoader("TIKA-1558-excludesub.json");
        cp = (CompositeParser) loader.loadParsers();
        parsers = cp.getAllComponentParsers();

        // Flatten nested CompositeParser if present
        for (Parser p : parsers) {
            if (p instanceof CompositeParser) {
                for (Parser inner : ((CompositeParser) p).getAllComponentParsers()) {
                    if (inner instanceof XMLParser) {
                        fail("Custom config should not include an XMLParser (" + inner.getClass() + ").");
                    }
                }
            } else if (p instanceof ParserDecorator) {
                Parser wrapped = ((ParserDecorator) p).getWrappedParser();
                if (wrapped instanceof XMLParser) {
                    fail("Custom config should not include an XMLParser (" + wrapped.getClass() + ").");
                }
                if (wrapped instanceof CompositeParser) {
                    for (Parser inner : ((CompositeParser) wrapped).getAllComponentParsers()) {
                        if (inner instanceof XMLParser) {
                            fail("Custom config should not include an XMLParser (" + inner.getClass() + ").");
                        }
                    }
                }
            } else if (p instanceof XMLParser) {
                fail("Custom config should not include an XMLParser (" + p.getClass() + ").");
            }
        }
    }

    @Test
    public void testDefaultLoaderIncludesAllParsers() throws Exception {
        TikaLoader loader = TikaLoader.loadDefault();
        Parser parser = loader.loadParsers();
        assertNotNull(parser);
        assertTrue(parser instanceof CompositeParser);

        CompositeParser cp = (CompositeParser) parser;
        // Should have many parsers loaded from SPI
        assertFalse(cp.getAllComponentParsers().isEmpty());
    }
}