MultipleParserTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.multiple;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.fail;

import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.junit.jupiter.api.Test;

import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.DummyParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ErrorParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.utils.ParserUtils;

public class MultipleParserTest {
    /**
     * Tests how {@link AbstractMultipleParser} works out which
     * mime types to offer, based on the types of the parsers
     */
    @Test
    public void testMimeTypeSupported() {
        // TODO
        // Some media types
        Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
        Set<MediaType> octAndText =
                new HashSet<>(Arrays.asList(MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
        // TODO One with a subtype
    }

    /**
     * Test {@link FallbackParser}
     */
    @Test
    public void testFallback() throws Exception {
        ParseContext context = new ParseContext();
        BodyContentHandler handler;
        Metadata metadata;
        Parser p;
        String[] usedParsers;

        // Some media types
        Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);

        // Some parsers
        ErrorParser pFail = new ErrorParser();
        DummyParser pContent =
                new DummyParser(onlyOct, new HashMap<>(), "Fell back!");
        EmptyParser pNothing = new EmptyParser();


        // With only one parser defined, works as normal
        p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pContent);

        metadata = new Metadata();
        handler = new BodyContentHandler();
        p.parse(TikaInputStream.get(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
        assertEquals("Fell back!", handler.toString());

        usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
        assertEquals(1, usedParsers.length);
        assertEquals(DummyParser.class.getName(), usedParsers[0]);


        // With a failing parser, will go to the working one
        p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pFail, pContent);

        metadata = new Metadata();
        handler = new BodyContentHandler();
        p.parse(TikaInputStream.get(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
        assertEquals("Fell back!", handler.toString());

        usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
        assertEquals(2, usedParsers.length);
        assertEquals(ErrorParser.class.getName(), usedParsers[0]);
        assertEquals(DummyParser.class.getName(), usedParsers[1]);

        // Check we got an exception
        assertNotNull(metadata.get(TikaCoreProperties.EMBEDDED_EXCEPTION));
        assertNotNull(metadata.get(ParserUtils.EMBEDDED_PARSER));
        assertEquals(ErrorParser.class.getName(), metadata.get(ParserUtils.EMBEDDED_PARSER));


        // Won't go past a working parser to a second one, stops after one works
        p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pFail, pContent, pNothing);

        metadata = new Metadata();
        handler = new BodyContentHandler();
        p.parse(TikaInputStream.get(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
        assertEquals("Fell back!", handler.toString());

        usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
        assertEquals(2, usedParsers.length);
        assertEquals(ErrorParser.class.getName(), usedParsers[0]);
        assertEquals(DummyParser.class.getName(), usedParsers[1]);


        // TODO Check merge policies - First vs Discard
    }

    /**
     * Test for {@link SupplementingParser}
     */
    @Test
    public void testSupplemental() throws Exception {
        ParseContext context = new ParseContext();
        BodyContentHandler handler;
        Metadata metadata;
        Parser p;
        String[] usedParsers;

        // Some media types
        Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);

        // Some test metadata
        Map<String, String> m1 = new HashMap<>();
        m1.put("T1", "Test1");
        m1.put("TBoth", "Test1");
        Map<String, String> m2 = new HashMap<>();
        m2.put("T2", "Test2");
        m2.put("TBoth", "Test2");

        // Some parsers
        ErrorParser pFail = new ErrorParser();
        DummyParser pContent1 = new DummyParser(onlyOct, m1, "Fell back 1!");
        DummyParser pContent2 = new DummyParser(onlyOct, m2, "Fell back 2!");
        EmptyParser pNothing = new EmptyParser();


        // Supplemental doesn't support DISCARD
        try {
            new SupplementingParser(null, MetadataPolicy.DISCARD_ALL);
            fail("Discard shouldn't be supported");
        } catch (IllegalArgumentException e) {
            //swallow
        }


        // With only one parser defined, works as normal
        p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pContent1);

        metadata = new Metadata();
        handler = new BodyContentHandler();
        p.parse(TikaInputStream.get(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
        assertEquals("Fell back 1!", handler.toString());

        assertEquals("Test1", metadata.get("T1"));
        assertEquals("Test1", metadata.get("TBoth"));

        usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
        assertEquals(1, usedParsers.length);
        assertEquals(DummyParser.class.getName(), usedParsers[0]);


        // Check the First, Last and All policies:
        // First Wins
        p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pFail, pContent1, pContent2,
                pNothing);

        metadata = new Metadata();
        handler = new BodyContentHandler();
        p.parse(TikaInputStream.get(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
        assertEquals("Fell back 1!Fell back 2!", handler.toString());

        assertEquals("Test1", metadata.get("T1"));
        assertEquals("Test2", metadata.get("T2"));
        assertEquals("Test1", metadata.get("TBoth"));

        usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
        assertEquals(3, usedParsers.length);
        assertEquals(ErrorParser.class.getName(), usedParsers[0]);
        assertEquals(DummyParser.class.getName(), usedParsers[1]);
        assertEquals(EmptyParser.class.getName(), usedParsers[2]);


        // Last Wins
        p = new SupplementingParser(null, MetadataPolicy.LAST_WINS, pFail, pContent1, pContent2,
                pNothing);

        metadata = new Metadata();
        handler = new BodyContentHandler();
        p.parse(TikaInputStream.get(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
        assertEquals("Fell back 1!Fell back 2!", handler.toString());

        assertEquals("Test1", metadata.get("T1"));
        assertEquals("Test2", metadata.get("T2"));
        assertEquals("Test2", metadata.get("TBoth"));

        usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
        assertEquals(3, usedParsers.length);
        assertEquals(ErrorParser.class.getName(), usedParsers[0]);
        assertEquals(DummyParser.class.getName(), usedParsers[1]);
        assertEquals(EmptyParser.class.getName(), usedParsers[2]);


        // Merge
        p = new SupplementingParser(null, MetadataPolicy.KEEP_ALL, pFail, pContent1, pContent2,
                pNothing);

        metadata = new Metadata();
        handler = new BodyContentHandler();
        p.parse(TikaInputStream.get(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
        assertEquals("Fell back 1!Fell back 2!", handler.toString());

        assertEquals("Test1", metadata.get("T1"));
        assertEquals("Test2", metadata.get("T2"));
        assertEquals(2, metadata.getValues("TBoth").length);
        assertEquals("Test1", metadata.getValues("TBoth")[0]);
        assertEquals("Test2", metadata.getValues("TBoth")[1]);

        usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
        assertEquals(3, usedParsers.length);
        assertEquals(ErrorParser.class.getName(), usedParsers[0]);
        assertEquals(DummyParser.class.getName(), usedParsers[1]);
        assertEquals(EmptyParser.class.getName(), usedParsers[2]);


        // Check the error details always come through, no matter the policy
        // TODO


        // Check that each parser gets its own ContentHandler if a factory was given
        // TODO
    }
}