AbstractPOIContainerExtractionTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;

import java.net.URL;

import org.apache.tika.TikaTest;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;

/**
 * Parent class of tests that the various POI powered parsers are
 * able to extract their embedded contents.
 */
public abstract class AbstractPOIContainerExtractionTest extends TikaTest {
    public static final MediaType TYPE_DOC = MediaType.application("msword");
    public static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint");
    public static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel");
    public static final MediaType TYPE_DOCX =
            MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
    public static final MediaType TYPE_PPTX =
            MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
    public static final MediaType TYPE_XLSX =
            MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
    public static final MediaType TYPE_MSG = MediaType.application("vnd.ms-outlook");

    public static final MediaType TYPE_TXT = MediaType.text("plain");
    public static final MediaType TYPE_PDF = MediaType.application("pdf");

    public static final MediaType TYPE_JPG = MediaType.image("jpeg");
    public static final MediaType TYPE_GIF = MediaType.image("gif");
    public static final MediaType TYPE_PNG = MediaType.image("png");
    public static final MediaType TYPE_EMF = MediaType.image("emf");
    public static final MediaType TYPE_WMF = MediaType.image("wmf");

    protected static TikaInputStream getTestFile(String filename) throws Exception {
        URL input =
                AbstractPOIContainerExtractionTest.class.getResource("/test-documents/" + filename);
        assertNotNull(input, filename + " not found");

        return TikaInputStream.get(input);
    }

    protected TrackingHandler process(String filename, ContainerExtractor extractor,
                                      boolean recurse) throws Exception {
        ParseContext context = new ParseContext();
        try (TikaInputStream tis = getTestFile(filename)) {
            assertEquals(true, extractor.isSupported(tis, context));

            // Process it
            TrackingHandler handler = new TrackingHandler();
            if (recurse) {
                extractor.extract(tis, extractor, handler, context);
            } else {
                extractor.extract(tis, null, handler, context);
            }

            // So they can check what happened
            return handler;
        }
    }
}