EmbeddedDocumentUtil.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.extractor;


import java.io.IOException;
import java.io.Serializable;

import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.StatefulParser;
import org.apache.tika.utils.ExceptionUtils;

/**
 * Utility class to handle common issues with embedded documents.
 * <p/>
 * Use statically if all that is needed is getting the EmbeddedDocumentExtractor.
 * Otherwise, instantiate an instance.
 * <p/>
 * Note: This is not thread safe.  Make sure to instantiate one per thread.
 */
public class EmbeddedDocumentUtil implements Serializable {


    private final ParseContext context;
    private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
    //these are lazily initialized and can be null
    private MimeTypes mimeTypes;
    private Detector detector;

    public EmbeddedDocumentUtil(ParseContext context) {
        this.context = context;
        this.embeddedDocumentExtractor = getEmbeddedDocumentExtractor(context);
    }

    /**
     * This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext.
     * As of Tika 1.15, an AutoDetectParser will automatically be added to parse
     * embedded documents if no Parser.class is specified in the ParseContext.
     * <p/>
     * If you'd prefer not to parse embedded documents, set Parser.class
     * to {@link org.apache.tika.parser.EmptyParser} in the ParseContext.
     *
     * @param context
     * @return EmbeddedDocumentExtractor
     */
    public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) {
        EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class);
        if (extractor != null) {
            return extractor;
        }
        //ensure that an AutoDetectParser is
        //available for parsing embedded docs TIKA-2096
        Parser embeddedParser = context.get(Parser.class);
        if (embeddedParser == null) {
            context.set(Parser.class, new AutoDetectParser());
        }
        EmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(context);
        context.set(EmbeddedDocumentExtractor.class, ex);
        return ex;
    }

    /**
     * Utility function to get the Parser that was sent in to the
     * ParseContext to handle embedded documents.  If it is stateful,
     * unwrap it to get its stateless delegating parser.
     * <p>
     * If there is no Parser in the parser context, this will return null.
     *
     * @param context
     * @return
     */
    public static Parser getStatelessParser(ParseContext context) {
        Parser p = context.get(Parser.class);
        if (p == null) {
            return null;
        }
        if (p instanceof StatefulParser) {
            return ((StatefulParser) p).getWrappedParser();
        }
        return p;
    }

    public PasswordProvider getPasswordProvider() {
        return context.get(PasswordProvider.class);
    }

    public Detector getDetector() {
        //be as lazy as possible and cache
        Detector localDetector = context.get(Detector.class);
        if (localDetector != null) {
            return localDetector;
        }
        if (detector != null) {
            return detector;
        }

        detector = new DefaultDetector(getMimeTypes());
        return detector;
    }

    public MimeTypes getMimeTypes() {
        MimeTypes localMimeTypes = context.get(MimeTypes.class);
        //be as lazy as possible and cache the mimeTypes
        if (localMimeTypes != null) {
            return localMimeTypes;
        }
        if (mimeTypes != null) {
            return mimeTypes;
        }
        mimeTypes = MimeTypes.getDefaultMimeTypes();
        return mimeTypes;
    }

    public String getExtension(TikaInputStream is, Metadata metadata) {
        String mimeString = metadata.get(Metadata.CONTENT_TYPE);

        //use the buffered mimetypes as default
        MimeTypes localMimeTypes = getMimeTypes();

        MimeType mimeType = null;
        boolean detected = false;
        if (mimeString != null) {
            try {
                mimeType = localMimeTypes.forName(mimeString);
            } catch (MimeTypeException e) {
                //swallow
            }
        }
        if (mimeType == null) {
            try {
                MediaType mediaType = getDetector().detect(is, metadata, context);
                mimeType = localMimeTypes.forName(mediaType.toString());
                detected = true;
                is.reset();
            } catch (IOException | MimeTypeException e) {
                //swallow
            }
        }
        if (mimeType != null) {
            if (detected) {
                //set or correct the mime type
                metadata.set(Metadata.CONTENT_TYPE, mimeType.toString());
            }
            return mimeType.getExtension();
        }
        return ".bin";
    }

    /**
     * Looks up the file extension for a given media type string.
     *
     * @param mediaType the media type string (e.g., "image/png")
     * @return the extension including the dot (e.g., ".png"), or empty string if unknown
     */
    /**
     * Normalizes internal OCR routing media types (e.g., {@code image/ocr-png})
     * back to standard media types (e.g., {@code image/png}).
     * Returns the input unchanged if it is not an OCR routing type.
     *
     * @param mediaType the media type string
     * @return the normalized media type string, or the original if no normalization needed
     */
    public static String normalizeMediaType(String mediaType) {
        if (mediaType != null && mediaType.startsWith("image/ocr-")) {
            return "image/" + mediaType.substring("image/ocr-".length());
        }
        return mediaType;
    }

    public static String getExtensionForMediaType(String mediaType) {
        if (mediaType == null) {
            return "";
        }
        mediaType = normalizeMediaType(mediaType);
        try {
            MimeType mimeType = MimeTypes.getDefaultMimeTypes().forName(mediaType);
            return mimeType.getExtension();
        } catch (MimeTypeException e) {
            return "";
        }
    }

    /**
     * Type of embedded resource, used for generating canonical resource names.
     */
    public enum EmbeddedResourcePrefix {
        EMBEDDED("embedded"),
        IMAGE("image"),
        THUMBNAIL("thumbnail");

        private final String prefix;

        EmbeddedResourcePrefix(String prefix) {
            this.prefix = prefix;
        }

        public String getPrefix() {
            return prefix;
        }
    }

    /**
     * Generates a canonical resource name from a type, counter, and media type.
     * For example: {@code generateResourceName(EmbeddedResourcePrefix.EMBEDDED, 0, "image/png")}
     * returns {@code "embedded-0.png"}.
     *
     * @param type      the embedded resource type
     * @param count     the counter value
     * @param mediaType the media type string, or null if unknown
     * @return the generated resource name with extension
     */
    public static String generateResourceName(EmbeddedResourcePrefix type, int count,
                                               String mediaType) {
        return type.getPrefix() + "-" + count + getExtensionForMediaType(mediaType);
    }

    /**
     * Sets a generated resource name on the metadata and marks the extension as inferred.
     *
     * @param metadata  the metadata to update
     * @param type      the embedded resource type
     * @param count     the counter value
     * @param mediaType the media type string, or null if unknown
     */
    public static void setGeneratedResourceName(Metadata metadata, EmbeddedResourcePrefix type,
                                                 int count, String mediaType) {
        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
                generateResourceName(type, count, mediaType));
        metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
    }

    public static void recordException(Throwable t, Metadata m) {
        String ex = ExceptionUtils.getFilteredStackTrace(t);
        m.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ex);
    }

    public static void recordEmbeddedStreamException(Throwable t, Metadata m) {
        String ex = ExceptionUtils.getFilteredStackTrace(t);
        m.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM, ex);
    }

    public boolean shouldParseEmbedded(Metadata m) {
        return getEmbeddedDocumentExtractor().shouldParseEmbedded(m);
    }

    private EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
        return embeddedDocumentExtractor;
    }

    public void parseEmbedded(TikaInputStream tis, ContentHandler handler, Metadata metadata,
                              boolean outputHtml) throws IOException, SAXException {
        embeddedDocumentExtractor.parseEmbedded(tis, handler, metadata, context, outputHtml);
    }

    /**
     * Tries to find an existing parser within the ParseContext.
     * It looks inside of CompositeParsers and ParserDecorators.
     * The use case is when a parser needs to parse an internal stream
     * that is _part_ of the document, e.g. rtf body inside an msg.
     * <p/>
     * Can return <code>null</code> if the context contains no parser or
     * the correct parser can't be found.
     *
     * @param clazz   parser class to search for
     * @param context
     * @return
     */
    public static Parser tryToFindExistingLeafParser(Class clazz, ParseContext context) {
        Parser p = context.get(Parser.class);
        if (equals(p, clazz)) {
            return p;
        }
        Parser returnParser = null;
        if (p != null) {
            if (p instanceof ParserDecorator) {
                p = findInDecorated((ParserDecorator) p, clazz);
            }
            if (equals(p, clazz)) {
                return p;
            }
            if (p instanceof CompositeParser) {
                returnParser = findInComposite((CompositeParser) p, clazz, context);
            }
        }
        if (returnParser != null && equals(returnParser, clazz)) {
            return returnParser;
        }

        return null;
    }

    private static Parser findInDecorated(ParserDecorator p, Class clazz) {
        Parser candidate = p.getWrappedParser();
        if (equals(candidate, clazz)) {
            return candidate;
        }
        if (candidate instanceof ParserDecorator) {
            candidate = findInDecorated((ParserDecorator) candidate, clazz);
        }
        return candidate;
    }

    private static Parser findInComposite(CompositeParser p, Class clazz, ParseContext context) {
        for (Parser candidate : p.getAllComponentParsers()) {
            if (equals(candidate, clazz)) {
                return candidate;
            }
            if (candidate instanceof ParserDecorator) {
                candidate = findInDecorated((ParserDecorator) candidate, clazz);
            }
            if (equals(candidate, clazz)) {
                return candidate;
            }
            if (candidate instanceof CompositeParser) {
                candidate = findInComposite((CompositeParser) candidate, clazz, context);
            }
            if (equals(candidate, clazz)) {
                return candidate;
            }
        }
        return null;
    }

    private static boolean equals(Parser parser, Class clazz) {
        if (parser == null) {
            return false;
        }
        return parser.getClass().equals(clazz);
    }
}