EmbeddedDocumentUtil.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.extractor;
import java.io.IOException;
import java.io.Serializable;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.StatefulParser;
import org.apache.tika.utils.ExceptionUtils;
/**
* Utility class to handle common issues with embedded documents.
* <p/>
* Use statically if all that is needed is getting the EmbeddedDocumentExtractor.
* Otherwise, instantiate an instance.
* <p/>
* Note: This is not thread safe. Make sure to instantiate one per thread.
*/
public class EmbeddedDocumentUtil implements Serializable {
private final ParseContext context;
private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
//these are lazily initialized and can be null
private MimeTypes mimeTypes;
private Detector detector;
public EmbeddedDocumentUtil(ParseContext context) {
this.context = context;
this.embeddedDocumentExtractor = getEmbeddedDocumentExtractor(context);
}
/**
* This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext.
* As of Tika 1.15, an AutoDetectParser will automatically be added to parse
* embedded documents if no Parser.class is specified in the ParseContext.
* <p/>
* If you'd prefer not to parse embedded documents, set Parser.class
* to {@link org.apache.tika.parser.EmptyParser} in the ParseContext.
*
* @param context
* @return EmbeddedDocumentExtractor
*/
public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) {
EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class);
if (extractor != null) {
return extractor;
}
//ensure that an AutoDetectParser is
//available for parsing embedded docs TIKA-2096
Parser embeddedParser = context.get(Parser.class);
if (embeddedParser == null) {
context.set(Parser.class, new AutoDetectParser());
}
EmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(context);
context.set(EmbeddedDocumentExtractor.class, ex);
return ex;
}
/**
* Utility function to get the Parser that was sent in to the
* ParseContext to handle embedded documents. If it is stateful,
* unwrap it to get its stateless delegating parser.
* <p>
* If there is no Parser in the parser context, this will return null.
*
* @param context
* @return
*/
public static Parser getStatelessParser(ParseContext context) {
Parser p = context.get(Parser.class);
if (p == null) {
return null;
}
if (p instanceof StatefulParser) {
return ((StatefulParser) p).getWrappedParser();
}
return p;
}
public PasswordProvider getPasswordProvider() {
return context.get(PasswordProvider.class);
}
public Detector getDetector() {
//be as lazy as possible and cache
Detector localDetector = context.get(Detector.class);
if (localDetector != null) {
return localDetector;
}
if (detector != null) {
return detector;
}
detector = new DefaultDetector(getMimeTypes());
return detector;
}
public MimeTypes getMimeTypes() {
MimeTypes localMimeTypes = context.get(MimeTypes.class);
//be as lazy as possible and cache the mimeTypes
if (localMimeTypes != null) {
return localMimeTypes;
}
if (mimeTypes != null) {
return mimeTypes;
}
mimeTypes = MimeTypes.getDefaultMimeTypes();
return mimeTypes;
}
public String getExtension(TikaInputStream is, Metadata metadata) {
String mimeString = metadata.get(Metadata.CONTENT_TYPE);
//use the buffered mimetypes as default
MimeTypes localMimeTypes = getMimeTypes();
MimeType mimeType = null;
boolean detected = false;
if (mimeString != null) {
try {
mimeType = localMimeTypes.forName(mimeString);
} catch (MimeTypeException e) {
//swallow
}
}
if (mimeType == null) {
try {
MediaType mediaType = getDetector().detect(is, metadata, context);
mimeType = localMimeTypes.forName(mediaType.toString());
detected = true;
is.reset();
} catch (IOException | MimeTypeException e) {
//swallow
}
}
if (mimeType != null) {
if (detected) {
//set or correct the mime type
metadata.set(Metadata.CONTENT_TYPE, mimeType.toString());
}
return mimeType.getExtension();
}
return ".bin";
}
/**
* Looks up the file extension for a given media type string.
*
* @param mediaType the media type string (e.g., "image/png")
* @return the extension including the dot (e.g., ".png"), or empty string if unknown
*/
/**
* Normalizes internal OCR routing media types (e.g., {@code image/ocr-png})
* back to standard media types (e.g., {@code image/png}).
* Returns the input unchanged if it is not an OCR routing type.
*
* @param mediaType the media type string
* @return the normalized media type string, or the original if no normalization needed
*/
public static String normalizeMediaType(String mediaType) {
if (mediaType != null && mediaType.startsWith("image/ocr-")) {
return "image/" + mediaType.substring("image/ocr-".length());
}
return mediaType;
}
public static String getExtensionForMediaType(String mediaType) {
if (mediaType == null) {
return "";
}
mediaType = normalizeMediaType(mediaType);
try {
MimeType mimeType = MimeTypes.getDefaultMimeTypes().forName(mediaType);
return mimeType.getExtension();
} catch (MimeTypeException e) {
return "";
}
}
/**
* Type of embedded resource, used for generating canonical resource names.
*/
public enum EmbeddedResourcePrefix {
EMBEDDED("embedded"),
IMAGE("image"),
THUMBNAIL("thumbnail");
private final String prefix;
EmbeddedResourcePrefix(String prefix) {
this.prefix = prefix;
}
public String getPrefix() {
return prefix;
}
}
/**
* Generates a canonical resource name from a type, counter, and media type.
* For example: {@code generateResourceName(EmbeddedResourcePrefix.EMBEDDED, 0, "image/png")}
* returns {@code "embedded-0.png"}.
*
* @param type the embedded resource type
* @param count the counter value
* @param mediaType the media type string, or null if unknown
* @return the generated resource name with extension
*/
public static String generateResourceName(EmbeddedResourcePrefix type, int count,
String mediaType) {
return type.getPrefix() + "-" + count + getExtensionForMediaType(mediaType);
}
/**
* Sets a generated resource name on the metadata and marks the extension as inferred.
*
* @param metadata the metadata to update
* @param type the embedded resource type
* @param count the counter value
* @param mediaType the media type string, or null if unknown
*/
public static void setGeneratedResourceName(Metadata metadata, EmbeddedResourcePrefix type,
int count, String mediaType) {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
generateResourceName(type, count, mediaType));
metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
}
public static void recordException(Throwable t, Metadata m) {
String ex = ExceptionUtils.getFilteredStackTrace(t);
m.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ex);
}
public static void recordEmbeddedStreamException(Throwable t, Metadata m) {
String ex = ExceptionUtils.getFilteredStackTrace(t);
m.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM, ex);
}
public boolean shouldParseEmbedded(Metadata m) {
return getEmbeddedDocumentExtractor().shouldParseEmbedded(m);
}
private EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
return embeddedDocumentExtractor;
}
public void parseEmbedded(TikaInputStream tis, ContentHandler handler, Metadata metadata,
boolean outputHtml) throws IOException, SAXException {
embeddedDocumentExtractor.parseEmbedded(tis, handler, metadata, context, outputHtml);
}
/**
* Tries to find an existing parser within the ParseContext.
* It looks inside of CompositeParsers and ParserDecorators.
* The use case is when a parser needs to parse an internal stream
* that is _part_ of the document, e.g. rtf body inside an msg.
* <p/>
* Can return <code>null</code> if the context contains no parser or
* the correct parser can't be found.
*
* @param clazz parser class to search for
* @param context
* @return
*/
public static Parser tryToFindExistingLeafParser(Class clazz, ParseContext context) {
Parser p = context.get(Parser.class);
if (equals(p, clazz)) {
return p;
}
Parser returnParser = null;
if (p != null) {
if (p instanceof ParserDecorator) {
p = findInDecorated((ParserDecorator) p, clazz);
}
if (equals(p, clazz)) {
return p;
}
if (p instanceof CompositeParser) {
returnParser = findInComposite((CompositeParser) p, clazz, context);
}
}
if (returnParser != null && equals(returnParser, clazz)) {
return returnParser;
}
return null;
}
private static Parser findInDecorated(ParserDecorator p, Class clazz) {
Parser candidate = p.getWrappedParser();
if (equals(candidate, clazz)) {
return candidate;
}
if (candidate instanceof ParserDecorator) {
candidate = findInDecorated((ParserDecorator) candidate, clazz);
}
return candidate;
}
private static Parser findInComposite(CompositeParser p, Class clazz, ParseContext context) {
for (Parser candidate : p.getAllComponentParsers()) {
if (equals(candidate, clazz)) {
return candidate;
}
if (candidate instanceof ParserDecorator) {
candidate = findInDecorated((ParserDecorator) candidate, clazz);
}
if (equals(candidate, clazz)) {
return candidate;
}
if (candidate instanceof CompositeParser) {
candidate = findInComposite((CompositeParser) candidate, clazz, context);
}
if (equals(candidate, clazz)) {
return candidate;
}
}
return null;
}
private static boolean equals(Parser parser, Class clazz) {
if (parser == null) {
return false;
}
return parser.getClass().equals(clazz);
}
}