ParsingEmbeddedDocumentExtractor.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.extractor;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.CorruptedFileException;
import org.apache.tika.exception.EmbeddedLimitReachedException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.DelegatingParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.ParseRecord;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.SAXOutputConfig;
/**
* Helper class for parsers of package archives or other compound document
* formats that support embedded or attached component documents.
*
* @since Apache Tika 0.8
*/
public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {
private static final File ABSTRACT_PATH = new File("");
private static final Parser DELEGATING_PARSER = new DelegatingParser();
protected final ParseContext context;
public ParsingEmbeddedDocumentExtractor(ParseContext context) {
this.context = context;
}
@Override
public boolean shouldParseEmbedded(Metadata metadata) {
// Check ParseRecord for depth/count limits first
ParseRecord parseRecord = context.get(ParseRecord.class);
if (parseRecord != null && !checkEmbeddedLimits(parseRecord)) {
return false;
}
// Then check DocumentSelector for content-based filtering
DocumentSelector selector = context.get(DocumentSelector.class);
if (selector != null) {
return selector.select(metadata);
}
// Then check FilenameFilter
FilenameFilter filter = context.get(FilenameFilter.class);
if (filter != null) {
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if (name != null) {
return filter.accept(ABSTRACT_PATH, name);
}
}
return true;
}
/**
* Checks embedded document limits from ParseRecord.
* <p>
* If throwOnMaxDepth or throwOnMaxCount is configured and the respective limit is hit,
* an EmbeddedLimitReachedException is thrown. Otherwise, returns false and sets the
* appropriate limit flag on the ParseRecord.
* <p>
* Note: The count limit is a hard stop (once hit, no more embedded docs are parsed).
* The depth limit only affects documents at that depth - sibling documents at
* shallower depths will still be parsed.
* <p>
* Subclasses that override parseEmbedded() should call this method to enforce limits.
*
* @param parseRecord the parse record to check
* @return true if the embedded document should be parsed, false if limits are exceeded
* @throws EmbeddedLimitReachedException if a limit is exceeded and throwing is configured
*/
protected boolean checkEmbeddedLimits(ParseRecord parseRecord) {
// Count limit is a hard stop - once we've hit max, no more embedded parsing
if (parseRecord.isEmbeddedCountLimitReached()) {
return false;
}
int maxCount = parseRecord.getMaxEmbeddedCount();
if (maxCount >= 0 && parseRecord.getEmbeddedCount() >= maxCount) {
parseRecord.setEmbeddedCountLimitReached(true);
if (parseRecord.isThrowOnMaxCount()) {
throw new EmbeddedLimitReachedException(
EmbeddedLimitReachedException.LimitType.MAX_COUNT, maxCount);
}
return false;
}
// Depth limit only applies to current depth - siblings at shallower levels
// can still be parsed. The flag is set for reporting purposes.
// depth is 1-indexed (main doc is depth 1), so embedded depth limit of N
// means we allow parsing up to depth N+1
int maxDepth = parseRecord.getMaxEmbeddedDepth();
if (maxDepth >= 0 && parseRecord.getDepth() > maxDepth + 1) {
parseRecord.setEmbeddedDepthLimitReached(true);
if (parseRecord.isThrowOnMaxDepth()) {
throw new EmbeddedLimitReachedException(
EmbeddedLimitReachedException.LimitType.MAX_DEPTH, maxDepth);
}
return false;
}
return true;
}
@Override
public void parseEmbedded(
TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext parseContext, boolean outputHtml)
throws SAXException, IOException {
// Check and enforce embedded limits even if caller didn't call shouldParseEmbedded()
// This guarantees limits are enforced for all callers
ParseRecord parseRecord = context.get(ParseRecord.class);
if (parseRecord != null && !checkEmbeddedLimits(parseRecord)) {
return;
}
// Increment embedded count for tracking
if (parseRecord != null) {
parseRecord.incrementEmbeddedCount();
}
if (outputHtml) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
handler.startElement(XHTML, "div", "div", attributes);
}
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if (isWriteFileNameToContent() && name != null && name.length() > 0 && outputHtml) {
handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
char[] chars = name.toCharArray();
handler.characters(chars, 0, chars.length);
handler.endElement(XHTML, "h1", "h1");
}
// Use the delegate parser to parse this entry
try {
tis.setCloseShield();
DELEGATING_PARSER.parse(tis, new EmbeddedContentHandler(new BodyContentHandler(handler)),
metadata, context);
} catch (EncryptedDocumentException ede) {
recordException(ede, context);
} catch (CorruptedFileException e) {
//necessary to stop the parse to avoid infinite loops
//on corrupt sqlite3 files
throw new IOException(e);
} catch (TikaException e) {
recordException(e, context);
} finally {
tis.removeCloseShield();
}
if (outputHtml) {
handler.endElement(XHTML, "div", "div");
}
}
protected void recordException(Exception e, ParseContext context) {
ParseRecord record = context.get(ParseRecord.class);
if (record == null) {
return;
}
record.addException(e);
}
public Parser getDelegatingParser() {
return DELEGATING_PARSER;
}
/**
* Returns whether to write file names to content based on {@link SAXOutputConfig}
* in the ParseContext. Defaults to {@code true} if no config is present.
*
* @return true if file names should be written to content
*/
public boolean isWriteFileNameToContent() {
SAXOutputConfig config = context.get(SAXOutputConfig.class);
return config == null || config.isWriteFileNameToContent();
}
}