UnpackExtractor.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.pipes.core.extractor;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.CorruptedFileException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator;
import org.apache.tika.extractor.EmbeddedStreamTranslator;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.extractor.UnpackHandler;
import org.apache.tika.extractor.UnpackSelector;
import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.ParseRecord;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
/**
* Embedded document extractor that parses and unpacks embedded documents,
* extracting both text/metadata and raw bytes.
*
* @since Apache Tika 3.0.0
*/
public class UnpackExtractor extends ParsingEmbeddedDocumentExtractor {
private static final Logger LOGGER =
LoggerFactory.getLogger(ParsingEmbeddedDocumentExtractor.class);
private static final File ABSTRACT_PATH = new File("");
private final EmbeddedStreamTranslator embeddedStreamTranslator = new DefaultEmbeddedStreamTranslator();
private long bytesExtracted = 0;
private final long maxEmbeddedBytesForExtraction;
public UnpackExtractor(ParseContext context) {
super(context);
// Get maxUnpackBytes from UnpackConfig, defaulting to 10GB if not configured
// or using Long.MAX_VALUE if set to -1 (unlimited)
UnpackConfig unpackConfig = context.get(UnpackConfig.class);
if (unpackConfig != null) {
long configuredMax = unpackConfig.getMaxUnpackBytes();
// -1 means no limit (use Long.MAX_VALUE)
this.maxEmbeddedBytesForExtraction = configuredMax >= 0 ? configuredMax : Long.MAX_VALUE;
} else {
this.maxEmbeddedBytesForExtraction = UnpackConfig.DEFAULT_MAX_UNPACK_BYTES;
}
}
@Override
public void parseEmbedded(
TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext parseContext, boolean outputHtml)
throws SAXException, IOException {
// Check and enforce embedded limits even if caller didn't call shouldParseEmbedded()
// This guarantees limits are enforced for all callers
ParseRecord parseRecord = context.get(ParseRecord.class);
if (parseRecord != null && !checkEmbeddedLimits(parseRecord)) {
return;
}
// Increment embedded count for tracking (needed for EmbeddedLimits)
if (parseRecord != null) {
parseRecord.incrementEmbeddedCount();
}
if (outputHtml) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
handler.startElement(XHTML, "div", "div", attributes);
}
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if (isWriteFileNameToContent() && name != null && name.length() > 0 && outputHtml) {
handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
char[] chars = name.toCharArray();
handler.characters(chars, 0, chars.length);
handler.endElement(XHTML, "h1", "h1");
}
// Use the delegate parser to parse this entry
try {
UnpackHandler bytesHandler = context.get(UnpackHandler.class);
tis.setCloseShield();
if (bytesHandler != null) {
parseWithBytes(tis, handler, metadata);
} else {
parse(tis, handler, metadata);
}
} catch (EncryptedDocumentException ede) {
recordException(ede, context);
} catch (CorruptedFileException e) {
//necessary to stop the parse to avoid infinite loops
//on corrupt sqlite3 files
throw new IOException(e);
} catch (TikaException e) {
recordException(e, context);
} finally {
tis.removeCloseShield();
}
if (outputHtml) {
handler.endElement(XHTML, "div", "div");
}
}
private void parseWithBytes(TikaInputStream tis, ContentHandler handler, Metadata metadata) throws TikaException, IOException, SAXException {
//trigger spool to disk
Path rawBytes = tis.getPath();
//There may be a "translated" path for OLE2 etc
Path translated = null;
try {
//translate the stream or not
if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
translated = Files.createTempFile("tika-tmp-", ".bin");
try (OutputStream os = Files.newOutputStream(translated)) {
embeddedStreamTranslator.translate(tis, metadata, os);
}
}
parse(tis, handler, metadata);
} finally {
try {
if (translated != null) {
storeEmbeddedBytes(translated, metadata);
} else {
storeEmbeddedBytes(rawBytes, metadata);
}
} finally {
if (translated != null) {
Files.delete(translated);
}
}
}
}
private void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata)
throws TikaException, IOException, SAXException {
getDelegatingParser().parse(tis,
new EmbeddedContentHandler(new BodyContentHandler(handler)),
metadata, context);
}
private void storeEmbeddedBytes(Path p, Metadata metadata) {
if (p == null) {
return;
}
// Get UnpackSelector from ParseContext - if configured, use it to filter
// If no selector configured, accept all embedded documents
UnpackSelector selector = context.get(UnpackSelector.class);
if (selector != null && !selector.select(metadata)) {
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("skipping embedded bytes {} <-> {}",
metadata.get(Metadata.CONTENT_TYPE),
metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
}
return;
}
UnpackHandler unpackHandler =
context.get(UnpackHandler.class);
int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID);
try (InputStream is = Files.newInputStream(p)) {
if (bytesExtracted >= maxEmbeddedBytesForExtraction) {
throw new IOException("Bytes extracted (" + bytesExtracted +
") >= max allowed (" + maxEmbeddedBytesForExtraction + ")");
}
long maxToRead = maxEmbeddedBytesForExtraction - bytesExtracted;
try (BoundedInputStream boundedIs = new BoundedInputStream(maxToRead, is)) {
unpackHandler.add(id, metadata, boundedIs);
bytesExtracted += boundedIs.getPos();
if (boundedIs.hasHitBound()) {
throw new IOException("Bytes extracted (" + bytesExtracted +
") >= max allowed (" + maxEmbeddedBytesForExtraction + "). Truncated " +
"bytes");
}
}
} catch (IOException e) {
LOGGER.warn("problem writing out embedded bytes", e);
}
}
}