RecursiveParserWrapperTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.ClosedInputStream;
import org.apache.commons.io.input.ProxyInputStream;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.TikaTest;
import org.apache.tika.config.EmbeddedLimits;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
public class RecursiveParserWrapperTest extends TikaTest {
@Test
public void testBasicXML() throws Exception {
List<Metadata> list = getMetadata(new Metadata(),
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
Metadata container = list.get(0);
String content = container.get(TikaCoreProperties.TIKA_CONTENT);
//not much differentiates html from xml in this test file
assertTrue(content.contains("<p class=\"header\" />"));
}
@Test
public void testBasicHTML() throws Exception {
List<Metadata> list = getMetadata(new Metadata(),
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1));
Metadata container = list.get(0);
String content = container.get(TikaCoreProperties.TIKA_CONTENT);
//not much differentiates html from xml in this test file
assertTrue(content.contains("<p class=\"header\"></p>"));
}
@Test
public void testBasicText() throws Exception {
List<Metadata> list = getMetadata(new Metadata(),
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
Metadata container = list.get(0);
String content = container.get(TikaCoreProperties.TIKA_CONTENT);
assertFalse(content.contains("<p "));
assertTrue(content.contains("embed_0"));
}
@Test
public void testIgnoreContent() throws Exception {
List<Metadata> list = getMetadata(new Metadata(),
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
Metadata container = list.get(0);
String content = container.get(TikaCoreProperties.TIKA_CONTENT);
assertNull(content);
}
@Test
public void testCharLimit() throws Exception {
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
70));
try (TikaInputStream tis =
getResourceAsStream("/test-documents/test_recursive_embedded.docx")) {
wrapper.parse(tis, handler, metadata, context);
}
List<Metadata> list = handler.getMetadataList();
assertEquals(2, list.size());
int wlr = 0;
for (Metadata m : list) {
String limitReached = m.get(TikaCoreProperties.WRITE_LIMIT_REACHED);
if (limitReached != null && limitReached.equals("true")) {
wlr++;
}
}
assertEquals(2, wlr);
}
@Test
public void testOne() throws Exception {
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
int writeLimit = 100;
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
writeLimit, false, context));
try (TikaInputStream tis = getResourceAsStream(
"/test-documents/test_recursive_embedded" + ".docx")) {
wrapper.parse(tis, handler, metadata, context);
}
List<Metadata> list = handler.getMetadataList();
assertEquals(12, list.size());
}
@Test
public void testTarball() throws Exception {
List<Metadata> list = getRecursiveMetadata("test-documents.tgz");
List<String> actualInternalPaths =
list.stream()
.map(m -> m.get(TikaCoreProperties.INTERNAL_PATH))
.collect(Collectors.toList());
List<String> expectedInternalPaths = Arrays.asList(null,
"test-documents/testEXCEL.xls",
"test-documents/testHTML.html",
"Thumbnails/thumbnail.png",
"Thumbnails/thumbnail.pdf",
"test-documents/testOpenOffice2.odt",
"test-documents/testPDF.pdf",
"test-documents/testPPT.ppt",
"test-documents/testRTF.rtf",
"test-documents/testTXT.txt",
"test-documents/testWORD.doc",
"test-documents/testXML.xml",
"test-documents.tar");
assertEquals(expectedInternalPaths, actualInternalPaths);
List<String> actualEmbeddedPaths =
list.stream()
.map(m -> m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH))
.collect(Collectors.toList());
assertEquals(Arrays.asList(null,
"/test-documents.tar/testEXCEL.xls",
"/test-documents.tar/testHTML.html",
"/test-documents.tar/testOpenOffice2.odt/thumbnail.png",
"/test-documents.tar/testOpenOffice2.odt/thumbnail.pdf",
"/test-documents.tar/testOpenOffice2.odt",
"/test-documents.tar/testPDF.pdf",
"/test-documents.tar/testPPT.ppt",
"/test-documents.tar/testRTF.rtf",
"/test-documents.tar/testTXT.txt",
"/test-documents.tar/testWORD.doc",
"/test-documents.tar/testXML.xml",
"/test-documents.tar"), actualEmbeddedPaths);
}
@Test
public void testCharLimitNoThrowOnWriteLimit() throws Exception {
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
int writeLimit = 510;
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
writeLimit, false, context));
try (TikaInputStream tis = getResourceAsStream("/test-documents/test_recursive_embedded" +
".docx")) {
wrapper.parse(tis, handler, metadata, context);
}
List<Metadata> list = handler.getMetadataList();
assertEquals(12, list.size());
assertEquals("true", list.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
// Verify that content was extracted (the specific order of embedded documents
// may vary based on ZIP entry iteration order)
int totalContentLength = 0;
for (Metadata m : list) {
String content = m.get(TikaCoreProperties.TIKA_CONTENT);
if (content != null) {
totalContentLength += content.length();
}
}
// With a 510 char limit, we should have extracted some content but not unlimited
assertTrue(totalContentLength > 0, "Should have extracted some content");
assertTrue(totalContentLength <= writeLimit + 100,
"Total content length should be near the write limit");
}
@Test
public void testSpecificLimit() throws Exception {
int writeLimit = 60;
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
writeLimit, false, context));
try (TikaInputStream tis = getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf")) {
wrapper.parse(tis, handler, metadata, context);
}
List<Metadata> list = handler.getMetadataList();
assertTrue(writeLimit >= getContentLength(list),
"writeLimit=" + writeLimit + " contentLength=" + getContentLength(list));
}
private int getContentLength(List<Metadata> metadataList) {
int sz = 0;
for (Metadata metadata : metadataList) {
String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
if (content != null) {
sz += content.length();
}
}
return sz;
}
@Test
public void testMaxEmbedded() throws Exception {
int maxEmbedded = 4;
int totalNoLimit = 12;//including outer container file
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
String limitReached = null;
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
//test default
try (TikaInputStream tis = getResourceAsStream("/test-documents/test_recursive_embedded.docx")) {
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
wrapper.parse(tis, handler, metadata, context);
List<Metadata> list = handler.getMetadataList();
assertEquals(totalNoLimit, list.size());
limitReached = list.get(0)
.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
assertNull(limitReached);
}
//test setting value via EmbeddedLimits
metadata = new Metadata();
ParseContext limitContext = new ParseContext();
EmbeddedLimits limits = new EmbeddedLimits();
limits.setMaxCount(maxEmbedded);
limitContext.set(EmbeddedLimits.class, limits);
try (TikaInputStream tis = getResourceAsStream("/test-documents/test_recursive_embedded.docx")) {
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
wrapper.parse(tis, handler, metadata, limitContext);
List<Metadata> list = handler.getMetadataList();
//add 1 for outer container file
assertEquals(maxEmbedded + 1, list.size());
limitReached = list.get(0)
.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
assertEquals("true", limitReached);
}
//test setting value < 0 (unlimited)
metadata = new Metadata();
try (TikaInputStream tis = getResourceAsStream("/test-documents/test_recursive_embedded.docx")) {
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
wrapper.parse(tis, handler, metadata, context);
List<Metadata> list = handler.getMetadataList();
assertEquals(totalNoLimit, list.size());
limitReached = list.get(0)
.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
assertNull(limitReached);
}
}
@Test
public void testEmbeddedResourcePath() throws Exception {
Set<String> targets = new HashSet<>();
targets.add("/embed1.zip");
targets.add("/embed1.zip/embed2.zip");
targets.add("/embed1.zip/embed2.zip/embed3.zip");
targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip");
targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip/embed4.txt");
targets.add("/embed1.zip/embed2.zip/embed3.zip/embed3.txt");
targets.add("/embed1.zip/embed2.zip/embed2a.txt");
targets.add("/embed1.zip/embed2.zip/embed2b.txt");
targets.add("/embed1.zip/embed1b.txt");
targets.add("/embed1.zip/embed1a.txt");
targets.add("/image1.emf");
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
List<Metadata> list = getMetadata(metadata,
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
Metadata container = list.get(0);
String content = container.get(TikaCoreProperties.TIKA_CONTENT);
assertTrue(content.contains("<p class=\"header\" />"));
Set<String> seen = new HashSet<>();
for (Metadata m : list) {
String path = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
if (path != null) {
seen.add(path);
}
}
assertEquals(targets, seen);
}
@Test
public void testEmbeddedNPE() throws Exception {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
List<Metadata> list = getMetadata(metadata,
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
//default behavior (user doesn't specify whether or not to catch embedded exceptions
//is to catch the exception
assertEquals(13, list.size());
Metadata mockNPEMetadata = list.get(10);
assertContains("java.lang.NullPointerException",
mockNPEMetadata.get(TikaCoreProperties.EMBEDDED_EXCEPTION));
metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
list = getMetadata(metadata,
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
false, false);
//Composite parser swallows caught TikaExceptions, IOExceptions and SAXExceptions
//and just doesn't bother to report that there was an exception.
assertEquals(13, list.size());
}
@Test
public void testPrimaryExcWEmbedded() throws Exception {
//if embedded content is handled and then
//the parser hits an exception in the container document,
//that the first element of the returned list is the container document
//and the second is the embedded content
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "embedded_then_npe.xml");
ParseContext context = new ParseContext();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER, true);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
String path = "/test-documents/mock/embedded_then_npe.xml";
TikaInputStream tis = null;
boolean npe = false;
try {
tis = getResourceAsStream(path);
wrapper.parse(tis, handler, metadata, context);
} catch (TikaException e) {
if (e.getCause().getClass().equals(NullPointerException.class)) {
npe = true;
}
} finally {
IOUtils.closeQuietly(tis);
}
assertTrue(npe, "npe");
List<Metadata> metadataList = handler.getMetadataList();
assertEquals(2, metadataList.size());
Metadata outerMetadata = metadataList.get(0);
Metadata embeddedMetadata = metadataList.get(1);
assertContains("main_content", outerMetadata.get(TikaCoreProperties.TIKA_CONTENT));
assertEquals("embedded_then_npe.xml",
outerMetadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));
assertContains("some_embedded_content",
embeddedMetadata.get(TikaCoreProperties.TIKA_CONTENT));
assertEquals("embed1.xml", embeddedMetadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
}
@Test
public void testDigesters() throws Exception {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
List<Metadata> list = getMetadata(metadata,
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
true, true);
String md5Key = "X-TIKA:digest:MD5";
assertEquals("59f626e09a8c16ab6dbc2800c685f772", list.get(0).get(md5Key));
assertEquals("ccdf3882e7e4c2454e28884db9b0a54d", list.get(6).get(md5Key));
assertEquals("a869bf6432ebd14e19fc79416274e0c9", list.get(7).get(md5Key));
//while we're at it, also test the embedded path id
assertEquals("/2/5/8/9", list.get(6).get(TikaCoreProperties.EMBEDDED_ID_PATH));
assertEquals("/embed1.zip/embed2.zip/embed3.zip/embed3.txt",
list.get(6).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertEquals(9, list.get(6).getInt(TikaCoreProperties.EMBEDDED_ID));
assertEquals(4, list.get(6).getInt(TikaCoreProperties.EMBEDDED_DEPTH));
}
@Test
public void testStreamClosedAfterSpill() throws Exception {
// When TikaInputStream spills to a temp file (via getPath()/getFile()),
// the source stream should be closed promptly since all bytes have been
// consumed and cached - there's no reason to keep it open.
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER, true);
String path = "/test-documents/test_recursive_embedded.docx";
ContentHandlerFactory contentHandlerFactory =
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1);
RecursiveParserWrapperHandler handler =
new RecursiveParserWrapperHandler(contentHandlerFactory);
try (CloseCountingInputStream stream =
new CloseCountingInputStream(getResourceAsStream(path))) {
TikaInputStream tis = TikaInputStream.get(stream);
tis.setCloseShield();
wrapper.parse(tis, handler, metadata, context);
// Source stream should not be closed after spilling to file
assertEquals(0, stream.counter);
tis.removeCloseShield();
tis.close();
}
}
private List<Metadata> getMetadata(Metadata metadata,
ContentHandlerFactory contentHandlerFactory,
boolean catchEmbeddedExceptions,
boolean digest) throws Exception {
ParseContext context;
Parser wrapped;
if (digest) {
TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-md5-digest.json");
wrapped = loader.loadAutoDetectParser();
context = loader.loadParseContext();
} else {
wrapped = AUTO_DETECT_PARSER;
context = new ParseContext();
}
RecursiveParserWrapper wrapper =
new RecursiveParserWrapper(wrapped, catchEmbeddedExceptions);
String path = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if (path == null) {
path = "/test-documents/test_recursive_embedded.docx";
} else {
path = "/test-documents/" + path;
}
TikaInputStream tis = null;
RecursiveParserWrapperHandler handler =
new RecursiveParserWrapperHandler(contentHandlerFactory);
try {
tis = TikaInputStream.get(getResourceAsUri(path));
wrapper.parse(tis, handler, metadata, context);
} finally {
IOUtils.closeQuietly(tis);
}
return handler.getMetadataList();
}
private List<Metadata> getMetadata(Metadata metadata,
ContentHandlerFactory contentHandlerFactory)
throws Exception {
return getMetadata(metadata, contentHandlerFactory, true, false);
}
private static class CloseCountingInputStream extends ProxyInputStream {
int counter = 0;
public CloseCountingInputStream(InputStream in) {
super(in);
}
/**
* Replaces the underlying input stream with a {@link ClosedInputStream}
* sentinel. The original input stream will remain open, but this proxy
* will appear closed.
*/
@Override
public void close() throws IOException {
in.close();
counter++;
}
}
}