AutoDetectParserConfigTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
public class AutoDetectParserConfigTest extends TikaTest {
@Test
public void testConfiguringEmbeddedDocExtractor() throws Exception {
TikaLoader noNamesLoader = TikaLoaderHelper.getLoader("tika-config-no-names.json");
Parser p = noNamesLoader.loadAutoDetectParser();
ParseContext noNamesContext = noNamesLoader.loadParseContext();
String xml = getXML("testPPT_EmbeddedPDF.pptx", p, new Metadata(), noNamesContext).xml;
assertNotContained("<h1>image3.jpg</h1>", xml);
TikaLoader withNamesLoader = TikaLoaderHelper.getLoader("tika-config-with-names.json");
p = withNamesLoader.loadAutoDetectParser();
ParseContext withNamesContext = withNamesLoader.loadParseContext();
xml = getXML("testPPT_EmbeddedPDF.pptx", p, new Metadata(), withNamesContext).xml;
assertContains("<h1>image3.jpg</h1>", xml);
}
@Test
public void testContentHandlerDecoratorFactory() throws Exception {
Parser p = TikaLoaderHelper.getLoader("tika-config-upcasing-custom-handler-decorator.json").loadAutoDetectParser();
List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p);
Metadata pdfMetadata1 = metadataList.get(4);
assertContains("APACHE TIKA", pdfMetadata1.get(TikaCoreProperties.TIKA_CONTENT));
Metadata pdfMetadata2 = metadataList.get(5);
assertContains("HELLO WORLD", pdfMetadata2.get(TikaCoreProperties.TIKA_CONTENT));
}
@Test
public void testRecursiveContentHandlerDecoratorFactory() throws Exception {
Parser p = TikaLoaderHelper.getLoader("tika-config-doubling-custom-handler-decorator.json").loadAutoDetectParser();
List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p);
assertContainsCount("IMAGE2.EMF",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT), 2);
assertContainsCount("15.9.2007 11:02",
metadataList.get(4).get(TikaCoreProperties.TIKA_CONTENT), 2);
assertContainsCount("HELLO WORLD",
metadataList.get(5).get(TikaCoreProperties.TIKA_CONTENT), 4);
}
@Test
public void testXMLContentHandlerDecoratorFactory() throws Exception {
//test to make sure that the decorator is only applied once for
//legacy (e.g. not RecursiveParserWrapperHandler) parsing
Parser p = TikaLoaderHelper.getLoader("tika-config-doubling-custom-handler-decorator.json").loadAutoDetectParser();
String txt = getXML("testPPT_EmbeddedPDF.pptx", p).xml;
assertContainsCount("THE APACHE TIKA PROJECT WAS FORMALLY", txt, 2);
assertContainsCount("15.9.2007 11:02", txt, 2);
}
@Test
public void testWriteFilter() throws Exception {
TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-write-filter.json");
Parser p = loader.loadAutoDetectParser();
ParseContext parseContext = loader.loadParseContext();
Metadata metadata = Metadata.newInstance(parseContext);
List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p,
metadata, parseContext, true);
for (Metadata m : metadataList) {
for (String k : m.names()) {
assertTrue(k.startsWith("X-TIKA:") || k.startsWith("access_permission:")
|| k.startsWith("Content-") || k.equals("dc:creator"),
"unexpected key: " + k);
}
}
}
@Test
public void testDigests() throws Exception {
//test to make sure that the decorator is only applied once for
//legacy (e.g. not RecursiveParserWrapperHandler) parsing
TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-digests.json");
Parser p = loader.loadAutoDetectParser();
ParseContext context = loader.loadParseContext();
List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p, context);
// SHA256 with BASE32 encoding includes encoding in the key
assertEquals("SO67W5OGGMOFPMFQTHTNL5YU5EQXWPMNEPU7HKOZX2ULHRQICRZA====",
metadataList.get(0).get("X-TIKA:digest:SHA256:BASE32"));
assertEquals("a16f14215ebbfa47bd995e799f03cb18",
metadataList.get(0).get("X-TIKA:digest:MD5"));
assertEquals("Q7D3RFV6DNGZ4BQIS6UKNWX4CDIKPIGDU2D7ADBUDVOBYSZHF7FQ====",
metadataList.get(6).get("X-TIKA:digest:SHA256:BASE32"));
assertEquals("90a8b249a6d6b6cb127c59e01cef3aaa",
metadataList.get(6).get("X-TIKA:digest:MD5"));
}
@Test
public void testDigestsSkipContainer() throws Exception {
//test to make sure that the decorator is only applied once for
//legacy (e.g. not RecursiveParserWrapperHandler) parsing
TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-digests-skip-container.json");
Parser p = loader.loadAutoDetectParser();
ParseContext context = loader.loadParseContext();
List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p, context);
// SHA256 with BASE32 encoding includes encoding in the key
assertNull(metadataList.get(0).get("X-TIKA:digest:SHA256:BASE32"));
assertNull(metadataList.get(0).get("X-TIKA:digest:MD5"));
assertEquals("Q7D3RFV6DNGZ4BQIS6UKNWX4CDIKPIGDU2D7ADBUDVOBYSZHF7FQ====",
metadataList.get(6).get("X-TIKA:digest:SHA256:BASE32"));
assertEquals("90a8b249a6d6b6cb127c59e01cef3aaa",
metadataList.get(6).get("X-TIKA:digest:MD5"));
}
@Test
public void testDigestsEmptyParser() throws Exception {
//TIKA-3939 -- ensure that digesting happens even with EmptyParser
TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-digests-pdf-only.json");
Parser p = loader.loadAutoDetectParser();
ParseContext context = loader.loadParseContext();
List<Metadata> metadataList = getRecursiveMetadata("testPDF.pdf", p, context);
assertEquals(1, metadataList.size());
assertEquals("4ef0d3bdb12ba603f4caf7d2e2c6112e",
metadataList.get(0).get("X-TIKA:digest:MD5"));
assertEquals("org.apache.tika.parser.EmptyParser",
metadataList.get(0).get("X-TIKA:Parsed-By"));
}
@Test
public void testContainerZeroBytes() throws Exception {
Path tmp = Files.createTempFile("tika-test", "");
try {
TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-digests.json");
Parser p = loader.loadAutoDetectParser();
ParseContext context = loader.loadParseContext();
List<Metadata> metadataList = getRecursiveMetadata(tmp, p, context, true);
assertEquals("d41d8cd98f00b204e9800998ecf8427e",
metadataList.get(0).get("X-TIKA:digest:MD5"));
assertEquals("0", metadataList.get(0).get(Metadata.CONTENT_LENGTH));
} finally {
Files.delete(tmp);
}
}
}