TikaCLITest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.cli;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.io.Reader;
import java.net.URI;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.serialization.JsonMetadataList;
import org.apache.tika.utils.StringUtils;
/**
* Tests the Tika's cli
*/
public class TikaCLITest {
private static final Logger LOG = LoggerFactory.getLogger(TikaCLITest.class);
static final File TEST_DATA_FILE = new File("src/test/resources/test-data");
static final File CONFIGS_DIR = new File("src/test/resources/configs");
private final URI testDataURI = TEST_DATA_FILE.toURI();
@TempDir
private Path extractDir;
/* Test members */
private ByteArrayOutputStream outContent = null;
private ByteArrayOutputStream errContent = null;
private PrintStream stdout = null;
private PrintStream stderr = null;
private String resourcePrefix;
protected static void assertExtracted(Path p, String allFiles) throws IOException {
assertTrue(Files.exists(p), "File " + p.getFileName() + " not found in " + allFiles);
assertFalse(Files.isDirectory(p), "File " + p.getFileName() + " is a directory!");
assertTrue(Files.size(p) > 0, "File " + p.getFileName() + " wasn't extracted with " + "contents");
}
/**
* reset resourcePrefix
* save original System.out and System.err
* clear outContent and errContent if they are not empty
* set outContent and errContent as System.out and System.err
*/
@BeforeEach
public void setUp() throws Exception {
resourcePrefix = testDataURI.toString();
stdout = System.out;
stderr = System.err;
resetContent();
}
/**
* Tears down the test. Returns the System.out and System.err
*/
@AfterEach
public void tearDown() {
System.setOut(stdout);
System.setErr(stderr);
}
/**
* clear outContent and errContent if they are not empty by create a new one.
* set outContent and errContent as System.out and System.err
*/
private void resetContent() throws Exception {
if (outContent == null || outContent.size() > 0) {
outContent = new ByteArrayOutputStream();
System.setOut(new PrintStream(outContent, true, UTF_8.name()));
}
if (errContent == null || errContent.size() > 0) {
errContent = new ByteArrayOutputStream();
System.setErr(new PrintStream(errContent, true, UTF_8.name()));
}
}
/**
* Tests --list-parser-detail option of the cli
* Tests --list-parser-details option of the cli
*
* @throws Exception
*/
@Test
public void testListParserDetail() throws Exception {
String content = getParamOutContent("--list-parser-detail");
assertTrue(content.contains("application/vnd.oasis.opendocument.text-web"));
content = getParamOutContent("--list-parser-details");
assertTrue(content.contains("application/vnd.oasis.opendocument.text-web"));
}
/**
* Tests --list-parser option of the cli
*
* @throws Exception
*/
@Test
public void testListParsers() throws Exception {
String content = getParamOutContent("--list-parser");
assertTrue(content.contains("org.apache.tika.parser.iwork.IWorkPackageParser"));
}
/**
* Tests -x option of the cli
*
* @throws Exception
*/
@Test
public void testXMLOutput() throws Exception {
String content = getParamOutContent("-x", resourcePrefix + "alice.cli.test");
assertTrue(content.contains("?xml version=\"1.0\" encoding=\"UTF-8\"?"));
content = getParamOutContent("-x", "--digest=sha256", resourcePrefix + "alice.cli.test");
assertTrue(content.contains("<meta name=\"X-TIKA:digest:SHA256\" content=\"e90779adbac09c4ee"));
}
/**
* Tests a -h option of the cli
*
* @throws Exception
*/
@Test
public void testHTMLOutput() throws Exception {
String content = getParamOutContent("-h", resourcePrefix + "alice.cli.test");
assertTrue(content.contains("html xmlns=\"http://www.w3.org/1999/xhtml"));
assertTrue(content.contains("<title></title>"), "Expanded <title></title> element should be present");
content = getParamOutContent("-h", "--digest=sha384", resourcePrefix + "alice.cli.test");
assertTrue(content.contains("<meta name=\"X-TIKA:digest:SHA384\" content=\"c69ea023f5da95a026"));
}
/**
* Tests -t option of the cli
*
* @throws Exception
*/
@Test
public void testTextOutput() throws Exception {
String content = getParamOutContent("-t", resourcePrefix + "alice.cli.test");
assertTrue(content.contains("finished off the cake"));
}
/**
* Tests -A option of the cli
*
* @throws Exception
*/
@Test
public void testContentAllOutput() throws Exception {
String[] params = {"-A", resourcePrefix + "testJsonMultipleInts.html"};
TikaCLI.main(params);
String out = outContent.toString(UTF_8.name());
assertTrue(out.contains("this is a title"));
assertTrue(out.contains("body"));
}
/**
* Tests -m option of the cli
*
* @throws Exception
*/
@Test
public void testMetadataOutput() throws Exception {
String content = getParamOutContent("-m", resourcePrefix + "alice.cli.test");
assertTrue(content.contains("text/plain"));
content = getParamOutContent("-m", "--digest=SHA512", resourcePrefix + "alice.cli.test");
assertTrue(content.contains("text/plain"));
assertTrue(content.contains("X-TIKA:digest:SHA512: dd459d99bc19ff78fd31fbae46e0"));
}
/**
* Basic tests for -json option
*
* @throws Exception
*/
@Test
public void testJsonMetadataOutput() throws Exception {
String json = getParamOutContent("--json", "--digest=MD2", resourcePrefix + "testJsonMultipleInts.html");
//TIKA-1310
assertTrue(json.contains("\"html:fb:admins\":\"1,2,3,4\","));
assertTrue(json.contains("\"X-TIKA:digest:MD2\":"));
}
/**
* Test for -json with prettyprint option
*
* @throws Exception
*/
@Test
public void testJsonMetadataPrettyPrintOutput() throws Exception {
String json = getParamOutContent("--json", "-r", resourcePrefix + "testJsonMultipleInts.html");
assertTrue(json.contains("org.apache.tika.parser.DefaultParser\", \"org.apache.tika.parser.html.JSoupParser"));
//test pretty-print alphabetic sort of keys
int enc = json.indexOf("\"Content-Encoding\"");
int fb = json.indexOf("fb:admins");
int title = json.indexOf("\"dc:title\"");
assertTrue(enc > -1 && fb > -1 && enc < fb);
assertTrue(fb > -1 && title > -1 && fb > title);
}
@Test
public void testDefaultPDFIncrementalUpdateSettings() throws Exception {
String json = getParamOutContent("-J",
resourcePrefix + "testPDF_incrementalUpdates.pdf");
assertTrue(json.contains("pdf:incrementalUpdateCount\":\"2\""));
assertTrue(json.contains("embeddedResourceType\":\"VERSION\""));
}
@Test
public void testExtractJavascript() throws Exception {
String json = getParamOutContent("-J", resourcePrefix + "testPDFPackage.pdf");
assertTrue(json.contains("type=\\\"PDActionJavaScript\\\""));
assertTrue(json.contains("MACRO"));
assertTrue(json.contains("NAMES_TREE"));
}
@Test
public void testMacros() throws Exception {
String json = getParamOutContent("-J", resourcePrefix + "testPPT_macros.ppt");
assertTrue(json.contains("MACRO"));
assertTrue(json.contains("Module1"));
}
@Test
public void testRUnpack() throws Exception {
//TODO -- rework this to use two separate emitters
//one for bytes and one for json
// TODO: 00000001.bin extension may be wrong - see ~/Desktop/unpack-discussion/mime-todo.txt
String[] expectedChildren = new String[]{
"testPDFPackage.pdf.json",
//the first two test that the default single file config is working
"testPDFPackage.pdf-embed/00000001.bin",
"testPDFPackage.pdf-embed/00000002.jpg",
"testPDFPackage.pdf-embed/00000003.pdf",
"testPDFPackage.pdf-embed/00000004.pdf"};
testRecursiveUnpack("testPDFPackage.pdf", expectedChildren, 2);
}
@Test
public void testPSTRUnpack() throws Exception {
// TODO: The .bin extensions for embedded .msg files are wrong - they should be .msg
// CONTENT_TYPE is not being set for embedded documents - see ~/Desktop/unpack-discussion/mime-todo.txt
String[] expectedChildren = new String[]{"testPST.pst.json",
"testPST.pst-embed/00000007.bin",
"testPST.pst-embed/00000001.bin",
"testPST.pst-embed/00000008.bin",
"testPST.pst-embed/00000004.bin",
"testPST.pst-embed/00000003.bin",
"testPST.pst-embed/00000002.bin",
"testPST.pst-embed/00000005.bin",
"testPST.pst-embed/00000009.docx",
"testPST.pst-embed/00000006.bin"};
testRecursiveUnpack("testPST.pst", expectedChildren, 2);
try (Reader reader = Files.newBufferedReader(extractDir.resolve("testPST.pst.json"))) {
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
for (Metadata m : metadataList) {
String content = m.get(TikaCoreProperties.TIKA_CONTENT);
assertFalse(StringUtils.isBlank(content));
}
}
}
/**
* Tests -l option of the cli
*
* @throws Exception
*/
@Test
public void testLanguageOutput() throws Exception {
String content = getParamOutContent("-l", resourcePrefix + "alice.cli.test");
assertTrue(content.contains("en"));
}
/**
* Tests -d option of the cli
*
* @throws Exception
*/
@Test
public void testDetectOutput() throws Exception {
String content = getParamOutContent("-d", resourcePrefix + "alice.cli.test");
assertTrue(content.contains("text/plain"));
}
/**
* Tests --list-met-models option of the cli
*
* @throws Exception
*/
@Test
public void testListMetModels() throws Exception {
String content = getParamOutContent("--list-met-models", resourcePrefix + "alice.cli.test");
assertTrue(content.contains("text/plain"));
}
/**
* Tests --list-supported-types option of the cli
*
* @throws Exception
*/
@Test
public void testListSupportedTypes() throws Exception {
String content = getParamOutContent("--list-supported-types", resourcePrefix + "alice.cli.test");
assertTrue(content.contains("supertype: application/octet-stream"));
}
@Test
public void testExtractSimple() throws Exception {
// New pipes-based output format: json metadata + embedded files in subdirectory
String[] expectedChildren = new String[]{
"coffee.xls.json",
"coffee.xls-embed/00000001.emf",
"coffee.xls-embed/00000006.cdx",
"coffee.xls-embed/00000005.png"
};
testExtract("/coffee.xls", expectedChildren, 9);
}
@Test
public void testExtractAbsolute() throws Exception {
// New pipes format: json metadata + embedded files in subdirectory with numbered names
String[] expectedChildren = new String[]{
"testZip_absolutePath.zip.json",
"testZip_absolutePath.zip-embed/00000001.bin"
};
testExtract("testZip_absolutePath.zip", expectedChildren, 3);
}
@Test
public void testExtractRelative() throws Exception {
// New pipes format
String[] expectedChildren = new String[]{
"testZip_relative.zip.json"
};
testExtract("testZip_relative.zip", expectedChildren, 2);
}
@Test
public void testExtractOverlapping() throws Exception {
// New pipes format - overlapping names are handled by numbering
String[] expectedChildren = new String[]{
"testZip_overlappingNames.zip.json"
};
testExtract("testZip_overlappingNames.zip", expectedChildren, 3);
}
@Test
public void testExtract0x00() throws Exception {
// New pipes format
String[] expectedChildren = new String[]{
"testZip_zeroByte.zip.json"
};
testExtract("testZip_zeroByte.zip", expectedChildren, 2);
}
private void testRecursiveUnpack(String targetFile, String[] expectedChildrenFileNames) throws Exception {
testRecursiveUnpack(targetFile, expectedChildrenFileNames, expectedChildrenFileNames.length);
}
private void testRecursiveUnpack(String targetFile, String[] expectedChildrenFileNames, int expectedLength) throws Exception {
Path input = Paths.get(new URI(resourcePrefix + "/" + targetFile));
Path pluginsDir = Paths.get("target/plugins");
String[] params = {"-Z",
"-p", pluginsDir.toAbsolutePath().toString(),
input.toAbsolutePath().toString(),
extractDir.toAbsolutePath().toString()};
TikaCLI.main(params);
Set<String> fileNames = getFileNames(extractDir);
String[] jsonFile = extractDir
.toFile()
.list();
assertNotNull(jsonFile);
// Debug: log actual files found
LOG.info("=== Actual files found ===");
for (String name : fileNames) {
LOG.info(" {}", name);
}
LOG.info("=== End actual files ===");
assertEquals(expectedLength, jsonFile.length);
for (String expectedChildName : expectedChildrenFileNames) {
assertContainsFile(fileNames, expectedChildName);
}
}
private Set<String> getFileNames(Path extractDir) throws IOException {
final Set<String> names = new HashSet<>();
Files.walkFileTree(extractDir, new FileVisitor<Path>() {
@Override
public FileVisitResult preVisitDirectory(Path path, BasicFileAttributes basicFileAttributes) throws IOException {
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult visitFile(Path path, BasicFileAttributes basicFileAttributes) throws IOException {
names.add(extractDir.relativize(path).toString().replace('\\', '/'));
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult visitFileFailed(Path path, IOException e) throws IOException {
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult postVisitDirectory(Path path, IOException e) throws IOException {
return FileVisitResult.CONTINUE;
}
});
return names;
}
/**
* When tesseract is available, image types get an "ocr-" prefix (e.g., image/ocr-jpeg)
* which has no registered extension, so extracted files fall back to ".bin".
* This helper accepts either the expected name or its ".bin" variant for image extensions.
*/
private static final Set<String> IMAGE_EXTENSIONS = Set.of(".jpg", ".jpeg", ".png",
".gif", ".bmp", ".tiff", ".tif", ".jp2");
private void assertContainsFile(Set<String> fileNames, String expected) {
if (fileNames.contains(expected)) {
return;
}
// Check if this is an image file that might have .bin extension due to OCR
int dotIndex = expected.lastIndexOf('.');
if (dotIndex > 0) {
String ext = expected.substring(dotIndex);
if (IMAGE_EXTENSIONS.contains(ext.toLowerCase(java.util.Locale.ROOT))) {
String binVariant = expected.substring(0, dotIndex) + ".bin";
assertTrue(fileNames.contains(expected) || fileNames.contains(binVariant),
"Expected " + expected + " or " + binVariant + " in " + fileNames);
return;
}
}
assertTrue(fileNames.contains(expected), "Expected " + expected + " in " + fileNames);
}
private void testExtract(String targetFile, String[] expectedChildrenFileNames) throws Exception {
testExtract(targetFile, expectedChildrenFileNames, expectedChildrenFileNames.length);
}
private void testExtract(String targetFile, String[] expectedChildrenFileNames, int expectedLength) throws Exception {
Path input = Paths.get(new URI(resourcePrefix + "/" + targetFile));
Path pluginsDir = Paths.get("target/plugins");
String[] params = {"-z",
"-p", pluginsDir.toAbsolutePath().toString(),
input.toAbsolutePath().toString(),
extractDir.toAbsolutePath().toString()};
TikaCLI.main(params);
Set<String> fileNames = getFileNames(extractDir);
// Debug: log actual files found
LOG.info("=== Actual files found for -z ===");
for (String name : fileNames) {
LOG.info(" {}", name);
}
LOG.info("=== End actual files ===");
assertEquals(expectedLength, fileNames.size());
for (String expectedChildName : expectedChildrenFileNames) {
assertContainsFile(fileNames, expectedChildName);
}
}
@Test
public void testExtractTgz() throws Exception {
//TIKA-2564
Path input = Paths.get(new URI(resourcePrefix + "/test-documents.tgz"));
Path pluginsDir = Paths.get("target/plugins");
String[] params = {"-z",
"-p", pluginsDir.toAbsolutePath().toString(),
input.toAbsolutePath().toString(),
extractDir.toAbsolutePath().toString()};
TikaCLI.main(params);
Set<String> fileNames = getFileNames(extractDir);
assertTrue(fileNames.size() > 0, "Should have extracted some files");
}
// TIKA-920
@Test
public void testMultiValuedMetadata() throws Exception {
String content = getParamOutContent("-m", resourcePrefix + "testMultipleSheets.numbers");
assertTrue(content.contains("sheetNames: Checking"));
assertTrue(content.contains("sheetNames: Secon sheet"));
assertTrue(content.contains("sheetNames: Logical Sheet 3"));
assertTrue(content.contains("sheetNames: Sheet 4"));
}
// TIKA-1031
@Test
public void testZipWithSubdirs() throws Exception {
Path input = Paths.get(new URI(resourcePrefix + "/testWithSubdirs.zip"));
Path pluginsDir = Paths.get("target/plugins");
String[] params = {"-z",
"-p", pluginsDir.toAbsolutePath().toString(),
input.toAbsolutePath().toString(),
extractDir.toAbsolutePath().toString()};
TikaCLI.main(params);
Set<String> fileNames = getFileNames(extractDir);
// Async mode creates: .json metadata file + -embed/ directory with extracted bytes
assertTrue(fileNames.stream().anyMatch(f -> f.endsWith(".json")),
"Should have a .json metadata file, got: " + fileNames);
assertTrue(fileNames.stream().anyMatch(f -> f.contains("-embed/")),
"Should have extracted embedded files in -embed/ directory, got: " + fileNames);
}
@Test
public void testExtractInlineImages() throws Exception {
Path input = Paths.get(new URI(resourcePrefix + "/testPDF_childAttachments.pdf"));
Path pluginsDir = Paths.get("target/plugins");
String[] params = {"-z",
"-p", pluginsDir.toAbsolutePath().toString(),
input.toAbsolutePath().toString(),
extractDir.toAbsolutePath().toString()};
TikaCLI.main(params);
Set<String> fileNames = getFileNames(extractDir);
// New pipes format: should have json plus embedded files in subdirectory
assertTrue(fileNames.stream().anyMatch(f -> f.endsWith(".json")),
"Should have a .json metadata file in " + fileNames);
assertTrue(fileNames.size() >= 2,
"Should have at least 2 files (json + embedded), got " + fileNames.size() + ": " + fileNames);
}
/**
* Test that --extract-dir option correctly sets the output directory
* for both -z (shallow) and -Z (recursive) extraction modes.
*/
@Test
public void testExtractDirOption() throws Exception {
Path input = Paths.get(new URI(resourcePrefix + "/test_recursive_embedded.docx"));
Path pluginsDir = Paths.get("target/plugins");
// Test with -z (shallow extraction)
String[] params = {"-z",
"--extract-dir=" + extractDir.toAbsolutePath(),
"-p", pluginsDir.toAbsolutePath().toString(),
input.toAbsolutePath().toString()};
TikaCLI.main(params);
Set<String> fileNames = getFileNames(extractDir);
// Should have extracted files in the specified directory, not current dir
assertTrue(fileNames.stream().anyMatch(f -> f.endsWith(".json")),
"Should have a .json metadata file in extractDir, got: " + fileNames);
assertTrue(fileNames.stream().anyMatch(f -> f.contains("-embed/")),
"Should have extracted embedded files in extractDir, got: " + fileNames);
}
/**
* Test that --extract-dir option works with -Z (recursive) extraction.
*/
@Test
public void testExtractDirOptionRecursive() throws Exception {
Path input = Paths.get(new URI(resourcePrefix + "/test_recursive_embedded.docx"));
Path pluginsDir = Paths.get("target/plugins");
// Test with -Z (recursive extraction)
String[] params = {"-Z",
"--extract-dir=" + extractDir.toAbsolutePath(),
"-p", pluginsDir.toAbsolutePath().toString(),
input.toAbsolutePath().toString()};
TikaCLI.main(params);
Set<String> fileNames = getFileNames(extractDir);
// Should have extracted files in the specified directory
assertTrue(fileNames.stream().anyMatch(f -> f.endsWith(".json")),
"Should have a .json metadata file in extractDir, got: " + fileNames);
assertTrue(fileNames.stream().anyMatch(f -> f.contains("-embed/")),
"Should have extracted embedded files in extractDir, got: " + fileNames);
}
@Test
public void testDefaultConfigException() throws Exception {
//default xml parser will throw TikaException
//this and TestConfig() are broken into separate tests so that
//setUp and tearDown() are called each time
String[] params = {resourcePrefix + "bad_xml.xml"};
boolean tikaEx = false;
try {
TikaCLI.main(params);
} catch (TikaException e) {
tikaEx = true;
}
assertTrue(tikaEx);
}
@Test
public void testConfig() throws Exception {
String content = getParamOutContent("--config=" + CONFIGS_DIR.toString() + "/tika-config1.json", resourcePrefix + "bad_xml.xml");
assertTrue(content.contains("apple"));
assertTrue(content.contains("org.apache.tika.parser.html.JSoupParser"));
}
@Test
public void testJsonRecursiveMetadataParserMetadataOnly() throws Exception {
String content = getParamOutContent("-m", "-J", "-r", resourcePrefix + "test_recursive_embedded.docx");
assertTrue(content.contains("\"extended-properties:AppVersion\" : \"15.0000\","));
assertTrue(content.contains("\"extended-properties:Application\" : \"Microsoft Office Word\","));
assertTrue(content.contains("\"X-TIKA:embedded_resource_path\" : \"/embed1.zip\""));
assertFalse(content.contains("X-TIKA:content"));
}
@Test
public void testJsonRecursiveMetadataParserDefault() throws Exception {
String content = getParamOutContent("-J", "-r", resourcePrefix + "test_recursive_embedded.docx");
assertTrue(content.contains("\"X-TIKA:content\" : \"<html xmlns=\\\"http://www.w3.org/1999/xhtml"));
}
@Test
public void testJsonRecursiveMetadataParserText() throws Exception {
String content = getParamOutContent("-J", "-r", "-t", resourcePrefix + "test_recursive_embedded.docx");
assertTrue(content.contains("\\n\\nembed_4\\n"));
assertTrue(content.contains("\\n\\nembed_0"));
}
@Test
public void testDigestInJson() throws Exception {
String content = getParamOutContent("-J", "-r", "-t", "--digest=md5", resourcePrefix + "test_recursive_embedded.docx");
assertTrue(content.contains("\"X-TIKA:digest:MD5\" : \"59f626e09a8c16ab6dbc2800c685f772\","));
assertTrue(content.contains("\"X-TIKA:digest:MD5\" : \"f9627095ef86c482e61d99f0cc1cf87d\""));
}
@Test
@Disabled("until we re-implement serialization")
public void testConfigSerializationStaticAndCurrent() throws Exception {
String content = getParamOutContent("--dump-static-config");
//make sure at least one detector is there
assertTrue(content.contains("<detector class=\"org.apache.tika.detect.microsoft.POIFSContainerDetector\"/>"));
//make sure Executable is there because follow on tests of custom config
//test that it has been turned off.
assertTrue(content.contains("<parser class=\"org.apache.tika.parser.executable.ExecutableParser\"/>"));
content = getParamOutContent("--dump-current-config");
//make sure at least one detector is there
assertTrue(content.contains("<detector class=\"org.apache.tika.detect.DefaultDetector\"/>"));
//and at least one parser
assertTrue(content.contains("<parser class=\"org.apache.tika.parser.DefaultParser\"/>"));
}
@Test
@Disabled("until we re-implement serialization")
public void testConfigSerializationCustomMinimal() throws Exception {
String content = getParamOutContent("--config=" + CONFIGS_DIR.toString() + "/tika-config2.json", "--dump-minimal-config").replaceAll("[\r\n\t ]+", " ");
String expected =
"<parser class=\"org.apache.tika.parser.DefaultParser\">" + " <mime-exclude>application/pdf</mime-exclude>" + " <mime-exclude>image/jpeg</mime-exclude> " +
"</parser> " + "<parser class=\"org.apache.tika.parser.EmptyParser\">" + " <mime>application/pdf</mime> " + "</parser>";
assertTrue(content.contains(expected));
}
@Test
@Disabled("until we re-implement serialization")
public void testConfigSerializationCustomStatic() throws Exception {
String content = getParamOutContent("--config=" + TEST_DATA_FILE.toString() + "/tika-config2.json", "--dump-static-config");
assertFalse(content.contains("org.apache.tika.parser.executable.Executable"));
}
/**
* Tests --list-detector option of the cli
* Tests --list-detectors option of the cli
*
* @throws Exception
*/
@Test
public void testListDetectors() throws Exception {
String content = getParamOutContent("--list-detector");
assertTrue(content.contains("org.apache.tika.detect.DefaultDetector"));
content = getParamOutContent("--list-detectors");
assertTrue(content.contains("org.apache.tika.detect.DefaultDetector"));
}
/**
* Tests --list-parser-detail-apt option of the cli
* Tests --list-parser-details-apt option of the cli
*
* @throws Exception
*/
@Test
public void testListParserDetailApt() throws Exception {
String content = getParamOutContent("--list-parser-detail-apt");
assertTrue(content.contains("application/vnd.oasis.opendocument.text-web"));
content = getParamOutContent("--list-parser-details-apt");
assertTrue(content.contains("application/vnd.oasis.opendocument.text-web"));
}
/**
* reset outContent and errContent if they are not empty
* run given params in TikaCLI and return outContent String with UTF-8
*/
String getParamOutContent(String... params) throws Exception {
resetContent();
TikaCLI.main(params);
return outContent.toString("UTF-8");
}
}