PipesForkParserTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.pipes.fork;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.api.PipesResult;
import org.apache.tika.sax.BasicContentHandlerFactory;
public class PipesForkParserTest {
private static final Path PLUGINS_DIR = Paths.get("target/plugins");
@TempDir
Path tempDir;
@BeforeAll
static void checkPluginsDir() {
if (!Files.isDirectory(PLUGINS_DIR)) {
System.err.println("WARNING: Plugins directory not found at " + PLUGINS_DIR.toAbsolutePath() +
". Tests may fail. Run 'mvn process-test-resources' first.");
}
}
private Path createZipWithEmbeddedFiles(String zipName, String... entries) throws IOException {
Path zipPath = tempDir.resolve(zipName);
try (OutputStream fos = Files.newOutputStream(zipPath);
ZipOutputStream zos = new ZipOutputStream(fos)) {
for (int i = 0; i < entries.length; i += 2) {
zos.putNextEntry(new ZipEntry(entries[i]));
zos.write(entries[i + 1].getBytes(StandardCharsets.UTF_8));
zos.closeEntry();
}
}
return zipPath;
}
@Test
public void testParseTextFile() throws Exception {
// Create a simple test file
Path testFile = tempDir.resolve("test.txt");
String content = "Hello, this is a test document.\nIt has multiple lines.";
Files.writeString(testFile, content);
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
.setParseMode(ParseMode.RMETA)
.addJvmArg("-Xmx256m");
try (PipesForkParser parser = new PipesForkParser(config);
TikaInputStream tis = TikaInputStream.get(testFile)) {
PipesForkResult result = parser.parse(tis);
assertTrue(result.isSuccess(), "Parse should succeed. Status: " + result.getStatus()
+ ", message: " + result.getMessage());
assertFalse(result.isProcessCrash(), "Should not be a process crash");
List<Metadata> metadataList = result.getMetadataList();
assertNotNull(metadataList, "Metadata list should not be null");
assertFalse(metadataList.isEmpty(), "Metadata list should not be empty");
String extractedContent = result.getContent();
assertNotNull(extractedContent, "Content should not be null");
assertTrue(extractedContent.contains("Hello"), "Content should contain 'Hello'");
assertTrue(extractedContent.contains("test document"), "Content should contain 'test document'");
}
}
@Test
public void testParseWithMetadata() throws Exception {
// Create a simple HTML file
Path testFile = tempDir.resolve("test.html");
String html = "<html><head><title>Test Title</title></head>" +
"<body><p>Test paragraph content.</p></body></html>";
Files.writeString(testFile, html);
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
.setParseMode(ParseMode.RMETA);
try (PipesForkParser parser = new PipesForkParser(config);
TikaInputStream tis = TikaInputStream.get(testFile)) {
Metadata initialMetadata = new Metadata();
PipesForkResult result = parser.parse(tis, initialMetadata);
assertTrue(result.isSuccess(), "Parse should succeed");
Metadata metadata = result.getMetadata();
assertNotNull(metadata, "Metadata should not be null");
String extractedContent = result.getContent();
assertNotNull(extractedContent, "Content should not be null");
assertTrue(extractedContent.contains("Test paragraph"), "Content should contain paragraph text");
}
}
@Test
public void testParseMultipleFiles() throws Exception {
// Create multiple test files
Path testFile1 = tempDir.resolve("test1.txt");
Path testFile2 = tempDir.resolve("test2.txt");
Files.writeString(testFile1, "Content of first file");
Files.writeString(testFile2, "Content of second file");
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
.setParseMode(ParseMode.RMETA);
try (PipesForkParser parser = new PipesForkParser(config)) {
try (TikaInputStream tis1 = TikaInputStream.get(testFile1)) {
PipesForkResult result1 = parser.parse(tis1);
assertTrue(result1.isSuccess());
assertTrue(result1.getContent().contains("first file"));
}
try (TikaInputStream tis2 = TikaInputStream.get(testFile2)) {
PipesForkResult result2 = parser.parse(tis2);
assertTrue(result2.isSuccess());
assertTrue(result2.getContent().contains("second file"));
}
}
}
@Test
public void testConcatenateMode() throws Exception {
Path testZip = createZipWithEmbeddedFiles("test_with_embedded.zip",
"embedded1.txt", "Content from first embedded file",
"embedded2.txt", "Content from second embedded file");
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
.setParseMode(ParseMode.CONCATENATE);
try (PipesForkParser parser = new PipesForkParser(config);
TikaInputStream tis = TikaInputStream.get(testZip)) {
PipesForkResult result = parser.parse(tis);
assertTrue(result.isSuccess(), "Parse should succeed");
// In CONCATENATE mode, there should be exactly one metadata object
// even though the zip contains multiple embedded files
List<Metadata> metadataList = result.getMetadataList();
assertEquals(1, metadataList.size(), "CONCATENATE mode should return single metadata");
// The content should contain text from both embedded files
String content = result.getContent();
assertNotNull(content);
assertTrue(content.contains("first embedded"),
"Content should contain text from first embedded file");
assertTrue(content.contains("second embedded"),
"Content should contain text from second embedded file");
}
}
@Test
public void testNoParseMode() throws Exception {
// Create a simple test file
Path testFile = tempDir.resolve("test_no_parse.txt");
String content = "This content should NOT be extracted in NO_PARSE mode.";
Files.writeString(testFile, content);
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
.setParseMode(ParseMode.NO_PARSE);
try (PipesForkParser parser = new PipesForkParser(config);
TikaInputStream tis = TikaInputStream.get(testFile)) {
PipesForkResult result = parser.parse(tis);
assertTrue(result.isSuccess(), "Parse should succeed. Status: " + result.getStatus()
+ ", message: " + result.getMessage());
// In NO_PARSE mode, there should be exactly one metadata object
List<Metadata> metadataList = result.getMetadataList();
assertEquals(1, metadataList.size(), "NO_PARSE mode should return single metadata");
// Content type should be detected
Metadata metadata = metadataList.get(0);
String contentType = metadata.get(Metadata.CONTENT_TYPE);
assertNotNull(contentType, "Content type should be detected");
assertTrue(contentType.contains("text/plain"),
"Content type should be text/plain, got: " + contentType);
// No content should be extracted
String extractedContent = result.getContent();
assertTrue(extractedContent == null || extractedContent.isBlank(),
"NO_PARSE mode should not extract content, got: " + extractedContent);
}
}
@Test
public void testNoParseModeWithZip() throws Exception {
// Test NO_PARSE mode with a zip file - should NOT extract embedded files
Path testZip = createZipWithEmbeddedFiles("test_no_parse.zip",
"embedded1.txt", "Content from first embedded file",
"embedded2.txt", "Content from second embedded file");
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
.setParseMode(ParseMode.NO_PARSE);
try (PipesForkParser parser = new PipesForkParser(config);
TikaInputStream tis = TikaInputStream.get(testZip)) {
PipesForkResult result = parser.parse(tis);
assertTrue(result.isSuccess(), "Parse should succeed");
// Should have exactly one metadata object (no embedded file extraction)
List<Metadata> metadataList = result.getMetadataList();
assertEquals(1, metadataList.size(),
"NO_PARSE mode should return only container metadata, not embedded files");
// Content type should be detected as zip
Metadata metadata = metadataList.get(0);
String contentType = metadata.get(Metadata.CONTENT_TYPE);
assertNotNull(contentType, "Content type should be detected");
assertTrue(contentType.contains("zip"),
"Content type should be zip, got: " + contentType);
// No content should be extracted
String extractedContent = result.getContent();
assertTrue(extractedContent == null || extractedContent.isBlank(),
"NO_PARSE mode should not extract content");
}
}
@Test
public void testRmetaModeWithEmbedded() throws Exception {
Path testZip = createZipWithEmbeddedFiles("test_rmeta_embedded.zip",
"file1.txt", "First file content",
"file2.txt", "Second file content");
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
.setParseMode(ParseMode.RMETA);
try (PipesForkParser parser = new PipesForkParser(config);
TikaInputStream tis = TikaInputStream.get(testZip)) {
PipesForkResult result = parser.parse(tis);
assertTrue(result.isSuccess(), "Parse should succeed");
// In RMETA mode, there should be multiple metadata objects:
// one for the container (zip) and one for each embedded file
List<Metadata> metadataList = result.getMetadataList();
assertTrue(metadataList.size() >= 3,
"RMETA mode should return metadata for container + embedded files, got: "
+ metadataList.size());
}
}
@Test
public void testDefaultConfigMatchesExplicitRmeta() throws Exception {
Path testZip = createZipWithEmbeddedFiles("test_default_config.zip",
"file1.txt", "First file content",
"file2.txt", "Second file content");
// Parse with explicit RMETA config
PipesForkParserConfig explicitConfig = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
.setParseMode(ParseMode.RMETA);
int explicitMetadataCount;
try (PipesForkParser parser = new PipesForkParser(explicitConfig);
TikaInputStream tis = TikaInputStream.get(testZip)) {
PipesForkResult result = parser.parse(tis);
assertTrue(result.isSuccess());
explicitMetadataCount = result.getMetadataList().size();
}
// Parse with default config (only pluginsDir set) - should produce same results
PipesForkParserConfig defaultConfig = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR);
try (PipesForkParser parser = new PipesForkParser(defaultConfig);
TikaInputStream tis = TikaInputStream.get(testZip)) {
PipesForkResult result = parser.parse(tis);
assertTrue(result.isSuccess(), "Parse with default config should succeed");
assertEquals(explicitMetadataCount, result.getMetadataList().size(),
"Default config should produce same metadata count as explicit RMETA config");
}
}
@Test
public void testTextVsXhtmlHandlerType() throws Exception {
// Create an HTML file to parse
Path testFile = tempDir.resolve("test_handler.html");
String html = "<html><head><title>Test Title</title></head>" +
"<body><p>Paragraph one.</p><p>Paragraph two.</p></body></html>";
Files.writeString(testFile, html);
// Parse with TEXT handler - should get plain text without markup
PipesForkParserConfig textConfig = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
.setParseMode(ParseMode.RMETA);
String textContent;
try (PipesForkParser parser = new PipesForkParser(textConfig);
TikaInputStream tis = TikaInputStream.get(testFile)) {
PipesForkResult result = parser.parse(tis);
assertTrue(result.isSuccess(), "TEXT parse should succeed");
textContent = result.getContent();
assertNotNull(textContent, "TEXT content should not be null");
// TEXT mode should NOT contain HTML tags
assertFalse(textContent.contains("<p>"), "TEXT content should not contain <p> tags");
assertFalse(textContent.contains("<html>"), "TEXT content should not contain <html> tags");
assertTrue(textContent.contains("Paragraph one"), "TEXT content should contain text");
}
// Parse with XML handler - should get XHTML markup
PipesForkParserConfig xmlConfig = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.XML)
.setParseMode(ParseMode.RMETA);
String xmlContent;
try (PipesForkParser parser = new PipesForkParser(xmlConfig);
TikaInputStream tis = TikaInputStream.get(testFile)) {
PipesForkResult result = parser.parse(tis);
assertTrue(result.isSuccess(), "XML parse should succeed");
xmlContent = result.getContent();
assertNotNull(xmlContent, "XML content should not be null");
// XML mode SHOULD contain markup
assertTrue(xmlContent.contains("<p>") || xmlContent.contains("<p "),
"XML content should contain <p> tags");
assertTrue(xmlContent.contains("Paragraph one"), "XML content should contain text");
}
// The XML content should be longer due to markup
assertTrue(xmlContent.length() > textContent.length(),
"XML content should be longer than TEXT content due to markup");
}
@Test
public void testWriteLimit() throws Exception {
// Create a file with more content than the write limit
Path testFile = tempDir.resolve("longfile.txt");
StringBuilder longContent = new StringBuilder();
for (int i = 0; i < 1000; i++) {
longContent.append("This is line ").append(i).append(" of the test document.\n");
}
Files.writeString(testFile, longContent.toString());
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
.setParseMode(ParseMode.RMETA)
.setWriteLimit(100); // Limit to 100 characters
try (PipesForkParser parser = new PipesForkParser(config);
TikaInputStream tis = TikaInputStream.get(testFile)) {
PipesForkResult result = parser.parse(tis);
// Note: behavior depends on throwOnWriteLimitReached setting
// With default (true), this may result in an exception being recorded
assertNotNull(result);
}
}
@Test
public void testDefaultConfiguration() throws Exception {
Path testFile = tempDir.resolve("default.txt");
Files.writeString(testFile, "Testing default configuration");
// Use default configuration (only pluginsDir set)
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR);
try (PipesForkParser parser = new PipesForkParser(config);
TikaInputStream tis = TikaInputStream.get(testFile)) {
PipesForkResult result = parser.parse(tis);
assertTrue(result.isSuccess());
assertNotNull(result.getContent());
}
}
@Test
public void testFileNotFoundThrowsException() throws Exception {
// Try to parse a file that doesn't exist
Path nonExistentFile = tempDir.resolve("does_not_exist.txt");
// TikaInputStream.get(Path) throws NoSuchFileException for non-existent files
// because it needs to read file attributes (size)
assertThrows(java.nio.file.NoSuchFileException.class, () -> {
TikaInputStream.get(nonExistentFile);
});
}
@Test
public void testExceptionOnOneFileDoesNotPreventNextParse() throws Exception {
// Test that an exception when opening one file doesn't prevent parsing another file
Path nonExistentFile = tempDir.resolve("does_not_exist.txt");
Path realFile = tempDir.resolve("real_file.txt");
Files.writeString(realFile, "This file exists");
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR);
try (PipesForkParser parser = new PipesForkParser(config)) {
// First attempt - TikaInputStream.get() will throw for non-existent file
assertThrows(java.nio.file.NoSuchFileException.class, () -> {
TikaInputStream.get(nonExistentFile);
});
// Second parse - should succeed despite the previous exception
try (TikaInputStream tis2 = TikaInputStream.get(realFile)) {
PipesForkResult result2 = parser.parse(tis2);
assertTrue(result2.isSuccess(), "Should succeed for existing file");
assertTrue(result2.getContent().contains("This file exists"));
}
}
}
@Test
public void testParseSuccessWithExceptionStatus() throws Exception {
// Create a file that will parse but may have warnings
// For example, a file with content that might trigger a write limit
Path testFile = tempDir.resolve("parse_with_warning.txt");
Files.writeString(testFile, "Simple content");
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR);
try (PipesForkParser parser = new PipesForkParser(config);
TikaInputStream tis = TikaInputStream.get(testFile)) {
PipesForkResult result = parser.parse(tis);
// Verify we can check for different success states
if (result.isSuccess()) {
// Could be PARSE_SUCCESS, PARSE_SUCCESS_WITH_EXCEPTION, or EMIT_SUCCESS_PASSBACK
assertTrue(
result.getStatus() == PipesResult.RESULT_STATUS.PARSE_SUCCESS ||
result.getStatus() == PipesResult.RESULT_STATUS.PARSE_SUCCESS_WITH_EXCEPTION ||
result.getStatus() == PipesResult.RESULT_STATUS.EMIT_SUCCESS_PASSBACK,
"Success status should be one of the success types");
}
}
}
@Test
public void testResultCategorization() throws Exception {
// Test that we can properly categorize results
Path testFile = tempDir.resolve("categorize.txt");
Files.writeString(testFile, "Test categorization");
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR);
try (PipesForkParser parser = new PipesForkParser(config);
TikaInputStream tis = TikaInputStream.get(testFile)) {
PipesForkResult result = parser.parse(tis);
// At least one of these should be true
boolean hasCategory = result.isSuccess() || result.isProcessCrash() ||
result.isFatal() || result.isInitializationFailure() || result.isTaskException();
assertTrue(hasCategory, "Result should have a valid category");
// These should be mutually exclusive
int trueCount = 0;
if (result.isSuccess()) trueCount++;
if (result.isProcessCrash()) trueCount++;
if (result.isFatal()) trueCount++;
if (result.isInitializationFailure()) trueCount++;
if (result.isTaskException()) trueCount++;
assertEquals(1, trueCount, "Exactly one category should be true");
}
}
@Test
public void testParseWithPath() throws Exception {
// Create a simple test file
Path testFile = tempDir.resolve("test_path.txt");
String content = "Hello from path-based parsing!";
Files.writeString(testFile, content);
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
.setParseMode(ParseMode.RMETA);
try (PipesForkParser parser = new PipesForkParser(config)) {
// Use parse(Path) directly without wrapping in TikaInputStream
PipesForkResult result = parser.parse(testFile);
assertTrue(result.isSuccess(), "Parse should succeed. Status: " + result.getStatus()
+ ", message: " + result.getMessage());
assertFalse(result.isProcessCrash(), "Should not be a process crash");
List<Metadata> metadataList = result.getMetadataList();
assertNotNull(metadataList, "Metadata list should not be null");
assertFalse(metadataList.isEmpty(), "Metadata list should not be empty");
String extractedContent = result.getContent();
assertNotNull(extractedContent, "Content should not be null");
assertTrue(extractedContent.contains("path-based parsing"),
"Content should contain 'path-based parsing'");
}
}
@Test
public void testParseWithPathAndMetadata() throws Exception {
// Create a simple test file
Path testFile = tempDir.resolve("test_path_metadata.txt");
Files.writeString(testFile, "Content for metadata test");
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
.setParseMode(ParseMode.RMETA);
try (PipesForkParser parser = new PipesForkParser(config)) {
Metadata initialMetadata = new Metadata();
initialMetadata.set("custom-key", "custom-value");
// Use parse(Path, Metadata)
PipesForkResult result = parser.parse(testFile, initialMetadata);
assertTrue(result.isSuccess(), "Parse should succeed");
assertNotNull(result.getMetadata(), "Metadata should not be null");
assertTrue(result.getContent().contains("metadata test"));
}
}
@Test
public void testParseMultipleFilesWithPath() throws Exception {
// Create multiple test files
Path testFile1 = tempDir.resolve("path1.txt");
Path testFile2 = tempDir.resolve("path2.txt");
Files.writeString(testFile1, "Content of first path file");
Files.writeString(testFile2, "Content of second path file");
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
.setParseMode(ParseMode.RMETA);
try (PipesForkParser parser = new PipesForkParser(config)) {
// Parse both files using Path directly
PipesForkResult result1 = parser.parse(testFile1);
assertTrue(result1.isSuccess());
assertTrue(result1.getContent().contains("first path file"));
PipesForkResult result2 = parser.parse(testFile2);
assertTrue(result2.isSuccess());
assertTrue(result2.getContent().contains("second path file"));
}
}
@Test
public void testParsePathMatchesTikaInputStream() throws Exception {
// Verify that parse(Path) produces the same result as parse(TikaInputStream)
Path testFile = tempDir.resolve("compare.txt");
Files.writeString(testFile, "Content for comparison test");
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
.setParseMode(ParseMode.RMETA);
// Parse with Path
String pathContent;
try (PipesForkParser parser = new PipesForkParser(config)) {
PipesForkResult result = parser.parse(testFile);
assertTrue(result.isSuccess());
pathContent = result.getContent();
}
// Parse with TikaInputStream
String tisContent;
try (PipesForkParser parser = new PipesForkParser(config);
TikaInputStream tis = TikaInputStream.get(testFile)) {
PipesForkResult result = parser.parse(tis);
assertTrue(result.isSuccess());
tisContent = result.getContent();
}
// Results should match
assertEquals(pathContent, tisContent,
"parse(Path) and parse(TikaInputStream) should produce same content");
}
}