TestParseContextSerialization.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.serialization;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.module.SimpleModule;
import org.junit.jupiter.api.Test;
import org.apache.tika.config.TimeoutLimits;
import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.extractor.SkipEmbeddedDocumentSelector;
import org.apache.tika.metadata.filter.AttachmentCountingListFilter;
import org.apache.tika.metadata.filter.CompositeMetadataFilter;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.filter.MockUpperCaseFilter;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.SimplePasswordProvider;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.serialization.serdes.ParseContextDeserializer;
import org.apache.tika.serialization.serdes.ParseContextSerializer;
/**
* Tests for ParseContext serialization/deserialization.
* <p>
* JSON configs are stored in ParseContext's jsonConfigs map.
* Components are resolved at runtime via ParseContextUtils.resolveAll() or ConfigDeserializer.
*/
public class TestParseContextSerialization {
private ObjectMapper createMapper() {
// Start with the properly configured mapper that has polymorphic type handling
ObjectMapper mapper = TikaObjectMapperFactory.getMapper();
// Register our custom serializer/deserializer on top
SimpleModule module = new SimpleModule();
module.addDeserializer(ParseContext.class, new ParseContextDeserializer());
module.addSerializer(ParseContext.class, new ParseContextSerializer());
mapper.registerModule(module);
return mapper;
}
private String serializeParseContext(ParseContext pc) throws Exception {
ObjectMapper mapper = createMapper();
try (Writer writer = new StringWriter()) {
try (JsonGenerator jsonGenerator = mapper
.getFactory()
.createGenerator(writer)) {
ParseContextSerializer serializer = new ParseContextSerializer();
serializer.serialize(pc, jsonGenerator, null);
}
return writer.toString();
}
}
@Test
public void testEmptyParseContext() throws Exception {
ParseContext pc = new ParseContext();
String json = serializeParseContext(pc);
// Empty ParseContext should serialize to empty object
ObjectMapper mapper = createMapper();
JsonNode root = mapper.readTree(json);
assertEquals(0, root.size(), "Empty ParseContext should have no fields");
// Verify round-trip
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
assertNotNull(deserialized);
}
@Test
public void testFriendlyNameFormat() throws Exception {
// Test the friendly-name format
ParseContext pc = new ParseContext();
// Add friendly-named configurations via setJsonConfig
pc.setJsonConfig("pdf-parser", "{\"ocrStrategy\":\"AUTO\",\"extractInlineImages\":true}");
pc.setJsonConfig("html-parser", "{\"extractScripts\":false}");
String json = serializeParseContext(pc);
// Verify JSON structure - should have flat friendly names
ObjectMapper mapper = createMapper();
JsonNode root = mapper.readTree(json);
assertFalse(root.has("objects"), "Should NOT have objects field");
assertTrue(root.has("pdf-parser"), "Should have pdf-parser field");
assertTrue(root.has("html-parser"), "Should have html-parser field");
assertEquals("AUTO", root
.get("pdf-parser")
.get("ocrStrategy")
.asText());
assertEquals(false, root
.get("html-parser")
.get("extractScripts")
.asBoolean());
// Verify round-trip
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
assertTrue(deserialized.hasJsonConfig("pdf-parser"));
assertTrue(deserialized.hasJsonConfig("html-parser"));
}
@Test
public void testTimeoutLimitsFormat() throws Exception {
// Test serializing timeout-limits configuration
ParseContext pc = new ParseContext();
pc.setJsonConfig("timeout-limits",
"{\"progressTimeoutMillis\":30000,\"totalTaskTimeoutMillis\":120000}");
String json = serializeParseContext(pc);
ObjectMapper mapper = createMapper();
JsonNode root = mapper.readTree(json);
assertTrue(root.has("timeout-limits"), "Should have timeout-limits field");
assertEquals(30000, root
.get("timeout-limits")
.get("progressTimeoutMillis")
.asInt());
assertEquals(120000, root
.get("timeout-limits")
.get("totalTaskTimeoutMillis")
.asInt());
// Verify round-trip
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
assertTrue(deserialized.hasJsonConfig("timeout-limits"));
// Resolve and verify
ParseContextUtils.resolveAll(deserialized, Thread.currentThread().getContextClassLoader());
TimeoutLimits limits = deserialized.get(TimeoutLimits.class);
assertNotNull(limits, "TimeoutLimits should be resolved");
assertEquals(30000, limits.getProgressTimeoutMillis());
assertEquals(120000, limits.getTotalTaskTimeoutMillis());
}
@Test
public void testConfigDeserializerHelper() throws Exception {
// Test the ConfigDeserializer helper utility
ParseContext pc = new ParseContext();
// Simulate a PDFParserConfig as JSON
String pdfConfig = "{\"extractInlineImages\":true,\"ocrStrategy\":\"AUTO\"}";
pc.setJsonConfig("pdf-parser", pdfConfig);
// Test hasConfig
assertTrue(ConfigDeserializer.hasConfig(pc, "pdf-parser"));
assertFalse(ConfigDeserializer.hasConfig(pc, "non-existent"));
// Test getJsonConfig retrieves JSON correctly
String retrievedConfig = pc.getJsonConfig("pdf-parser").json();
assertNotNull(retrievedConfig);
assertTrue(retrievedConfig.contains("extractInlineImages"));
}
@Test
public void testDeserializeFriendlyNameFromJSON() throws Exception {
// Test deserializing friendly-name format from raw JSON string
String json = """
{
"pdf-parser": {
"ocrStrategy": "AUTO",
"extractInlineImages": true
},
"html-parser": {
"extractScripts": false
}
}
""";
ObjectMapper mapper = createMapper();
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
assertTrue(deserialized.hasJsonConfig("pdf-parser"));
assertTrue(deserialized.hasJsonConfig("html-parser"));
// Verify the JSON content is preserved
String pdfParserJson = deserialized.getJsonConfig("pdf-parser").json();
assertTrue(pdfParserJson.contains("AUTO"));
assertTrue(pdfParserJson.contains("extractInlineImages"));
}
@Test
public void testDeserializeWithParseContextWrapper() throws Exception {
// Test deserializing with optional "parse-context" wrapper
String json = """
{
"parse-context": {
"pdf-parser": {
"ocrStrategy": "NO_OCR"
}
}
}
""";
ObjectMapper mapper = createMapper();
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
assertTrue(deserialized.hasJsonConfig("pdf-parser"));
}
@Test
public void testMultipleConfigs() throws Exception {
// Test with multiple different config types
ParseContext pc = new ParseContext();
pc.setJsonConfig("pdf-parser", "{\"ocrStrategy\":\"AUTO\"}");
pc.setJsonConfig("html-parser", "{\"extractScripts\":true}");
pc.setJsonConfig("timeout-limits",
"{\"progressTimeoutMillis\":5000,\"totalTaskTimeoutMillis\":60000}");
pc.setJsonConfig("my-custom-config", "{\"enabled\":true,\"maxRetries\":3}");
String json = serializeParseContext(pc);
// Verify all are present
ObjectMapper mapper = createMapper();
JsonNode root = mapper.readTree(json);
assertEquals(4, root.size(), "Should have 4 config fields");
assertTrue(root.has("pdf-parser"));
assertTrue(root.has("html-parser"));
assertTrue(root.has("timeout-limits"));
assertTrue(root.has("my-custom-config"));
// Verify round-trip
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
assertEquals(4, deserialized.getJsonConfigs().size());
}
@Test
public void testUnregisteredObjectFailsSerialization() throws Exception {
// Unregistered objects must fail serialization with a clear error
ParseContext pc = new ParseContext();
pc.set(String.class, "test-value");
assertThrows(IOException.class, () -> serializeParseContext(pc),
"Unregistered components should fail serialization");
}
@Test
public void testMetadataFiltersFromJson() throws Exception {
ParseContext parseContext = new ParseContext();
parseContext.setJsonConfig("metadata-filters", """
[
"attachment-counting-list-filter",
"mock-upper-case-filter"
]
""");
ObjectMapper mapper = createMapper();
String json = mapper.writeValueAsString(parseContext);
ParseContext deser = mapper.readValue(json, ParseContext.class);
// Resolve the array config
ParseContextUtils.resolveAll(deser, Thread.currentThread().getContextClassLoader());
MetadataFilter resolvedFilter = deser.get(MetadataFilter.class);
assertNotNull(resolvedFilter, "MetadataFilter should be resolved");
assertEquals(CompositeMetadataFilter.class, resolvedFilter.getClass());
CompositeMetadataFilter deserFilter = (CompositeMetadataFilter) resolvedFilter;
assertEquals(AttachmentCountingListFilter.class, deserFilter.getFilters().get(0).getClass());
assertEquals(MockUpperCaseFilter.class, deserFilter.getFilters().get(1).getClass());
}
@Test
public void testContextKeyDeserialization() throws Exception {
// Test that components with @TikaComponent(contextKey=...) are stored
// in ParseContext with the contextKey, not the component class.
// SkipEmbeddedDocumentSelector has contextKey=DocumentSelector.class
String json = """
{
"skip-embedded-document-selector": {}
}
""";
ObjectMapper mapper = createMapper();
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
// Resolve the config
ParseContextUtils.resolveAll(deserialized, Thread.currentThread().getContextClassLoader());
// Should be accessible via DocumentSelector.class (the contextKey)
DocumentSelector selector = deserialized.get(DocumentSelector.class);
assertNotNull(selector, "DocumentSelector should be found via contextKey");
assertTrue(selector instanceof SkipEmbeddedDocumentSelector,
"Should be SkipEmbeddedDocumentSelector instance");
// The selector should skip all embedded documents (return false)
assertFalse(selector.select(new org.apache.tika.metadata.Metadata()),
"SkipEmbeddedDocumentSelector should return false for all documents");
}
@Test
public void testSimplePasswordProviderDeserialization() throws Exception {
// Test that SimplePasswordProvider with contextKey=PasswordProvider.class
// is stored in ParseContext with the contextKey
String json = """
{
"simple-password-provider": {
"password": "secret123"
}
}
""";
ObjectMapper mapper = createMapper();
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
// Resolve the config
ParseContextUtils.resolveAll(deserialized, Thread.currentThread().getContextClassLoader());
// Should be accessible via PasswordProvider.class (the contextKey)
PasswordProvider provider = deserialized.get(PasswordProvider.class);
assertNotNull(provider, "PasswordProvider should be found via contextKey");
assertTrue(provider instanceof SimplePasswordProvider,
"Should be SimplePasswordProvider instance");
assertEquals("secret123", provider.getPassword(null),
"Password should match the configured value");
}
/**
* Test that BasicContentHandlerFactory can be configured via JSON, serialized,
* deserialized, and resolved via ParseContextUtils.resolveAll().
* This verifies the fix for TIKA-4582 where ContentHandlerFactory was not being
* resolved because it wasn't in the "parse-context" registry.
*/
@Test
public void testContentHandlerFactoryRoundTrip() throws Exception {
// Create ParseContext with BasicContentHandlerFactory configuration
String json = """
{
"basic-content-handler-factory": {
"type": "XML",
"writeLimit": 50000
}
}
""";
ObjectMapper mapper = createMapper();
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
// Verify JSON config is present
assertTrue(deserialized.hasJsonConfig("basic-content-handler-factory"),
"Should have basic-content-handler-factory JSON config");
// Resolve the config - this should now work with ComponentNameResolver
ParseContextUtils.resolveAll(deserialized, Thread.currentThread().getContextClassLoader());
// Should be accessible via ContentHandlerFactory.class (the contextKey)
ContentHandlerFactory factory = deserialized.get(ContentHandlerFactory.class);
assertNotNull(factory, "ContentHandlerFactory should be resolved");
assertTrue(factory instanceof BasicContentHandlerFactory,
"Should be BasicContentHandlerFactory instance");
// Verify the configuration was applied
BasicContentHandlerFactory basicFactory = (BasicContentHandlerFactory) factory;
assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.XML, basicFactory.getType(),
"Handler type should be XML");
assertEquals(50000, basicFactory.getWriteLimit(),
"Write limit should be 50000");
}
/**
* Test full round-trip: create ParseContext with ContentHandlerFactory,
* serialize to JSON, deserialize back, resolve, and verify.
*/
@Test
public void testContentHandlerFactoryFullRoundTrip() throws Exception {
// Create original ParseContext with JSON config
ParseContext original = new ParseContext();
original.setJsonConfig("basic-content-handler-factory", """
{
"type": "HTML",
"writeLimit": 10000,
"throwOnWriteLimitReached": false
}
""");
// Serialize
ObjectMapper mapper = createMapper();
String json = mapper.writeValueAsString(original);
// Verify JSON structure
JsonNode root = mapper.readTree(json);
assertTrue(root.has("basic-content-handler-factory"),
"Serialized JSON should have basic-content-handler-factory");
// Deserialize
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
assertTrue(deserialized.hasJsonConfig("basic-content-handler-factory"),
"Deserialized should have JSON config");
// Resolve
ParseContextUtils.resolveAll(deserialized, Thread.currentThread().getContextClassLoader());
// Verify resolution
ContentHandlerFactory factory = deserialized.get(ContentHandlerFactory.class);
assertNotNull(factory, "ContentHandlerFactory should be resolved after round-trip");
BasicContentHandlerFactory basicFactory = (BasicContentHandlerFactory) factory;
assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.HTML, basicFactory.getType());
assertEquals(10000, basicFactory.getWriteLimit());
assertFalse(basicFactory.isThrowOnWriteLimitReached());
}
/**
* Test that duplicate context keys within a single JSON document are detected and rejected.
* Both BasicContentHandlerFactory and UppercasingContentHandlerFactory resolve to
* ContentHandlerFactory.class as their context key, so configuring both should fail.
*/
@Test
public void testDuplicateContextKeyDetection() throws Exception {
// Both of these resolve to ContentHandlerFactory.class as the context key
String json = """
{
"basic-content-handler-factory": {
"type": "XML",
"writeLimit": 50000
},
"uppercasing-content-handler-factory": {}
}
""";
ObjectMapper mapper = createMapper();
// Should throw an exception due to duplicate context key
Exception ex = assertThrows(Exception.class, () ->
mapper.readValue(json, ParseContext.class));
// Verify the error message mentions the duplicate
assertTrue(ex.getMessage().contains("Duplicate") ||
(ex.getCause() != null && ex.getCause().getMessage().contains("Duplicate")),
"Exception should mention duplicate context key: " + ex.getMessage());
assertTrue(ex.getMessage().contains("ContentHandlerFactory") ||
(ex.getCause() != null && ex.getCause().getMessage().contains("ContentHandlerFactory")),
"Exception should mention the conflicting key: " + ex.getMessage());
}
/**
* Test that multiple self-configuring components (e.g., parsers) with the same
* context key are allowed. Self-configuring components stay as JSON configs and
* are accessed by string key at runtime, so they never conflict in the context map.
*/
@Test
public void testSelfConfiguringComponentsAllowDuplicateContextKeys() throws Exception {
// Both parsers resolve to Parser.class as context key, but Parser extends
// SelfConfiguring, so they should be allowed to coexist.
String json = """
{
"configurable-test-parser": {
"maxItems": 5
},
"minimal-test-parser": {}
}
""";
ObjectMapper mapper = createMapper();
// Should NOT throw - self-configuring components skip duplicate detection
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
assertTrue(deserialized.hasJsonConfig("configurable-test-parser"),
"configurable-test-parser should be stored as JSON config");
assertTrue(deserialized.hasJsonConfig("minimal-test-parser"),
"minimal-test-parser should be stored as JSON config");
}
/**
* Test that a single component per context key is allowed (no false positives).
*/
@Test
public void testNoDuplicateWhenDifferentContextKeys() throws Exception {
// These have different context keys, so both should be allowed
String json = """
{
"basic-content-handler-factory": {
"type": "TEXT",
"writeLimit": 10000
},
"skip-embedded-document-selector": {}
}
""";
ObjectMapper mapper = createMapper();
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
// Both should be present as JSON configs
assertTrue(deserialized.hasJsonConfig("basic-content-handler-factory"));
assertTrue(deserialized.hasJsonConfig("skip-embedded-document-selector"));
// Resolve and verify both work
ParseContextUtils.resolveAll(deserialized, Thread.currentThread().getContextClassLoader());
assertNotNull(deserialized.get(ContentHandlerFactory.class));
assertNotNull(deserialized.get(DocumentSelector.class));
}
}