TikaModule.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.serialization;
import java.io.IOException;
import java.lang.reflect.Method;
import java.lang.reflect.Modifier;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.BeanDescription;
import com.fasterxml.jackson.databind.DeserializationConfig;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JavaType;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationConfig;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.deser.Deserializers;
import com.fasterxml.jackson.databind.module.SimpleModule;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.databind.ser.Serializers;
import org.apache.tika.config.loader.ComponentInstantiator;
import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.serialization.serdes.DefaultDetectorSerializer;
import org.apache.tika.serialization.serdes.DefaultParserSerializer;
import org.apache.tika.serialization.serdes.MetadataDeserializer;
import org.apache.tika.serialization.serdes.MetadataSerializer;
import org.apache.tika.serialization.serdes.ParseContextDeserializer;
import org.apache.tika.serialization.serdes.ParseContextSerializer;
/**
* Jackson module that provides compact serialization for Tika components.
* <p>
* Uses {@link ComponentNameResolver} for friendly name resolution (e.g., "pdf-parser").
* <p>
* Supports two formats:
* <ol>
* <li>Simple string: {@code "text-parser"} ��� instance with defaults</li>
* <li>Object with type as key: {@code {"pdf-parser": {"ocrStrategy": "AUTO"}}} ��� instance with config</li>
* </ol>
* <p>
* For components implementing {@link SelfConfiguring}, uses the {@link JsonConfig} constructor.
* For other components, uses Jackson bean deserialization (readerForUpdating).
*/
public class TikaModule extends SimpleModule {
private static ObjectMapper sharedMapper;
public TikaModule() {
super("TikaModule");
// Register MediaType serializers (string-based)
addSerializer(MediaType.class, new JsonSerializer<MediaType>() {
@Override
public void serialize(MediaType value, JsonGenerator gen, SerializerProvider serializers)
throws IOException {
gen.writeString(value.toString());
}
});
addDeserializer(MediaType.class, new JsonDeserializer<MediaType>() {
@Override
public MediaType deserialize(JsonParser p, DeserializationContext ctxt)
throws IOException {
return MediaType.parse(p.getValueAsString());
}
});
// Register Metadata serializers
addSerializer(Metadata.class, new MetadataSerializer());
addDeserializer(Metadata.class, new MetadataDeserializer());
// Register ParseContext serializers
addSerializer(ParseContext.class, new ParseContextSerializer());
addDeserializer(ParseContext.class, new ParseContextDeserializer());
}
/**
* Sets the shared ObjectMapper for use during deserialization.
* Must be called before deserializing components.
*
* @param mapper the ObjectMapper with TikaModule registered
*/
public static void setSharedMapper(ObjectMapper mapper) {
sharedMapper = mapper;
}
/**
* Gets the shared ObjectMapper.
*
* @return the shared mapper, or null if not configured
*/
public static ObjectMapper getSharedMapper() {
return sharedMapper;
}
@Override
public void setupModule(SetupContext context) {
super.setupModule(context);
context.addDeserializers(new TikaDeserializers());
context.addSerializers(new TikaSerializers());
}
/**
* Deserializers for Tika component types.
* <p>
* Only handles abstract types (interfaces/abstract classes) that are compact format interfaces.
* Concrete implementations use normal Jackson bean deserialization for their properties.
*/
private static class TikaDeserializers extends Deserializers.Base {
@Override
public JsonDeserializer<?> findBeanDeserializer(JavaType type, DeserializationConfig config,
BeanDescription beanDesc) {
Class<?> rawClass = type.getRawClass();
// Only use compact format deserializer for ABSTRACT types (interfaces or abstract classes)
// that are in the compact format interfaces list.
// Concrete implementations (like ExternalParser, HtmlParser) should use normal
// Jackson bean deserialization for their properties.
if (rawClass.isInterface() || Modifier.isAbstract(rawClass.getModifiers())) {
if (ComponentNameResolver.getContextKeyInterfaces().contains(rawClass) ||
ComponentNameResolver.usesCompactFormat(rawClass)) {
return new TikaComponentDeserializer(rawClass);
}
}
return null;
}
}
/**
* Serializers for Tika component types.
*/
private static class TikaSerializers extends Serializers.Base {
@Override
public JsonSerializer<?> findSerializer(SerializationConfig config, JavaType type,
BeanDescription beanDesc) {
Class<?> rawClass = type.getRawClass();
// Use dedicated serializers for SPI composite types
if (rawClass == DefaultParser.class) {
return new DefaultParserSerializer();
}
if (rawClass == DefaultDetector.class) {
return new DefaultDetectorSerializer();
}
// Handle MimeFilteringDecorator specially - serialize wrapped parser with mime filters
if (rawClass == ParserDecorator.MimeFilteringDecorator.class) {
return new TikaComponentSerializer();
}
// Only serialize with compact format if type implements a compact format interface
// AND has a registered friendly name
if (ComponentNameResolver.usesCompactFormat(rawClass) &&
ComponentNameResolver.getFriendlyName(rawClass) != null) {
return new TikaComponentSerializer();
}
return null;
}
}
/**
* Deserializer that handles both string and object formats for Tika components.
* Delegates to {@link ComponentInstantiator#instantiateComponent} for instantiation.
*/
private static class TikaComponentDeserializer extends JsonDeserializer<Object> {
private final Class<?> expectedType;
TikaComponentDeserializer(Class<?> expectedType) {
this.expectedType = expectedType;
}
@Override
public Object deserialize(JsonParser p, DeserializationContext ctxt) throws IOException {
JsonNode node = p.readValueAsTree();
ObjectMapper mapper = sharedMapper;
if (mapper == null) {
throw new IOException("Shared ObjectMapper not configured. " +
"Call TikaModule.setSharedMapper() before deserializing.");
}
String typeName;
JsonNode configNode;
if (node.isTextual()) {
typeName = node.asText();
configNode = null;
} else if (node.isObject()) {
Iterator<Map.Entry<String, JsonNode>> fields = node.fields();
if (!fields.hasNext()) {
try {
return expectedType.getDeclaredConstructor().newInstance();
} catch (ReflectiveOperationException e) {
throw new IOException("Empty object for abstract type " +
expectedType.getSimpleName() + " - specify a concrete type name");
}
}
Map.Entry<String, JsonNode> entry = fields.next();
typeName = entry.getKey();
configNode = entry.getValue();
} else {
throw new IOException("Expected string or object for " +
expectedType.getSimpleName() + ", got: " + node.getNodeType());
}
try {
return ComponentInstantiator.instantiateComponent(typeName, configNode,
mapper, Thread.currentThread().getContextClassLoader(), expectedType);
} catch (TikaConfigException e) {
throw new IOException(e.getMessage(), e);
}
}
}
/**
* Serializer that produces compact output for Tika components.
* Outputs simple string if using defaults, object with type key if configured.
*/
private static class TikaComponentSerializer extends JsonSerializer<Object> {
TikaComponentSerializer() {
}
@Override
public void serialize(Object value, JsonGenerator gen, SerializerProvider serializers)
throws IOException {
// Handle MimeFilteringDecorator specially for round-trip
Set<MediaType> includeTypes = null;
Set<MediaType> excludeTypes = null;
if (value instanceof ParserDecorator.MimeFilteringDecorator mfd) {
includeTypes = mfd.getIncludeTypes();
excludeTypes = mfd.getExcludeTypes();
value = mfd.getWrappedParser();
}
String typeName = ComponentNameResolver.getFriendlyName(value.getClass());
if (typeName == null) {
typeName = value.getClass().getName();
}
ObjectMapper mapper = (ObjectMapper) gen.getCodec();
// Get configured properties (only non-default values)
ObjectNode configNode = getConfiguredProperties(value, mapper);
// Add mime filter fields if present
if (includeTypes != null && !includeTypes.isEmpty()) {
configNode.set("_mime-include", mimeTypesToArray(includeTypes, mapper));
}
if (excludeTypes != null && !excludeTypes.isEmpty()) {
configNode.set("_mime-exclude", mimeTypesToArray(excludeTypes, mapper));
}
if (configNode.isEmpty()) {
// No config differences - output simple string
gen.writeString(typeName);
} else {
// Has config - output object with type as key
gen.writeStartObject();
gen.writeObjectField(typeName, configNode);
gen.writeEndObject();
}
}
private JsonNode mimeTypesToArray(Set<MediaType> types, ObjectMapper mapper) {
var arrayNode = mapper.createArrayNode();
for (MediaType type : types) {
arrayNode.add(type.toString());
}
return arrayNode;
}
private ObjectNode getConfiguredProperties(Object value, ObjectMapper mapper) throws IOException {
try {
// Check for getConfig() method (common pattern for config objects)
Method getConfigMethod = findGetConfigMethod(value.getClass());
if (getConfigMethod != null) {
// Serialize the config object's properties
Object config = getConfigMethod.invoke(value);
if (config == null) {
return mapper.createObjectNode();
}
// Create default config to compare against
Object defaultConfig = config.getClass().getDeclaredConstructor().newInstance();
ObjectNode configNode = TikaObjectMapperFactory.getPlainMapper().valueToTree(config);
ObjectNode defaultNode = TikaObjectMapperFactory.getPlainMapper().valueToTree(defaultConfig);
// Only keep properties that differ from defaults
ObjectNode result = mapper.createObjectNode();
Iterator<Map.Entry<String, JsonNode>> fields = configNode.fields();
while (fields.hasNext()) {
Map.Entry<String, JsonNode> field = fields.next();
JsonNode defaultValue = defaultNode.get(field.getKey());
if (!field.getValue().equals(defaultValue)) {
result.set(field.getKey(), field.getValue());
}
}
return result;
} else {
// No config object - serialize the component directly
Object defaultInstance = value.getClass().getDeclaredConstructor().newInstance();
ObjectNode valueNode = TikaObjectMapperFactory.getPlainMapper().valueToTree(value);
ObjectNode defaultNode = TikaObjectMapperFactory.getPlainMapper().valueToTree(defaultInstance);
ObjectNode result = TikaObjectMapperFactory.getPlainMapper().createObjectNode();
Iterator<Map.Entry<String, JsonNode>> fields = valueNode.fields();
while (fields.hasNext()) {
Map.Entry<String, JsonNode> field = fields.next();
JsonNode defaultValue = defaultNode.get(field.getKey());
if (!field.getValue().equals(defaultValue)) {
result.set(field.getKey(), field.getValue());
}
}
return result;
}
} catch (ReflectiveOperationException e) {
throw new IOException("Failed to serialize config", e);
}
}
private Method findGetConfigMethod(Class<?> clazz) {
try {
Method method = clazz.getMethod("getConfig");
if (method.getReturnType() != void.class) {
return method;
}
} catch (NoSuchMethodException e) {
// No getConfig method
}
return null;
}
}
}