TikaModule.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.serialization;

import java.io.IOException;
import java.lang.reflect.Method;
import java.lang.reflect.Modifier;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.BeanDescription;
import com.fasterxml.jackson.databind.DeserializationConfig;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JavaType;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationConfig;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.deser.Deserializers;
import com.fasterxml.jackson.databind.module.SimpleModule;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.databind.ser.Serializers;

import org.apache.tika.config.loader.ComponentInstantiator;
import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.serialization.serdes.DefaultDetectorSerializer;
import org.apache.tika.serialization.serdes.DefaultParserSerializer;
import org.apache.tika.serialization.serdes.MetadataDeserializer;
import org.apache.tika.serialization.serdes.MetadataSerializer;
import org.apache.tika.serialization.serdes.ParseContextDeserializer;
import org.apache.tika.serialization.serdes.ParseContextSerializer;

/**
 * Jackson module that provides compact serialization for Tika components.
 * <p>
 * Uses {@link ComponentNameResolver} for friendly name resolution (e.g., "pdf-parser").
 * <p>
 * Supports two formats:
 * <ol>
 *   <li>Simple string: {@code "text-parser"} ��� instance with defaults</li>
 *   <li>Object with type as key: {@code {"pdf-parser": {"ocrStrategy": "AUTO"}}} ��� instance with config</li>
 * </ol>
 * <p>
 * For components implementing {@link SelfConfiguring}, uses the {@link JsonConfig} constructor.
 * For other components, uses Jackson bean deserialization (readerForUpdating).
 */
public class TikaModule extends SimpleModule {

    private static ObjectMapper sharedMapper;

    public TikaModule() {
        super("TikaModule");

        // Register MediaType serializers (string-based)
        addSerializer(MediaType.class, new JsonSerializer<MediaType>() {
            @Override
            public void serialize(MediaType value, JsonGenerator gen, SerializerProvider serializers)
                    throws IOException {
                gen.writeString(value.toString());
            }
        });
        addDeserializer(MediaType.class, new JsonDeserializer<MediaType>() {
            @Override
            public MediaType deserialize(JsonParser p, DeserializationContext ctxt)
                    throws IOException {
                return MediaType.parse(p.getValueAsString());
            }
        });

        // Register Metadata serializers
        addSerializer(Metadata.class, new MetadataSerializer());
        addDeserializer(Metadata.class, new MetadataDeserializer());

        // Register ParseContext serializers
        addSerializer(ParseContext.class, new ParseContextSerializer());
        addDeserializer(ParseContext.class, new ParseContextDeserializer());
    }

    /**
     * Sets the shared ObjectMapper for use during deserialization.
     * Must be called before deserializing components.
     *
     * @param mapper the ObjectMapper with TikaModule registered
     */
    public static void setSharedMapper(ObjectMapper mapper) {
        sharedMapper = mapper;
    }

    /**
     * Gets the shared ObjectMapper.
     *
     * @return the shared mapper, or null if not configured
     */
    public static ObjectMapper getSharedMapper() {
        return sharedMapper;
    }

    @Override
    public void setupModule(SetupContext context) {
        super.setupModule(context);
        context.addDeserializers(new TikaDeserializers());
        context.addSerializers(new TikaSerializers());
    }

    /**
     * Deserializers for Tika component types.
     * <p>
     * Only handles abstract types (interfaces/abstract classes) that are compact format interfaces.
     * Concrete implementations use normal Jackson bean deserialization for their properties.
     */
    private static class TikaDeserializers extends Deserializers.Base {
        @Override
        public JsonDeserializer<?> findBeanDeserializer(JavaType type, DeserializationConfig config,
                                                         BeanDescription beanDesc) {
            Class<?> rawClass = type.getRawClass();

            // Only use compact format deserializer for ABSTRACT types (interfaces or abstract classes)
            // that are in the compact format interfaces list.
            // Concrete implementations (like ExternalParser, HtmlParser) should use normal
            // Jackson bean deserialization for their properties.
            if (rawClass.isInterface() || Modifier.isAbstract(rawClass.getModifiers())) {
                if (ComponentNameResolver.getContextKeyInterfaces().contains(rawClass) ||
                        ComponentNameResolver.usesCompactFormat(rawClass)) {
                    return new TikaComponentDeserializer(rawClass);
                }
            }

            return null;
        }
    }

    /**
     * Serializers for Tika component types.
     */
    private static class TikaSerializers extends Serializers.Base {
        @Override
        public JsonSerializer<?> findSerializer(SerializationConfig config, JavaType type,
                                                  BeanDescription beanDesc) {
            Class<?> rawClass = type.getRawClass();

            // Use dedicated serializers for SPI composite types
            if (rawClass == DefaultParser.class) {
                return new DefaultParserSerializer();
            }
            if (rawClass == DefaultDetector.class) {
                return new DefaultDetectorSerializer();
            }

            // Handle MimeFilteringDecorator specially - serialize wrapped parser with mime filters
            if (rawClass == ParserDecorator.MimeFilteringDecorator.class) {
                return new TikaComponentSerializer();
            }

            // Only serialize with compact format if type implements a compact format interface
            // AND has a registered friendly name
            if (ComponentNameResolver.usesCompactFormat(rawClass) &&
                    ComponentNameResolver.getFriendlyName(rawClass) != null) {
                return new TikaComponentSerializer();
            }

            return null;
        }
    }

    /**
     * Deserializer that handles both string and object formats for Tika components.
     * Delegates to {@link ComponentInstantiator#instantiateComponent} for instantiation.
     */
    private static class TikaComponentDeserializer extends JsonDeserializer<Object> {
        private final Class<?> expectedType;

        TikaComponentDeserializer(Class<?> expectedType) {
            this.expectedType = expectedType;
        }

        @Override
        public Object deserialize(JsonParser p, DeserializationContext ctxt) throws IOException {
            JsonNode node = p.readValueAsTree();

            ObjectMapper mapper = sharedMapper;
            if (mapper == null) {
                throw new IOException("Shared ObjectMapper not configured. " +
                        "Call TikaModule.setSharedMapper() before deserializing.");
            }

            String typeName;
            JsonNode configNode;

            if (node.isTextual()) {
                typeName = node.asText();
                configNode = null;
            } else if (node.isObject()) {
                Iterator<Map.Entry<String, JsonNode>> fields = node.fields();
                if (!fields.hasNext()) {
                    try {
                        return expectedType.getDeclaredConstructor().newInstance();
                    } catch (ReflectiveOperationException e) {
                        throw new IOException("Empty object for abstract type " +
                                expectedType.getSimpleName() + " - specify a concrete type name");
                    }
                }
                Map.Entry<String, JsonNode> entry = fields.next();
                typeName = entry.getKey();
                configNode = entry.getValue();
            } else {
                throw new IOException("Expected string or object for " +
                        expectedType.getSimpleName() + ", got: " + node.getNodeType());
            }

            try {
                return ComponentInstantiator.instantiateComponent(typeName, configNode,
                        mapper, Thread.currentThread().getContextClassLoader(), expectedType);
            } catch (TikaConfigException e) {
                throw new IOException(e.getMessage(), e);
            }
        }
    }

    /**
     * Serializer that produces compact output for Tika components.
     * Outputs simple string if using defaults, object with type key if configured.
     */
    private static class TikaComponentSerializer extends JsonSerializer<Object> {

        TikaComponentSerializer() {
        }

        @Override
        public void serialize(Object value, JsonGenerator gen, SerializerProvider serializers)
                throws IOException {
            // Handle MimeFilteringDecorator specially for round-trip
            Set<MediaType> includeTypes = null;
            Set<MediaType> excludeTypes = null;
            if (value instanceof ParserDecorator.MimeFilteringDecorator mfd) {
                includeTypes = mfd.getIncludeTypes();
                excludeTypes = mfd.getExcludeTypes();
                value = mfd.getWrappedParser();
            }

            String typeName = ComponentNameResolver.getFriendlyName(value.getClass());
            if (typeName == null) {
                typeName = value.getClass().getName();
            }

            ObjectMapper mapper = (ObjectMapper) gen.getCodec();

            // Get configured properties (only non-default values)
            ObjectNode configNode = getConfiguredProperties(value, mapper);

            // Add mime filter fields if present
            if (includeTypes != null && !includeTypes.isEmpty()) {
                configNode.set("_mime-include", mimeTypesToArray(includeTypes, mapper));
            }
            if (excludeTypes != null && !excludeTypes.isEmpty()) {
                configNode.set("_mime-exclude", mimeTypesToArray(excludeTypes, mapper));
            }

            if (configNode.isEmpty()) {
                // No config differences - output simple string
                gen.writeString(typeName);
            } else {
                // Has config - output object with type as key
                gen.writeStartObject();
                gen.writeObjectField(typeName, configNode);
                gen.writeEndObject();
            }
        }

        private JsonNode mimeTypesToArray(Set<MediaType> types, ObjectMapper mapper) {
            var arrayNode = mapper.createArrayNode();
            for (MediaType type : types) {
                arrayNode.add(type.toString());
            }
            return arrayNode;
        }

        private ObjectNode getConfiguredProperties(Object value, ObjectMapper mapper) throws IOException {
            try {
                // Check for getConfig() method (common pattern for config objects)
                Method getConfigMethod = findGetConfigMethod(value.getClass());

                if (getConfigMethod != null) {
                    // Serialize the config object's properties
                    Object config = getConfigMethod.invoke(value);
                    if (config == null) {
                        return mapper.createObjectNode();
                    }

                    // Create default config to compare against
                    Object defaultConfig = config.getClass().getDeclaredConstructor().newInstance();

                    ObjectNode configNode = TikaObjectMapperFactory.getPlainMapper().valueToTree(config);
                    ObjectNode defaultNode = TikaObjectMapperFactory.getPlainMapper().valueToTree(defaultConfig);

                    // Only keep properties that differ from defaults
                    ObjectNode result = mapper.createObjectNode();
                    Iterator<Map.Entry<String, JsonNode>> fields = configNode.fields();
                    while (fields.hasNext()) {
                        Map.Entry<String, JsonNode> field = fields.next();
                        JsonNode defaultValue = defaultNode.get(field.getKey());
                        if (!field.getValue().equals(defaultValue)) {
                            result.set(field.getKey(), field.getValue());
                        }
                    }
                    return result;
                } else {
                    // No config object - serialize the component directly
                    Object defaultInstance = value.getClass().getDeclaredConstructor().newInstance();

                    ObjectNode valueNode = TikaObjectMapperFactory.getPlainMapper().valueToTree(value);
                    ObjectNode defaultNode = TikaObjectMapperFactory.getPlainMapper().valueToTree(defaultInstance);

                    ObjectNode result = TikaObjectMapperFactory.getPlainMapper().createObjectNode();
                    Iterator<Map.Entry<String, JsonNode>> fields = valueNode.fields();
                    while (fields.hasNext()) {
                        Map.Entry<String, JsonNode> field = fields.next();
                        JsonNode defaultValue = defaultNode.get(field.getKey());
                        if (!field.getValue().equals(defaultValue)) {
                            result.set(field.getKey(), field.getValue());
                        }
                    }
                    return result;
                }
            } catch (ReflectiveOperationException e) {
                throw new IOException("Failed to serialize config", e);
            }
        }

        private Method findGetConfigMethod(Class<?> clazz) {
            try {
                Method method = clazz.getMethod("getConfig");
                if (method.getReturnType() != void.class) {
                    return method;
                }
            } catch (NoSuchMethodException e) {
                // No getConfig method
            }
            return null;
        }
    }
}