TikaObjectMapperFactory.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.config.loader;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.serialization.ComponentNameResolver;
import org.apache.tika.serialization.TikaModule;

/**
 * Factory for creating ObjectMappers configured for Tika serialization.
 * <p>
 * Configures strict validation settings and loads component registries
 * for friendly name resolution.
 */
public class TikaObjectMapperFactory {

    private static final Logger LOG = LoggerFactory.getLogger(TikaObjectMapperFactory.class);

    /**
     * Index file names for component registries.
     */
    private static final String[] REGISTRY_INDEX_FILES = {
            "parsers",
            "detectors",
            "encoding-detectors",
            "metadata-filters",
            "renderers",
            "translators",
            "digester-factories",
            "content-handler-factories",
            "parse-context"
    };

    private static ObjectMapper MAPPER = null;

    // Shared plain ObjectMapper (no TikaModule) for converting JsonNodes to JSON strings.
    // Needed because the main mapper may use a binary format (e.g., Smile)
    // which doesn't support writeValueAsString().
    private static final ObjectMapper PLAIN_MAPPER = new ObjectMapper();

    static {
        // Components with no bean properties (e.g., parsers with no configuration)
        // need to serialize as empty objects rather than throwing.
        PLAIN_MAPPER.disable(SerializationFeature.FAIL_ON_EMPTY_BEANS);
    }

    /**
     * Returns a shared plain ObjectMapper without TikaModule registration.
     * <p>
     * This mapper is suitable for:
     * <ul>
     *   <li>Converting JsonNodes to JSON strings</li>
     *   <li>Serializing component properties without compact format wrapping</li>
     *   <li>Avoiding infinite recursion when serializing inside TikaModule</li>
     * </ul>
     * <p>
     * Has {@code FAIL_ON_EMPTY_BEANS} disabled to allow serialization of classes with no properties.
     *
     * @return the shared plain ObjectMapper
     */
    public static ObjectMapper getPlainMapper() {
        return PLAIN_MAPPER;
    }

    public static synchronized ObjectMapper getMapper() {
        if (MAPPER == null) {
            MAPPER = createMapper();
        }
        return MAPPER;
    }

    /**
     * Creates an ObjectMapper configured for Tika serialization.
     *
     * @return configured ObjectMapper
     */
    public static ObjectMapper createMapper() {
        return createMapper(null);
    }

    /**
     * Creates an ObjectMapper configured for Tika serialization with a custom JsonFactory.
     * <p>
     * This can be used to create mappers for binary formats like Smile:
     * <pre>
     * ObjectMapper smileMapper = TikaObjectMapperFactory.createMapper(new SmileFactory());
     * </pre>
     *
     * @param factory the JsonFactory to use, or null for default JSON
     * @return configured ObjectMapper
     */
    public static ObjectMapper createMapper(JsonFactory factory) {
        ObjectMapper mapper = (factory != null) ? new ObjectMapper(factory) : new ObjectMapper();

        // Allow comments in JSON config files (// and /* */ style)
        mapper.configure(JsonParser.Feature.ALLOW_COMMENTS, true);

        // Fail on unknown properties to catch configuration errors early
        mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, true);

        // Prevent null values being assigned to primitive fields (int, boolean, etc.)
        mapper.configure(DeserializationFeature.FAIL_ON_NULL_FOR_PRIMITIVES, true);

        // Ensure enums are properly validated (not just numeric values)
        mapper.configure(DeserializationFeature.FAIL_ON_NUMBERS_FOR_ENUMS, true);

        // Catch duplicate keys in JSON objects
        mapper.configure(DeserializationFeature.FAIL_ON_READING_DUP_TREE_KEY, true);

        // Need to allow creation of classes without setters/getters -- we may want to revisit this
        mapper.configure(SerializationFeature.FAIL_ON_EMPTY_BEANS, false);

        // Load component registries for name resolution
        loadComponentRegistries();

        // Register TikaModule for compact serialization/deserialization of registered components
        TikaModule tikaModule = new TikaModule();
        mapper.registerModule(tikaModule);

        // Set the shared mapper for TikaModule's deserializers
        TikaModule.setSharedMapper(mapper);

        return mapper;
    }

    /**
     * Loads component registries for name resolution.
     * Registries are loaded from index files and registered with the ComponentNameResolver.
     * Missing registries are silently ignored (may not be on classpath).
     */
    private static void loadComponentRegistries() {
        ClassLoader classLoader = TikaObjectMapperFactory.class.getClassLoader();

        for (String indexFile : REGISTRY_INDEX_FILES) {
            try {
                ComponentRegistry registry = new ComponentRegistry(indexFile, classLoader);
                ComponentNameResolver.registerRegistry(indexFile, registry);
                LOG.debug("Loaded component registry: {}", indexFile);
            } catch (TikaConfigException e) {
                // Registry not available - this is expected if the module isn't on classpath
                LOG.debug("Component registry not available: {} - {}", indexFile, e.getMessage());
            }
        }
    }
}