ParseContextDeserializer.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.serialization.serdes;

import static org.apache.tika.serialization.serdes.ParseContextSerializer.PARSE_CONTEXT;
import static org.apache.tika.serialization.serdes.ParseContextSerializer.TYPED;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Optional;

import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.tika.config.loader.ComponentInfo;
import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.serialization.ComponentNameResolver;

/**
 * Deserializes ParseContext from JSON.
 * <p>
 * Handles two types of entries:
 * <ul>
 *   <li>"typed" section: Deserialized directly to typed objects in the context map</li>
 *   <li>Other entries: Stored as JSON configs for lazy resolution</li>
 * </ul>
 * <p>
 * Example input:
 * <pre>
 * {
 *   "typed": {
 *     "handler-config": {"type": "XML", "parseMode": "RMETA"}
 *   },
 *   "metadata-filters": ["mock-upper-case-filter"]
 * }
 * </pre>
 */
public class ParseContextDeserializer extends JsonDeserializer<ParseContext> {

    private static final Logger LOG = LoggerFactory.getLogger(ParseContextDeserializer.class);

    private static ObjectMapper plainMapper() {
        return TikaObjectMapperFactory.getPlainMapper();
    }

    @Override
    public ParseContext deserialize(JsonParser jsonParser, DeserializationContext ctxt)
            throws IOException {
        JsonNode root = jsonParser.readValueAsTree();
        return readParseContext(root, (ObjectMapper) jsonParser.getCodec());
    }

    /**
     * Deserializes a ParseContext from a JsonNode.
     * <p>
     * The "typed" section is deserialized directly to typed objects in the context map.
     * All other fields are stored as JSON config strings for lazy resolution.
     * <p>
     * Duplicate detection is performed within a single document: if multiple entries
     * resolve to the same context key (e.g., both "bouncy-castle-digester" and
     * "commons-digester" resolve to DigesterFactory), an IOException is thrown.
     *
     * @param jsonNode the JSON node containing the ParseContext data
     * @param mapper   the ObjectMapper for deserializing typed objects
     * @return the deserialized ParseContext
     * @throws IOException if deserialization fails or duplicate context keys are detected
     */
    public static ParseContext readParseContext(JsonNode jsonNode, ObjectMapper mapper)
            throws IOException {
        // Handle optional wrapper: { "parse-context": {...} }
        JsonNode contextNode = jsonNode.get(PARSE_CONTEXT);
        if (contextNode == null) {
            contextNode = jsonNode;
        }

        ParseContext parseContext = new ParseContext();

        if (!contextNode.isObject()) {
            return parseContext;
        }

        // Track context keys to detect duplicates within this document
        // Maps contextKey -> friendlyName for error messages
        Map<Class<?>, String> seenContextKeys = new HashMap<>();

        Iterator<String> fieldNames = contextNode.fieldNames();
        while (fieldNames.hasNext()) {
            String name = fieldNames.next();
            JsonNode value = contextNode.get(name);

            if (TYPED.equals(name)) {
                // Deserialize typed objects directly to context map
                deserializeTypedObjects(value, parseContext, mapper, seenContextKeys);
            } else {
                // Check for duplicate context key before storing
                checkForDuplicateContextKey(name, seenContextKeys);

                // Store as JSON config for lazy resolution
                // Use plain JSON mapper since the main mapper may be binary (Smile)
                String json = plainMapper().writeValueAsString(value);
                parseContext.setJsonConfig(name, json);
            }
        }

        return parseContext;
    }

    /**
     * Checks if a JSON config entry would create a duplicate context key.
     * <p>
     * Looks up the friendly name in the component registry to determine its context key,
     * then checks if that key has already been seen in this document.
     *
     * @param friendlyName the friendly name of the config entry
     * @param seenContextKeys map of already-seen context keys to their friendly names
     * @throws IOException if a duplicate context key is detected
     */
    private static void checkForDuplicateContextKey(String friendlyName,
                                                     Map<Class<?>, String> seenContextKeys)
            throws IOException {
        Optional<ComponentInfo> infoOpt = ComponentNameResolver.getComponentInfo(friendlyName);
        if (infoOpt.isEmpty()) {
            // Not a registered component - can't check for duplicates, that's okay
            return;
        }

        ComponentInfo info = infoOpt.get();

        // Self-configuring components (e.g., parsers) stay as JSON configs and are
        // accessed by string key at runtime via ParseContextConfig.getConfig().
        // They never get resolved to typed objects in the context map, so multiple
        // self-configuring components with the same context key are not duplicates.
        if (info.selfConfiguring()) {
            return;
        }

        Class<?> contextKey = ComponentNameResolver.determineContextKey(info);

        String existingName = seenContextKeys.get(contextKey);
        if (existingName != null) {
            throw new IOException("Duplicate parse-context entries resolve to the same key " +
                    contextKey.getName() + ": '" + existingName + "' and '" + friendlyName + "'");
        }
        seenContextKeys.put(contextKey, friendlyName);
    }

    /**
     * Deserializes the "typed" section into typed objects in the context map.
     *
     * @param typedNode the JSON node containing typed objects
     * @param parseContext the ParseContext to add objects to
     * @param mapper the ObjectMapper for deserializing
     * @param seenContextKeys map tracking context keys to their friendly names (for duplicate detection)
     * @throws IOException if deserialization fails or duplicate context keys are detected
     */
    @SuppressWarnings("unchecked")
    private static void deserializeTypedObjects(JsonNode typedNode, ParseContext parseContext,
                                                 ObjectMapper mapper,
                                                 Map<Class<?>, String> seenContextKeys) throws IOException {
        if (!typedNode.isObject()) {
            return;
        }

        Iterator<String> fieldNames = typedNode.fieldNames();
        while (fieldNames.hasNext()) {
            String componentName = fieldNames.next();
            JsonNode configNode = typedNode.get(componentName);

            Class<?> configClass = null;
            Class<?> contextKeyClass = null;

            // First, try component registry lookup (for friendly names like "pdf-parser-config")
            Optional<ComponentInfo> infoOpt = ComponentNameResolver.getComponentInfo(componentName);
            if (infoOpt.isPresent()) {
                ComponentInfo info = infoOpt.get();
                configClass = info.componentClass();
                contextKeyClass = info.contextKey();
            }

            // If not found in registry, reject ��� components must be registered
            if (configClass == null) {
                throw new IOException("Unknown typed component '" + componentName + "'. " +
                        "Components must be registered via @TikaComponent annotation or .idx file.");
            }

            // Determine context key: explicit > interface detection > class itself
            Class<?> parseContextKey = contextKeyClass;
            if (parseContextKey == null) {
                parseContextKey = ComponentNameResolver.findContextKeyInterface(configClass);
            }
            if (parseContextKey == null) {
                parseContextKey = configClass;
            }

            // Check for duplicate context key
            String existingName = seenContextKeys.get(parseContextKey);
            if (existingName != null) {
                throw new IOException("Duplicate parse-context entries resolve to the same key " +
                        parseContextKey.getName() + ": '" + existingName + "' and '" + componentName + "'");
            }
            seenContextKeys.put(parseContextKey, componentName);

            // Deserialize and add to context
            try {
                Object config = mapper.treeToValue(configNode, configClass);
                parseContext.set((Class) parseContextKey, config);
                LOG.debug("Deserialized typed object '{}' -> {} (contextKey={})",
                        componentName, configClass.getName(), parseContextKey.getName());
            } catch (Exception e) {
                LOG.warn("Failed to deserialize typed component '{}' as {}, storing as JSON config",
                        componentName, configClass.getName(), e);
                // Use plain JSON mapper since main mapper may be binary (Smile)
                parseContext.setJsonConfig(componentName, plainMapper().writeValueAsString(configNode));
            }
        }
    }
}