ParseContextUtils.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.serialization;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.loader.ComponentInfo;
import org.apache.tika.config.loader.ComponentInstantiator;
import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.filter.CompositeMetadataFilter;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.parser.ParseContext;

/**
 * Utility methods for working with ParseContext objects in JSON-based configurations.
 * <p>
 * Uses friendly-name format for configuration:
 * <pre>
 * "parse-context": {
 *   "timeout-limits": {
 *     "progressTimeoutMillis": 60000
 *   },
 *   "pdf-parser": {
 *     "extractInlineImages": true
 *   }
 * }
 * </pre>
 * <p>
 * Components that implement {@link org.apache.tika.config.SelfConfiguring} are skipped
 * during resolution - they read their own config from jsonConfigs at runtime.
 */
public class ParseContextUtils {

    private static final Logger LOG = LoggerFactory.getLogger(ParseContextUtils.class);
    private static final ObjectMapper MAPPER = TikaObjectMapperFactory.getMapper();

    /**
     * Mapping of array config keys to their context keys and composite wrapper factories.
     * Key: config name (e.g., "metadata-filters")
     * Value: (contextKey, componentInterface)
     */
    private static final Map<String, ArrayConfigInfo> ARRAY_CONFIGS = Map.of(
            "metadata-filters", new ArrayConfigInfo(MetadataFilter.class, MetadataFilter.class)
    );

    /**
     * Holds information about how to process array configs.
     */
    private record ArrayConfigInfo(Class<?> contextKey, Class<?> componentInterface) {}

    /**
     * Resolves all JSON configs from ParseContext and adds them to the resolved cache.
     * <p>
     * Iterates through all entries in jsonConfigs, looks up the friendly name in
     * ComponentNameResolver (which searches all registered component registries),
     * deserializes the JSON, and caches the instance in resolvedConfigs.
     * <p>
     * Components that implement {@link org.apache.tika.config.SelfConfiguring} are skipped -
     * they read their own config at runtime via {@link ConfigDeserializer}.
     * <p>
     * The ParseContext key is determined by the contextKey from the .idx file, which is
     * auto-detected by the annotation processor from the service interface, or explicitly
     * specified via {@code @TikaComponent(contextKey=...)}. Falls back to the component
     * class if no contextKey is available.
     *
     * @param context the ParseContext to populate
     * @param classLoader the ClassLoader to use for loading component classes
     */
    public static void resolveAll(ParseContext context, ClassLoader classLoader)
            throws TikaConfigException {
        if (context == null) {
            return;
        }

        Map<String, JsonConfig> jsonConfigs = context.getJsonConfigs();
        if (jsonConfigs.isEmpty()) {
            return;
        }

        // First, process known array configs (e.g., "metadata-filters")
        // These don't depend on the parse-context registry
        for (String friendlyName : new ArrayList<>(jsonConfigs.keySet())) {
            if (ARRAY_CONFIGS.containsKey(friendlyName)) {
                JsonConfig jsonConfig = jsonConfigs.get(friendlyName);
                if (jsonConfig != null) {
                    resolveArrayConfig(friendlyName, jsonConfig, context, classLoader);
                }
            }
        }

        // Then, try to resolve single component configs using ComponentNameResolver
        // This searches all registered component registries, not just "parse-context"
        for (Map.Entry<String, JsonConfig> entry : jsonConfigs.entrySet()) {
            String friendlyName = entry.getKey();
            JsonConfig jsonConfig = entry.getValue();

            // Skip already resolved configs (including array configs)
            if (context.getResolvedConfig(friendlyName) != null) {
                continue;
            }

            // Try to find this friendly name in any registered component registry
            var optionalInfo = ComponentNameResolver.getComponentInfo(friendlyName);
            if (optionalInfo.isEmpty()) {
                // Not a registered component - that's okay, might be used for something else
                LOG.debug("'{}' not found in any component registry, skipping", friendlyName);
                continue;
            }

            ComponentInfo info = optionalInfo.get();

            // Skip self-configuring components - they handle their own config
            if (info.selfConfiguring()) {
                LOG.debug("'{}' is self-configuring, skipping resolution", friendlyName);
                continue;
            }

            // Determine the context key
            Class<?> contextKey = ComponentNameResolver.determineContextKey(info);

            try {
                // Deserialize and cache in resolvedConfigs, also add to context
                Object instance = MAPPER.readValue(jsonConfig.json(), info.componentClass());
                context.setResolvedConfig(friendlyName, instance);
                context.set((Class) contextKey, instance);

                LOG.debug("Resolved '{}' -> {} with key {}",
                        friendlyName, info.componentClass().getName(), contextKey.getName());
            } catch (IOException e) {
                throw new TikaConfigException("Failed to deserialize component '" +
                        friendlyName + "' of type " + info.componentClass().getName(), e);
            }
        }
    }

    /**
     * Resolves an array config entry (e.g., "metadata-filters") to a composite component.
     * <p>
     * The array can contain either strings (friendly names) or objects:
     * <pre>
     * ["filter-name-1", "filter-name-2"]              // String shorthand
     * [{"filter-name-1": {}}, {"filter-name-2": {}}]  // Object format
     * </pre>
     *
     * @param configName the config name (e.g., "metadata-filters")
     * @param jsonConfig the JSON configuration (should be an array)
     * @param context the ParseContext to add the resolved component to
     * @param classLoader the ClassLoader to use for loading component classes
     * @return true if resolution was successful
     */
    @SuppressWarnings("unchecked")
    private static boolean resolveArrayConfig(String configName, JsonConfig jsonConfig,
                                              ParseContext context, ClassLoader classLoader)
            throws TikaConfigException {
        ArrayConfigInfo configInfo = ARRAY_CONFIGS.get(configName);
        if (configInfo == null) {
            return false;
        }

        try {
            JsonNode arrayNode = MAPPER.readTree(jsonConfig.json());
            if (!arrayNode.isArray()) {
                throw new TikaConfigException("Expected array for '" + configName +
                        "', got: " + arrayNode.getNodeType());
            }

            List<Object> components = new ArrayList<>();

            for (JsonNode item : arrayNode) {
                String typeName;
                JsonNode configNode;

                if (item.isTextual()) {
                    // String shorthand: "component-name"
                    typeName = item.asText();
                    configNode = MAPPER.createObjectNode();
                } else if (item.isObject() && item.size() == 1) {
                    // Object format: {"component-name": {...}}
                    typeName = item.fieldNames().next();
                    configNode = item.get(typeName);
                } else {
                    throw new TikaConfigException("Unexpected item format in '" +
                            configName + "': " + item);
                }

                Object component = ComponentInstantiator.instantiate(
                        typeName, configNode, MAPPER, classLoader);
                components.add(component);
                LOG.debug("Instantiated '{}' for '{}'", typeName, configName);
            }

            // Create the composite and add to ParseContext
            if (!components.isEmpty()) {
                Object composite = createComposite(configName, components, configInfo);
                if (composite != null) {
                    context.setResolvedConfig(configName, composite);
                    context.set((Class) configInfo.contextKey(), composite);
                    LOG.debug("Resolved '{}' -> {} with {} components",
                            configName, composite.getClass().getSimpleName(), components.size());
                    return true;
                }
            }
        } catch (IOException e) {
            throw new TikaConfigException("Failed to parse array config '" +
                    configName + "'", e);
        }

        return false;
    }

    /**
     * Creates a composite component from a list of individual components.
     *
     * @param configName the config name (for error messages)
     * @param components the list of components
     * @param configInfo the array config info
     * @return the composite component, or null if creation failed
     */
    @SuppressWarnings("unchecked")
    private static Object createComposite(String configName, List<Object> components,
                                          ArrayConfigInfo configInfo) {
        // Handle known composite types
        if (configInfo.componentInterface() == MetadataFilter.class) {
            List<MetadataFilter> filters = (List<MetadataFilter>) (List<?>) components;
            return new CompositeMetadataFilter(filters);
        }

        // Add more composite types as needed
        LOG.warn("No composite factory for '{}'", configName);
        return null;
    }
}