ComponentNameResolver.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.serialization;

import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.tika.config.loader.ComponentInfo;
import org.apache.tika.config.loader.ComponentRegistry;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.digest.DigesterFactory;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
import org.apache.tika.extractor.UnpackSelector;
import org.apache.tika.language.translate.Translator;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory;
import org.apache.tika.parser.Parser;
import org.apache.tika.renderer.Renderer;
import org.apache.tika.sax.ContentHandlerDecoratorFactory;
import org.apache.tika.sax.ContentHandlerFactory;

/**
 * Utility class that resolves friendly component names to classes using ComponentRegistry.
 * <p>
 * Supports friendly names like "pdf-parser" as well as fully qualified class names.
 * Registries must be registered via {@link #registerRegistry(String, ComponentRegistry)}
 * before use.
 * <p>
 * Also stores {@link ComponentConfig} registrations for top-level component loading.
 */
public final class ComponentNameResolver {

    /**
     * Interfaces that use compact format serialization and serve as ParseContext keys.
     * Types implementing these interfaces will be serialized as:
     * - "type-name" for defaults
     * - {"type-name": {...}} for configured instances
     */
    private static final Set<Class<?>> CONTEXT_KEY_INTERFACES = new HashSet<>();

    static {
        CONTEXT_KEY_INTERFACES.add(Parser.class);
        CONTEXT_KEY_INTERFACES.add(Detector.class);
        CONTEXT_KEY_INTERFACES.add(EncodingDetector.class);
        CONTEXT_KEY_INTERFACES.add(MetadataFilter.class);
        CONTEXT_KEY_INTERFACES.add(Translator.class);
        CONTEXT_KEY_INTERFACES.add(Renderer.class);
        CONTEXT_KEY_INTERFACES.add(DigesterFactory.class);
        CONTEXT_KEY_INTERFACES.add(EmbeddedDocumentExtractorFactory.class);
        CONTEXT_KEY_INTERFACES.add(MetadataWriteLimiterFactory.class);
        CONTEXT_KEY_INTERFACES.add(ContentHandlerDecoratorFactory.class);
        CONTEXT_KEY_INTERFACES.add(ContentHandlerFactory.class);
        CONTEXT_KEY_INTERFACES.add(UnpackSelector.class);
    }

    private static final Map<String, ComponentRegistry> REGISTRIES = new ConcurrentHashMap<>();

    // Component configuration storage (keyed by JSON field name and by component class)
    private static final Map<String, ComponentConfig<?>> FIELD_TO_CONFIG = new ConcurrentHashMap<>();
    private static final Map<Class<?>, ComponentConfig<?>> CLASS_TO_CONFIG = new ConcurrentHashMap<>();

    private ComponentNameResolver() {
        // Utility class
    }

    /**
     * Registers a ComponentRegistry for name resolution.
     *
     * @param indexName the index file name (e.g., "parsers", "detectors")
     * @param registry the registry to register
     */
    public static void registerRegistry(String indexName, ComponentRegistry registry) {
        REGISTRIES.put(indexName, registry);
    }

    /**
     * Resolves a friendly name or FQCN to a Class.
     * Searches all registered component registries, falling back to Class.forName.
     *
     * @param name friendly name or fully qualified class name
     * @param classLoader the class loader to use for FQCN fallback
     * @return the resolved class
     * @throws ClassNotFoundException if not found in any registry and not a valid FQCN
     */
    public static Class<?> resolveClass(String name, ClassLoader classLoader)
            throws ClassNotFoundException {
        for (ComponentRegistry registry : REGISTRIES.values()) {
            if (registry.hasComponent(name)) {
                try {
                    return registry.getComponentClass(name);
                } catch (TikaConfigException e) {
                    // continue to next registry
                }
            }
        }
        throw new ClassNotFoundException(
                "Component '" + name + "' is not registered. " +
                "Components must be registered via @TikaComponent annotation or .idx file. " +
                "Arbitrary class names are not allowed for security reasons.");
    }

    /**
     * Gets the friendly name for a class, or null if not registered.
     *
     * @param clazz the class to look up
     * @return the friendly name, or null if not found
     */
    public static String getFriendlyName(Class<?> clazz) {
        for (ComponentRegistry registry : REGISTRIES.values()) {
            String friendlyName = registry.getFriendlyName(clazz);
            if (friendlyName != null) {
                return friendlyName;
            }
        }
        return null;
    }

    /**
     * Checks if a component with the given name is registered in any registry.
     *
     * @param name the component name to check
     * @return true if the component is registered
     */
    public static boolean hasComponent(String name) {
        for (ComponentRegistry registry : REGISTRIES.values()) {
            if (registry.hasComponent(name)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Gets the component info for a given friendly name.
     *
     * @param name the friendly name to look up
     * @return Optional containing the ComponentInfo, or empty if not found
     */
    public static Optional<ComponentInfo> getComponentInfo(String name) {
        for (ComponentRegistry registry : REGISTRIES.values()) {
            if (registry.hasComponent(name)) {
                try {
                    return Optional.of(registry.getComponentInfo(name));
                } catch (TikaConfigException e) {
                    // continue to next registry
                }
            }
        }
        return Optional.empty();
    }

    /**
     * Checks if any registered component implements or extends the given abstract type.
     * <p>
     * This is used by TikaModule to determine if an abstract type (interface or abstract class)
     * should use compact component serialization.
     *
     * @param abstractType the abstract type to check
     * @return true if at least one registered component is assignable to this type
     */
    public static boolean hasImplementationsOf(Class<?> abstractType) {
        for (ComponentRegistry registry : REGISTRIES.values()) {
            for (ComponentInfo info : registry.getAllComponents().values()) {
                if (abstractType.isAssignableFrom(info.componentClass())) {
                    return true;
                }
            }
        }
        return false;
    }

    // ==================== Component Config Methods ====================

    /**
     * Registers a ComponentConfig for top-level component loading.
     *
     * @param config the component configuration
     */
    public static <T> void registerComponentConfig(ComponentConfig<T> config) {
        FIELD_TO_CONFIG.put(config.getJsonField(), config);
        CLASS_TO_CONFIG.put(config.getComponentClass(), config);
    }

    /**
     * Gets component configuration by JSON field name.
     *
     * @param jsonField the JSON field name (e.g., "parsers")
     * @return the component config, or null if not registered
     */
    public static ComponentConfig<?> getComponentConfig(String jsonField) {
        return FIELD_TO_CONFIG.get(jsonField);
    }

    /**
     * Gets component configuration by component class.
     *
     * @param componentClass the component class (e.g., Parser.class)
     * @return the component config, or null if not registered
     */
    @SuppressWarnings("unchecked")
    public static <T> ComponentConfig<T> getComponentConfig(Class<T> componentClass) {
        return (ComponentConfig<T>) CLASS_TO_CONFIG.get(componentClass);
    }

    /**
     * Checks if a component config is registered for the given JSON field.
     */
    public static boolean hasComponentConfig(String jsonField) {
        return FIELD_TO_CONFIG.containsKey(jsonField);
    }

    /**
     * Checks if a component config is registered for the given class.
     */
    public static boolean hasComponentConfig(Class<?> componentClass) {
        return CLASS_TO_CONFIG.containsKey(componentClass);
    }

    /**
     * Gets all registered component JSON field names.
     */
    public static Set<String> getComponentFields() {
        return Collections.unmodifiableSet(FIELD_TO_CONFIG.keySet());
    }

    // ==================== Context Key Resolution Methods ====================

    /**
     * Returns the set of interfaces that use compact format serialization.
     *
     * @return unmodifiable set of context key interfaces
     */
    public static Set<Class<?>> getContextKeyInterfaces() {
        return Collections.unmodifiableSet(CONTEXT_KEY_INTERFACES);
    }

    /**
     * Finds the appropriate context key interface for a given type.
     * This is used to determine which interface should be used as the ParseContext key
     * when storing instances of this type.
     *
     * @param type the type to find the context key for
     * @return the interface to use as context key, or null if none found
     */
    public static Class<?> findContextKeyInterface(Class<?> type) {
        for (Class<?> iface : CONTEXT_KEY_INTERFACES) {
            if (iface.isAssignableFrom(type)) {
                return iface;
            }
        }
        return null;
    }

    /**
     * Checks if a type should use compact format serialization.
     * Returns true if the type implements any of the registered context key interfaces.
     *
     * @param type the type to check
     * @return true if the type uses compact format
     */
    public static boolean usesCompactFormat(Class<?> type) {
        return findContextKeyInterface(type) != null;
    }

    /**
     * Determines the ParseContext key for a component.
     * <p>
     * Resolution order:
     * <ol>
     *   <li>Explicit contextKey from .idx file (via @TikaComponent annotation)</li>
     *   <li>Auto-detect from implemented interfaces (using CONTEXT_KEY_INTERFACES)</li>
     *   <li>Fall back to the component class itself</li>
     * </ol>
     *
     * @param info the component info
     * @return the class to use as ParseContext key
     */
    public static Class<?> determineContextKey(ComponentInfo info) {
        if (info.contextKey() != null) {
            return info.contextKey();
        }
        Class<?> interfaceKey = findContextKeyInterface(info.componentClass());
        if (interfaceKey != null) {
            return interfaceKey;
        }
        return info.componentClass();
    }

    /**
     * Gets the contextKey for a class from the component registry.
     * The contextKey is recorded in the .idx file by the annotation processor.
     *
     * @param clazz the class to check
     * @return the contextKey class if specified, or null if not registered or no contextKey
     */
    public static Class<?> getContextKey(Class<?> clazz) {
        for (ComponentRegistry registry : REGISTRIES.values()) {
            String friendlyName = registry.getFriendlyName(clazz);
            if (friendlyName != null) {
                try {
                    ComponentInfo info = registry.getComponentInfo(friendlyName);
                    return info.contextKey();
                } catch (TikaConfigException e) {
                    // continue to next registry
                }
            }
        }
        return null;
    }
}