ComponentRegistry.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.config.loader;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;

import org.apache.tika.config.SelfConfiguring;
import org.apache.tika.exception.TikaConfigException;

/**
 * Registry for looking up Tika component classes by name.
 * Loads component name-to-class mappings from META-INF/tika/*.idx files
 * generated by the {@code @TikaComponent} annotation processor.
 * <p>
 * The registry tracks:
 * <ul>
 *   <li>Component class</li>
 *   <li>Whether the component is self-configuring (implements {@link SelfConfiguring})</li>
 *   <li>Optional explicit context key for ParseContext</li>
 * </ul>
 * <p>
 * Modules that can't use @TikaComponent (due to dependency constraints) can provide
 * their own META-INF/tika/*.idx files to register components.
 */
public class ComponentRegistry {

    /**
     * Built-in aliases for external dependencies.
     * Maps component names to fully qualified class names.
     */
    private static final Map<String, String> BUILTIN_ALIASES = createBuiltinAliases();

    private static Map<String, String> createBuiltinAliases() {
        Map<String, String> aliases = new HashMap<>();
        // UnpackConfig is in tika-pipes-core which can't depend on tika-core for @TikaComponent
        aliases.put("unpack-config",
                "org.apache.tika.pipes.core.extractor.UnpackConfig");
        return Collections.unmodifiableMap(aliases);
    }

    private final Map<String, ComponentInfo> components;
    private final Map<String, String> classNameToFriendlyName;  // Reverse lookup by class name
    private final ClassLoader classLoader;

    /**
     * Creates a component registry by loading the specified index file.
     *
     * @param indexFileName the index file name (e.g., "parsers", "detectors")
     *                      without the .idx extension
     * @param classLoader the class loader to use for loading classes
     * @throws TikaConfigException if the index file cannot be loaded
     */
    public ComponentRegistry(String indexFileName, ClassLoader classLoader)
            throws TikaConfigException {
        this.classLoader = classLoader;
        this.components = loadComponents(indexFileName);
        // Build reverse lookup by class name (not Class object) to handle classloader differences
        this.classNameToFriendlyName = new HashMap<>();
        for (Map.Entry<String, ComponentInfo> entry : components.entrySet()) {
            classNameToFriendlyName.put(entry.getValue().componentClass().getName(), entry.getKey());
        }
    }

    /**
     * Looks up a component class by name.
     *
     * @param name the component name (e.g., "pdf-parser")
     * @return the component class
     * @throws TikaConfigException if the component name is not found
     */
    public Class<?> getComponentClass(String name) throws TikaConfigException {
        ComponentInfo info = getComponentInfo(name);
        return info.componentClass();
    }

    /**
     * Looks up full component information by name.
     *
     * @param name the component name (e.g., "pdf-parser")
     * @return the component info including class, selfConfiguring flag, and contextKey
     * @throws TikaConfigException if the component name is not found
     */
    public ComponentInfo getComponentInfo(String name) throws TikaConfigException {
        ComponentInfo info = components.get(name);
        if (info == null) {
            throw new TikaConfigException("Unknown component name: '" + name + "'. " +
                    "Available components: " + components.keySet());
        }
        return info;
    }

    /**
     * Returns all registered component names.
     *
     * @return unmodifiable map of component names to component info
     */
    public Map<String, ComponentInfo> getAllComponents() {
        return Collections.unmodifiableMap(components);
    }

    /**
     * Returns all components marked as defaults.
     *
     * @return unmodifiable map of component names to component info for default implementations
     */
    public Map<String, ComponentInfo> getDefaultComponents() {
        Map<String, ComponentInfo> defaults = new LinkedHashMap<>();
        for (Map.Entry<String, ComponentInfo> entry : components.entrySet()) {
            if (entry.getValue().isDefault()) {
                defaults.put(entry.getKey(), entry.getValue());
            }
        }
        return Collections.unmodifiableMap(defaults);
    }

    /**
     * Checks if a component with the given name is registered.
     *
     * @param name the component name
     * @return true if the component is registered
     */
    public boolean hasComponent(String name) {
        return components.containsKey(name);
    }

    /**
     * Looks up a component's friendly name by its class.
     * Uses class name (not Class object) for lookup to handle classloader differences.
     *
     * @param clazz the component class
     * @return the friendly name, or null if not registered
     */
    public String getFriendlyName(Class<?> clazz) {
        return classNameToFriendlyName.get(clazz.getName());
    }

    private Map<String, ComponentInfo> loadComponents(String indexFileName)
            throws TikaConfigException {
        Map<String, ComponentInfo> result = new LinkedHashMap<>();
        String resourcePath = "META-INF/tika/" + indexFileName + ".idx";

        try {
            Enumeration<URL> resources = classLoader.getResources(resourcePath);

            if (!resources.hasMoreElements()) {
                throw new TikaConfigException("Component index file not found: " + resourcePath);
            }

            while (resources.hasMoreElements()) {
                URL url = resources.nextElement();
                loadFromUrl(url, result);
            }

        } catch (IOException e) {
            throw new TikaConfigException("Failed to load component index: " + resourcePath, e);
        }

        return result;
    }

    private void loadFromUrl(URL url, Map<String, ComponentInfo> result) throws TikaConfigException {
        try (InputStream in = url.openStream();
                BufferedReader reader = new BufferedReader(
                        new InputStreamReader(in, StandardCharsets.UTF_8))) {

            String line;
            int lineNumber = 0;

            while ((line = reader.readLine()) != null) {
                lineNumber++;
                line = line.trim();

                // Skip comments and empty lines
                if (line.isEmpty() || line.startsWith("#")) {
                    continue;
                }

                // Parse: component-name=fully.qualified.ClassName[:key=contextKeyClass]
                int equalsIndex = line.indexOf('=');
                if (equalsIndex == -1) {
                    throw new TikaConfigException(
                            "Invalid index file format at " + url + " line " + lineNumber +
                            ": expected 'name=class', got: " + line);
                }

                String name = line.substring(0, equalsIndex).trim();
                String value = line.substring(equalsIndex + 1).trim();

                if (name.isEmpty() || value.isEmpty()) {
                    throw new TikaConfigException(
                            "Invalid index file format at " + url + " line " + lineNumber +
                            ": name or class is empty");
                }

                // Parse value: className or className:key=contextKeyClass[:default]
                String className = value;
                String contextKeyClassName = null;
                boolean isDefault = false;

                // Parse suffixes (e.g., :key=SomeClass:default)
                int colonIndex = value.indexOf(':');
                if (colonIndex != -1) {
                    className = value.substring(0, colonIndex);
                    String suffixes = value.substring(colonIndex + 1);

                    // Parse each colon-separated suffix
                    for (String suffix : suffixes.split(":")) {
                        if (suffix.startsWith("key=")) {
                            contextKeyClassName = suffix.substring(4);
                        } else if (suffix.equals("default")) {
                            isDefault = true;
                        } else if (!suffix.isEmpty()) {
                            throw new TikaConfigException(
                                    "Invalid index file format at " + url + " line " + lineNumber +
                                    ": unknown suffix '" + suffix + "', expected 'key=...' or 'default'");
                        }
                    }
                }

                // Load the component class
                try {
                    Class<?> clazz = classLoader.loadClass(className);
                    boolean selfConfiguring = SelfConfiguring.class.isAssignableFrom(clazz);

                    // Load the context key class if specified
                    Class<?> contextKey = null;
                    if (contextKeyClassName != null) {
                        try {
                            contextKey = classLoader.loadClass(contextKeyClassName);
                        } catch (ClassNotFoundException e) {
                            throw new TikaConfigException(
                                    "Context key class not found: " + contextKeyClassName +
                                    " (from " + url + ")", e);
                        }
                    }

                    result.put(name, new ComponentInfo(clazz, selfConfiguring, contextKey, isDefault));
                } catch (ClassNotFoundException e) {
                    throw new TikaConfigException(
                            "Component class not found: " + className + " (from " + url + ")", e);
                }
            }

        } catch (IOException e) {
            throw new TikaConfigException("Failed to read component index from: " + url, e);
        }
    }
}