ParseContextConfig.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.config;

import java.io.IOException;
import java.lang.reflect.Method;
import java.util.Locale;

import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.parser.ParseContext;

/**
 * Facade for accessing runtime configuration from ParseContext's jsonConfigs.
 * <p>
 * This wrapper provides a safe way for parsers to access runtime configuration
 * without directly depending on tika-serialization. It performs these critical checks:
 * <ul>
 *   <li>If ParseContext has JSON config for the requested key but ConfigDeserializer
 *       is not on the classpath, throws TikaConfigException with a clear error message</li>
 *   <li>If ConfigDeserializer is available, delegates to it for deserialization</li>
 *   <li>If no config is present, returns the default config</li>
 * </ul>
 * <p>
 * Usage in parsers:
 * <pre>
 * PDFParserConfig localConfig = ParseContextConfig.getConfig(
 *     context, "pdf-parser", PDFParserConfig.class, defaultConfig);
 * </pre>
 *
 * @since Apache Tika 4.0
 */
public class ParseContextConfig {

    private static final Class<?> CONFIG_DESERIALIZER_CLASS;
    private static final Method GET_CONFIG_METHOD;
    private static final Method HAS_CONFIG_METHOD;

    static {
        Class<?> clazz = null;
        Method getMethod = null;
        Method hasMethod = null;
        try {
            clazz = Class.forName("org.apache.tika.serialization.ConfigDeserializer");
            getMethod = clazz.getMethod("getConfig",
                ParseContext.class, String.class, Class.class, Object.class);
            hasMethod = clazz.getMethod("hasConfig", ParseContext.class, String.class);
        } catch (ClassNotFoundException | NoSuchMethodException e) {
            // ConfigDeserializer not on classpath - will check at runtime if needed
        }
        CONFIG_DESERIALIZER_CLASS = clazz;
        GET_CONFIG_METHOD = getMethod;
        HAS_CONFIG_METHOD = hasMethod;
    }

    /**
     * Retrieves runtime configuration from ParseContext.
     * <p>
     * This method first checks if the config is already resolved in ParseContext
     * (via {@code context.get(configClass)}). If found, it returns immediately without
     * re-deserializing. This is efficient for embedded documents where the config
     * was already deserialized for the parent document.
     * <p>
     * If not found, it checks jsonConfigs for the config key and deserializes
     * the JSON. The deserialized config is cached in resolvedConfigs and also
     * set in the main ParseContext for future lookups.
     * <p>
     * This method performs defensive checking: if the ParseContext has JSON configuration
     * for the requested key but the ConfigDeserializer is not available on the classpath,
     * it throws TikaConfigException to prevent silent failures.
     *
     * @param context the parse context (may be null)
     * @param configKey the configuration key (e.g., "pdf-parser", "html-parser")
     * @param configClass the configuration class
     * @param defaultConfig the default configuration to use if no runtime config exists
     * @param <T> the configuration type
     * @return the runtime config merged with defaults, or the default config if no runtime config
     * @throws TikaConfigException if ParseContext has JSON config but ConfigDeserializer is not on classpath
     * @throws IOException if deserialization fails
     */
    public static <T> T getConfig(ParseContext context, String configKey,
                                   Class<T> configClass, T defaultConfig)
            throws TikaConfigException, IOException {
        if (context == null) {
            return defaultConfig;
        }

        // First check if config is already resolved in ParseContext
        // (may have been set by a previous call or by user code)
        T existingConfig = context.get(configClass);
        if (existingConfig != null) {
            return existingConfig;
        }

        // Check for JSON config
        if (!context.hasJsonConfig(configKey)) {
            return defaultConfig;
        }

        // JSON config exists for this key - ConfigDeserializer MUST be available
        if (CONFIG_DESERIALIZER_CLASS == null) {
            throw new TikaConfigException(String.format(Locale.ROOT,
                "ParseContext contains JSON configuration for '%s' " +
                "but org.apache.tika.serialization.ConfigDeserializer is not on the classpath. " +
                "This means your runtime configuration will be ignored. " +
                "To fix: add tika-serialization as a dependency.",
                configKey));
        }

        // ConfigDeserializer is available - delegate to it
        // (ConfigDeserializer.getConfig also sets the config in ParseContext for future lookups)
        try {
            @SuppressWarnings("unchecked")
            T result = (T) GET_CONFIG_METHOD.invoke(null, context, configKey, configClass, defaultConfig);
            return result;
        } catch (Exception e) {
            Throwable cause = e.getCause() != null ? e.getCause() : e;
            if (cause instanceof IOException) {
                throw (IOException) cause;
            }
            throw new IOException("Failed to deserialize config for '" + configKey + "': " +
                cause.getMessage(), cause);
        }
    }

    /**
     * Checks if runtime configuration exists for the given key.
     * <p>
     * Unlike {@link #getConfig}, this method does NOT throw if ConfigDeserializer
     * is missing - it only checks for the presence of config.
     *
     * @param context the parse context
     * @param configKey the configuration key
     * @return true if JSON config exists for this key
     */
    public static boolean hasConfig(ParseContext context, String configKey) {
        if (context == null) {
            return false;
        }
        return context.hasJsonConfig(configKey);
    }

    /**
     * Checks if ConfigDeserializer is available on the classpath.
     *
     * @return true if tika-serialization is available
     */
    public static boolean isConfigDeserializerAvailable() {
        return CONFIG_DESERIALIZER_CLASS != null;
    }
}