ConfigLoader.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.config.loader;
import java.io.IOException;
import java.util.Set;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.tika.exception.TikaConfigException;
/**
* Loader for configuration objects from the "parse-context" section.
* <p>
* This class handles ParseContext components and configuration POJOs that are loaded
* into a ParseContext for use during parsing. All configurations loaded via ConfigLoader
* must be placed under the "parse-context" top-level node in the JSON.
* <p>
* For official Tika components and configurations (parsers, detectors, async, server, etc.),
* use the specific methods on {@link TikaLoader} or load directly from {@link TikaJsonConfig}.
*
* <p>Usage:
* <pre>
* TikaLoader loader = TikaLoader.load(configPath);
*
* // Load by explicit key
* MyConfig config = loader.configs().load("my-config", MyConfig.class);
*
* // Load by class name (auto-converts to kebab-case)
* MyConfig config = loader.configs().load(MyConfig.class);
* </pre>
*
* <p>JSON configuration example:
* <pre>
* {
* // Official Tika configs at root level (NOT loaded via configs())
* "parsers": [...],
* "detectors": [...],
* "pipes": {...},
* "server": {...},
*
* // ParseContext configs in "parse-context" (loaded via configs())
* "parse-context": {
* "embedded-limits": {
* "maxDepth": 10,
* "maxCount": 1000
* },
* "output-limits": {
* "writeLimit": 100000
* },
* "commons-digester-factory": {
* "algorithms": ["MD5", "SHA-256"]
* }
* }
* }
* </pre>
*/
public class ConfigLoader {
/**
* Reserved keys for complex components that require special handling.
* These cannot be loaded via ConfigLoader - use TikaLoader methods instead.
*/
private static final Set<String> PROHIBITED_KEYS =
Set.of("parsers", "detectors", "encoding-detectors", "encodingDetectors", "metadata-filters", "metadataFilters", "renderers", "translators");
private static final Set<String> PROHIBITED_CLASSES =
Set.of("org.apache.tika.parser.Parser", "org.apache.tika.detect.Detector",
"org.apache.tika.renderer.Renderer",
"org.apache.tika.detect.EncodingDetector", "org.apache.tika.metadata.filter.MetadataFilter");
private final TikaJsonConfig config;
private final ObjectMapper objectMapper;
ConfigLoader(TikaJsonConfig config, ObjectMapper objectMapper) {
this.config = config;
this.objectMapper = objectMapper;
}
/**
* Loads a configuration object using the class name converted to kebab-case.
* <p>
* For example, {@code MyAppConfig.class} will look for key "my-app-config".
* Class name suffixes like "Config", "Configuration", "Settings" are stripped first.
* <p>
* For interfaces, the JSON must specify the implementation (see {@link #load(String, Class)}).
*
* @param clazz The class to deserialize into (can be interface, abstract, or concrete)
* @param <T> The type to load
* @return the deserialized object, or null if key not found in config
* @throws TikaConfigException if loading fails or class is not instantiable
*/
public <T> T load(Class<T> clazz) throws TikaConfigException {
String key = deriveKeyFromClass(clazz);
return load(key, clazz);
}
/**
* Loads a configuration object using the class name, with a default value.
*
* @param clazz The class to deserialize into
* @param defaultValue The value to return if key not found in config
* @param <T> The type to load
* @return the deserialized object, or defaultValue if not present
* @throws TikaConfigException if loading fails or class is not instantiable
*/
public <T> T load(Class<T> clazz, T defaultValue) throws TikaConfigException {
T result = load(clazz);
return result != null ? result : defaultValue;
}
/**
* Loads a configuration object from the specified JSON key.
* <p>
* Supports two formats:
* <ul>
* <li>String value: treated as fully qualified class name to instantiate</li>
* <li>Object: deserialized directly into the target class</li>
* </ul>
* <p>
* For tier-1 polymorphic types (Parser, Detector, MetadataFilter), use the wrapper
* object format with friendly names: {@code {"pdf-parser": {...}}}
*
* @param key The JSON key to load from
* @param clazz The class to deserialize into (can be interface, abstract, or concrete)
* @param <T> The type to load
* @return the deserialized object, or null if key not found
* @throws TikaConfigException if loading fails or class cannot be instantiated
*/
public <T> T load(String key, Class<T> clazz) throws TikaConfigException {
validateKey(key);
validateClass(clazz);
JsonNode node = getNode(key);
if (node == null || node.isNull()) {
return null;
}
try {
// Strategy 1: String value - treat as class name (for interfaces)
if (node.isTextual()) {
return loadFromClassName(node.asText(), clazz);
}
// Strategy 2: Direct deserialization
// For tier-1 types (Parser, Detector, MetadataFilter), mixins handle polymorphism
// For concrete classes, Jackson deserializes directly
return objectMapper.treeToValue(node, clazz);
} catch (JsonProcessingException e) {
throw new TikaConfigException(
"Failed to deserialize '" + key + "' into " + clazz.getName(), e);
}
}
/**
* Loads a class from a string (fully qualified class name).
*/
@SuppressWarnings("unchecked")
private <T> T loadFromClassName(String className, Class<T> expectedType)
throws TikaConfigException {
try {
Class<?> clazz = Class.forName(className);
if (!expectedType.isAssignableFrom(clazz)) {
throw new TikaConfigException(
"Class " + className + " is not assignable to " + expectedType.getName());
}
// Try to instantiate with no-arg constructor
return (T) clazz.getDeclaredConstructor().newInstance();
} catch (ClassNotFoundException e) {
throw new TikaConfigException("Class not found: " + className, e);
} catch (ReflectiveOperationException e) {
throw new TikaConfigException(
"Failed to instantiate " + className +
". Ensure it has a public no-argument constructor.", e);
}
}
/**
* Loads a configuration object from the specified JSON key, with a default value.
*
* @param key The JSON key to load from
* @param clazz The class to deserialize into
* @param defaultValue The value to return if key not found in config
* @param <T> The type to load
* @return the deserialized object, or defaultValue if not present
* @throws TikaConfigException if loading fails or class is not instantiable
*/
public <T> T load(String key, Class<T> clazz, T defaultValue) throws TikaConfigException {
T result = load(key, clazz);
return result != null ? result : defaultValue;
}
/**
* Loads a configuration object by merging JSON properties into a copy of the default instance.
* <p>
* This allows partial configuration where only some properties are specified in JSON,
* and the rest retain their default values. The original defaultValue object is NOT modified.
*
* <p>Example:
* <pre>
* MyConfig defaults = new MyConfig();
* defaults.setTimeout(30000);
* defaults.setRetries(2);
* defaults.setEnabled(false);
*
* // JSON: { "enabled": true }
* // Result: timeout=30000, retries=2, enabled=true (merged!)
* // Note: 'defaults' object remains unchanged
* MyConfig config = loader.configs().loadWithDefaults("my-config",
* MyConfig.class,
* defaults);
* </pre>
*
* @param key The JSON key to load from
* @param clazz The class type (not used for deserialization, but for type safety)
* @param defaultValue The object with default values (will NOT be modified)
* @param <T> The type to load
* @return a new object with defaults merged with JSON properties, or the original default if key not found
* @throws TikaConfigException if loading fails
*/
public <T> T loadWithDefaults(String key, Class<T> clazz, T defaultValue)
throws TikaConfigException {
validateKey(key);
validateClass(clazz);
JsonNode node = getNode(key);
if (node == null || node.isNull()) {
return defaultValue;
}
try {
return JsonMergeUtils.mergeWithDefaults(objectMapper, node, clazz, defaultValue);
} catch (IOException e) {
throw new TikaConfigException(
"Failed to merge '" + key + "' into " + clazz.getName(), e);
}
}
/**
* Loads a configuration object by class name with defaults, merging JSON properties.
*
* @param clazz The class to deserialize into
* @param defaultValue The object with default values to merge into
* @param <T> The type to load
* @return the default object updated with JSON properties, or the original default if key not found
* @throws TikaConfigException if loading fails
*/
public <T> T loadWithDefaults(Class<T> clazz, T defaultValue) throws TikaConfigException {
String key = deriveKeyFromClass(clazz);
return loadWithDefaults(key, clazz, defaultValue);
}
/**
* Checks if a configuration key exists in the JSON config.
*
* @param key The JSON key to check
* @return true if the key exists and is not null
*/
public boolean hasKey(String key) {
JsonNode node = getNode(key);
return node != null && !node.isNull();
}
/**
* Gets a node by key from the "parse-context" section.
*
* @param key The JSON key to look for
* @return the node, or null if not found
*/
private JsonNode getNode(String key) {
JsonNode parseContext = config.getRootNode().get("parse-context");
if (parseContext != null && parseContext.isObject()) {
return parseContext.get(key);
}
return null;
}
/**
* Derives a kebab-case key from a class name.
* <p>
* Uses the full class name converted to kebab-case for consistency with
* the annotation processor's component naming.
*
* @param clazz the class to derive the key from
* @return kebab-case version of the class name
*/
private String deriveKeyFromClass(Class<?> clazz) {
String simpleName = clazz.getSimpleName();
return toKebabCase(simpleName);
}
/**
* Converts a camelCase or PascalCase string to kebab-case.
* Delegates to {@link KebabCaseConverter} for consistent behavior
* with the annotation processor.
*/
private String toKebabCase(String name) {
return KebabCaseConverter.toKebabCase(name);
}
/**
* Validates that the key is not reserved for complex components.
*/
private void validateKey(String key) throws TikaConfigException {
if (PROHIBITED_KEYS.contains(key)) {
throw new TikaConfigException(
"Cannot load '" + key + "' via ConfigLoader. " +
"This is a complex component that requires special handling. " +
"Use TikaLoader.load" + toPascalCase(key) + "() instead.");
}
}
/**
* Validates that complex Tika components aren't loaded via this method.
* Interfaces and abstract classes are allowed, but require explicit type info in JSON.
*/
private void validateClass(Class<?> clazz) throws TikaConfigException {
// Check for known complex component types (defense in depth)
String className = clazz.getName();
if (PROHIBITED_CLASSES.contains(className)) {
throw new TikaConfigException(
clazz.getSimpleName() + " is a Tika component interface. " +
"Use the appropriate TikaLoader method (e.g., loadParsers(), loadDetectors()).");
}
}
/**
* Converts kebab-case to PascalCase for error messages.
*/
private String toPascalCase(String kebabCase) {
StringBuilder result = new StringBuilder();
boolean capitalizeNext = true;
for (char c : kebabCase.toCharArray()) {
if (c == '-') {
capitalizeNext = true;
} else if (capitalizeNext) {
result.append(Character.toUpperCase(c));
capitalizeNext = false;
} else {
result.append(c);
}
}
return result.toString();
}
}