TikaLoader.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.config.loader;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import com.fasterxml.jackson.core.StreamReadConstraints;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import org.apache.tika.config.GlobalSettings;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.CompositeEncodingDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.language.translate.DefaultTranslator;
import org.apache.tika.language.translate.Translator;
import org.apache.tika.metadata.filter.CompositeMetadataFilter;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.filter.NoOpFilter;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.AutoDetectParserConfig;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.renderer.CompositeRenderer;
import org.apache.tika.renderer.Renderer;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.serialization.ComponentConfig;
import org.apache.tika.serialization.ComponentNameResolver;
import org.apache.tika.serialization.JsonMetadata;
import org.apache.tika.serialization.JsonMetadataList;
import org.apache.tika.serialization.ParseContextUtils;
import org.apache.tika.serialization.serdes.ParseContextDeserializer;
/**
* Main entry point for loading Tika components from JSON configuration.
* Provides lazy loading of component types - only loads classes when requested.
*
* <p>Usage:
* <pre>
* TikaLoader loader = TikaLoader.load(Path.of("tika-config.json"));
* Parser parser = loader.loadParsers();
* Detector detector = loader.loadDetectors();
* </pre>
*
* <p>JSON configuration format:
* <pre>
* {
* "parsers": [
* {
* "pdf-parser": {
* "_mime-include": ["application/pdf"],
* "_mime-exclude": ["application/pdf+fdf"],
* "ocrStrategy": "AUTO",
* "extractInlineImages": true
* }
* }
* ],
* "detectors": [
* { "mime-magic-detector": { ... } }
* ]
* }
* </pre>
*/
public class TikaLoader {
// Static registration of component configurations
static {
registerComponentConfigs();
}
private static void registerComponentConfigs() {
// Complex components with custom loaders
ComponentConfig.builder("parsers", Parser.class)
.customLoader(new ParserLoader())
.register();
ComponentConfig.builder("detectors", Detector.class)
.customLoader(new DetectorLoader())
.register();
ComponentConfig.builder("encoding-detectors", EncodingDetector.class)
.customLoader(new EncodingDetectorLoader())
.register();
// Simple components with default list-based loading
ComponentConfig.builder("metadata-filters", MetadataFilter.class)
.loadAsList()
.wrapWith(list -> list.isEmpty()
? NoOpFilter.NOOP_FILTER
: new CompositeMetadataFilter((List<MetadataFilter>) list))
.defaultProvider(() -> NoOpFilter.NOOP_FILTER)
.register();
ComponentConfig.builder("renderers", Renderer.class)
.loadAsList()
.wrapWith(list -> new CompositeRenderer((List<Renderer>) list))
.register();
ComponentConfig.builder("translator", Translator.class)
.loadAsList()
.wrapWith(list -> list.isEmpty() ? null : (Translator) list.get(0))
.defaultProvider(DefaultTranslator::new)
.register();
}
private final TikaJsonConfig config;
private final ClassLoader classLoader;
private final ObjectMapper objectMapper;
// Cache of loaded components (keyed by component class)
private final Map<Class<?>, Object> componentCache = new ConcurrentHashMap<>();
// Static instances
private static MimeTypes mimeTypes;
private static TikaLoader defaultLoader;
// Lazy-initialized loader context
private LoaderContext loaderContext;
// Special cached instances that aren't standard components
private Parser autoDetectParser;
private Detector detectors;
private EncodingDetector encodingDetectors;
private MetadataFilter metadataFilter;
private ContentHandlerFactory contentHandlerFactory;
private Renderer renderers;
private Translator translator;
private ConfigLoader configLoader;
private GlobalSettings globalSettings;
private TikaLoader(TikaJsonConfig config, ClassLoader classLoader) {
this.config = config;
this.classLoader = classLoader;
this.objectMapper = TikaObjectMapperFactory.getMapper();
}
/**
* Initializes the loader by loading global settings.
* Should be called by all factory methods after construction.
*
* @throws TikaConfigException if loading global settings fails
*/
private void init() throws TikaConfigException, IOException {
loadGlobalSettings();
}
/**
* Loads a Tika configuration from a file.
* Global settings are automatically loaded and applied during initialization.
*
* @param configPath the path to the JSON configuration file
* @return the Tika loader
* @throws TikaConfigException if loading or parsing fails
*/
public static TikaLoader load(Path configPath) throws TikaConfigException, IOException {
return load(configPath, Thread.currentThread().getContextClassLoader());
}
/**
* Loads a Tika configuration from a file with a specific class loader.
* Global settings are automatically loaded and applied during initialization.
*
* @param configPath the path to the JSON configuration file
* @param classLoader the class loader to use for loading components
* @return the Tika loader
* @throws TikaConfigException if loading or parsing fails
*/
public static TikaLoader load(Path configPath, ClassLoader classLoader)
throws TikaConfigException, IOException {
TikaJsonConfig config = TikaJsonConfig.load(configPath);
TikaLoader loader = new TikaLoader(config, classLoader);
loader.init();
return loader;
}
/**
* Creates a default Tika loader with no configuration file.
* All components (parsers, detectors, etc.) will be loaded from SPI.
* Returns a cached instance if already created.
*
* @return the Tika loader
*/
public static synchronized TikaLoader loadDefault() {
if (defaultLoader == null) {
defaultLoader = loadDefault(Thread.currentThread().getContextClassLoader());
}
return defaultLoader;
}
/**
* Creates a default Tika loader with no configuration file and a specific class loader.
* All components (parsers, detectors, etc.) will be loaded from SPI.
*
* @param classLoader the class loader to use for loading components
* @return the Tika loader
*/
public static TikaLoader loadDefault(ClassLoader classLoader) {
TikaJsonConfig config = TikaJsonConfig.loadDefault();
TikaLoader loader = new TikaLoader(config, classLoader);
try {
loader.init();
} catch (IOException | TikaConfigException e) {
// Default config should never throw, but wrap in RuntimeException if it does
throw new RuntimeException("Failed to initialize default TikaLoader", e);
}
return loader;
}
/**
* Loads and returns all parsers.
* Syntactic sugar for {@code get(Parser.class)}.
* Results are cached - subsequent calls return the same instance.
*
* @return the parser (typically a CompositeParser internally)
* @throws TikaConfigException if loading fails
*/
public Parser loadParsers() throws TikaConfigException {
return get(Parser.class);
}
/**
* Loads and returns all detectors.
* Syntactic sugar for {@code get(Detector.class)}.
* Results are cached - subsequent calls return the same instance.
*
* @return the detector (typically a CompositeDetector internally)
* @throws TikaConfigException if loading fails
*/
public Detector loadDetectors() throws TikaConfigException {
return get(Detector.class);
}
/**
* Loads and returns all encoding detectors.
* Syntactic sugar for {@code get(EncodingDetector.class)}.
* Results are cached - subsequent calls return the same instance.
*
* @return the encoding detector (typically a CompositeEncodingDetector internally)
* @throws TikaConfigException if loading fails
*/
public EncodingDetector loadEncodingDetectors() throws TikaConfigException {
return get(EncodingDetector.class);
}
/**
* Loads and returns all metadata filters.
* Syntactic sugar for {@code get(MetadataFilter.class)}.
* Results are cached - subsequent calls return the same instance.
*
* @return the metadata filter (typically a CompositeMetadataFilter internally)
* @throws TikaConfigException if loading fails
*/
public MetadataFilter loadMetadataFilters() throws TikaConfigException {
return get(MetadataFilter.class);
}
/**
* Loads and returns the content handler factory.
* If "content-handler-factory" section exists in config, uses that factory.
* If section missing, returns a default BasicContentHandlerFactory with MARKDOWN handler.
* Results are cached - subsequent calls return the same instance.
*
* <p>Example JSON:
* <pre>
* {
* "content-handler-factory": {
* "basic-content-handler-factory": {
* "type": "HTML",
* "writeLimit": 100000
* }
* }
* }
* </pre>
*
* @return the content handler factory
* @throws TikaConfigException if loading fails
*/
public synchronized ContentHandlerFactory loadContentHandlerFactory() throws TikaConfigException {
if (contentHandlerFactory == null) {
// Check if content-handler-factory section exists in config
if (config.hasComponentSection("content-handler-factory")) {
try {
contentHandlerFactory = config.deserialize("content-handler-factory",
ContentHandlerFactory.class);
} catch (IOException e) {
throw new TikaConfigException("Failed to load content-handler-factory", e);
}
}
// Default to BasicContentHandlerFactory with MARKDOWN handler if not configured.
// Markdown preserves structural boundaries (headings, lists, code blocks)
// which enables higher-quality chunking in the inference pipeline.
if (contentHandlerFactory == null) {
contentHandlerFactory = new BasicContentHandlerFactory(
BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN, -1);
}
}
return contentHandlerFactory;
}
/**
* Loads and returns all renderers.
* Syntactic sugar for {@code get(Renderer.class)}.
* Results are cached - subsequent calls return the same instance.
*
* @return the renderer (typically a CompositeRenderer internally)
* @throws TikaConfigException if loading fails
*/
public Renderer loadRenderers() throws TikaConfigException {
return get(Renderer.class);
}
/**
* Loads and returns the translator.
* Syntactic sugar for {@code get(Translator.class)}.
* Results are cached - subsequent calls return the same instance.
*
* @return the translator
* @throws TikaConfigException if loading fails
*/
public Translator loadTranslator() throws TikaConfigException {
return get(Translator.class);
}
/**
* Loads and returns the AutoDetectParserConfig from the "auto-detect-parser" section.
* Returns null if the section is not present in the config.
*
* @return the AutoDetectParserConfig, or null if not configured
* @throws IOException if loading fails
*/
private AutoDetectParserConfig loadAutoDetectParserConfig() throws IOException {
return config.deserialize("auto-detect-parser", AutoDetectParserConfig.class);
}
/**
* Loads and returns an AutoDetectParser configured with this loader's parsers and detectors.
* Results are cached - subsequent calls return the same instance.
*
* @return the auto-detect parser
* @throws TikaConfigException if loading fails
* @throws IOException if loading AutoDetectParserConfig fails
*/
public synchronized Parser loadAutoDetectParser() throws TikaConfigException, IOException {
if (autoDetectParser == null) {
// Load directly from root-level config (not via configs() which only looks in "parse-context")
AutoDetectParserConfig adpConfig = loadAutoDetectParserConfig();
if (adpConfig == null) {
adpConfig = new AutoDetectParserConfig();
}
autoDetectParser = AutoDetectParser.build((CompositeParser)loadParsers(), loadDetectors(), adpConfig);
}
return autoDetectParser;
}
/**
* Loads and returns a ParseContext populated with components from the "parse-context" section.
* <p>
* This method deserializes the parse-context JSON and resolves all component references
* using the component registry. Components are looked up by their friendly names
* (e.g., "embedded-limits", "pdf-parser-config") and deserialized to their appropriate types.
* <p>
* Use this method when you need a pre-configured ParseContext for parsing operations.
*
* <p>Example usage:
* <pre>
* TikaLoader loader = TikaLoader.load(configPath);
* Parser parser = loader.loadAutoDetectParser();
* ParseContext context = loader.loadParseContext();
* Metadata metadata = Metadata.newInstance(context);
* parser.parse(stream, handler, metadata, context);
* </pre>
*
* @return a ParseContext populated with configured components
* @throws TikaConfigException if loading fails
*/
public ParseContext loadParseContext() throws TikaConfigException {
JsonNode parseContextNode = config.getRootNode().get("parse-context");
if (parseContextNode == null) {
return new ParseContext();
}
try {
ParseContext context =
ParseContextDeserializer.readParseContext(parseContextNode, objectMapper);
ParseContextUtils.resolveAll(context, classLoader);
return context;
} catch (IOException e) {
throw new TikaConfigException("Failed to load parse-context", e);
}
}
/**
* Loads a configuration object from the "parse-context" section, merging with defaults.
* <p>
* This method is useful when you have a base configuration (e.g., from code defaults or
* a previous load) and want to overlay values from the JSON config. Properties not
* specified in the JSON retain their default values.
* <p>
* The original defaults object is NOT modified - a new instance is returned.
*
* <p>Example usage for PDFParserConfig:
* <pre>
* // Load base config from tika-config.json at init time
* TikaLoader loader = TikaLoader.load(configPath);
* PDFParserConfig baseConfig = loader.loadConfig(PDFParserConfig.class, new PDFParserConfig());
*
* // At runtime, create per-request overrides
* PDFParserConfig requestConfig = new PDFParserConfig();
* requestConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
*
* // Merge: base config values + request overrides
* // (Note: for runtime merging, use JsonMergeUtils directly or loadConfig on a runtime loader)
* </pre>
*
* @param clazz the class to deserialize into
* @param defaults the default values to use for properties not in the JSON config
* @param <T> the configuration type
* @return a new instance with defaults merged with JSON config, or the original defaults if not configured
* @throws TikaConfigException if loading fails
*/
public <T> T loadConfig(Class<T> clazz, T defaults) throws TikaConfigException {
return configs().loadWithDefaults(clazz, defaults);
}
/**
* Loads a configuration object from the "parse-context" section by explicit key, merging with defaults.
* <p>
* This method is useful when the JSON key doesn't match the class name's kebab-case conversion,
* or when you want to load from a specific key.
*
* @param key the JSON key in the "parse-context" section
* @param clazz the class to deserialize into
* @param defaults the default values to use for properties not in the JSON config
* @param <T> the configuration type
* @return a new instance with defaults merged with JSON config, or the original defaults if not configured
* @throws TikaConfigException if loading fails
*/
public <T> T loadConfig(String key, Class<T> clazz, T defaults) throws TikaConfigException {
return configs().loadWithDefaults(key, clazz, defaults);
}
/**
* Returns a ConfigLoader for loading simple configuration objects.
* <p>
* This is internal - external code should use {@link #loadParseContext()} or
* {@link #loadConfig(Class, Object)} instead.
*
* @return the ConfigLoader instance
*/
private synchronized ConfigLoader configs() {
if (configLoader == null) {
configLoader = new ConfigLoader(config, objectMapper);
}
return configLoader;
}
/**
* Gets the underlying JSON configuration.
*
* @return the JSON configuration
*/
public TikaJsonConfig getConfig() {
return config;
}
/**
* Gets the class loader used for loading components.
*
* @return the class loader
*/
public ClassLoader getClassLoader() {
return classLoader;
}
/**
* Gets the media type registry.
* Lazily loads the default registry if not already set.
* This is a static singleton shared across all TikaLoader instances.
*
* @return the media type registry
*/
public static synchronized MediaTypeRegistry getMediaTypeRegistry() {
return getMimeTypes().getMediaTypeRegistry();
}
public static synchronized MimeTypes getMimeTypes() {
if (mimeTypes == null) {
mimeTypes = MimeTypes.getDefaultMimeTypes();
}
return mimeTypes;
}
/**
* Loads global configuration settings from the JSON config.
* These settings are applied to Tika's static configuration when loaded.
*
* <p>Settings include:
* <ul>
* <li>metadata-list - Jackson StreamReadConstraints for JsonMetadata/JsonMetadataList serialization</li>
* <li>service-loader - Service loader configuration</li>
* <li>xml-reader-utils - XML parser security settings</li>
* </ul>
*
* <p>Example JSON:
* <pre>
* {
* "metadata-list": {
* "maxStringLength": 50000000,
* "maxNestingDepth": 10,
* "maxNumberLength": 500
* },
* "xml-reader-utils": {
* "maxEntityExpansions": 1000,
* "maxNumReuses": 100,
* "poolSize": 10
* }
* }
* </pre>
*
* @return the global settings, or an empty object if no settings are configured
* @throws TikaConfigException if loading fails
*/
public synchronized GlobalSettings loadGlobalSettings() throws IOException, TikaConfigException {
if (globalSettings == null) {
globalSettings = new GlobalSettings();
// Load metadata-list config for JsonMetadata/JsonMetadataList serialization
loadMetadataListConfig();
// Load service-loader config (official Tika config at root level)
GlobalSettings.ServiceLoaderConfig serviceLoaderConfig =
config.deserialize("service-loader", GlobalSettings.ServiceLoaderConfig.class);
if (serviceLoaderConfig != null) {
globalSettings.setServiceLoader(serviceLoaderConfig);
}
// Load xml-reader-utils config (official Tika config at root level)
GlobalSettings.XmlReaderUtilsConfig xmlReaderUtilsConfig =
config.deserialize("xml-reader-utils", GlobalSettings.XmlReaderUtilsConfig.class);
if (xmlReaderUtilsConfig != null) {
globalSettings.setXmlReaderUtils(xmlReaderUtilsConfig);
}
}
return globalSettings;
}
/**
* Loads the metadata-list configuration section and applies it to
* JsonMetadata and JsonMetadataList serializers.
* <p>
* Configuration uses Jackson's StreamReadConstraints property names:
* <pre>
* {
* "metadata-list": {
* "maxStringLength": 20000000,
* "maxNestingDepth": 10,
* "maxNumberLength": 500
* }
* }
* </pre>
*/
private void loadMetadataListConfig() {
JsonNode metadataListNode = config.getRootNode().get("metadata-list");
if (metadataListNode == null) {
return;
}
StreamReadConstraints.Builder builder = StreamReadConstraints.builder();
if (metadataListNode.has("maxStringLength")) {
builder.maxStringLength(metadataListNode.get("maxStringLength").asInt());
}
if (metadataListNode.has("maxNestingDepth")) {
builder.maxNestingDepth(metadataListNode.get("maxNestingDepth").asInt());
}
if (metadataListNode.has("maxNumberLength")) {
builder.maxNumberLength(metadataListNode.get("maxNumberLength").asInt());
}
StreamReadConstraints constraints = builder.build();
JsonMetadata.setStreamReadConstraints(constraints);
JsonMetadataList.setStreamReadConstraints(constraints);
}
/**
* Gets the global settings if they have been loaded.
*
* @return the global settings, or null if not yet loaded
*/
public GlobalSettings getGlobalSettings() {
return globalSettings;
}
// ==================== Generic Component Access ====================
/**
* Gets a component by its class type.
* Components are loaded lazily and cached.
*
* @param componentClass the component class (e.g., Parser.class, Detector.class)
* @return the loaded component
* @throws TikaConfigException if loading fails
*/
@SuppressWarnings("unchecked")
public <T> T get(Class<T> componentClass) throws TikaConfigException {
// Check cache first
if (componentCache.containsKey(componentClass)) {
return (T) componentCache.get(componentClass);
}
// Get component config from registry
ComponentConfig<T> componentConfig = ComponentNameResolver.getComponentConfig(componentClass);
if (componentConfig == null) {
throw new IllegalArgumentException(
"No component registered for class: " + componentClass.getName());
}
// Load the component
T component = loadComponent(componentConfig);
// Cache and return
if (component != null) {
componentCache.put(componentClass, component);
}
return component;
}
/**
* Gets a component by its JSON field name.
* Components are loaded lazily and cached.
*
* @param jsonField the JSON field name (e.g., "parsers", "detectors")
* @return the loaded component
* @throws TikaConfigException if loading fails
*/
@SuppressWarnings("unchecked")
public <T> T get(String jsonField) throws TikaConfigException {
// Get component config from registry by field name
ComponentConfig<?> componentConfig = ComponentNameResolver.getComponentConfig(jsonField);
if (componentConfig == null) {
throw new IllegalArgumentException("No component registered for field: " + jsonField);
}
// Delegate to get by class (which handles caching)
return (T) get(componentConfig.getComponentClass());
}
/**
* Load a component using its configuration.
* Delegates to custom loader if available, otherwise uses default list-based loading.
*/
private <T> T loadComponent(ComponentConfig<T> componentConfig) throws TikaConfigException {
if (componentConfig.hasCustomLoader()) {
// Use custom loader for complex components
return componentConfig.getCustomLoader().load(config, getLoaderContext());
} else {
// Use default list-based loading for simple components
return loadDefaultComponent(componentConfig);
}
}
/**
* Default loading for simple components.
* No special handling - just deserialize, wrap, done.
*/
private <T> T loadDefaultComponent(ComponentConfig<T> componentConfig) throws TikaConfigException {
List<T> components = loadComponentList(
componentConfig.getJsonField(),
componentConfig.getComponentClass());
if (components.isEmpty()) {
return componentConfig.hasDefault()
? componentConfig.getDefault()
: null;
}
return componentConfig.hasListWrapper()
? componentConfig.wrapList(components)
: components.get(0);
}
/**
* Get the loader context, creating it lazily.
*/
private synchronized LoaderContext getLoaderContext() {
if (loaderContext == null) {
loaderContext = new LoaderContext(classLoader, objectMapper, this::get);
}
return loaderContext;
}
// ==================== Component List Loading ====================
/**
* Loads a list of components from the JSON configuration.
*
* @param jsonField the JSON field name (e.g., "parsers", "detectors")
* @param componentClass the component class
* @return list of loaded components (may be empty, never null)
*/
private <T> List<T> loadComponentList(String jsonField, Class<T> componentClass)
throws TikaConfigException {
List<Map.Entry<String, JsonNode>> entries = config.getArrayComponents(jsonField);
if (entries.isEmpty()) {
return new ArrayList<>();
}
List<T> components = new ArrayList<>();
for (Map.Entry<String, JsonNode> entry : entries) {
String typeName = entry.getKey();
JsonNode configNode = entry.getValue();
try {
// Create wrapper node: { "type-name": {...config...} }
ObjectNode wrapperNode = objectMapper.createObjectNode();
wrapperNode.set(typeName, configNode);
// Deserialize using Jackson (TikaModule handles type resolution)
T component = objectMapper.treeToValue(wrapperNode, componentClass);
components.add(component);
} catch (Exception e) {
throw new TikaConfigException(
"Failed to load " + componentClass.getSimpleName() + ": " + typeName, e);
}
}
return components;
}
// ==================== Serialization ====================
/**
* Saves the current configuration to a JSON file (pretty-printed).
*/
public void save(File file) throws IOException {
objectMapper.writerWithDefaultPrettyPrinter().writeValue(file, buildOutputNode());
}
/**
* Saves the current configuration to an output stream (pretty-printed).
*/
public void save(OutputStream outputStream) throws IOException {
objectMapper.writerWithDefaultPrettyPrinter().writeValue(outputStream, buildOutputNode());
}
/**
* Converts the current configuration to a JSON string (pretty-printed).
*/
public String toJson() throws IOException {
return objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(buildOutputNode());
}
private ObjectNode buildOutputNode() throws IOException {
ObjectNode output = objectMapper.createObjectNode();
// Serialize loaded components from cache
if (componentCache.containsKey(Parser.class)) {
output.set("parsers", serializeComponent(componentCache.get(Parser.class), "parsers"));
} else if (config.hasArrayComponents("parsers")) {
output.set("parsers", config.getRootNode().get("parsers"));
}
if (componentCache.containsKey(Detector.class)) {
output.set("detectors", serializeComponent(componentCache.get(Detector.class), "detectors"));
} else if (config.hasArrayComponents("detectors")) {
output.set("detectors", config.getRootNode().get("detectors"));
}
if (componentCache.containsKey(EncodingDetector.class)) {
output.set("encoding-detectors", serializeComponent(componentCache.get(EncodingDetector.class), "encoding-detectors"));
} else if (config.hasArrayComponents("encoding-detectors")) {
output.set("encoding-detectors", config.getRootNode().get("encoding-detectors"));
}
Object metadataFilter = componentCache.get(MetadataFilter.class);
if (metadataFilter != null && metadataFilter != NoOpFilter.NOOP_FILTER) {
output.set("metadata-filters", serializeComponent(metadataFilter, "metadata-filters"));
} else if (config.hasArrayComponents("metadata-filters")) {
output.set("metadata-filters", config.getRootNode().get("metadata-filters"));
}
if (componentCache.containsKey(Renderer.class)) {
output.set("renderers", serializeComponent(componentCache.get(Renderer.class), "renderers"));
} else if (config.hasArrayComponents("renderers")) {
output.set("renderers", config.getRootNode().get("renderers"));
}
// Preserve auto-detect-parser config if present
JsonNode adpNode = config.getRootNode().get("auto-detect-parser");
if (adpNode != null && !adpNode.isNull()) {
output.set("auto-detect-parser", adpNode);
}
return output;
}
private JsonNode serializeComponent(Object component, String jsonField) throws IOException {
Object toSerialize = unwrapForSerialization(component);
if (toSerialize == null) {
return objectMapper.createArrayNode();
}
return objectMapper.valueToTree(toSerialize);
}
@SuppressWarnings("unchecked")
private Object unwrapForSerialization(Object component) {
if (component instanceof CompositeParser cp) {
Map<org.apache.tika.mime.MediaType, Parser> parserMap = cp.getParsers();
// Get unique parsers from the map
return new ArrayList<>(new HashSet<>(parserMap.values()));
} else if (component instanceof CompositeDetector cd) {
return cd.getDetectors();
} else if (component instanceof CompositeMetadataFilter cmf) {
return cmf.getFilters();
} else if (component instanceof CompositeEncodingDetector ced) {
return ced.getDetectors();
}
// For types without accessor methods (like CompositeRenderer), return as-is
return component;
}
}