TikaJsonConfig.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.config.loader;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.tika.exception.TikaConfigException;
/**
* Parsed representation of a Tika JSON configuration file.
* Provides access to component configurations by type (parsers, detectors, etc.).
*
* <p>This class serves as the single source of truth for JSON parsing across
* core Tika (parsers, detectors) and tika-pipes (fetchers, emitters) components.
* It performs no validation - consumers validate only their own keys.
*
* <p><b>Unified Configuration Usage:</b>
* <pre>
* // Parse config once
* TikaJsonConfig jsonConfig = TikaJsonConfig.load(Paths.get("config.json"));
*
* // Load core Tika components (same classloader)
* TikaLoader tikaLoader = TikaLoader.load(jsonConfig);
* Parser parser = tikaLoader.loadParsers();
* Detector detector = tikaLoader.loadDetectors();
*
* // Load pipes/plugin components (different classloader)
* TikaPluginManager pluginManager = TikaPluginManager.load(jsonConfig);
* pluginManager.loadPlugins();
* pluginManager.startPlugins();
*
* // Extract config for plugins (crosses classloader boundary as string)
* JsonNode fetchersNode = jsonConfig.getRootNode().get("fetchers");
* if (fetchersNode != null) {
* String fetcherConfigJson = fetchersNode.toString();
* // Pass string to plugin - safe across classloader boundary
* }
* </pre>
*
* <p><b>JSON structure:</b>
* <pre>
* {
* // Core Tika components (validated by TikaLoader)
* "parsers": [
* { "pdf-parser": { "_mime-include": ["application/pdf"], "ocrStrategy": "AUTO", ... } },
* { "html-parser": { ... } },
* { "default-parser": { "exclude": ["some-parser"] } }
* { "pdf-parser": { "_mime-include": ["application/pdf"], "ocrStrategy": "AUTO" } },
* "html-parser", // String shorthand for no-config components
* { "default-parser": { "exclude": ["ocr-parser"] } }
* ],
* "detectors": [
* "poifs-container-detector", // String shorthand
* { "default-detector": { "spoolTypes": ["application/zip", "application/pdf"] } }
* ],
*
* // Pipes components (validated by validateKeys())
* "plugin-roots": ["/path/to/plugins"],
* "fetchers": [...],
* "emitters": [...]
* }
* </pre>
*
* <p>All components use array format for explicit ordering.
* Parsers support decoration via "_mime-include" and "_mime-exclude" fields.
* Components without configuration can use string shorthand: "component-name"
* instead of { "component-name": {} }.
* Parsers support mime filtering via "_mime-include" and "_mime-exclude" fields.
* Special "default-parser" entry enables SPI fallback for unlisted parsers.
*/
public class TikaJsonConfig {
/**
* Known top-level configuration keys across core Tika and pipes/plugins.
* Only kebab-case names are allowed.
*/
private static final Set<String> KNOWN_KEYS = Set.of(
// Globals
"metadata-list",
"service-loader",
"xml-reader-utils",
// Core Tika component keys
"parsers",
"detectors",
"encoding-detectors",
"metadata-filters",
"content-handler-factory",
"renderers",
"translator",
"auto-detect-parser",
"parse-context",
"server",
// Pipes/plugin keys
"fetchers",
"emitters",
"pipes-iterator",
"pipes-reporters",
"pipes",
"plugin-roots"
);
private static final ObjectMapper OBJECT_MAPPER =
TikaObjectMapperFactory.getMapper();
private final JsonNode rootNode;
private final Map<String, Map<String, JsonNode>> componentsByType;
private final Map<String, List<Map.Entry<String, JsonNode>>> arrayComponentsByType;
private TikaJsonConfig(JsonNode rootNode) {
this.rootNode = rootNode;
this.componentsByType = parseObjectComponents(rootNode);
this.arrayComponentsByType = parseArrayComponents(rootNode);
}
/**
* Loads configuration from a file.
*
* @param configPath the path to the JSON configuration file
* @return the parsed configuration
* @throws TikaConfigException if loading or parsing fails
*/
public static TikaJsonConfig load(Path configPath) throws TikaConfigException {
try (InputStream in = Files.newInputStream(configPath)) {
return load(in);
} catch (IOException e) {
throw new TikaConfigException("Failed to load config from: " + configPath, e);
}
}
/**
* Loads configuration from an input stream.
*
* @param inputStream the input stream containing JSON configuration
* @return the parsed configuration
* @throws TikaConfigException if loading or parsing fails
*/
public static TikaJsonConfig load(InputStream inputStream) throws TikaConfigException {
try {
JsonNode rootNode = OBJECT_MAPPER.readTree(inputStream);
TikaJsonConfig tikaJsonConfig = new TikaJsonConfig(rootNode);
tikaJsonConfig.validateKeys();
return tikaJsonConfig;
} catch (IOException e) {
throw new TikaConfigException("Failed to parse JSON configuration", e);
}
}
/**
* Creates an empty configuration (no config file).
* All components will be loaded from SPI.
*
* @return an empty configuration
*/
public static TikaJsonConfig loadDefault() {
JsonNode emptyNode = OBJECT_MAPPER.createObjectNode();
return new TikaJsonConfig(emptyNode);
}
/**
* Gets component configurations for a specific type (object format - used for parsers).
*
* @param componentType the component type (e.g., "parsers")
* @return map of component name to configuration JSON, or empty map if type not found
*/
public Map<String, JsonNode> getComponents(String componentType) {
return componentsByType.getOrDefault(componentType, Collections.emptyMap());
}
/**
* Gets component configurations for a specific type (array format - used for detectors, etc.).
*
* @param componentType the component type (e.g., "detectors")
* @return ordered list of (name, config) entries, or empty list if type not found
*/
public List<Map.Entry<String, JsonNode>> getArrayComponents(String componentType) {
return arrayComponentsByType.getOrDefault(componentType, Collections.emptyList());
}
/**
* Checks if a component type has any configured components (object format).
*
* @param componentType the component type
* @return true if the type has configurations
*/
public boolean hasComponents(String componentType) {
Map<String, JsonNode> components = componentsByType.get(componentType);
return components != null && !components.isEmpty();
}
/**
* Checks if a component type has any configured components (array format).
*
* @param componentType the component type
* @return true if the type has configurations
*/
public boolean hasArrayComponents(String componentType) {
List<Map.Entry<String, JsonNode>> components = arrayComponentsByType.get(componentType);
return components != null && !components.isEmpty();
}
/**
* Checks if a component type section exists in the config (even if empty).
*
* @param componentType the component type
* @return true if the section exists
*/
public boolean hasComponentSection(String componentType) {
return rootNode.has(componentType);
}
/**
* Gets the raw root JSON node.
*
* @return the root node
*/
public JsonNode getRootNode() {
return rootNode;
}
private Map<String, Map<String, JsonNode>> parseObjectComponents(JsonNode root) {
Map<String, Map<String, JsonNode>> result = new LinkedHashMap<>();
if (root == null || !root.isObject()) {
return result;
}
for (Map.Entry<String, JsonNode> entry : root.properties()) {
String componentType = entry.getKey();
JsonNode typeNode = entry.getValue();
// Only process object nodes (used for parsers)
if (!typeNode.isObject()) {
continue;
}
Map<String, JsonNode> components = new LinkedHashMap<>();
for (Map.Entry<String, JsonNode> componentEntry : typeNode.properties()) {
components.put(componentEntry.getKey(), componentEntry.getValue());
}
if (!components.isEmpty()) {
result.put(componentType, components);
}
}
return result;
}
private Map<String, List<Map.Entry<String, JsonNode>>> parseArrayComponents(JsonNode root) {
Map<String, List<Map.Entry<String, JsonNode>>> result = new LinkedHashMap<>();
if (root == null || !root.isObject()) {
return result;
}
for (Map.Entry<String, JsonNode> entry : root.properties()) {
String componentType = entry.getKey();
JsonNode typeNode = entry.getValue();
// Only process array nodes (used for detectors, filters, etc.)
if (!typeNode.isArray()) {
continue;
}
List<Map.Entry<String, JsonNode>> components = new ArrayList<>();
for (JsonNode arrayItem : typeNode) {
if (arrayItem.isTextual()) {
// String shorthand: "component-name" -> treat as { "component-name": {} }
String componentName = arrayItem.asText();
components.add(Map.entry(componentName, OBJECT_MAPPER.createObjectNode()));
} else if (arrayItem.isObject()) {
// Object syntax: { "component-name": {...config...} }
for (Map.Entry<String, JsonNode> componentEntry : arrayItem.properties()) {
components.add(Map.entry(componentEntry.getKey(), componentEntry.getValue()));
break; // Only take the first field
}
}
// Skip other types (null, numbers, arrays, etc.)
}
if (!components.isEmpty()) {
result.put(componentType, components);
}
}
return result;
}
/**
* Deserializes a configuration value for the given key.
*
* @param key the configuration key
* @param clazz the target class
* @param <T> the type to deserialize to
* @return the deserialized value, or null if key doesn't exist
* @throws IOException if deserialization fails
*/
public <T> T deserialize(String key, Class<T> clazz) throws IOException {
JsonNode node = rootNode.get(key);
if (node == null || node.isNull()) {
return null;
}
return OBJECT_MAPPER.treeToValue(node, clazz);
}
/**
* Checks if a configuration key exists.
*
* @param key the configuration key
* @return true if the key exists and is not null
*/
public boolean hasKey(String key) {
return rootNode.has(key) && !rootNode.get(key).isNull();
}
/**
* Validates that all top-level configuration keys are known.
* <p>
* This catches typos like "parser" instead of "parsers" or "pipes-reporter"
* instead of "pipes-reporters".
*
* @throws TikaConfigException if unknown keys are found
*/
private void validateKeys() throws TikaConfigException {
if (rootNode == null || !rootNode.isObject()) {
return;
}
Iterator<String> fieldNames = rootNode.fieldNames();
List<String> unknownKeys = new ArrayList<>();
while (fieldNames.hasNext()) {
String key = fieldNames.next();
// Must be a known key
if (!KNOWN_KEYS.contains(key)) {
unknownKeys.add(key);
}
}
if (!unknownKeys.isEmpty()) {
throw new TikaConfigException(
"Unknown configuration key(s): " + unknownKeys + ". " +
"Valid keys: " + KNOWN_KEYS);
}
}
@Override
public String toString() {
return "TikaJsonConfig{" + "rootNode=" + rootNode + '}';
}
}