ComponentRegistryIntegrationTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.config;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.detect.Detector;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
/**
* Integration tests to verify that the annotation processor correctly generated
* component index files mapping human-readable names to class names, and that
* JSON configuration can load components using these names.
*/
public class ComponentRegistryIntegrationTest {
@Test
public void testLoadDetectorByName() throws Exception {
// Load config that uses "poifs-container-detector" by name
TikaLoader loader = TikaLoaderHelper.getLoader("test-detectors.json");
Detector detector = loader.loadDetectors();
assertNotNull(detector, "Detector should be loaded");
// The detector will be wrapped in a CompositeDetector, so we need to check differently
// For now, just verify it loaded successfully
}
@Test
public void testLoadDefaultParser() throws Exception {
// Load config that uses "default-parser" by name
TikaLoader loader = TikaLoaderHelper.getLoader("test-default-parser.json");
Parser parser = loader.loadAutoDetectParser();
assertNotNull(parser, "Parser should be loaded");
assertTrue(parser instanceof AutoDetectParser,
"Should have loaded AutoDetectParser (default-parser)");
}
@Test
public void testLoadDefaultParserWithExclusions() throws Exception {
// Load config that excludes "pdf-parser" and "html-parser" by name
// This verifies that the component names can be used in exclusion lists
TikaLoader loader = TikaLoaderHelper.getLoader("test-default-with-exclusions.json");
Parser parser = loader.loadAutoDetectParser();
assertNotNull(parser, "Parser should be loaded");
assertTrue(parser instanceof AutoDetectParser,
"Should have loaded AutoDetectParser with exclusions");
// The config loaded successfully, which means it was able to resolve
// the component names "pdf-parser" and "html-parser" for exclusion
}
@Test
public void testLoadDcXmlParserByName() throws Exception {
// Load config that uses "dc-xml-parser" by name
// XMLParser has spi=false, but DcXMLParser should be available
TikaLoader loader = TikaLoaderHelper.getLoader("test-dc-xml-parser.json");
Parser parser = loader.loadParsers();
assertNotNull(parser, "Parser should be loaded");
// The parser will be wrapped in a CompositeParser, check that it loaded successfully
}
@Test
public void testSpiFalseParserInIndexButNotInSpi() throws Exception {
// XMLParser has spi=false:
// - SHOULD be in index (for name-based configuration)
// - should NOT be in SPI (for auto-discovery via ServiceLoader)
Map<String, String> parserIndex = readAllIndexFiles("META-INF/tika/parsers.idx");
// Verify xml-parser IS in index for name-based config
assertTrue(parserIndex.containsKey("xml-parser"),
"xml-parser should be in index (spi=false still allows name-based config)");
assertEquals("org.apache.tika.parser.xml.XMLParser",
parserIndex.get("xml-parser"),
"xml-parser should map to XMLParser class");
// Verify dc-xml-parser is also in index
assertTrue(parserIndex.containsKey("dc-xml-parser"),
"dc-xml-parser should be in index");
assertEquals("org.apache.tika.parser.xml.DcXMLParser",
parserIndex.get("dc-xml-parser"),
"dc-xml-parser should map to DcXMLParser class");
// Verify xml-parser is NOT in SPI file
Map<String, Boolean> spiParsers = readAllSpiFiles("META-INF/services/org.apache.tika.parser.Parser");
assertFalse(spiParsers.containsKey("org.apache.tika.parser.xml.XMLParser"),
"XMLParser should NOT be in SPI (spi=false prevents auto-discovery)");
}
@Test
public void testIndexFilesFollowKebabCaseConvention() throws Exception {
// Test that all component names follow kebab-case convention
Map<String, String> parserIndex = readAllIndexFiles("META-INF/tika/parsers.idx");
Map<String, String> detectorIndex = readAllIndexFiles("META-INF/tika/detectors.idx");
assertFalse(parserIndex.isEmpty(), "Parser index should not be empty");
assertFalse(detectorIndex.isEmpty(), "Detector index should not be empty");
// Verify all names follow kebab-case convention
for (String name : parserIndex.keySet()) {
assertTrue(name.matches("[a-z0-9-]+"),
"Parser name should be kebab-case: " + name);
}
for (String name : detectorIndex.keySet()) {
assertTrue(name.matches("[a-z0-9-]+"),
"Detector name should be kebab-case: " + name);
}
}
@Test
public void testIndexFilesHaveCorrectFormat() throws Exception {
// Verify index files have Apache license and correct format
Enumeration<URL> resources = getClass().getClassLoader()
.getResources("META-INF/tika/parsers.idx");
assertTrue(resources.hasMoreElements(), "At least one parsers.idx should exist");
URL url = resources.nextElement();
try (BufferedReader reader = new BufferedReader(
new InputStreamReader(url.openStream(), StandardCharsets.UTF_8))) {
String firstLine = reader.readLine();
assertTrue(firstLine.contains("Licensed to the Apache Software Foundation"),
"First line should contain Apache license");
String line;
boolean foundFormatComment = false;
boolean foundEntry = false;
while ((line = reader.readLine()) != null) {
if (line.startsWith("# Format:")) {
foundFormatComment = true;
}
if (!line.startsWith("#") && !line.trim().isEmpty()) {
foundEntry = true;
// Verify format: name=fully.qualified.ClassName
assertTrue(line.matches("[a-z0-9-]+=\\S+"),
"Line should match format 'name=ClassName': " + line);
}
}
assertTrue(foundFormatComment, "Should have format comment");
assertTrue(foundEntry, "Should have at least one entry");
}
}
@Test
public void testNoDuplicateComponentNames() throws Exception {
// Verify no duplicate names across all index files
Map<String, String> parserIndex = readAllIndexFiles("META-INF/tika/parsers.idx");
Map<String, String> detectorIndex = readAllIndexFiles("META-INF/tika/detectors.idx");
// Check for duplicates within each index (readAllIndexFiles already handles this)
// The fact that we can build the maps without issues means no duplicates within files
// Verify all class names are fully qualified
for (Map.Entry<String, String> entry : parserIndex.entrySet()) {
String className = entry.getValue();
assertTrue(className.contains("."),
"Parser class name should be fully qualified: " + className);
assertTrue(className.startsWith("org.apache.tika."),
"Parser class should be in org.apache.tika package: " + className);
}
for (Map.Entry<String, String> entry : detectorIndex.entrySet()) {
String className = entry.getValue();
assertTrue(className.contains("."),
"Detector class name should be fully qualified: " + className);
assertTrue(className.startsWith("org.apache.tika."),
"Detector class should be in org.apache.tika package: " + className);
}
}
/**
* Reads all index files with the given resource path from all JARs on the classpath
* and merges them into a single map.
*/
private Map<String, String> readAllIndexFiles(String resourcePath) throws Exception {
Map<String, String> mergedIndex = new HashMap<>();
Enumeration<URL> resources = getClass().getClassLoader().getResources(resourcePath);
assertTrue(resources.hasMoreElements(),
"At least one " + resourcePath + " file should exist");
while (resources.hasMoreElements()) {
URL url = resources.nextElement();
try (InputStream stream = url.openStream()) {
Map<String, String> index = readIndexFile(stream);
mergedIndex.putAll(index);
}
}
return mergedIndex;
}
/**
* Reads an index file in the format: name=fully.qualified.ClassName[:key=contextKeyClass]
* Returns a map of component name -> class name (without the :key= suffix).
*/
private Map<String, String> readIndexFile(InputStream stream) throws Exception {
Map<String, String> index = new HashMap<>();
try (BufferedReader reader = new BufferedReader(
new InputStreamReader(stream, StandardCharsets.UTF_8))) {
String line;
while ((line = reader.readLine()) != null) {
// Skip comments and empty lines
if (line.startsWith("#") || line.trim().isEmpty()) {
continue;
}
String[] parts = line.split("=", 2);
if (parts.length == 2) {
String name = parts[0].trim();
String value = parts[1].trim();
// Strip optional :key=contextKeyClass suffix
int colonIndex = value.indexOf(':');
if (colonIndex > 0) {
value = value.substring(0, colonIndex);
}
index.put(name, value);
}
}
}
return index;
}
/**
* Reads all SPI service files from all JARs on the classpath.
* Returns a map of class names (as keys) to true for easy lookup.
*/
private Map<String, Boolean> readAllSpiFiles(String resourcePath) throws Exception {
Map<String, Boolean> spiClasses = new HashMap<>();
Enumeration<URL> resources = getClass().getClassLoader().getResources(resourcePath);
assertTrue(resources.hasMoreElements(),
"At least one " + resourcePath + " file should exist");
while (resources.hasMoreElements()) {
URL url = resources.nextElement();
try (BufferedReader reader = new BufferedReader(
new InputStreamReader(url.openStream(), StandardCharsets.UTF_8))) {
String line;
while ((line = reader.readLine()) != null) {
// Skip comments and empty lines
line = line.trim();
if (line.isEmpty() || line.startsWith("#")) {
continue;
}
spiClasses.put(line, true);
}
}
}
return spiClasses;
}
}