TikaJsonConfigTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.config.loader;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.databind.JsonNode;
import org.junit.jupiter.api.Test;
/**
* Unit tests for TikaJsonConfig parsing functionality.
*/
public class TikaJsonConfigTest {
@Test
public void testStringShorthandForParsers() throws Exception {
String json = """
{
"parsers": [
"html-parser",
{ "pdf-parser": { "ocrStrategy": "AUTO" } },
"txt-parser"
]
}
""";
TikaJsonConfig config = TikaJsonConfig.load(
new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
List<Map.Entry<String, JsonNode>> parsers = config.getArrayComponents("parsers");
assertEquals(3, parsers.size(), "Should have 3 parsers");
// First parser: string shorthand
assertEquals("html-parser", parsers.get(0).getKey());
assertTrue(parsers.get(0).getValue().isEmpty(), "Should have empty config for shorthand");
// Second parser: full object syntax
assertEquals("pdf-parser", parsers.get(1).getKey());
assertEquals("AUTO", parsers.get(1).getValue().get("ocrStrategy").asText());
// Third parser: string shorthand
assertEquals("txt-parser", parsers.get(2).getKey());
assertTrue(parsers.get(2).getValue().isEmpty(), "Should have empty config for shorthand");
}
@Test
public void testStringShorthandForDetectors() throws Exception {
String json = """
{
"detectors": [
"poifs-container-detector",
{ "default-detector": { "spoolTypes": ["application/zip", "application/pdf"] } },
"zip-container-detector"
]
}
""";
TikaJsonConfig config = TikaJsonConfig.load(
new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
List<Map.Entry<String, JsonNode>> detectors = config.getArrayComponents("detectors");
assertEquals(3, detectors.size(), "Should have 3 detectors");
assertEquals("poifs-container-detector", detectors.get(0).getKey());
assertTrue(detectors.get(0).getValue().isEmpty());
assertEquals("default-detector", detectors.get(1).getKey());
assertTrue(detectors.get(1).getValue().get("spoolTypes").isArray());
assertEquals("zip-container-detector", detectors.get(2).getKey());
assertTrue(detectors.get(2).getValue().isEmpty());
}
@Test
public void testStringShorthandForMetadataFilters() throws Exception {
String json = """
{
"metadata-filters": [
"date-normalizing-metadata-filter",
{ "field-name-mapping-filter": { "excludeUnmapped": true } }
]
}
""";
TikaJsonConfig config = TikaJsonConfig.load(
new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
List<Map.Entry<String, JsonNode>> filters = config.getArrayComponents("metadata-filters");
assertEquals(2, filters.size(), "Should have 2 filters");
assertEquals("date-normalizing-metadata-filter", filters.get(0).getKey());
assertTrue(filters.get(0).getValue().isEmpty());
assertEquals("field-name-mapping-filter", filters.get(1).getKey());
assertTrue(filters.get(1).getValue().get("excludeUnmapped").asBoolean());
}
@Test
public void testMixedShorthandAndObjectSyntax() throws Exception {
String json = """
{
"parsers": [
"first-parser",
{ "second-parser": { "option": "value" } },
"third-parser",
{ "fourth-parser": {} },
"fifth-parser"
]
}
""";
TikaJsonConfig config = TikaJsonConfig.load(
new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
List<Map.Entry<String, JsonNode>> parsers = config.getArrayComponents("parsers");
assertEquals(5, parsers.size(), "Should have 5 parsers");
// Verify ordering is preserved
assertEquals("first-parser", parsers.get(0).getKey());
assertEquals("second-parser", parsers.get(1).getKey());
assertEquals("third-parser", parsers.get(2).getKey());
assertEquals("fourth-parser", parsers.get(3).getKey());
assertEquals("fifth-parser", parsers.get(4).getKey());
// Verify configs
assertTrue(parsers.get(0).getValue().isEmpty());
assertEquals("value", parsers.get(1).getValue().get("option").asText());
assertTrue(parsers.get(2).getValue().isEmpty());
assertTrue(parsers.get(3).getValue().isEmpty());
assertTrue(parsers.get(4).getValue().isEmpty());
}
@Test
public void testAllStringsShorthand() throws Exception {
String json = """
{
"detectors": [
"detector-a",
"detector-b",
"detector-c"
]
}
""";
TikaJsonConfig config = TikaJsonConfig.load(
new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
List<Map.Entry<String, JsonNode>> detectors = config.getArrayComponents("detectors");
assertEquals(3, detectors.size());
for (Map.Entry<String, JsonNode> entry : detectors) {
assertNotNull(entry.getKey());
assertTrue(entry.getValue().isEmpty(),
"All shorthand entries should have empty config");
}
}
@Test
public void testEmptyArrayWithShorthand() throws Exception {
String json = """
{
"parsers": []
}
""";
TikaJsonConfig config = TikaJsonConfig.load(
new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
List<Map.Entry<String, JsonNode>> parsers = config.getArrayComponents("parsers");
assertTrue(parsers.isEmpty(), "Empty array should return empty list");
}
}