XmlToJsonConfigConverterTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.cli;

import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;

import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Map;

import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;

import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.JSoupParser;
import org.apache.tika.parser.pdf.PDFParser;

/**
 * Tests for XmlToJsonConfigConverter.
 * These tests verify that XML configurations are correctly converted to JSON
 * and can be loaded by TikaLoader to produce properly configured parsers.
 */
public class XmlToJsonConfigConverterTest {

    @Test
    public void testSimpleParserConfig(@TempDir Path tempDir) throws Exception {
        Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI());
        Path jsonPath = tempDir.resolve("simple-config.json");

        // Convert XML to JSON
        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);

        // Verify JSON file was created
        assertTrue(Files.exists(jsonPath));

        // Load the JSON config with TikaLoader
        TikaLoader loader = TikaLoader.load(jsonPath);
        Parser parser = loader.loadParsers();

        assertNotNull(parser);
        assertTrue(parser instanceof CompositeParser);

        // Verify PDF parser is configured
        CompositeParser compositeParser = (CompositeParser) parser;
        ParseContext context = new ParseContext();
        Map<MediaType, Parser> parsers = compositeParser.getParsers(context);

        // Check that PDF parser is present
        MediaType pdfType = MediaType.parse("application/pdf");
        assertTrue(parsers.containsKey(pdfType), "PDF parser should be configured");

        Parser pdfParser = parsers.get(pdfType);
        assertTrue(pdfParser instanceof PDFParser, "Parser for PDF should be PDFParser");

        // The actual parser configuration (sortByPosition, extractInlineImages, etc.)
        // is tested by the parser's behavior, not directly accessible here
    }

    @Test
    public void testParserWithExcludes(@TempDir Path tempDir) throws Exception {
        Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-with-excludes.xml").toURI());
        Path jsonPath = tempDir.resolve("excludes-config.json");

        // Convert XML to JSON
        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);

        String json = new String(Files.readAllBytes(jsonPath), StandardCharsets.UTF_8);

        // Verify exclude is at the correct level (with underscore prefix)
        assertTrue(json.contains("\"_exclude\""), "Should have _exclude array");
        assertFalse(json.contains("\"_decorate\""), "_decorate should not be used for parser excludes");
        assertTrue(json.contains("\"jsoup-parser\""), "Should exclude jsoup-parser");
        assertTrue(json.contains("\"pdf-parser\""), "Should exclude pdf-parser");

        // Load the JSON config with TikaLoader
        TikaLoader loader = TikaLoader.load(jsonPath);
        Parser parser = loader.loadParsers();

        assertNotNull(parser);
        assertTrue(parser instanceof CompositeParser);

        // Verify parsers are configured
        CompositeParser compositeParser = (CompositeParser) parser;
        for (Parser p : ((CompositeParser) parser).getAllComponentParsers()) {
            if (p instanceof PDFParser) {
                fail("pdf parser should have been excluded");
            }
        }
        ParseContext context = new ParseContext();
        Map<MediaType, Parser> parsers = compositeParser.getParsers(context);

        // Check that HTML parser is present (JSoupParser should be configured)
        MediaType htmlType = MediaType.parse("text/html");
        assertTrue(parsers.containsKey(htmlType), "HTML parser should be configured");

        Parser htmlParser = parsers.get(htmlType);
        // JSoupParser extends HtmlParser, so this checks for the correct family
        assertTrue(htmlParser instanceof JSoupParser, "Parser for HTML should be HtmlParser or JSoupParser");
    }

    @Test
    public void testNumericTypes(@TempDir Path tempDir) throws Exception {
        Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-numeric-types.xml").toURI());
        Path jsonPath = tempDir.resolve("numeric-config.json");

        // Convert XML to JSON
        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);

        // Verify JSON file was created and contains proper numeric types
        assertTrue(Files.exists(jsonPath));

        // Read the JSON to verify numeric types are preserved
        String json = new String(Files.readAllBytes(jsonPath), StandardCharsets.UTF_8);

        // Verify numbers are not quoted (they should appear as: "density": 300, not "density": "300")
        assertTrue(json.contains("\"density\" : 300"), "density should be numeric, not string");
        assertFalse(json.contains("\"timeout\" : \"300\""), "timeout should not be a quoted string");

        // Load the JSON config with TikaLoader to verify it's valid
        TikaLoader loader = TikaLoader.load(jsonPath);
        Parser parser = loader.loadParsers();
        assertNotNull(parser);
    }

    @Test
    public void testFileConversion(@TempDir Path tempDir) throws Exception {
        Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI());
        Path jsonPath = tempDir.resolve("output.json");

        // Test the Path-based conversion method
        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);

        // Verify file exists
        assertTrue(Files.exists(jsonPath));

        // Verify it can be loaded by TikaLoader
        TikaLoader loader = TikaLoader.load(jsonPath);
        Parser parser = loader.loadParsers();
        assertNotNull(parser);
        assertTrue(parser instanceof CompositeParser);
    }

    @Test
    public void testClassNameConversion(@TempDir Path tempDir) throws Exception {
        Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI());
        Path jsonPath = tempDir.resolve("classname-config.json");

        // Convert XML to JSON
        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);

        // Read JSON and verify component name conversion
        String json = new String(Files.readAllBytes(jsonPath), StandardCharsets.UTF_8);

        // Verify that PDFParser was converted to pdf-parser (kebab-case)
        assertTrue(json.contains("\"pdf-parser\""), "PDFParser should be converted to pdf-parser");

        // Verify the config loads successfully
        TikaLoader loader = TikaLoader.load(jsonPath);
        Parser parser = loader.loadParsers();
        assertNotNull(parser);
    }

    @Test
    public void testAutoDetectParserLoading(@TempDir Path tempDir) throws Exception {
        Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI());
        Path jsonPath = tempDir.resolve("autodetect-config.json");

        // Convert XML to JSON
        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);

        // Load via TikaLoader and get AutoDetectParser
        TikaLoader loader = TikaLoader.load(jsonPath);
        Parser autoDetectParser = loader.loadAutoDetectParser();

        assertNotNull(autoDetectParser);

        // Verify it supports PDF type
        ParseContext context = new ParseContext();
        MediaType pdfType = MediaType.parse("application/pdf");
        assertTrue(autoDetectParser.getSupportedTypes(context).contains(pdfType),
                "AutoDetectParser should support PDF");
    }

    @Test
    public void testRedundantExclusionWarning(@TempDir Path tempDir) throws Exception {
        // This test demonstrates the old pattern where users excluded parsers from default-parser
        // and then configured those same parsers separately. The converter will log an INFO message
        // informing users that the exclusion is redundant.
        Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-redundant-exclusion.xml").toURI());
        Path jsonPath = tempDir.resolve("redundant-config.json");

        // Convert XML to JSON (this will log the INFO message about redundant exclusions)
        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);

        String json = new String(Files.readAllBytes(jsonPath), StandardCharsets.UTF_8);

        // Verify the JSON still contains the exclusions (we don't remove them, just inform)
        assertTrue(json.contains("\"_exclude\""), "Should still have _exclude array");
        assertTrue(json.contains("\"pdf-parser\""), "Should have pdf-parser configured");
        assertTrue(json.contains("\"jsoup-parser\""), "Should have jsoup-parser configured");

        // Verify it loads correctly via TikaLoader
        TikaLoader loader = TikaLoader.load(jsonPath);
        Parser parser = loader.loadParsers();
        assertNotNull(parser);
        assertTrue(parser instanceof CompositeParser);

        // Verify both parsers are configured and working
        CompositeParser compositeParser = (CompositeParser) parser;
        ParseContext context = new ParseContext();
        Map<MediaType, Parser> parsers = compositeParser.getParsers(context);

        MediaType pdfType = MediaType.parse("application/pdf");
        assertTrue(parsers.containsKey(pdfType), "PDF parser should be configured");

        MediaType htmlType = MediaType.parse("text/html");
        assertTrue(parsers.containsKey(htmlType), "HTML parser should be configured");
    }

    @Test
    public void testTesseractArbitrarySettings(@TempDir Path tempDir) throws Exception {
        // Test the special case conversion of TesseractOCR's otherTesseractSettings
        String xmlConfig = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
                "<properties>\n" +
                "    <parsers>\n" +
                "        <parser class=\"org.apache.tika.parser.ocr.TesseractOCRParser\">\n" +
                "            <params>\n" +
                "                <param name=\"otherTesseractSettings\" type=\"list\">\n" +
                "                    <string>textord_initialx_ile 0.75</string>\n" +
                "                    <string>textord_noise_hfract 0.15625</string>\n" +
                "                </param>\n" +
                "            </params>\n" +
                "        </parser>\n" +
                "    </parsers>\n" +
                "</properties>";

        Path xmlPath = tempDir.resolve("tesseract-arbitrary.xml");
        Path jsonPath = tempDir.resolve("tesseract-arbitrary.json");
        Files.write(xmlPath, xmlConfig.getBytes(StandardCharsets.UTF_8));

        // Convert
        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);

        String json = new String(Files.readAllBytes(jsonPath), StandardCharsets.UTF_8);

        // Verify conversion: list of space-delimited pairs -> map
        assertTrue(json.contains("\"otherTesseractConfig\""),
                "Should convert to otherTesseractConfig");
        assertFalse(json.contains("\"otherTesseractSettings\""),
                "Should not keep old parameter name");
        assertTrue(json.contains("\"textord_initialx_ile\" : \"0.75\""),
                "Should parse key-value pairs correctly");
        assertTrue(json.contains("\"textord_noise_hfract\" : \"0.15625\""),
                "Should parse second pair");

        // Verify it loads via TikaLoader without errors
        TikaLoader loader = TikaLoader.load(jsonPath);
        Parser parser = loader.loadParsers();
        assertNotNull(parser);
    }

    @Test
    public void testListAndMapParameterTypes(@TempDir Path tempDir) throws Exception {
        Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-list-map-types.xml").toURI());
        Path jsonPath = tempDir.resolve("list-map-config.json");

        // Convert XML to JSON
        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);

        String json = new String(Files.readAllBytes(jsonPath), StandardCharsets.UTF_8);

        // Verify otherTesseractSettings (list) is converted to otherTesseractConfig (map)
        // This is a special case where space-delimited key-value pairs are parsed
        assertTrue(json.contains("\"otherTesseractConfig\" : {"),
                "Should convert otherTesseractSettings list to otherTesseractConfig map");
        assertFalse(json.contains("\"otherTesseractSettings\""),
                "Should not have old otherTesseractSettings name");
        assertTrue(json.contains("\"textord_initialx_ile\" : \"0.75\""),
                "Should parse first key-value pair");
        assertTrue(json.contains("\"textord_noise_hfract\" : \"0.15625\""),
                "Should parse second key-value pair");
        assertTrue(json.contains("\"preserve_interword_spaces\" : \"1\""),
                "Should parse third key-value pair");

        // Verify regular parameters still work
        assertTrue(json.contains("\"timeoutSeconds\" : 300"), "Should have integer parameter");
        assertTrue(json.contains("\"enableImagePreprocessing\" : true"), "Should have boolean parameter");
        assertTrue(json.contains("\"language\" : \"eng\""), "Should have string parameter");

        // Verify it loads correctly via TikaLoader
        TikaLoader loader = TikaLoader.load(jsonPath);
        Parser parser = loader.loadParsers();
        assertNotNull(parser);
        assertTrue(parser instanceof CompositeParser);
    }
}