TikaLoaderTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.config.loader;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;

import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.config.EmbeddedLimits;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;

/**
 * Unit tests for TikaLoader JSON configuration loading.
 */
public class TikaLoaderTest {

    @Test
    public void testBasicParserLoading() throws Exception {
        URL configUrl = getClass().getResource("/configs/test-loader-config.json");
        assertNotNull(configUrl, "Test config not found");

        Path configPath = Path.of(configUrl.toURI());
        TikaLoader loader = TikaLoader.load(configPath);

        Parser parser = loader.get(Parser.class);
        assertNotNull(parser, "Parser should not be null");
    }

    @Test
    public void testConfigurableParserConfiguration() throws Exception {
        URL configUrl = getClass().getResource("/configs/test-loader-config.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        Parser compositeParser = loader.get(Parser.class);

        // Parse with the composite parser to verify config was applied
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "application/test+configurable");

        try (TikaInputStream tis = TikaInputStream.get("test".getBytes(StandardCharsets.UTF_8))) {
            compositeParser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
        }

        // Verify the configured values were used
        assertEquals("configured-parser", metadata.get("parser-name"));
        assertEquals("2048", metadata.get("buffer-size"));
        assertEquals("true", metadata.get("enabled"));
        assertEquals("advanced", metadata.get("mode"));
    }

    @Test
    public void testMimeTypeDecoration() throws Exception {
        URL configUrl = getClass().getResource("/configs/test-decoration-config.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        Parser parser = loader.get(Parser.class);

        ParseContext context = new ParseContext();

        // Test that included types are supported
        assertTrue(parser.getSupportedTypes(context).contains(MediaType.parse("application/pdf")),
                "Should support application/pdf");
        assertTrue(parser.getSupportedTypes(context).contains(MediaType.parse("text/plain")),
                "Should support text/plain");

        // Test that excluded types are not supported
        assertFalse(parser.getSupportedTypes(context).contains(MediaType.parse("application/pdf+fdf")),
                "Should NOT support application/pdf+fdf (excluded)");
    }

    @Test
    public void testLazyLoading() throws Exception {
        URL configUrl = getClass().getResource("/configs/test-loader-config.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);

        // Verify loader created but parsers not yet loaded
        assertNotNull(loader, "Loader should be created");

        // Load parsers
        Parser parser1 = loader.get(Parser.class);
        assertNotNull(parser1, "First load should return parser");

        // Load again - should return cached instance
        Parser parser2 = loader.get(Parser.class);
        assertTrue(parser1 == parser2, "Should return same cached instance");
    }

    @Test
    public void testMinimalParser() throws Exception {
        URL configUrl = getClass().getResource("/configs/test-loader-config.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        Parser compositeParser = loader.get(Parser.class);

        // Parse with minimal parser type
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "application/test+minimal");

        try (TikaInputStream tis = TikaInputStream.get("test".getBytes(StandardCharsets.UTF_8))) {
            compositeParser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
        }

        // Verify minimal parser was invoked
        assertEquals("minimal", metadata.get("parser-type"));
    }

    @Test
    public void testFallbackConfiguration() throws Exception {
        URL configUrl = getClass().getResource("/configs/test-loader-config.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        Parser compositeParser = loader.get(Parser.class);

        // Parse with fallback parser type
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "application/test+fallback");

        try (TikaInputStream tis = TikaInputStream.get("test".getBytes(StandardCharsets.UTF_8))) {
            compositeParser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
        }

        // Verify fallback parser was invoked with correct config
        assertEquals("success", metadata.get("fallback-parser"));
        assertEquals("primary parser", metadata.get("message"));
    }

    @Test
    public void testNoDuplicateParsersFromSpi() throws Exception {
        // Config explicitly configures ConfigurableTestParser but not the others
        URL configUrl = getClass().getResource("/configs/test-no-duplicate-parsers.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        Parser compositeParser = loader.get(Parser.class);

        // Parse with ConfigurableTestParser - should use the explicitly configured instance
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "application/test+configurable");

        try (TikaInputStream tis = TikaInputStream.get("test".getBytes(StandardCharsets.UTF_8))) {
            compositeParser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
        }

        // Verify it used the configured instance (with "explicitly-configured" name)
        // NOT the SPI instance (which would have "default" name from zero-arg constructor)
        assertEquals("explicitly-configured", metadata.get("parser-name"));
        assertEquals("4096", metadata.get("buffer-size"));

        // Verify other parsers (FallbackTestParser, MinimalTestParser) are still available via SPI
        Metadata fallbackMetadata = new Metadata();
        fallbackMetadata.set(Metadata.CONTENT_TYPE, "application/test+fallback");

        try (TikaInputStream tis = TikaInputStream.get("test".getBytes(StandardCharsets.UTF_8))) {
            compositeParser.parse(tis, new DefaultHandler(), fallbackMetadata, new ParseContext());
        }

        // FallbackTestParser should be loaded from SPI with default config
        assertEquals("success", fallbackMetadata.get("fallback-parser"));
        assertEquals("default message", fallbackMetadata.get("message"));
    }

    @Test
    public void testWithDefaultParserLoadsSpiParsers() throws Exception {
        // Config has "default-parser" so should load SPI parsers
        URL configUrl = getClass().getResource("/configs/test-with-default-parser.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        Parser compositeParser = loader.get(Parser.class);

        // Verify ConfigurableTestParser uses the configured instance
        Metadata configurableMetadata = new Metadata();
        configurableMetadata.set(Metadata.CONTENT_TYPE, "application/test+configurable");

        try (TikaInputStream tis = TikaInputStream.get("test".getBytes(StandardCharsets.UTF_8))) {
            compositeParser.parse(tis, new DefaultHandler(), configurableMetadata, new ParseContext());
        }

        assertEquals("with-default-config", configurableMetadata.get("parser-name"));
        assertEquals("1024", configurableMetadata.get("buffer-size"));

        // Verify FallbackTestParser was loaded from SPI
        Metadata fallbackMetadata = new Metadata();
        fallbackMetadata.set(Metadata.CONTENT_TYPE, "application/test+fallback");

        try (TikaInputStream tis = TikaInputStream.get("test".getBytes(StandardCharsets.UTF_8))) {
            compositeParser.parse(tis, new DefaultHandler(), fallbackMetadata, new ParseContext());
        }

        // FallbackTestParser should be loaded from SPI with default config
        assertEquals("success", fallbackMetadata.get("fallback-parser"));
    }

    @Test
    public void testWithoutDefaultParserSkipsSpiParsers() throws Exception {
        // Config does NOT have "default-parser" so should only load configured parsers
        URL configUrl = getClass().getResource("/configs/test-no-spi-fallback.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        Parser compositeParser = loader.get(Parser.class);

        ParseContext context = new ParseContext();

        // Verify ConfigurableTestParser is supported (explicitly configured)
        assertTrue(compositeParser.getSupportedTypes(context)
                        .contains(MediaType.parse("application/test+configurable")),
                "Should support application/test+configurable");

        // Verify FallbackTestParser is NOT supported (not configured, SPI skipped)
        assertTrue(!compositeParser.getSupportedTypes(context)
                        .contains(MediaType.parse("application/test+fallback")),
                "Should NOT support application/test+fallback");

        // Verify MinimalTestParser is NOT supported (not configured, SPI skipped)
        assertTrue(!compositeParser.getSupportedTypes(context)
                        .contains(MediaType.parse("application/test+minimal")),
                "Should NOT support application/test+minimal");
    }

    @Test
    public void testDefaultParserWithExclusions() throws Exception {
        // Config has "default-parser" with exclude list
        URL configUrl = getClass().getResource("/configs/test-default-parser-with-exclusions.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        Parser compositeParser = loader.get(Parser.class);

        ParseContext context = new ParseContext();

        // Verify ConfigurableTestParser is supported (explicitly configured)
        assertTrue(compositeParser.getSupportedTypes(context)
                        .contains(MediaType.parse("application/test+configurable")),
                "Should support application/test+configurable");

        // Verify MinimalTestParser is NOT supported (excluded via default-parser config)
        assertTrue(!compositeParser.getSupportedTypes(context)
                        .contains(MediaType.parse("application/test+minimal")),
                "Should NOT support application/test+minimal (excluded)");

        // Verify FallbackTestParser is NOT supported (excluded via default-parser config)
        assertTrue(!compositeParser.getSupportedTypes(context)
                        .contains(MediaType.parse("application/test+fallback")),
                "Should NOT support application/test+fallback (excluded)");
    }

    @Test
    public void testOptInParserExplicitLoad() throws Exception {
        // Config explicitly loads opt-in parser (spi=false)
        URL configUrl = getClass().getResource("/configs/test-opt-in-parser-explicit.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        Parser compositeParser = loader.get(Parser.class);

        // Parse with the opt-in parser
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "application/test+optin");

        try (TikaInputStream tis = TikaInputStream.get("test".getBytes(StandardCharsets.UTF_8))) {
            compositeParser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
        }

        // Verify opt-in parser was loaded
        assertEquals("opt-in", metadata.get("parser-type"));
        assertEquals("success", metadata.get("opt-in-parser"));
    }

    @Test
    public void testOptInParserNotLoadedBySpi() throws Exception {
        // Config uses default-parser - should NOT load opt-in parser (spi=false)
        URL configUrl = getClass().getResource("/configs/test-opt-in-parser-with-default.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        Parser compositeParser = loader.get(Parser.class);

        ParseContext context = new ParseContext();

        // Verify regular SPI parsers are supported
        assertTrue(compositeParser.getSupportedTypes(context)
                        .contains(MediaType.parse("application/test+configurable")),
                "Should support application/test+configurable (SPI)");

        // Verify opt-in parser is NOT supported (spi=false, not explicitly configured)
        assertTrue(!compositeParser.getSupportedTypes(context)
                        .contains(MediaType.parse("application/test+optin")),
                "Should NOT support application/test+optin (opt-in only, not in SPI)");
    }

    @Test
    public void testLoadConfigWithDefaults() throws Exception {
        // Test the loadConfig method that merges JSON config with defaults
        URL configUrl = getClass().getResource("/configs/embedded-limits-test.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);

        // Create defaults - some values will be overridden by JSON, others kept
        EmbeddedLimits defaults = new EmbeddedLimits();
        // Default values from EmbeddedLimits: maxDepth=UNLIMITED, maxCount=UNLIMITED, throwOnMax*=false

        // Load with defaults - JSON has: maxDepth=5, throwOnMaxDepth=true, maxCount=100, throwOnMaxCount=false
        EmbeddedLimits config = loader.loadConfig(EmbeddedLimits.class, defaults);

        assertNotNull(config, "Config should not be null");
        assertEquals(5, config.getMaxDepth(), "maxDepth should be from JSON");
        assertTrue(config.isThrowOnMaxDepth(), "throwOnMaxDepth should be from JSON");
        assertEquals(100, config.getMaxCount(), "maxCount should be from JSON");
        assertFalse(config.isThrowOnMaxCount(), "throwOnMaxCount should be from JSON");

        // Verify original defaults object was NOT modified
        assertEquals(EmbeddedLimits.UNLIMITED, defaults.getMaxDepth(), "Original defaults should be unchanged");
    }

    @Test
    public void testLoadConfigMissingKeyReturnsDefaults() throws Exception {
        // Test that loadConfig returns defaults when key is not in config
        URL configUrl = getClass().getResource("/configs/test-loader-config.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);

        // Create defaults
        EmbeddedLimits defaults = new EmbeddedLimits(10, true, 500, false);

        // Load with defaults - this config doesn't have embedded-limits
        EmbeddedLimits config = loader.loadConfig(EmbeddedLimits.class, defaults);

        // Should return the defaults since key is missing
        assertEquals(10, config.getMaxDepth(), "Should return defaults when key missing");
        assertTrue(config.isThrowOnMaxDepth(), "Should return defaults when key missing");
        assertEquals(500, config.getMaxCount(), "Should return defaults when key missing");
        assertFalse(config.isThrowOnMaxCount(), "Should return defaults when key missing");
    }

    // TODO: TIKA-SERIALIZATION-FOLLOWUP - Jackson may need configuration to fail on unknown properties
    @Disabled("TIKA-SERIALIZATION-FOLLOWUP")
    @Test
    public void testInvalidBeanPropertyThrowsException() throws Exception {
        // Config with a property that doesn't exist on DefaultDetector
        String invalidConfig = """
                {
                  "detectors": [
                    {
                      "default-detector": {
                        "nonExistentProperty": 12345
                      }
                    }
                  ]
                }
                """;

        Path tempFile = Files.createTempFile("test-invalid-property", ".json");
        try {
            Files.write(tempFile, invalidConfig.getBytes(StandardCharsets.UTF_8));

            TikaLoader loader = TikaLoader.load(tempFile);
            try {
                loader.loadDetectors();
                throw new AssertionError("Expected TikaConfigException for invalid property");
            } catch (org.apache.tika.exception.TikaConfigException e) {
                // Expected - Jackson should fail on unknown property
                assertTrue(e.getMessage().contains("nonExistentProperty") ||
                                e.getCause().getMessage().contains("nonExistentProperty"),
                        "Error should mention the invalid property name");
            }
        } finally {
            Files.deleteIfExists(tempFile);
        }
    }

}