TikaLoaderRoundTripTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.config.loader;

import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.junit.jupiter.api.Test;

import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;

/**
 * Tests for TikaLoader round-trip serialization (load -> toJson -> reload).
 */
public class TikaLoaderRoundTripTest {

    @Test
    void testBasicRoundTrip() throws Exception {
        URL configUrl = getClass().getResource("/configs/test-default-parser-with-exclusions.json");
        Path configPath = Path.of(configUrl.toURI());

        // Load config
        TikaLoader loader = TikaLoader.load(configPath);

        // Force component loading
        Parser parser = loader.get(Parser.class);
        assertNotNull(parser);

        // Serialize
        String json = loader.toJson();
        assertNotNull(json);
        assertTrue(json.contains("parsers"));

        // Reload from serialized JSON
        Path tempFile = Files.createTempFile("tika-roundtrip-", ".json");
        try {
            Files.writeString(tempFile, json);
            TikaLoader reloaded = TikaLoader.load(tempFile);

            Parser reloadedParser = reloaded.get(Parser.class);
            assertNotNull(reloadedParser);
        } finally {
            Files.deleteIfExists(tempFile);
        }
    }

    @Test
    void testExclusionsPreservedInRoundTrip() throws Exception {
        URL configUrl = getClass().getResource("/configs/test-default-parser-with-exclusions.json");
        Path configPath = Path.of(configUrl.toURI());

        // Load config with exclusions
        TikaLoader loader = TikaLoader.load(configPath);
        CompositeParser parser = (CompositeParser) loader.get(Parser.class);

        // Verify exclusions are applied
        ParseContext context = new ParseContext();
        assertFalse(parser.getSupportedTypes(context).contains(MediaType.parse("application/test+minimal")),
                "minimal-test-parser should be excluded");

        // Round-trip
        String json = loader.toJson();
        Path tempFile = Files.createTempFile("tika-exclusions-", ".json");
        try {
            Files.writeString(tempFile, json);
            TikaLoader reloaded = TikaLoader.load(tempFile);

            CompositeParser reloadedParser = (CompositeParser) reloaded.get(Parser.class);

            // Exclusions should still be applied
            assertFalse(reloadedParser.getSupportedTypes(context)
                            .contains(MediaType.parse("application/test+minimal")),
                    "Exclusions should survive round-trip");
        } finally {
            Files.deleteIfExists(tempFile);
        }
    }

    @Test
    void testMultipleRoundTripsStability() throws Exception {
        URL configUrl = getClass().getResource("/configs/test-default-parser-with-exclusions.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        loader.get(Parser.class); // Force loading

        // Perform 3 round-trips
        for (int i = 0; i < 3; i++) {
            String json = loader.toJson();
            Path tempFile = Files.createTempFile("tika-multi-", ".json");
            try {
                Files.writeString(tempFile, json);
                loader = TikaLoader.load(tempFile);
                loader.get(Parser.class); // Force loading
            } finally {
                Files.deleteIfExists(tempFile);
            }
        }

        // Should still work
        Parser parser = loader.get(Parser.class);
        assertNotNull(parser);
    }

    @Test
    void testUnloadedComponentsPreserved() throws Exception {
        URL configUrl = getClass().getResource("/configs/test-default-parser-with-exclusions.json");
        Path configPath = Path.of(configUrl.toURI());

        // Load but DON'T access Parser (lazy loading)
        TikaLoader loader = TikaLoader.load(configPath);

        // Serialize without loading Parser
        String json = loader.toJson();

        // Should still contain parsers section from original JSON
        ObjectMapper mapper = new ObjectMapper();
        JsonNode node = mapper.readTree(json);
        assertTrue(node.has("parsers"), "Unloaded component should be preserved");
    }

    @Test
    void testToJsonFormat() throws Exception {
        URL configUrl = getClass().getResource("/configs/test-default-parser-with-exclusions.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        loader.get(Parser.class);

        String json = loader.toJson();

        // Should be pretty-printed
        assertTrue(json.contains("\n"), "Should be pretty-printed");

        // Should be valid JSON
        ObjectMapper mapper = new ObjectMapper();
        JsonNode node = mapper.readTree(json);
        assertNotNull(node);
    }

    @Test
    void testSaveToFile() throws Exception {
        URL configUrl = getClass().getResource("/configs/test-default-parser-with-exclusions.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        loader.get(Parser.class);

        // Save to file
        Path tempFile = Files.createTempFile("tika-save-", ".json");
        try {
            loader.save(tempFile.toFile());

            // Verify file was written and is valid JSON
            String content = Files.readString(tempFile);
            assertTrue(content.contains("parsers"));

            // Should be loadable
            TikaLoader reloaded = TikaLoader.load(tempFile);
            assertNotNull(reloaded.get(Parser.class));
        } finally {
            Files.deleteIfExists(tempFile);
        }
    }

    @Test
    void testDefaultLoaderToJson() throws Exception {
        // Default loader with no config file
        TikaLoader loader = TikaLoader.loadDefault();
        loader.get(Parser.class);

        String json = loader.toJson();
        assertNotNull(json);

        // Should produce valid JSON with parsers
        ObjectMapper mapper = new ObjectMapper();
        JsonNode node = mapper.readTree(json);
        assertTrue(node.has("parsers"), "Default loader should serialize parsers");
    }

    @Test
    void testExcludeFieldInOutput() throws Exception {
        URL configUrl = getClass().getResource("/configs/test-default-parser-with-exclusions.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        loader.get(Parser.class);

        String json = loader.toJson();

        // Should contain exclude in output (for default-parser exclusions)
        assertTrue(json.contains("exclude"),
                "Exclusions should be in output");
    }

    @Test
    void testMimeFilteringRoundTrip() throws Exception {
        // Test that _mime-include/_mime-exclude survives round-trip
        URL configUrl = getClass().getResource("/configs/test-decoration-config.json");
        Path configPath = Path.of(configUrl.toURI());

        // Load config with mime filtering
        TikaLoader loader = TikaLoader.load(configPath);
        Parser parser = loader.get(Parser.class);
        ParseContext context = new ParseContext();

        // Verify initial filtering works
        assertTrue(parser.getSupportedTypes(context).contains(MediaType.parse("application/pdf")),
                "Should support application/pdf");
        assertTrue(parser.getSupportedTypes(context).contains(MediaType.parse("text/plain")),
                "Should support text/plain");
        assertFalse(parser.getSupportedTypes(context).contains(MediaType.parse("application/pdf+fdf")),
                "Should NOT support application/pdf+fdf (excluded)");

        // Round-trip
        String json = loader.toJson();

        // Verify JSON contains the mime filter fields
        assertTrue(json.contains("_mime-include") || json.contains("mime-include"),
                "JSON should contain mime-include");

        // Reload and verify filtering still works
        Path tempFile = Files.createTempFile("tika-mime-filter-", ".json");
        try {
            Files.writeString(tempFile, json);
            TikaLoader reloaded = TikaLoader.load(tempFile);
            Parser reloadedParser = reloaded.get(Parser.class);

            assertTrue(reloadedParser.getSupportedTypes(context).contains(MediaType.parse("application/pdf")),
                    "After round-trip: Should support application/pdf");
            assertTrue(reloadedParser.getSupportedTypes(context).contains(MediaType.parse("text/plain")),
                    "After round-trip: Should support text/plain");
            assertFalse(reloadedParser.getSupportedTypes(context).contains(MediaType.parse("application/pdf+fdf")),
                    "After round-trip: Should NOT support application/pdf+fdf");
        } finally {
            Files.deleteIfExists(tempFile);
        }
    }

    @Test
    void testMimeFilteringMultipleRoundTrips() throws Exception {
        // Test stability across multiple round-trips
        URL configUrl = getClass().getResource("/configs/test-decoration-config.json");
        Path configPath = Path.of(configUrl.toURI());

        TikaLoader loader = TikaLoader.load(configPath);
        ParseContext context = new ParseContext();

        // Perform 3 round-trips
        for (int i = 0; i < 3; i++) {
            Parser parser = loader.get(Parser.class);

            // Verify filtering works after each round-trip
            assertTrue(parser.getSupportedTypes(context).contains(MediaType.parse("application/pdf")),
                    "Round " + i + ": Should support application/pdf");
            assertFalse(parser.getSupportedTypes(context).contains(MediaType.parse("application/pdf+fdf")),
                    "Round " + i + ": Should NOT support application/pdf+fdf");

            String json = loader.toJson();
            Path tempFile = Files.createTempFile("tika-mime-multi-", ".json");
            try {
                Files.writeString(tempFile, json);
                loader = TikaLoader.load(tempFile);
            } finally {
                Files.deleteIfExists(tempFile);
            }
        }
    }
}