DigestConfigTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.digest;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;

import java.util.List;

import org.junit.jupiter.api.Test;

import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;

/**
 * Tests for digest configuration via JSON config files.
 * Tests both CommonsDigester and BouncyCastleDigester via AutoDetectParser configuration.
 */
public class DigestConfigTest extends TikaTest {

    private static final String P = TikaCoreProperties.TIKA_META_PREFIX + "digest" +
            TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;

    // Expected digest values for test_recursive_embedded.docx (lowercase hex)
    private static final String EXPECTED_MD2 = "d768c8e27b0b52c6eaabfaa7122d1d4f";
    private static final String EXPECTED_MD5 = "59f626e09a8c16ab6dbc2800c685f772";
    private static final String EXPECTED_SHA1 = "7a1f001d163ac90d8ea54c050faf5a38079788a6";
    private static final String EXPECTED_SHA256 =
            "c4b7fab030a8b6a9d6691f6699ac8e6f" + "82bc53764a0f1430d134ae3b70c32654";
    private static final String EXPECTED_SHA384 =
            "ebe368b9326fef44408290724d187553" + "8b8a6923fdf251ddab72c6e4b5d54160" +
                    "9db917ba4260d1767995a844d8d654df";
    private static final String EXPECTED_SHA512 =
            "ee46d973ee1852c018580c242955974d" + "da4c21f36b54d7acd06fcf68e974663b" +
                    "fed1d256875be58d22beacf178154cc3" + "a1178cb73443deaa53aa0840324708bb";
    private static final String EXPECTED_SHA3_512 =
            "04337f667a250348a1acb992863b3ddc" + "eab38365c206c18d356d2b31675ad669" +
                    "5fb5497f4e79b11640aefbb8042a5dbb" + "7ec6c2c6c1b6e19210453591c52cb6eb";
    private static final String EXPECTED_SHA1_BASE32 = "PIPQAHIWHLEQ3DVFJQCQ7L22HADZPCFG";

    // ================= CommonsDigester Tests =================

    @Test
    public void testCommonsDigesterBasic() throws Exception {
        TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-commons-digests-basic.json");
        Parser p = loader.loadAutoDetectParser();
        ParseContext context = loader.loadParseContext();
        Metadata m = new Metadata();
        getXML("test_recursive_embedded.docx", p, m, context);

        assertEquals(EXPECTED_MD2, m.get(P + "MD2"), "MD2 digest should match");
        assertEquals(EXPECTED_MD5, m.get(P + "MD5"), "MD5 digest should match");
        assertEquals(EXPECTED_SHA1, m.get(P + "SHA1"), "SHA1 digest should match");
        assertEquals(EXPECTED_SHA256, m.get(P + "SHA256"), "SHA256 digest should match");
        assertEquals(EXPECTED_SHA384, m.get(P + "SHA384"), "SHA384 digest should match");
        assertEquals(EXPECTED_SHA512, m.get(P + "SHA512"), "SHA512 digest should match");
    }

    @Test
    public void testCommonsDigesterWithBase32() throws Exception {
        TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-digests.json");
        Parser p = loader.loadAutoDetectParser();
        ParseContext context = loader.loadParseContext();
        Metadata m = new Metadata();
        getXML("test_recursive_embedded.docx", p, m, context);

        // SHA256 with BASE32 encoding - just verify it exists with non-default key
        assertNotNull(m.get(P + "SHA256:BASE32"),
                "SHA256:BASE32 digest should be present");
        // MD5 with default HEX encoding
        assertEquals(EXPECTED_MD5, m.get(P + "MD5"), "MD5 digest should match");
    }

    @Test
    public void testCommonsDigesterLengthsCalculated() throws Exception {
        // This tests that TIKA-4016 added lengths
        TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-commons-digests-basic.json");
        Parser p = loader.loadAutoDetectParser();
        ParseContext context = loader.loadParseContext();
        List<Metadata> metadataList = getRecursiveMetadata("test_recursive_embedded.docx", p, context);
        for (Metadata m : metadataList) {
            assertNotNull(m.get(Metadata.CONTENT_LENGTH));
        }
    }

    @Test
    public void testCommonsDigesterSkipContainer() throws Exception {
        // Tests skipContainerDocumentDigest on the factory (configured in parse-context)
        TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-digests-skip-container.json");
        Parser p = loader.loadAutoDetectParser();
        ParseContext context = loader.loadParseContext();
        List<Metadata> metadataList = getRecursiveMetadata("test_recursive_embedded.docx", p, context);

        // Container should NOT have digest
        assertNull(metadataList.get(0).get(P + "MD5"),
                "Container document should NOT have digest when skipContainerDocumentDigest=true");

        // Embedded documents should have digest
        for (int i = 1; i < metadataList.size(); i++) {
            assertNotNull(metadataList.get(i).get(P + "MD5"),
                    "Embedded document " + i + " should have digest");
        }
    }

    // ================= BouncyCastleDigester Tests =================

    @Test
    public void testBouncyCastleDigesterBasic() throws Exception {
        TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-bc-digests-basic.json");
        Parser p = loader.loadAutoDetectParser();
        ParseContext context = loader.loadParseContext();
        Metadata m = new Metadata();
        getXML("test_recursive_embedded.docx", p, m, context);

        assertEquals(EXPECTED_MD2, m.get(P + "MD2"), "MD2 digest should match");
        assertEquals(EXPECTED_MD5, m.get(P + "MD5"), "MD5 digest should match");
        assertEquals(EXPECTED_SHA1, m.get(P + "SHA1"), "SHA1 digest should match");
        assertEquals(EXPECTED_SHA256, m.get(P + "SHA256"), "SHA256 digest should match");
        assertEquals(EXPECTED_SHA384, m.get(P + "SHA384"), "SHA384 digest should match");
        assertEquals(EXPECTED_SHA512, m.get(P + "SHA512"), "SHA512 digest should match");
    }

    @Test
    public void testBouncyCastleDigesterMultipleAlgorithms() throws Exception {
        TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-bc-digests-multiple.json");
        Parser p = loader.loadAutoDetectParser();
        ParseContext context = loader.loadParseContext();
        Metadata m = new Metadata();
        getXML("test_recursive_embedded.docx", p, m, context);

        assertEquals(EXPECTED_MD5, m.get(P + "MD5"), "MD5 digest should match");
        assertEquals(EXPECTED_SHA256, m.get(P + "SHA256"), "SHA256 digest should match");
        assertEquals(EXPECTED_SHA384, m.get(P + "SHA384"), "SHA384 digest should match");
        assertEquals(EXPECTED_SHA512, m.get(P + "SHA512"), "SHA512 digest should match");
        assertEquals(EXPECTED_SHA3_512, m.get(P + "SHA3_512"), "SHA3_512 digest should match");

        // MD2 was not configured
        assertNull(m.get(P + "MD2"), "MD2 should not be present");
    }

    @Test
    public void testBouncyCastleDigesterBase32Encoding() throws Exception {
        TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-bc-digests-base32.json");
        Parser p = loader.loadAutoDetectParser();
        ParseContext context = loader.loadParseContext();
        Metadata m = new Metadata();
        getXML("test_recursive_embedded.docx", p, m, context);

        // Non-default encoding includes encoding in the key
        assertEquals(EXPECTED_SHA1_BASE32, m.get(P + "SHA1:BASE32"),
                "SHA1 BASE32 digest should match");
    }

    @Test
    public void testBouncyCastleDigesterLengthsCalculated() throws Exception {
        TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-bc-digests-basic.json");
        Parser p = loader.loadAutoDetectParser();
        ParseContext context = loader.loadParseContext();
        List<Metadata> metadataList = getRecursiveMetadata("test_recursive_embedded.docx", p, context);
        for (Metadata m : metadataList) {
            assertNotNull(m.get(Metadata.CONTENT_LENGTH));
        }
    }
}