SkipContainerDocumentDigestTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.digest;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import java.util.List;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
import org.apache.tika.digest.DigesterFactory;
import org.apache.tika.digest.SkipContainerDocumentDigest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.digestutils.CommonsDigesterFactory;
/**
* Tests for SkipContainerDocumentDigest functionality with MockParser and embedded documents.
* DigesterFactory is now configured via ParseContext (via parse-context in JSON).
*/
public class SkipContainerDocumentDigestTest extends TikaTest {
private static final String DIGEST_KEY = TikaCoreProperties.TIKA_META_PREFIX + "digest" +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "MD5";
@Test
public void testDigestContainerAndEmbedded() throws Exception {
// skipContainerDocumentDigest = false means digest everything
CommonsDigesterFactory factory = new CommonsDigesterFactory();
factory.setSkipContainerDocumentDigest(false);
AutoDetectParser parser = new AutoDetectParser();
ParseContext context = new ParseContext();
context.set(DigesterFactory.class, factory);
List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml",
parser, new Metadata(), context, false);
// Should have container + embedded
assertEquals(2, metadataList.size());
// Container should have digest
assertNotNull(metadataList.get(0).get(DIGEST_KEY),
"Container document should have digest");
// Embedded should have digest
assertNotNull(metadataList.get(1).get(DIGEST_KEY),
"Embedded document should have digest");
}
@Test
public void testSkipContainerDigestOnly() throws Exception {
// skipContainerDocumentDigest = true means skip container, digest only embedded
CommonsDigesterFactory factory = new CommonsDigesterFactory();
factory.setSkipContainerDocumentDigest(true);
AutoDetectParser parser = new AutoDetectParser();
ParseContext context = new ParseContext();
context.set(DigesterFactory.class, factory);
List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml",
parser, new Metadata(), context, false);
// Should have container + embedded
assertEquals(2, metadataList.size());
// Container should NOT have digest
assertNull(metadataList.get(0).get(DIGEST_KEY),
"Container document should NOT have digest when skipContainerDocumentDigest=true");
// Embedded should have digest
assertNotNull(metadataList.get(1).get(DIGEST_KEY),
"Embedded document should have digest");
}
@Test
public void testSkipContainerDocumentDigestMarkerInParseContext() throws Exception {
// Test that the SkipContainerDocumentDigest marker in ParseContext works
CommonsDigesterFactory factory = new CommonsDigesterFactory();
factory.setSkipContainerDocumentDigest(false); // Factory says digest all
AutoDetectParser parser = new AutoDetectParser();
// Set both factory and the marker in ParseContext - marker overrides factory
ParseContext context = new ParseContext();
context.set(DigesterFactory.class, factory);
context.set(SkipContainerDocumentDigest.class, SkipContainerDocumentDigest.INSTANCE);
List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml",
parser, new Metadata(), context, false);
// Should have container + embedded
assertEquals(2, metadataList.size());
// Container should NOT have digest because ParseContext marker overrides factory
assertNull(metadataList.get(0).get(DIGEST_KEY),
"Container document should NOT have digest when ParseContext marker is set");
// Embedded should have digest
assertNotNull(metadataList.get(1).get(DIGEST_KEY),
"Embedded document should have digest");
}
@Test
public void testNoDigesterConfigured() throws Exception {
// When no digester is configured in ParseContext, no digests should be computed
AutoDetectParser parser = new AutoDetectParser();
List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml", parser);
// Should have container + embedded
assertEquals(2, metadataList.size());
// Neither should have digest
assertNull(metadataList.get(0).get(DIGEST_KEY),
"Container should not have digest when no digester configured");
assertNull(metadataList.get(1).get(DIGEST_KEY),
"Embedded should not have digest when no digester configured");
}
@Test
public void testDigestWithFactoryInParseContext() throws Exception {
// Test that DigesterFactory in ParseContext is used
CommonsDigesterFactory factory = new CommonsDigesterFactory();
factory.setSkipContainerDocumentDigest(false);
AutoDetectParser parser = new AutoDetectParser();
ParseContext context = new ParseContext();
context.set(DigesterFactory.class, factory);
List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml",
parser, new Metadata(), context, false);
// Should have container + embedded
assertEquals(2, metadataList.size());
// Both should have digest
assertNotNull(metadataList.get(0).get(DIGEST_KEY),
"Container document should have digest when ParseContext provides factory");
assertNotNull(metadataList.get(1).get(DIGEST_KEY),
"Embedded document should have digest when ParseContext provides factory");
}
@Test
public void testSkipContainerOnFactory() throws Exception {
// Test skipContainerDocumentDigest configured on the factory
CommonsDigesterFactory factory = new CommonsDigesterFactory();
factory.setSkipContainerDocumentDigest(true);
AutoDetectParser parser = new AutoDetectParser();
ParseContext context = new ParseContext();
context.set(DigesterFactory.class, factory);
List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml",
parser, new Metadata(), context, false);
// Should have container + embedded
assertEquals(2, metadataList.size());
// Container should NOT have digest because factory says to skip
assertNull(metadataList.get(0).get(DIGEST_KEY),
"Container document should NOT have digest when factory.skipContainerDocumentDigest=true");
// Embedded should have digest
assertNotNull(metadataList.get(1).get(DIGEST_KEY),
"Embedded document should have digest");
}
}