ZipParserTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.pkg;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assumptions.assumeTrue;
import java.io.ByteArrayOutputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Locale;
import org.apache.commons.codec.binary.Base64;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.Zip;
import org.apache.tika.parser.ParseContext;
/**
* Test case for parsing zip files.
*/
public class ZipParserTest extends AbstractPkgTest {
/**
* Tests that the ParseContext parser is correctly
* fired for all the embedded entries.
*/
@Test
public void testEmbedded() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("test-documents.zip");
// First metadata is the container, rest are embedded documents
// With recursive parsing, we get more than 10 entries due to nested documents
// (e.g., ODT, PPT, DOC contain embedded resources)
assertTrue(metadataList.size() >= 10, "Expected at least 10 metadata entries");
// Collect all resource names for verification
List<String> resourceNames = new java.util.ArrayList<>();
for (Metadata m : metadataList) {
String name = m.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if (name != null) {
resourceNames.add(name);
}
}
// Should contain all 9 direct embedded files from the ZIP
assertContains("testEXCEL.xls", resourceNames);
assertContains("testHTML.html", resourceNames);
assertContains("testOpenOffice2.odt", resourceNames);
assertContains("testPDF.pdf", resourceNames);
assertContains("testPPT.ppt", resourceNames);
assertContains("testRTF.rtf", resourceNames);
assertContains("testTXT.txt", resourceNames);
assertContains("testWORD.doc", resourceNames);
assertContains("testXML.xml", resourceNames);
}
/**
* Test case for the ability of the ZIP parser to extract the name of
* a ZIP entry even if the content of the entry is unreadable due to an
* unsupported compression method.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-346">TIKA-346</a>
*/
@Test
public void testUnsupportedZipCompressionMethod() throws Exception {
String content = new Tika().parseToString(getResourceAsStream("/test-documents/moby.zip"));
assertContains("README", content);
}
@Test // TIKA-936
public void testCustomEncoding() throws Exception {
ZipParserConfig config = new ZipParserConfig();
config.setEntryEncoding(Charset.forName("SJIS"));
ParseContext context = new ParseContext();
context.set(ZipParserConfig.class, config);
List<Metadata> metadataList;
try (TikaInputStream tis = TikaInputStream.get(Base64.decodeBase64(
"UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50" +
"eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh" +
"QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA" +
"AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) {
metadataList = getRecursiveMetadata(tis, new Metadata(), context, false);
}
// Container + 1 embedded document
assertEquals(2, metadataList.size());
assertEquals("\u65E5\u672C\u8A9E\u30E1\u30E2.txt",
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
}
@Test
public void testCharsetAutoDetectionDisabled() throws Exception {
// Test that disabling charset detection leaves non-UTF8 names as-is (garbled)
ZipParserConfig config = new ZipParserConfig();
config.setDetectCharsetsInEntryNames(false);
ParseContext context = new ParseContext();
context.set(ZipParserConfig.class, config);
List<Metadata> metadataList;
try (TikaInputStream tis = TikaInputStream.get(Base64.decodeBase64(
"UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50" +
"eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh" +
"QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA" +
"AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) {
metadataList = getRecursiveMetadata(tis, new Metadata(), context, false);
}
// Container + 1 embedded document
assertEquals(2, metadataList.size());
String name = metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY);
// With detection disabled, the SJIS bytes are interpreted as default charset (garbled)
// The correct Japanese name is ���������������.txt - verify we DON'T get that
assertTrue(!"\u65E5\u672C\u8A9E\u30E1\u30E2.txt".equals(name),
"With detection disabled, SJIS name should NOT be correctly decoded");
}
@Test
public void testQuineRecursiveParserWrapper() throws Exception {
//Anti-virus can surreptitiously remove this file
assumeTrue(
ZipParserTest.class.getResourceAsStream("/test-documents/droste.zip") != null);
//received permission from author via dm
//2019-07-25 to include
//http://alf.nu/s/droste.zip in unit tests
//Out of respect to the author, please maintain
//the original file name
getRecursiveMetadata("droste.zip");
}
@Test
public void testQuine() {
//Anti-virus can surreptitiously remove this file
assumeTrue(
ZipParserTest.class.getResourceAsStream("/test-documents/droste.zip") != null);
assertThrows(TikaException.class, () -> {
getXML("droste.zip");
});
}
@Test
public void testZipUsingStoredWithDataDescriptor() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testZip_with_DataDescriptor.zip");
// Container + 5 embedded documents
assertEquals(6, metadataList.size());
assertEquals("en0", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals("en1", metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals("en2", metadataList.get(3).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals("en3", metadataList.get(4).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals("en4", metadataList.get(5).get(TikaCoreProperties.RESOURCE_NAME_KEY));
// This ZIP with DATA_DESCRIPTOR is salvaged and parsed with file-based access
// Integrity check can compare central directory vs local headers
Metadata containerMetadata = metadataList.get(0);
assertEquals("PASS", containerMetadata.get(Zip.INTEGRITY_CHECK_RESULT));
}
@Test
public void testIntegrityCheckPass() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("test-documents.zip");
// Normal ZIP with file-based access should pass integrity check
Metadata containerMetadata = metadataList.get(0);
assertEquals("PASS", containerMetadata.get(Zip.INTEGRITY_CHECK_RESULT));
assertNull(containerMetadata.get(Zip.DUPLICATE_ENTRY_NAMES));
assertNull(containerMetadata.get(Zip.CENTRAL_DIRECTORY_ONLY_ENTRIES));
assertNull(containerMetadata.get(Zip.LOCAL_HEADER_ONLY_ENTRIES));
}
@Test
public void testIntegrityCheckDisabled() throws Exception {
ZipParserConfig config = new ZipParserConfig();
config.setIntegrityCheck(false);
ParseContext context = new ParseContext();
context.set(ZipParserConfig.class, config);
List<Metadata> metadataList = getRecursiveMetadata("test-documents.zip", context);
// Integrity check disabled - no result should be set
Metadata containerMetadata = metadataList.get(0);
assertNull(containerMetadata.get(Zip.INTEGRITY_CHECK_RESULT));
}
@Test
public void testIntegrityCheckHiddenEntry(@TempDir Path tempDir) throws Exception {
// Create a ZIP with a hidden entry (in local headers but not central directory)
Path zipPath = tempDir.resolve("hidden-entry.zip");
byte[] zipBytes = createZipWithHiddenEntry();
Files.write(zipPath, zipBytes);
List<Metadata> metadataList = getRecursiveMetadata(zipPath, false);
Metadata containerMetadata = metadataList.get(0);
assertEquals("FAIL", containerMetadata.get(Zip.INTEGRITY_CHECK_RESULT));
String[] localOnly = containerMetadata.getValues(Zip.LOCAL_HEADER_ONLY_ENTRIES);
assertEquals(1, localOnly.length);
assertEquals("hidden.txt", localOnly[0]);
}
/**
* Creates a ZIP file with an entry that exists in local headers but not in the
* central directory. This simulates a hidden/smuggled entry attack.
*/
private byte[] createZipWithHiddenEntry() throws Exception {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
// Entry 1: visible.txt (will be in both local header and central directory)
byte[] visible = "visible content".getBytes(StandardCharsets.UTF_8);
// Entry 2: hidden.txt (will be in local header ONLY - not in central directory)
byte[] hidden = "hidden content".getBytes(StandardCharsets.UTF_8);
// Local file header for visible.txt
int visibleLocalOffset = baos.size();
writeLocalFileHeader(baos, "visible.txt", visible);
// Local file header for hidden.txt (this won't have a central directory entry)
writeLocalFileHeader(baos, "hidden.txt", hidden);
// Central directory - only includes visible.txt
int centralDirOffset = baos.size();
writeCentralDirectoryEntry(baos, "visible.txt", visible, visibleLocalOffset);
// End of central directory
int centralDirSize = baos.size() - centralDirOffset;
writeEndOfCentralDirectory(baos, 1, centralDirSize, centralDirOffset);
return baos.toByteArray();
}
private void writeLocalFileHeader(ByteArrayOutputStream baos, String name, byte[] content)
throws Exception {
byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8);
// Local file header signature
writeInt(baos, 0x04034b50);
// Version needed
writeShort(baos, 10);
// General purpose bit flag
writeShort(baos, 0);
// Compression method (0 = stored)
writeShort(baos, 0);
// Last mod time/date
writeShort(baos, 0);
writeShort(baos, 0);
// CRC-32
writeInt(baos, (int) computeCrc32(content));
// Compressed size
writeInt(baos, content.length);
// Uncompressed size
writeInt(baos, content.length);
// File name length
writeShort(baos, nameBytes.length);
// Extra field length
writeShort(baos, 0);
// File name
baos.write(nameBytes);
// File data
baos.write(content);
}
private void writeCentralDirectoryEntry(ByteArrayOutputStream baos, String name,
byte[] content, int localHeaderOffset) throws Exception {
byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8);
// Central directory file header signature
writeInt(baos, 0x02014b50);
// Version made by
writeShort(baos, 20);
// Version needed
writeShort(baos, 10);
// General purpose bit flag
writeShort(baos, 0);
// Compression method
writeShort(baos, 0);
// Last mod time/date
writeShort(baos, 0);
writeShort(baos, 0);
// CRC-32
writeInt(baos, (int) computeCrc32(content));
// Compressed size
writeInt(baos, content.length);
// Uncompressed size
writeInt(baos, content.length);
// File name length
writeShort(baos, nameBytes.length);
// Extra field length
writeShort(baos, 0);
// File comment length
writeShort(baos, 0);
// Disk number start
writeShort(baos, 0);
// Internal file attributes
writeShort(baos, 0);
// External file attributes
writeInt(baos, 0);
// Relative offset of local header
writeInt(baos, localHeaderOffset);
// File name
baos.write(nameBytes);
}
private void writeEndOfCentralDirectory(ByteArrayOutputStream baos, int numEntries,
int centralDirSize, int centralDirOffset) {
// End of central directory signature
writeInt(baos, 0x06054b50);
// Disk number
writeShort(baos, 0);
// Disk number with central directory
writeShort(baos, 0);
// Number of entries on this disk
writeShort(baos, numEntries);
// Total number of entries
writeShort(baos, numEntries);
// Size of central directory
writeInt(baos, centralDirSize);
// Offset of central directory
writeInt(baos, centralDirOffset);
// Comment length
writeShort(baos, 0);
}
private void writeInt(ByteArrayOutputStream baos, int value) {
baos.write(value & 0xff);
baos.write((value >> 8) & 0xff);
baos.write((value >> 16) & 0xff);
baos.write((value >> 24) & 0xff);
}
private void writeShort(ByteArrayOutputStream baos, int value) {
baos.write(value & 0xff);
baos.write((value >> 8) & 0xff);
}
private long computeCrc32(byte[] data) {
java.util.zip.CRC32 crc = new java.util.zip.CRC32();
crc.update(data);
return crc.getValue();
}
/**
* Microbenchmark to measure the performance impact of integrity checking.
* This test is disabled by default - remove the assumeTrue to run it.
*
* WARNING: The large ZIP test creates a multi-GB file and takes significant time.
*/
@Test
public void benchmarkIntegrityCheck(@TempDir Path tempDir) throws Exception {
// Skip by default - set this to true to run the benchmark
assumeTrue(false, "Benchmark disabled by default - set to true to run");
int iterations = 20;
int warmupIterations = 3;
// Create small ZIP (10 entries, ~1KB each) - ~10KB total
Path smallZip = tempDir.resolve("small.zip");
System.out.println("Creating small ZIP (10 entries, ~10KB)...");
createBenchmarkZip(smallZip, 10, 1024);
System.out.println(" Created: " + Files.size(smallZip) / 1024 + " KB");
// Create medium ZIP (1000 entries, ~100KB each) - ~100MB total
Path mediumZip = tempDir.resolve("medium.zip");
System.out.println("Creating medium ZIP (1000 entries, ~100MB)...");
createBenchmarkZip(mediumZip, 1000, 100 * 1024);
System.out.println(" Created: " + Files.size(mediumZip) / (1024 * 1024) + " MB");
// Create large ZIP (5000 entries, ~500KB each) - ~2.5GB total
Path largeZip = tempDir.resolve("large.zip");
System.out.println("Creating large ZIP (5000 entries, ~2.5GB)...");
createBenchmarkZip(largeZip, 5000, 500 * 1024);
System.out.println(" Created: " + Files.size(largeZip) / (1024 * 1024) + " MB");
System.out.println();
System.out.println("=== Integrity Check Benchmark ===");
System.out.println("Iterations: " + iterations + " (warmup: " + warmupIterations + ")");
System.out.println();
// Benchmark small ZIP
System.out.println("Small ZIP (10 entries, ~10KB):");
runBenchmark(smallZip, iterations, warmupIterations);
System.out.println();
// Benchmark medium ZIP
System.out.println("Medium ZIP (1000 entries, ~100MB):");
runBenchmark(mediumZip, 10, 2);
System.out.println();
// Benchmark large ZIP
System.out.println("Large ZIP (5000 entries, ~2.5GB):");
runBenchmark(largeZip, 5, 1);
}
private void createBenchmarkZip(Path zipPath, int numEntries, int entrySize) throws Exception {
try (java.util.zip.ZipOutputStream zos =
new java.util.zip.ZipOutputStream(Files.newOutputStream(zipPath))) {
// Use STORED to avoid compression - we want actual file size
zos.setMethod(java.util.zip.ZipOutputStream.STORED);
// Use random data to prevent any accidental compression
java.util.Random random = new java.util.Random(42);
byte[] content = new byte[entrySize];
random.nextBytes(content);
for (int i = 0; i < numEntries; i++) {
java.util.zip.ZipEntry entry = new java.util.zip.ZipEntry("entry" + i + ".txt");
entry.setMethod(java.util.zip.ZipEntry.STORED);
entry.setSize(content.length);
entry.setCompressedSize(content.length);
entry.setCrc(computeCrc32(content));
zos.putNextEntry(entry);
zos.write(content);
zos.closeEntry();
}
}
}
private void runBenchmark(Path zipPath, int iterations, int warmupIterations) throws Exception {
ZipParser parser = new ZipParser();
// Config with integrity check enabled
ZipParserConfig configWithCheck = new ZipParserConfig();
configWithCheck.setIntegrityCheck(true);
// Config with integrity check disabled
ZipParserConfig configWithoutCheck = new ZipParserConfig();
configWithoutCheck.setIntegrityCheck(false);
// Warmup - with integrity check
for (int i = 0; i < warmupIterations; i++) {
parseZip(parser, zipPath, configWithCheck);
}
// Warmup - without integrity check
for (int i = 0; i < warmupIterations; i++) {
parseZip(parser, zipPath, configWithoutCheck);
}
// Benchmark with integrity check
long startWithCheck = System.nanoTime();
for (int i = 0; i < iterations; i++) {
parseZip(parser, zipPath, configWithCheck);
}
long durationWithCheck = System.nanoTime() - startWithCheck;
// Benchmark without integrity check
long startWithoutCheck = System.nanoTime();
for (int i = 0; i < iterations; i++) {
parseZip(parser, zipPath, configWithoutCheck);
}
long durationWithoutCheck = System.nanoTime() - startWithoutCheck;
double avgWithCheck = durationWithCheck / (double) iterations / 1_000_000.0;
double avgWithoutCheck = durationWithoutCheck / (double) iterations / 1_000_000.0;
double overhead = avgWithCheck - avgWithoutCheck;
double overheadPercent = (overhead / avgWithoutCheck) * 100;
System.out.printf(Locale.ROOT, " Without integrity check: %.3f ms/parse%n", avgWithoutCheck);
System.out.printf(Locale.ROOT, " With integrity check: %.3f ms/parse%n", avgWithCheck);
System.out.printf(Locale.ROOT, " Overhead: %.3f ms (%.1f%%)%n", overhead, overheadPercent);
}
private void parseZip(ZipParser parser, Path zipPath, ZipParserConfig config) throws Exception {
ParseContext context = new ParseContext();
context.set(ZipParserConfig.class, config);
try (TikaInputStream tis = TikaInputStream.get(zipPath)) {
Metadata metadata = new Metadata();
parser.parse(tis, new org.xml.sax.helpers.DefaultHandler(), metadata, context);
}
}
}