ZipBenchmark.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.pkg;
import static org.junit.jupiter.api.Assumptions.assumeTrue;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Locale;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
public class ZipBenchmark {
// Toggle this to switch between DefaultHandler and RecursiveParserWrapper
private static final boolean USE_RECURSIVE_PARSER_WRAPPER = true;
@Test
public void benchmarkAutoDetectParser(@TempDir Path tempDir) throws Exception {
// Enable to run
assumeTrue(true, "Set to true to run");
int iterations = 40;
int warmupIterations = 6;
Path smallZip = tempDir.resolve("small.zip");
createBenchmarkZip(smallZip, 10, 1024);
System.out.println("Small: " + Files.size(smallZip) / 1024 + " KB");
Path mediumZip = tempDir.resolve("medium.zip");
createBenchmarkZip(mediumZip, 1000, 100 * 1024);
System.out.println("Medium: " + Files.size(mediumZip) / (1024 * 1024) + " MB");
Path largeZip = tempDir.resolve("large.zip");
createBenchmarkZip(largeZip, 5000, 500 * 1024);
System.out.println("Large: " + Files.size(largeZip) / (1024 * 1024) + " MB");
System.out.println("\n=== ZIP Benchmark ===");
System.out.println("Mode: " + (USE_RECURSIVE_PARSER_WRAPPER ? "RecursiveParserWrapper" : "DefaultHandler"));
System.out.println();
System.out.println("Small ZIP (10 entries, 10KB):");
runBenchmark(smallZip, 10, iterations, warmupIterations);
System.out.println("\nMedium ZIP (1000 entries, ~100MB):");
runBenchmark(mediumZip, 1000, 20, 4);
System.out.println("\nLarge ZIP (5000 entries, ~2.5GB):");
runBenchmark(largeZip, 5000, 10, 2);
}
private void createBenchmarkZip(Path zipPath, int numEntries, int entrySize) throws Exception {
try (java.util.zip.ZipOutputStream zos =
new java.util.zip.ZipOutputStream(Files.newOutputStream(zipPath))) {
zos.setMethod(java.util.zip.ZipOutputStream.STORED);
java.util.Random random = new java.util.Random(42);
byte[] content = new byte[entrySize];
random.nextBytes(content);
java.util.zip.CRC32 crc = new java.util.zip.CRC32();
crc.update(content);
long crcValue = crc.getValue();
for (int i = 0; i < numEntries; i++) {
java.util.zip.ZipEntry entry = new java.util.zip.ZipEntry("entry" + i + ".txt");
entry.setMethod(java.util.zip.ZipEntry.STORED);
entry.setSize(content.length);
entry.setCompressedSize(content.length);
entry.setCrc(crcValue);
zos.putNextEntry(entry);
zos.write(content);
zos.closeEntry();
}
}
}
private void runBenchmark(Path zipPath, int numEntries, int iterations, int warmup) throws Exception {
AutoDetectParser parser = new AutoDetectParser();
ParseContext context = new ParseContext();
long sizeKB = Files.size(zipPath) / 1024;
String sizeStr = sizeKB >= 1024 ? (sizeKB / 1024) + " MB" : sizeKB + " KB";
System.out.printf(Locale.ROOT, " Entries: %d, Size: %s%n", numEntries, sizeStr);
// Warmup
for (int i = 0; i < warmup; i++) {
try (TikaInputStream tis = TikaInputStream.get(zipPath)) {
if (USE_RECURSIVE_PARSER_WRAPPER) {
parseWithRecursiveWrapper(parser, tis, context);
} else {
parser.parse(tis, new DefaultHandler(), new Metadata(), context);
}
}
}
// Benchmark
long start = System.nanoTime();
for (int i = 0; i < iterations; i++) {
try (TikaInputStream tis = TikaInputStream.get(zipPath)) {
if (USE_RECURSIVE_PARSER_WRAPPER) {
parseWithRecursiveWrapper(parser, tis, context);
} else {
parser.parse(tis, new DefaultHandler(), new Metadata(), context);
}
}
}
long duration = System.nanoTime() - start;
double avgMs = duration / (double) iterations / 1_000_000.0;
System.out.printf(Locale.ROOT, " Average: %.3f ms%n", avgMs);
}
private void parseWithRecursiveWrapper(AutoDetectParser parser, TikaInputStream tis,
ParseContext context) throws Exception {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
BasicContentHandlerFactory factory = new BasicContentHandlerFactory(
BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(factory);
wrapper.parse(tis, handler, new Metadata(), context);
}
}