RDFSizeBenchmarks.java
/*******************************************************************************
* Copyright (c) 2021 Eclipse RDF4J contributors.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.benchmark.rio;
import static java.nio.charset.StandardCharsets.UTF_16;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.commons.io.output.NullOutputStream.NULL_OUTPUT_STREAM;
import static org.eclipse.rdf4j.rio.helpers.BasicParserSettings.VERIFY_LANGUAGE_TAGS;
import static org.eclipse.rdf4j.rio.helpers.BasicParserSettings.VERIFY_RELATIVE_URIS;
import static org.eclipse.rdf4j.rio.helpers.BasicParserSettings.VERIFY_URI_SYNTAX;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.time.Duration;
import java.time.Instant;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.output.CountingOutputStream;
import org.apache.commons.lang3.tuple.Pair;
import org.eclipse.rdf4j.rio.ParserConfig;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFParser;
import org.eclipse.rdf4j.rio.RDFWriter;
import org.eclipse.rdf4j.rio.Rio;
import org.eclipse.rdf4j.rio.WriterConfig;
import org.eclipse.rdf4j.rio.helpers.BinaryRDFWriterSettings;
/**
* <p>
* This class benchmarks {@link RDFWriter}s in terms of output size given a number of datasets (see
* {@link RDFTestDataset}. The output of the benchmark is the output size in megabytes, the time for parsing the input
* dataset (!) and writing the output and a description of the writer used for each of the datasets.
* </p>
* <p>
* Please note that the datasets from {@link RDFTestDataset} are fairly large files
* </p>
*
* @author Frens Jan Rumph
*/
public class RDFSizeBenchmarks {
public static void main(String[] args) throws IOException {
List<File> datasets = List.of(
RDFTestDataset.SWDF.download(),
RDFTestDataset.LEXVO.download(),
RDFTestDataset.DATAGOVBE.download(),
RDFTestDataset.FISHMARK.download(),
RDFTestDataset.SP2BENCH.download(),
RDFTestDataset.GENE2GO.download(),
RDFTestDataset.BSBM.download()
);
Map<String, Pair<RDFFormat, WriterConfig>> writers = new LinkedHashMap<>();
writers.put("binary v1, buffer size = 100",
binaryV1(100L));
writers.put("binary v1, buffer size = 8k",
binaryV1(8192L));
writers.put("binary v2, buffer size = 100, UTF-16",
binaryV2(100L, UTF_16, false));
writers.put("binary v2, buffer size = 8k, UTF-16",
binaryV2(8192L, UTF_16, false));
writers.put("binary v2, buffer size = 100, UTF-8",
binaryV2(100L, UTF_8, false));
writers.put("binary v2, buffer size = 8k, UTF-8",
binaryV2(8192L, UTF_8, false));
writers.put("binary v2, buffer size = 100, UTF-16, with id-recycling",
binaryV2(100L, UTF_16, true));
writers.put("binary v2, buffer size = 8k, UTF-16, with id-recycling",
binaryV2(8192L, UTF_16, true));
writers.put("binary v2, buffer size = 100, UTF-8, with id-recycling",
binaryV2(100L, UTF_8, true));
writers.put("binary v2, buffer size = 8k, UTF-8, with id-recycling",
binaryV2(8192L, UTF_8, true));
for (File dataset : datasets) {
for (Map.Entry<String, Pair<RDFFormat, WriterConfig>> writer : writers.entrySet()) {
System.gc();
reportSize(dataset, writer.getKey(), writer.getValue().getKey(), writer.getValue().getValue());
}
}
}
private static Pair<RDFFormat, WriterConfig> binaryV1(long bufferSize) {
return Pair.of(RDFFormat.BINARY, new WriterConfig()
.set(BinaryRDFWriterSettings.VERSION, 1L)
.set(BinaryRDFWriterSettings.BUFFER_SIZE, bufferSize));
}
private static Pair<RDFFormat, WriterConfig> binaryV2(long bufferSize, Charset charset, boolean recycleIds) {
WriterConfig config = new WriterConfig()
.set(BinaryRDFWriterSettings.VERSION, 2L)
.set(BinaryRDFWriterSettings.BUFFER_SIZE, bufferSize)
.set(BinaryRDFWriterSettings.CHARSET, charset.name())
.set(BinaryRDFWriterSettings.RECYCLE_IDS, recycleIds);
return Pair.of(RDFFormat.BINARY, config);
}
private static void reportSize(File path, String description, RDFFormat outputFormat, WriterConfig writerConfig)
throws IOException {
String fileName = path.getName();
RDFFormat inputFormat = Rio.getParserFormatForFileName(fileName)
.orElseThrow(() -> new IllegalArgumentException("No format available for " + fileName));
try (InputStream is = new BufferedInputStream(new FileInputStream(path))) {
reportSize(fileName, is, description, inputFormat, outputFormat, writerConfig);
}
}
private static void reportSize(String dataset, InputStream is, String description, RDFFormat inputFormat,
RDFFormat outputFormat, WriterConfig writerConfig) throws IOException {
CountingOutputStream os = new CountingOutputStream(NULL_OUTPUT_STREAM);
RDFWriter writer = Rio.createWriter(outputFormat, os);
writer.setWriterConfig(writerConfig);
RDFParser parser = Rio.createParser(inputFormat);
parser.setRDFHandler(writer);
// Verification of datasets is disabled because of encoding issues in the input data and it's not a critical
// part of the benchmarking.
parser.setParserConfig(new ParserConfig()
.set(VERIFY_LANGUAGE_TAGS, false)
.set(VERIFY_RELATIVE_URIS, false)
.set(VERIFY_URI_SYNTAX, false));
Instant start = Instant.now();
parser.parse(is);
Instant end = Instant.now();
Duration duration = Duration.between(start, end);
long size = os.getByteCount();
System.out.printf(
"%20s %8.2f MB in %-14s - %s%n",
dataset,
size / 1024 / 1024f,
duration,
description
);
}
}