ZstdCompressorOutputStream.java
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.commons.compress.compressors.zstandard;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.commons.compress.compressors.CompressorOutputStream;
import org.apache.commons.io.build.AbstractStreamBuilder;
import org.apache.commons.lang3.ArrayUtils;
import com.github.luben.zstd.ZstdOutputStream;
/**
* {@link CompressorOutputStream} implementation to create Zstandard encoded stream.
* <p>
* This class avoids making the underlying {@code zstd} classes part of the public or protected API. The underlying implementation is provided through the
* <a href="https://github.com/luben/zstd-jni/">Zstandard JNI</a> library which is based on <a href="https://github.com/facebook/zstd/">zstd</a>.
* </p>
*
* @see <a href="https://github.com/luben/zstd-jni/">Zstandard JNI</a>
* @see <a href="https://github.com/facebook/zstd/">zstd</a>
* @since 1.16
*/
public class ZstdCompressorOutputStream extends CompressorOutputStream<ZstdOutputStream> {
// @formatter:off
/**
* Builds a new {@link ZstdCompressorOutputStream}.
*
* <p>
* For example:
* </p>
* <pre>{@code
* ZstdCompressorOutputStream s = ZstdCompressorOutputStream.builder()
* .setPath(path)
* .setLevel(3)
* .setStrategy(0)
* .setWorkers(0)
* .get();
* }
* </pre>
* <p>
* This class avoids making the underlying {@code zstd} classes part of the public or protected API.
* </p>
* @see #get()
* @see ZstdConstants
* @since 1.28.0
*/
// @formatter:on
public static final class Builder extends AbstractStreamBuilder<ZstdCompressorOutputStream, Builder> {
private int chainLog;
private boolean checksum;
private boolean closeFrameOnFlush;
private byte[] dict;
private int hashLog;
private int jobSize;
private int level = ZstdConstants.ZSTD_CLEVEL_DEFAULT;
private int minMatch;
private int overlapLog;
private int searchLog;
private int strategy;
private int targetLength;
private int windowLog;
private int workers;
/**
* Constructs a new builder of {@link ZstdCompressorOutputStream}.
*/
public Builder() {
// empty
}
@Override
public ZstdCompressorOutputStream get() throws IOException {
return new ZstdCompressorOutputStream(this);
}
/**
* Sets the size of the multi-probe search table, as a power of 2.
* <p>
* The value {@code 0} means use the default chainLog.
* </p>
* <p>
* The resulting memory usage is (in C) {@code (1 << (chainLog + 2))}. The input must be between {@link ZstdConstants#ZSTD_CHAINLOG_MIN} and
* {@link ZstdConstants#ZSTD_CHAINLOG_MAX}. A larger tables result in better and slower compression. This parameter is useless for "fast" strategy but
* still useful when using "dfast" strategy, in which case it defines a secondary probe table.
* </p>
*
* @param chainLog the size of the multi-probe search table, as a power of 2.
* @return this instance.
* @see ZstdConstants#ZSTD_CHAINLOG_MIN
* @see ZstdConstants#ZSTD_CHAINLOG_MAX
* @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
* @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
*/
public Builder setChainLog(final int chainLog) {
this.chainLog = chainLog;
return this;
}
/**
* Sets whether a 32-bits checksum of content is written at end of frame (defaults to {@code false}).
* <p>
* The value {@code false} means no checksum.
* </p>
*
* @param checksum Whether a 32-bits checksum of content is written at end of frame.
* @return this instance.
* @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
* @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
*/
public Builder setChecksum(final boolean checksum) {
this.checksum = checksum;
return this;
}
/**
* Sets whether to close the frame on flush.
* <p>
* This will guarantee that it can be ready fully if the process crashes before closing the stream. The downside is that this negatively affects the
* compression ratio.
* </p>
* <p>
* The value {@code false} means don't close on flush.
* </p>
*
* @param closeFrameOnFlush whether to close the frame on flush.
* @return this instance.
* @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
* @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
*/
public Builder setCloseFrameOnFlush(final boolean closeFrameOnFlush) {
this.closeFrameOnFlush = closeFrameOnFlush;
return this;
}
/**
* Sets an internal {@code CDict} from the given {@code dict} buffer.
* <p>
* Decompression will have to use same dictionary.
* </p>
* <strong>Using a dictionary</strong>
* <ul>
* <li>Loading a null (or 0-length) dictionary invalidates the previous dictionary, returning to no-dictionary mode.</li>
* <li>A dictionary is sticky, it will be used for all future compressed frames. To return to the no-dictionary mode, load a null dictionary.</li>
* <li>Loading a dictionary builds tables. This is a CPU consuming operation, with non-negligible impact on latency. Tables are dependent on compression
* parameters, and for this reason, compression parameters can no longer be changed after loading a dictionary.</li>
* <li>The dictionary content will be copied internally.</li>
* </ul>
*
* @param dict The dictionary buffer.
* @return this instance.
* @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter12">Zstd manual Chapter12</a>
* @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
*/
public Builder setDict(final byte[] dict) {
this.dict = dict;
return this;
}
/**
* Size of the initial probe table, as a power of 2.
* <p>
* The value {@code 0} means "use default hashLog".
* </p>
* <p>
* The resulting memory usage is (in C) {@code (1 << (hashLog + 2))}. This value must be between {@link ZstdConstants#ZSTD_HASHLOG_MIN} and
* {@link ZstdConstants#ZSTD_HASHLOG_MAX}. Using a larger table improves the compression ratio of strategies <= dFast, and improves speed of
* strategies > dFast.
* </p>
*
* @param hashLog Size of the initial probe table, as a power of 2.
* @return this instance.
* @see ZstdConstants#ZSTD_HASHLOG_MIN
* @see ZstdConstants#ZSTD_HASHLOG_MAX
* @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
* @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
*/
public Builder setHashLog(final int hashLog) {
this.hashLog = hashLog;
return this;
}
/**
* Size of a compression job.
* <p>
* This value is enforced only when {@code workers >= 1}. Each compression job is completed in parallel, so this value can indirectly impact the number
* of active threads. A value of 0 uses a default behavior, which is dynamically determined based on compression parameters. Job size must be a minimum
* of overlap size, or <a href="https://github.com/facebook/zstd/blob/dev/lib/compress/zstdmt_compress.h">ZSTDMT_JOBSIZE_MIN (= 512 KB)</a>, whichever
* is largest. The minimum size is automatically and transparently enforced.
* </p>
* <p>
* This is a multi-threading parameters and is only active if multi-threading is enabled ( if the underlying native library is compiled with the build
* macro {@code ZSTD_MULTITHREAD}).
* </p>
*
* @param jobSize Size of a compression job.
* @return this instance.
* @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
* @see <a href="https://github.com/facebook/zstd/blob/dev/lib/compress/zstdmt_compress.h">zstdmt_compress.h</a>
*/
public Builder setJobSize(final int jobSize) {
this.jobSize = jobSize;
return this;
}
/**
* Sets compression parameters according to a pre-defined {@code cLevel} table, from 0 to 9.
* <p>
* The exact compression parameters are dynamically determined, depending on both compression level and srcSize (when known). The default level is
* {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}. The special value 0 means default, which is controlled by {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
* </p>
* <ul>
* <li>The value 0 means use the default, which is controlled by {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}</li>
* <li>You may pass a negative compression level.</li>
* <li>Setting a level does not automatically set all other compression parameters to defaults. Setting this value will eventually dynamically impact
* the compression parameters which have not been manually set. The manually set values are used.</li>
* </ul>
*
* @param level The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
* @return this instance
* @see ZstdConstants#ZSTD_CLEVEL_DEFAULT
* @see ZstdConstants#ZSTD_CLEVEL_MIN
* @see ZstdConstants#ZSTD_CLEVEL_MAX
* @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
* @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
*/
public Builder setLevel(final int level) {
this.level = level;
return this;
}
/**
* Sets minimum match size for long distance matcher.
* <p>
* Zstd can still find matches of smaller size, by updating its search algorithm to look for this size and larger. Using larger values increase
* compression and decompression speed, but decrease the ratio. The value must be between {@link ZstdConstants#ZSTD_MINMATCH_MIN} and
* {@link ZstdConstants#ZSTD_MINMATCH_MAX}. Note that currently, for all strategies < {@code btopt}, effective minimum is 4. , for all strategies
* > {@code fast}, effective maximum is {@code 6}.
* </p>
* <p>
* The value {@code 0} means use the default minMatchLength.
* </p>
*
* @param minMatch minimum match size for long distance matcher.
* @return this instance.
* @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
* @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
*/
public Builder setMinMatch(final int minMatch) {
this.minMatch = minMatch;
return this;
}
/**
* Sets the overlap size, as a fraction of window size.
* <p>
* The overlap size is an amount of data reloaded from previous job at the beginning of a new job. It helps preserve compression ratio, while each job
* is compressed in parallel. This value is enforced only when workers >= 1. Larger values increase compression ratio, but decrease speed. Possible
* values range from 0 to 9:
* </p>
* <ul>
* <li>0 means "default" : value will be determined by the library, depending on strategy</li>
* <li>1 means "no overlap"</li>
* <li>9 means "full overlap", using a full window size.</li>
* </ul>
* <p>
* Each intermediate rank increases/decreases the load size by a factor 2:
* </p>
* <ul>
* <li>9: full window</li>
* <li>8: w / 2</li>
* <li>7: w / 4</li>
* <li>6: w / 8</li>
* <li>5: w / 16</li>
* <li>4: w / 32</li>
* <li>3: w / 64</li>
* <li>2: w / 128</li>
* <li>1: no overlap</li>
* <li>0: default
* </ul>
* <p>
* The default value varies between 6 and 9, depending on the strategy.
* </p>
* <p>
* This is a multi-threading parameters and is only active if multi-threading is enabled ( if the underlying native library is compiled with the build
* macro {@code ZSTD_MULTITHREAD}).
* </p>
*
* @param overlapLog the overlap size, as a fraction of window size.
* @return this instance.
* @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
* @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
*/
public Builder setOverlapLog(final int overlapLog) {
this.overlapLog = overlapLog;
return this;
}
/**
* Sets number of search attempts, as a power of 2.
* <p>
* More attempts result in better and slower compression. This parameter is useless for "fast" and "dFast" strategies.
* </p>
* <p>
* The value {@code 0} means use the default searchLog.
* </p>
*
* @param searchLog number of search attempts, as a power of 2.
* @return this instance.
* @see ZstdConstants#ZSTD_SEARCHLOG_MIN
* @see ZstdConstants#ZSTD_SEARCHLOG_MAX
* @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
* @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
*/
public Builder setSearchLog(final int searchLog) {
this.searchLog = searchLog;
return this;
}
/**
* Sets the {@code ZSTD_strategy} from the C enum definition.
* <p>
* The higher the value of selected strategy, the more complex it is, resulting in stronger and slower compression.
* </p>
* <p>
* The value {@code 0} means use the default strategy.
* </p>
* <ul>
* <li>{@code ZSTD_fast = 1}</li>
* <li>{@code ZSTD_dfast = 2}</li>
* <li>{@code ZSTD_greedy = 3}</li>
* <li>{@code ZSTD_lazy = 4}</li>
* <li>{@code ZSTD_lazy2 = 5}</li>
* <li>{@code ZSTD_btlazy2 = 6}</li>
* <li>{@code ZSTD_btopt = 7}</li>
* <li>{@code ZSTD_btultra = 8}</li>
* <li>{@code ZSTD_btultra2 = 9}</li>
* </ul>
*
* @param strategy the {@code ZSTD_strategy} from the C enum definition.
* @return this instance.
* @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
* @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
*/
public Builder setStrategy(final int strategy) {
this.strategy = strategy;
return this;
}
/**
* Sets a value that depends on the strategy, see {@code ZSTD_c_targetLength}.
* <p>
* For strategies {@code btopt}, {@code btultra} and {@code btultra2}:
* </p>
* <ul>
* <li>Length of Match considered "good enough" to stop search.</li>
* <li>Larger values make compression stronger, and slower.</li>
* </ul>
* <p>
* For strategy {@code fast}:
* </p>
* <ul>
* <li>Distance between match sampling.</li>
* <li>Larger values make compression faster, and weaker.</li>
* </ul>
* <p>
* The value {@code 0} means use the default targetLength.
* </p>
*
* @param targetLength a value that depends on the strategy, see {@code ZSTD_c_targetLength}.
* @return this instance.
* @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
* @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
*/
public Builder setTargetLength(final int targetLength) {
this.targetLength = targetLength;
return this;
}
/**
* Sets maximum allowed back-reference distance, expressed as power of 2.
* <p>
* This will set a memory budget for streaming decompression, with larger values requiring more memory and typically compressing more. This value be
* between {@link ZstdConstants#ZSTD_WINDOWLOG_MIN} and {@link ZstdConstants#ZSTD_WINDOWLOG_MAX}.
* </p>
* <p>
* <strong>Note</strong>: Using a windowLog greater than {@link ZstdConstants#ZSTD_WINDOWLOG_LIMIT_DEFAULT} requires explicitly allowing such size at
* streaming decompression stage.
* </p>
* <p>
* The value {@code 0} means use the default windowLog.
* </p>
*
* @param windowLog maximum allowed back-reference distance, expressed as power of 2.
* @return this instance.
* @see ZstdConstants#ZSTD_WINDOWLOG_MIN
* @see ZstdConstants#ZSTD_WINDOWLOG_MAX
* @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
* @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
*/
public Builder setWindowLog(final int windowLog) {
this.windowLog = windowLog;
return this;
}
/**
* Sets how many threads will be spawned to compress in parallel.
* <p>
* When workers >= 1, this triggers asynchronous mode when compressing which consumes input and flushes output if possible, but immediately gives
* back control to the caller, while compression is performed in parallel, within worker threads. More workers improve speed, but also increase memory
* usage. Compression is performed from the calling thread, and all invocations are blocking.
* </p>
* <p>
* The value {@code 0} means "single-threaded mode", nothing is spawned.
* </p>
* <p>
* This is a multi-threading parameters and is only active if multi-threading is enabled ( if the underlying native library is compiled with the build
* macro {@code ZSTD_MULTITHREAD}).
* </p>
*
* @param workers How many threads will be spawned to compress in parallel.
* @return this instance.
* @see <a href="https://facebook.github.io/zstd/zstd_manual.html#Chapter5">Zstd manual Chapter5</a>
* @see <a href="https://github.com/facebook/zstd/blob/dev/lib/zstd.h">zstd.h</a>
*/
public Builder setWorkers(final int workers) {
this.workers = workers;
return this;
}
}
/**
* Constructs a new builder of {@link ZstdCompressorOutputStream}.
*
* @return a new builder of {@link ZstdCompressorOutputStream}.
* @since 1.28.0
*/
public static Builder builder() {
return new Builder();
}
@SuppressWarnings("resource") // Caller closes
private static ZstdOutputStream toZstdOutputStream(final Builder builder) throws IOException {
final OutputStream outputStream = builder.getOutputStream();
if (outputStream instanceof ZstdOutputStream) {
// Builder properties are not applied when a ZstdOutputStream is provided.
return (ZstdOutputStream) outputStream;
}
// @formatter:off
return new ZstdOutputStream(outputStream)
.setChainLog(builder.chainLog)
.setChecksum(builder.checksum)
.setCloseFrameOnFlush(builder.closeFrameOnFlush)
.setDict(builder.dict != null ? builder.dict : ArrayUtils.EMPTY_BYTE_ARRAY)
.setHashLog(builder.hashLog)
.setJobSize(builder.jobSize)
.setLevel(builder.level)
.setMinMatch(builder.minMatch)
.setOverlapLog(builder.overlapLog)
.setSearchLog(builder.searchLog)
.setStrategy(builder.strategy)
.setTargetLength(builder.targetLength)
.setWindowLog(builder.windowLog)
.setWorkers(builder.workers);
// @formatter:on
}
@SuppressWarnings("resource") // Caller closes
private ZstdCompressorOutputStream(final Builder builder) throws IOException {
super(toZstdOutputStream(builder));
}
/**
* Constructs a new instance using default Zstd parameter values.
*
* @param outStream the output stream.
* @throws IOException if an I/O error occurs.
*/
public ZstdCompressorOutputStream(final OutputStream outStream) throws IOException {
this(builder().setOutputStream(outStream));
}
/**
* Constructs a new instance using default Zstd parameter values plus a compression level.
*
* @param outStream the output stream.
* @param level The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
* @throws IOException if an I/O error occurs.
* @since 1.18
* @deprecated Use {@link #builder()}.
*/
@Deprecated
public ZstdCompressorOutputStream(final OutputStream outStream, final int level) throws IOException {
this(builder().setOutputStream(outStream).setLevel(level));
}
/**
* Constructs a new instance using default Zstd parameter values plus a compression level and checksum setting.
*
* @param outStream the output stream.
* @param level The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
* @param closeFrameOnFlush whether to close the frame on flush.
* @throws IOException if an I/O error occurs.
* @since 1.18
* @deprecated Use {@link #builder()}.
*/
@Deprecated
public ZstdCompressorOutputStream(final OutputStream outStream, final int level, final boolean closeFrameOnFlush) throws IOException {
this(builder().setOutputStream(outStream).setLevel(level).setCloseFrameOnFlush(closeFrameOnFlush));
}
/**
* Constructs a new instance using default Zstd parameter values plus a compression level, closeFrameOnFlush and checksum settings.
*
* @param outStream the output stream.
* @param level The compression level, from 0 to 9, where the default is {@link ZstdConstants#ZSTD_CLEVEL_DEFAULT}.
* @param closeFrameOnFlush whether to close the frame on flush.
* @param checksum Whether a 32-bits checksum of content is written at end of frame.
* @throws IOException if an I/O error occurs.
* @since 1.18
* @deprecated Use {@link #builder()}.
*/
@Deprecated
public ZstdCompressorOutputStream(final OutputStream outStream, final int level, final boolean closeFrameOnFlush, final boolean checksum)
throws IOException {
this(builder().setOutputStream(outStream).setLevel(level).setCloseFrameOnFlush(closeFrameOnFlush).setChecksum(checksum));
}
@Override
public void write(final byte[] buf, final int off, final int len) throws IOException {
out.write(buf, off, len);
}
}