GzipCompressorInputStream.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.commons.compress.compressors.gzip;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.zip.CRC32;
import java.util.zip.DataFormatException;
import java.util.zip.Deflater;
import java.util.zip.Inflater;

import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.utils.ByteUtils;
import org.apache.commons.compress.utils.InputStreamStatistics;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.build.AbstractOrigin;
import org.apache.commons.io.build.AbstractStreamBuilder;
import org.apache.commons.io.function.IOConsumer;
import org.apache.commons.io.input.BoundedInputStream;

/**
 * Input stream that decompresses GZIP (.gz) files.
 *
 * <p>
 * This supports decompressing concatenated GZIP files which is important when decompressing standalone GZIP files.
 * </p>
 * <p>
 * Instead of using {@code java.util.zip.GZIPInputStream}, this class has its own GZIP member decoder. Internally, decompression is done using
 * {@link java.util.zip.Inflater}.
 * </p>
 * <p>
 * If you use the constructor {@code GzipCompressorInputStream(in)}, {@code Builder.setDecompressConcatenated(false)}, or
 * {@code GzipCompressorInputStream(in, false)}, then {@link #read} will return -1 as soon as the first encoded GZIP member has been completely read. In this
 * case, if the underlying input stream supports {@link InputStream#mark mark()} and {@link InputStream#reset reset()}, then it will be left positioned just
 * after the end of the encoded GZIP member; otherwise, some indeterminate number of extra bytes following the encoded GZIP member will have been consumed and
 * discarded.
 * </p>
 * <p>
 * If you use the {@code Builder.setDecompressConcatenated(true)} or {@code GzipCompressorInputStream(in, true)} then {@link #read} will return -1 only after
 * the entire input stream has been exhausted; any bytes that follow an encoded GZIP member must constitute a new encoded GZIP member, otherwise an
 * {@link IOException} is thrown. The data read from a stream constructed this way will consist of the concatenated data of all of the encoded GZIP members in
 * order.
 * </p>
 * <p>
 * To build an instance, use {@link Builder}.
 * </p>
 *
 * @see Builder
 * @see <a href="https://datatracker.ietf.org/doc/html/rfc1952">RFC 1952 GZIP File Format Specification</a>
 */
public class GzipCompressorInputStream extends CompressorInputStream implements InputStreamStatistics {

    // @formatter:off
    /**
     * Builds a new {@link GzipCompressorInputStream}.
     *
     * <p>
     * For example:
     * </p>
     * <pre>{@code
     * GzipCompressorInputStream s = GzipCompressorInputStream.builder()
     *   .setPath(path)
     *   .setFileNameCharset(StandardCharsets.ISO_8859_1)
     *   .get();}
     * </pre>
     *
     * @see #get()
     * @since 1.28.0
     */
    // @formatter:on
    public static class Builder extends AbstractStreamBuilder<GzipCompressorInputStream, Builder> {

        /** True if decompressing multi-member streams. */
        private boolean decompressConcatenated;

        private Charset fileNameCharset = GzipUtils.GZIP_ENCODING;

        private IOConsumer<GzipCompressorInputStream> onMemberStart;

        private IOConsumer<GzipCompressorInputStream> onMemberEnd;

        /**
         * Constructs a new builder of {@link GzipCompressorInputStream}.
         */
        public Builder() {
            // empty
        }

        /**
         * Builds a new {@link GzipCompressorInputStream}.
         * <p>
         * You must set input that supports {@link InputStream}, otherwise, this method throws an exception.
         * </p>
         *
         * @return a new instance.
         * @throws IllegalStateException         if the {@code origin} is {@code null}.
         * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
         * @see AbstractOrigin#getInputStream(java.nio.file.OpenOption...)
         */
        @Override
        public GzipCompressorInputStream get() throws IOException {
            return new GzipCompressorInputStream(this);
        }

        /**
         * Sets whether we should allow decompressing multiple members.
         *
         * @param decompressConcatenated whether we should allow decompressing multiple members.
         * @return this instance.
         */
        public Builder setDecompressConcatenated(final boolean decompressConcatenated) {
            this.decompressConcatenated = decompressConcatenated;
            return this;
        }

        /**
         * Sets the Charset to use for writing file names and comments, where null maps to {@link GzipUtils#GZIP_ENCODING}.
         * <p>
         * <em>Setting a value other than {@link GzipUtils#GZIP_ENCODING} is not compliant with the <a href="https://datatracker.ietf.org/doc/html/rfc1952">RFC
         * 1952 GZIP File Format Specification</a></em>. Use at your own risk of interoperability issues.
         * </p>
         * <p>
         * The default value is {@link GzipUtils#GZIP_ENCODING}.
         * </p>
         *
         * @param fileNameCharset the Charset to use for writing file names and comments, null maps to {@link GzipUtils#GZIP_ENCODING}.
         * @return this instance.
         */
        public Builder setFileNameCharset(final Charset fileNameCharset) {
            this.fileNameCharset = fileNameCharset;
            return this;
        }

        /**
         * Sets the consumer called when a member <em>trailer</em> is parsed.
         * <p>
         * When a member <em>header</em> is parsed, all {@link GzipParameters} values are initialized except {@code trailerCrc} and {@code trailerISize}.
         * </p>
         * <p>
         * When a member <em>trailer</em> is parsed, the {@link GzipParameters} values {@code trailerCrc} and {@code trailerISize} are set.
         * </p>
         *
         * @param onMemberEnd The consumer.
         * @return this instance.
         * @see GzipCompressorInputStream#getMetaData()
         */
        public Builder setOnMemberEnd(final IOConsumer<GzipCompressorInputStream> onMemberEnd) {
            this.onMemberEnd = onMemberEnd;
            return this;
        }

        /**
         * Sets the consumer called when a member <em>header</em> is parsed.
         * <p>
         * When a member <em>header</em> is parsed, all {@link GzipParameters} values are initialized except {@code trailerCrc} and {@code trailerISize}.
         * </p>
         * <p>
         * When a member <em>trailer</em> is parsed, the {@link GzipParameters} values {@code trailerCrc} and {@code trailerISize} are set.
         * </p>
         *
         * @param onMemberStart The consumer.
         * @return this instance.
         * @see GzipCompressorInputStream#getMetaData()
         */
        public Builder setOnMemberStart(final IOConsumer<GzipCompressorInputStream> onMemberStart) {
            this.onMemberStart = onMemberStart;
            return this;
        }
    }

    private static final IOConsumer<GzipCompressorInputStream> NOOP = IOConsumer.noop();

    /**
     * Constructs a new builder of {@link GzipCompressorInputStream}.
     *
     * @return a new builder of {@link GzipCompressorInputStream}.
     * @since 1.28.0
     */
    public static Builder builder() {
        return new Builder();
    }

    /**
     * Checks if the signature matches what is expected for a .gz file.
     *
     * @param signature the bytes to check
     * @param length    the number of bytes to check
     * @return true if this is a .gz stream, false otherwise
     * @since 1.1
     */
    public static boolean matches(final byte[] signature, final int length) {
        return length >= 2 && signature[0] == 31 && signature[1] == -117;
    }

    private static byte[] readToNull(final DataInput inData) throws IOException {
        try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
            int b;
            while ((b = inData.readUnsignedByte()) != 0) { // NOSONAR
                bos.write(b);
            }
            return bos.toByteArray();
        }
    }

    /** Buffer to hold the input data. */
    private final byte[] buf = new byte[8192];

    /** Amount of data in buf. */
    private int bufUsed;

    private final BoundedInputStream countingStream;

    /** CRC32 from uncompressed data. */
    private final CRC32 crc = new CRC32();

    /** True if decompressing multi-member streams. */
    private final boolean decompressConcatenated;

    /** True once everything has been decompressed. */
    private boolean endReached;

    private final Charset fileNameCharset;

    /**
     * Compressed input stream, possibly wrapped in a BufferedInputStream, always wrapped in countingStream above
     */
    private final InputStream in;

    /** Decompressor. */
    private Inflater inflater = new Inflater(true);

    /** Buffer for no-argument read method. */
    private final byte[] oneByte = new byte[1];

    private GzipParameters parameters;

    private final IOConsumer<GzipCompressorInputStream> onMemberStart;

    private final IOConsumer<GzipCompressorInputStream> onMemberEnd;

    @SuppressWarnings("resource") // caller closes
    private GzipCompressorInputStream(final Builder builder) throws IOException {
        countingStream = BoundedInputStream.builder().setInputStream(builder.getInputStream()).get();
        // Mark support is strictly needed for concatenated files only,
        // but it's simpler if it is always available.
        in = countingStream.markSupported() ? countingStream : new BufferedInputStream(countingStream);
        this.decompressConcatenated = builder.decompressConcatenated;
        this.fileNameCharset = builder.fileNameCharset;
        this.onMemberStart = builder.onMemberStart != null ? builder.onMemberStart : NOOP;
        this.onMemberEnd = builder.onMemberEnd != null ? builder.onMemberEnd : NOOP;
        init(true);
    }

    /**
     * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
     * <p>
     * This is equivalent to {@code GzipCompressorInputStream(inputStream, false)} and thus will not decompress concatenated .gz files.
     * </p>
     *
     * @param inputStream the InputStream from which this object should be created of
     * @throws IOException if the stream could not be created
     */
    public GzipCompressorInputStream(final InputStream inputStream) throws IOException {
        this(builder().setInputStream(inputStream));
    }

    /**
     * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
     * <p>
     * If {@code decompressConcatenated} is {@code false}: This decompressor might read more input than it will actually use. If {@code inputStream} supports
     * {@code mark} and {@code reset}, then the input position will be adjusted so that it is right after the last byte of the compressed stream. If
     * {@code mark} isn't supported, the input position will be undefined.
     * </p>
     *
     * @param inputStream            the InputStream from which this object should be created of
     * @param decompressConcatenated if true, decompress until the end of the input; if false, stop after the first .gz member
     * @throws IOException if the stream could not be created
     * @deprecated Use {@link Builder#get()}.
     */
    @Deprecated
    public GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated) throws IOException {
        this(builder().setInputStream(inputStream).setDecompressConcatenated(decompressConcatenated));
    }

    /**
     * Closes the input stream (unless it is System.in).
     *
     * @since 1.2
     */
    @Override
    public void close() throws IOException {
        if (inflater != null) {
            inflater.end();
            inflater = null;
        }
        if (this.in != System.in) {
            this.in.close();
        }
    }

    /**
     * {@inheritDoc}.
     *
     * @since 1.17
     */
    @Override
    public long getCompressedCount() {
        return countingStream.getCount();
    }

    /**
     * Provides the stream's meta data - may change with each stream when decompressing concatenated streams.
     *
     * @return the stream's meta data
     * @since 1.8
     */
    public GzipParameters getMetaData() {
        return parameters;
    }

    private boolean init(final boolean isFirstMember) throws IOException {
        if (!isFirstMember && !decompressConcatenated) { // at least one must be true
            throw new IllegalStateException("Unexpected: isFirstMember and decompressConcatenated are both false.");
        }
        // Check the magic bytes without a possibility of EOFException.
        final int magic0 = in.read();
        // If end of input was reached after decompressing at least
        // one .gz member, we have reached the end of the file successfully.
        if (magic0 == -1 && !isFirstMember) {
            return false;
        }
        if (magic0 != GzipUtils.ID1 || in.read() != GzipUtils.ID2) {
            throw new IOException(isFirstMember ? "Input is not in the .gz format." : "Unexpected data after a valid .gz stream.");
        }
        parameters = new GzipParameters();
        parameters.setFileNameCharset(fileNameCharset);
        // Parsing the rest of the header may throw EOFException.
        final DataInput inData = new DataInputStream(in);
        final int method = inData.readUnsignedByte();
        if (method != Deflater.DEFLATED) {
            throw new IOException("Unsupported compression method " + method + " in the .gz header");
        }
        final int flg = inData.readUnsignedByte();
        if ((flg & GzipUtils.FRESERVED) != 0) {
            throw new IOException("Reserved flags are set in the .gz header.");
        }
        parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4));
        switch (inData.readUnsignedByte()) { // extra flags
        case GzipUtils.XFL_MAX_COMPRESSION:
            parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
            break;
        case GzipUtils.XFL_MAX_SPEED:
            parameters.setCompressionLevel(Deflater.BEST_SPEED);
            break;
        default:
            parameters.setCompressionLevel(Deflater.DEFAULT_COMPRESSION);
            break;
        }
        parameters.setOperatingSystem(inData.readUnsignedByte());
        // Extra field
        if ((flg & GzipUtils.FEXTRA) != 0) {
            int xlen = inData.readUnsignedByte();
            xlen |= inData.readUnsignedByte() << 8;
            final byte[] extra = new byte[xlen];
            inData.readFully(extra);
            parameters.setExtraField(ExtraField.fromBytes(extra));
        }
        // Original file name
        if ((flg & GzipUtils.FNAME) != 0) {
            parameters.setFileName(new String(readToNull(inData), parameters.getFileNameCharset()));
        }
        // Comment
        if ((flg & GzipUtils.FCOMMENT) != 0) {
            parameters.setComment(new String(readToNull(inData), parameters.getFileNameCharset()));
        }
        // Header "CRC16" which is actually a truncated CRC32 (which isn't
        // as good as real CRC16). I don't know if any encoder implementation
        // sets this, so it's not worth trying to verify it. GNU gzip 1.4
        // doesn't support this field, but zlib seems to be able to at least
        // skip over it.
        if ((flg & GzipUtils.FHCRC) != 0) {
            parameters.setHeaderCRC(true);
            inData.readShort();
        }
        // Reset
        inflater.reset();
        crc.reset();
        onMemberStart.accept(this);
        return true;
    }

    @Override
    public int read() throws IOException {
        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
    }

    /**
     * {@inheritDoc}
     *
     * @since 1.1
     */
    @Override
    public int read(final byte[] b, int off, int len) throws IOException {
        if (len == 0) {
            return 0;
        }
        if (endReached) {
            return -1;
        }

        int size = 0;

        while (len > 0) {
            if (inflater.needsInput()) {
                // Remember the current position because we may need to
                // rewind after reading too much input.
                in.mark(buf.length);

                bufUsed = in.read(buf);
                if (bufUsed == -1) {
                    throw new EOFException();
                }

                inflater.setInput(buf, 0, bufUsed);
            }

            final int ret;
            try {
                ret = inflater.inflate(b, off, len);
            } catch (final DataFormatException e) { // NOSONAR
                throw new IOException("Gzip-compressed data is corrupt.", e);
            }

            crc.update(b, off, ret);
            off += ret;
            len -= ret;
            size += ret;
            count(ret);

            if (inflater.finished()) {
                // We may have read too many bytes. Rewind the read
                // position to match the actual amount used.
                in.reset();
                final int skipAmount = bufUsed - inflater.getRemaining();
                if (IOUtils.skip(in, skipAmount) != skipAmount) {
                    throw new IOException();
                }
                bufUsed = 0;
                final DataInput inData = new DataInputStream(in);
                // CRC32
                final long trailerCrc = ByteUtils.fromLittleEndian(inData, 4);
                if (trailerCrc != crc.getValue()) {
                    throw new IOException("Gzip-compressed data is corrupt (CRC32 error).");
                }
                // Uncompressed size modulo 2^32, ISIZE in the RFC.
                final long iSize = ByteUtils.fromLittleEndian(inData, 4);
                if (iSize != (inflater.getBytesWritten() & 0xffffffffL)) {
                    throw new IOException("Gzip-compressed data is corrupt (uncompressed size mismatch).");
                }
                parameters.setTrailerCrc(trailerCrc);
                parameters.setTrailerISize(iSize);
                onMemberEnd.accept(this);
                // See if this is the end of the file.
                if (!decompressConcatenated || !init(false)) {
                    inflater.end();
                    inflater = null;
                    endReached = true;
                    return size == 0 ? -1 : size;
                }
            }
        }

        return size;
    }
}