ArjArchiveInputStream.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.commons.compress.archivers.arj;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.zip.CRC32;

import org.apache.commons.compress.archivers.AbstractArchiveBuilder;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.utils.ArchiveUtils;
import org.apache.commons.io.EndianUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.BoundedInputStream;
import org.apache.commons.io.input.ChecksumInputStream;

/**
 * Implements the "arj" archive format as an InputStream.
 * <ul>
 * <li><a href="https://github.com/FarGroup/FarManager/blob/master/plugins/multiarc/arc.doc/arj.txt">Reference 1</a></li>
 * <li><a href="https://www.fileformat.info/format/arj/corion.htm">Reference 2</a></li>
 * </ul>
 *
 * @NotThreadSafe
 * @since 1.6
 */
public class ArjArchiveInputStream extends ArchiveInputStream<ArjArchiveEntry> {

    /**
     * Builds a new {@link ArjArchiveInputStream}.
     * <p>
     * For example:
     * </p>
     * <pre>{@code
     * ArjArchiveInputStream in = ArjArchiveInputStream.builder()
     *     .setPath(inputPath)
     *     .setCharset(StandardCharsets.UTF_8)
     *     .setSelfExtracting(false)
     *     .get();
     * }</pre>
     *
     * @since 1.29.0
     */
    public static final class Builder extends AbstractArchiveBuilder<ArjArchiveInputStream, Builder> {

        private boolean selfExtracting;

        private Builder() {
            setCharset(ENCODING_NAME);
        }

        @Override
        public ArjArchiveInputStream get() throws IOException {
            return new ArjArchiveInputStream(this);
        }

        /**
         * Enables compatibility with self-extracting (SFX) ARJ files, default to {@code false}.
         *
         * <p>When {@code true}, the stream is scanned forward to locate the first
         * valid ARJ main header. All bytes before that point are ignored, which
         * allows reading ARJ data embedded in an executable stub.</p>
         *
         * <p><strong>Caveat:</strong> This lenient pre-scan can mask corruption that
         * would otherwise be reported at the start of a normal {@code .arj} file.
         * Enable only when you expect an SFX input.</p>
         *
         * <p>Default to {@code false}.</p>
         *
         * @param selfExtracting {@code true} if the input stream is for a self-extracting archive.
         * @return {@code this} instance.
         * @since 1.29.0
         */
        public Builder setSelfExtracting(final boolean selfExtracting) {
            this.selfExtracting = selfExtracting;
            return asThis();
        }
    }

    private static final String ENCODING_NAME = "CP437";
    private static final int ARJ_MAGIC_1 = 0x60;
    private static final int ARJ_MAGIC_2 = 0xEA;

    /**
     * Maximum size of the basic header, in bytes.
     *
     * <p>The value is taken from the reference implementation</p>
     */
    private static final int MAX_BASIC_HEADER_SIZE = 2600;

    /**
     * Minimum size of the first header (the fixed-size part of the basic header), in bytes.
     */
    private static final int MIN_FIRST_HEADER_SIZE = 30;

    /**
     * Creates a new builder.
     *
     * @return A new builder.
     * @since 1.29.0
     */
    public static Builder builder() {
        return new Builder();
    }

    /**
     * Checks if the signature matches what is expected for an arj file.
     *
     * @param signature the bytes to check.
     * @param length    the number of bytes to check.
     * @return true, if this stream is an arj archive stream, false otherwise.
     */
    public static boolean matches(final byte[] signature, final int length) {
        return length >= 2 && (0xff & signature[0]) == ARJ_MAGIC_1 && (0xff & signature[1]) == ARJ_MAGIC_2;
    }

    private static int readUnsignedByte(final InputStream in) throws IOException {
        final int value = in.read();
        if (value == -1) {
            throw new EOFException("Truncated ARJ archive: Expected more data");
        }
        return value & 0xff;
    }

    private final MainHeader mainHeader;
    private LocalFileHeader currentLocalFileHeader;
    private InputStream currentInputStream;

    private ArjArchiveInputStream(final Builder builder) throws IOException {
        super(builder);
        mainHeader = readMainHeader(builder.selfExtracting);
        if ((mainHeader.arjFlags & MainHeader.Flags.GARBLED) != 0) {
            throw new ArchiveException("Encrypted ARJ files are unsupported");
        }
        if ((mainHeader.arjFlags & MainHeader.Flags.VOLUME) != 0) {
            throw new ArchiveException("Multi-volume ARJ files are unsupported");
        }
    }

    /**
     * Constructs the ArjInputStream, taking ownership of the inputStream that is passed in, and using the CP437 character encoding.
     *
     * <p>Since 1.29.0: Throws {@link IOException}.</p>
     *
     * @param inputStream the underlying stream, whose ownership is taken.
     * @throws IOException if an exception occurs while reading.
     */
    public ArjArchiveInputStream(final InputStream inputStream) throws IOException {
        this(builder().setInputStream(inputStream));
    }

    /**
     * Constructs the ArjInputStream, taking ownership of the inputStream that is passed in.
     *
     * <p>Since 1.29.0: Throws {@link IOException}.</p>
     *
     * @param inputStream the underlying stream, whose ownership is taken.
     * @param charsetName the charset used for file names and comments in the archive. May be {@code null} to use the platform default.
     * @throws IOException if an exception occurs while reading
     * @deprecated Since 1.29.0, use {@link #builder()}.
     */
    @Deprecated
    public ArjArchiveInputStream(final InputStream inputStream, final String charsetName) throws IOException {
        this(builder().setInputStream(inputStream).setCharset(charsetName));
    }

    @Override
    public boolean canReadEntryData(final ArchiveEntry ae) {
        return ae instanceof ArjArchiveEntry && ((ArjArchiveEntry) ae).getMethod() == LocalFileHeader.Methods.STORED;
    }

    /**
     * Verifies the CRC32 checksum of the given data against the next four bytes read from the input stream.
     *
     * @param data The data to verify.
     * @return true if the checksum matches, false otherwise.
     * @throws EOFException If the end of the stream is reached before reading the checksum.
     * @throws IOException If an I/O error occurs.
     */
    @SuppressWarnings("Since15")
    private boolean checkCRC32(final byte[] data) throws IOException {
        final CRC32 crc32 = new CRC32();
        crc32.update(data);
        final long expectedCrc32 = readSwappedUnsignedInteger();
        return crc32.getValue() == expectedCrc32;
    }

    /**
     * Scans for the next valid ARJ header.
     *
     * @return The header bytes.
     * @throws EOFException If the end of the stream is reached before a valid header is found.
     * @throws IOException If an I/O error occurs.
     */
    private byte[] findMainHeader() throws IOException {
        byte[] basicHeaderBytes;
        try {
            while (true) {
                int first;
                int second = readUnsignedByte();
                do {
                    first = second;
                    second = readUnsignedByte();
                } while (first != ARJ_MAGIC_1 && second != ARJ_MAGIC_2);
                final int basicHeaderSize = readSwappedUnsignedShort();
                // At least two bytes are required for the null-terminated name and comment
                if (MIN_FIRST_HEADER_SIZE + 2 <= basicHeaderSize && basicHeaderSize <= MAX_BASIC_HEADER_SIZE) {
                    basicHeaderBytes = IOUtils.toByteArray(in, basicHeaderSize);
                    count(basicHeaderSize);
                    if (checkCRC32(basicHeaderBytes)) {
                        return basicHeaderBytes;
                    }
                }
                // CRC32 failed, continue scanning
            }
        } catch (final EOFException e) {
            throw new ArchiveException("Corrupted ARJ archive: Unable to find valid main header");
        }
    }

    /**
     * Gets the archive's comment.
     *
     * @return the archive's comment
     */
    public String getArchiveComment() {
        return mainHeader.comment;
    }

    /**
     * Gets the archive's recorded name.
     *
     * @return the archive's name
     */
    public String getArchiveName() {
        return mainHeader.name;
    }

    @Override
    public ArjArchiveEntry getNextEntry() throws IOException {
        if (currentInputStream != null) {
            // return value ignored as IOUtils.skip ensures the stream is drained completely
            final InputStream input = currentInputStream;
            IOUtils.consume(input);
            currentInputStream.close();
            currentLocalFileHeader = null;
            currentInputStream = null;
        }

        currentLocalFileHeader = readLocalFileHeader();
        if (currentLocalFileHeader != null) {
            // @formatter:off
            final long currentPosition = getBytesRead();
            currentInputStream = BoundedInputStream.builder()
                    .setInputStream(in)
                    .setMaxCount(currentLocalFileHeader.compressedSize)
                    .setPropagateClose(false)
                    .setAfterRead(read -> {
                        if (read < 0) {
                            throw new EOFException(String.format(
                                    "Truncated ARJ archive: Entry '%s' expected %,d bytes, but only %,d were read.",
                                    currentLocalFileHeader.name,
                                    currentLocalFileHeader.compressedSize,
                                    getBytesRead() - currentPosition
                            ));
                        }
                        count(read);
                    })
                    .get();
            // @formatter:on
            if (currentLocalFileHeader.method == LocalFileHeader.Methods.STORED) {
                // @formatter:off
                currentInputStream = ChecksumInputStream.builder()
                        .setChecksum(new CRC32())
                        .setInputStream(currentInputStream)
                        .setCountThreshold(currentLocalFileHeader.originalSize)
                        .setExpectedChecksumValue(currentLocalFileHeader.originalCrc32)
                        .get();
                // @formatter:on
            }
            return new ArjArchiveEntry(currentLocalFileHeader);
        }
        currentInputStream = null;
        return null;
    }

    @Override
    public int read(final byte[] b, final int off, final int len) throws IOException {
        IOUtils.checkFromIndexSize(b, off, len);
        if (len == 0) {
            return 0;
        }
        if (currentLocalFileHeader == null) {
            throw new IllegalStateException("No current arj entry");
        }
        if (currentLocalFileHeader.method != LocalFileHeader.Methods.STORED) {
            throw new ArchiveException("Unsupported compression method '%s'", currentLocalFileHeader.method);
        }
        return currentInputStream.read(b, off, len);
    }

    private String readComment(final InputStream dataIn) throws IOException {
        return new String(readString(dataIn).toByteArray(), getCharset());
    }

    private String readEntryName(final InputStream dataIn) throws IOException {
        final ByteArrayOutputStream buffer = readString(dataIn);
        ArchiveUtils.checkEntryNameLength(buffer.size(), getMaxEntryNameLength(), "ARJ");
        return new String(buffer.toByteArray(), getCharset());
    }

    /**
     * Reads the next valid ARJ header.
     *
     * @return The header bytes, or {@code null} if end of archive.
     * @throws EOFException If the end of the stream is reached before a valid header is found.
     * @throws IOException If an I/O error occurs.
     */
    private byte[] readHeader() throws IOException {
        final int first = readUnsignedByte();
        final int second = readUnsignedByte();
        if (first != ARJ_MAGIC_1 || second != ARJ_MAGIC_2) {
            throw new ArchiveException("Corrupted ARJ archive: Invalid ARJ header signature 0x%02X 0x%02X", first, second);
        }
        final int basicHeaderSize = readSwappedUnsignedShort();
        if (basicHeaderSize == 0) {
            // End of archive
            return null;
        }
        // At least two bytes are required for the null-terminated name and comment
        if (basicHeaderSize < MIN_FIRST_HEADER_SIZE + 2 || basicHeaderSize > MAX_BASIC_HEADER_SIZE) {
            throw new ArchiveException("Corrupted ARJ archive: Invalid ARJ header size %,d", basicHeaderSize);
        }
        final byte[] basicHeaderBytes = IOUtils.toByteArray(in, basicHeaderSize);
        count(basicHeaderSize);
        if (!checkCRC32(basicHeaderBytes)) {
            throw new ArchiveException("Corrupted ARJ archive: Invalid ARJ header CRC32 checksum");
        }
        return basicHeaderBytes;
    }

    private LocalFileHeader readLocalFileHeader() throws IOException {
        final byte[] basicHeaderBytes = readHeader();
        if (basicHeaderBytes == null) {
            return null;
        }
        final LocalFileHeader localFileHeader = new LocalFileHeader();
        try (InputStream basicHeader = new ByteArrayInputStream(basicHeaderBytes)) {
            final int firstHeaderSize = readUnsignedByte(basicHeader);
            try (InputStream firstHeader = BoundedInputStream.builder().setInputStream(basicHeader).setMaxCount(firstHeaderSize - 1).get()) {
                localFileHeader.archiverVersionNumber = readUnsignedByte(firstHeader);
                localFileHeader.minVersionToExtract = readUnsignedByte(firstHeader);
                localFileHeader.hostOS = readUnsignedByte(firstHeader);
                localFileHeader.arjFlags = readUnsignedByte(firstHeader);
                localFileHeader.method = readUnsignedByte(firstHeader);
                localFileHeader.fileType = readUnsignedByte(firstHeader);
                localFileHeader.reserved = readUnsignedByte(firstHeader);
                localFileHeader.dateTimeModified = EndianUtils.readSwappedInteger(firstHeader);
                localFileHeader.compressedSize = EndianUtils.readSwappedUnsignedInteger(firstHeader);
                localFileHeader.originalSize = EndianUtils.readSwappedUnsignedInteger(firstHeader);
                localFileHeader.originalCrc32 = EndianUtils.readSwappedUnsignedInteger(firstHeader);
                localFileHeader.fileSpecPosition = EndianUtils.readSwappedShort(firstHeader);
                localFileHeader.fileAccessMode = EndianUtils.readSwappedShort(firstHeader);
                localFileHeader.firstChapter = readUnsignedByte(firstHeader);
                localFileHeader.lastChapter = readUnsignedByte(firstHeader);
                // Total read (including size byte): 10 + 4 * 4 + 2 * 2 = 30 bytes
                if (firstHeaderSize >= MIN_FIRST_HEADER_SIZE + 4) {
                    localFileHeader.extendedFilePosition = EndianUtils.readSwappedInteger(firstHeader);
                    // Total read (including size byte): 30 + 4 = 34 bytes
                    if (firstHeaderSize >= MIN_FIRST_HEADER_SIZE + 4 + 12) {
                        localFileHeader.dateTimeAccessed = EndianUtils.readSwappedInteger(firstHeader);
                        localFileHeader.dateTimeCreated = EndianUtils.readSwappedInteger(firstHeader);
                        localFileHeader.originalSizeEvenForVolumes = EndianUtils.readSwappedInteger(firstHeader);
                        // Total read (including size byte): 34 + 12 = 46 bytes
                    }
                }
            }
            localFileHeader.name = readEntryName(basicHeader);
            localFileHeader.comment = readComment(basicHeader);
        }
        final ArrayList<byte[]> extendedHeaders = new ArrayList<>();
        int extendedHeaderSize;
        while ((extendedHeaderSize = readSwappedUnsignedShort()) > 0) {
            final byte[] extendedHeaderBytes = IOUtils.toByteArray(in, extendedHeaderSize);
            count(extendedHeaderSize);
            if (!checkCRC32(extendedHeaderBytes)) {
                throw new ArchiveException("Corrupted ARJ archive: Extended header CRC32 verification failure");
            }
            extendedHeaders.add(extendedHeaderBytes);
        }
        localFileHeader.extendedHeaders = extendedHeaders.toArray(new byte[0][]);
        return localFileHeader;
    }

    private MainHeader readMainHeader(final boolean selfExtracting) throws IOException {
        final byte[] basicHeaderBytes = selfExtracting ? findMainHeader() : readHeader();
        final MainHeader header = new MainHeader();
        try (InputStream basicHeader = new ByteArrayInputStream(basicHeaderBytes)) {
            final int firstHeaderSize = readUnsignedByte(basicHeader);
            try (InputStream firstHeader = BoundedInputStream.builder().setInputStream(basicHeader).setMaxCount(firstHeaderSize - 1).get()) {
                header.archiverVersionNumber = readUnsignedByte(firstHeader);
                header.minVersionToExtract = readUnsignedByte(firstHeader);
                header.hostOS = readUnsignedByte(firstHeader);
                header.arjFlags = readUnsignedByte(firstHeader);
                header.securityVersion = readUnsignedByte(firstHeader);
                header.fileType = readUnsignedByte(firstHeader);
                header.reserved = readUnsignedByte(firstHeader);
                header.dateTimeCreated = EndianUtils.readSwappedInteger(firstHeader);
                header.dateTimeModified = EndianUtils.readSwappedInteger(firstHeader);
                header.archiveSize = EndianUtils.readSwappedUnsignedInteger(firstHeader);
                header.securityEnvelopeFilePosition = EndianUtils.readSwappedInteger(firstHeader);
                header.fileSpecPosition = EndianUtils.readSwappedShort(firstHeader);
                header.securityEnvelopeLength = EndianUtils.readSwappedShort(firstHeader);
                header.encryptionVersion = readUnsignedByte(firstHeader);
                header.lastChapter = readUnsignedByte(firstHeader);
                // Total read (including size byte): 10 + 4 * 4 + 2 * 2 = 30 bytes
                if (firstHeaderSize >= MIN_FIRST_HEADER_SIZE + 4) {
                    header.arjProtectionFactor = readUnsignedByte(firstHeader);
                    header.arjFlags2 = readUnsignedByte(firstHeader);
                    readUnsignedByte(firstHeader);
                    readUnsignedByte(firstHeader);
                    // Total read (including size byte): 30 + 4 = 34 bytes
                }
            }
            header.name = readEntryName(basicHeader);
            header.comment = readComment(basicHeader);
        }
        final int extendedHeaderSize = readSwappedUnsignedShort();
        if (extendedHeaderSize > 0) {
            header.extendedHeaderBytes = IOUtils.toByteArray(in, extendedHeaderSize);
            count(extendedHeaderSize);
            if (!checkCRC32(header.extendedHeaderBytes)) {
                throw new ArchiveException("Corrupted ARJ archive: Extended header CRC32 verification failure");
            }
        }
        return header;
    }

    private ByteArrayOutputStream readString(final InputStream dataIn) throws IOException {
        try (ByteArrayOutputStream buffer = new ByteArrayOutputStream()) {
            int nextByte;
            while ((nextByte = readUnsignedByte(dataIn)) != 0) {
                buffer.write(nextByte);
            }
            return buffer;
        }
    }

    private long readSwappedUnsignedInteger() throws IOException {
        final long value = EndianUtils.readSwappedUnsignedInteger(in);
        count(4);
        return value;
    }

    private int readSwappedUnsignedShort() throws IOException {
        final int value = EndianUtils.readSwappedUnsignedShort(in);
        count(2);
        return value;
    }

    private int readUnsignedByte() throws IOException {
        final int value = readUnsignedByte(in);
        count(1);
        return value & 0xff;
    }
}