DefaultZipContainerDetector.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect.zip;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.tika.config.ServiceLoader;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.DetectHelper;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Zip;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.ParsingIntent;
import org.apache.tika.zip.utils.ZipSalvager;

/**
 * This class is designed to detect subtypes of zip-based file formats.
 * For the sake of efficiency, it also detects archive and compressor formats
 * via commons-compress.
 * <p>
 * As a first step, it uses commons-compress to detect any archive format
 * supported by commons-compress. If "zip" file is detected, then the
 * ZipContainerDetectors are run to try to identify a subtype.
 * <p>
 * If an archive format that is not a zip is detected, that mime type is returned.
 * <p>
 * Finally, if the file is not detected as an archive format, this runs
 * commons-compress' compressor format detector.
 * <p>
 * For {@link TikaInputStream}, file-based detection is used (TikaInputStream
 * handles spilling to disk automatically if needed).
 *
 * <h2>ZIP Salvaging</h2>
 * <p>
 * When a ZIP file cannot be opened directly (truncated or corrupted), and
 * {@link ParsingIntent#WILL_PARSE} is present in the {@link ParseContext},
 * this detector will attempt to salvage the file using {@link ZipSalvager}.
 * Salvaging reconstructs a valid ZIP structure from the local file headers.
 * <p>
 * When salvaging succeeds, {@link Zip#SALVAGED} is set to {@code true} in the
 * metadata, and the salvaged {@link ZipFile} is stored in
 * {@link TikaInputStream#getOpenContainer()} for reuse by parsers.
 * <p>
 * <b>Note:</b> If you use parsers directly without this detector (or without
 * {@link org.apache.tika.parser.AutoDetectParser}), salvaging will not occur
 * and truncated files may fail to parse.
 */
@TikaComponent
public class DefaultZipContainerDetector implements Detector {

    //Regrettably, some tiff files can be incorrectly identified
    //as tar files.  We need this ugly workaround to rule out TIFF.
    //If commons-compress ever chooses to take over TIFF detection
    //we can remove all of this. See TIKA-2591.
    final static MediaType TIFF = MediaType.image("tiff");
    final static byte[][] TIFF_SIGNATURES = new byte[3][];
    /**
     * Serial version UID
     */
    private static final long serialVersionUID = 2891763938430295453L;

    private static final Logger LOG = LoggerFactory.getLogger(DefaultZipContainerDetector.class);

    static {
        TIFF_SIGNATURES[0] = new byte[]{'M', 'M', 0x00, 0x2a};
        TIFF_SIGNATURES[1] = new byte[]{'I', 'I', 0x2a, 0x00};
        TIFF_SIGNATURES[2] = new byte[]{'M', 'M', 0x00, 0x2b};
    }

    private transient ServiceLoader loader;

    protected List<ZipContainerDetector> staticZipDetectors;

    public DefaultZipContainerDetector() {
        this(new ServiceLoader(DefaultZipContainerDetector.class.getClassLoader(), false));
    }

    public DefaultZipContainerDetector(ServiceLoader loader) {
        this.loader = loader;
        staticZipDetectors = loader.loadStaticServiceProviders(ZipContainerDetector.class);
    }

    public DefaultZipContainerDetector(List<ZipContainerDetector> zipDetectors) {
        staticZipDetectors = zipDetectors;
    }

    static boolean isZipArchive(MediaType type) {
        return type.equals(PackageConstants.ZIP) || type.equals(PackageConstants.JAR);
    }

    private static boolean isTiff(byte[] prefix) {
        for (byte[] sig : TIFF_SIGNATURES) {
            if (arrayStartWith(sig, prefix)) {
                return true;
            }
        }
        return false;
    }

    private static boolean arrayStartWith(byte[] needle, byte[] haystack) {
        if (haystack.length < needle.length) {
            return false;
        }
        for (int i = 0; i < needle.length; i++) {
            if (haystack[i] != needle[i]) {
                return false;
            }
        }
        return true;
    }

    static MediaType detectArchiveFormat(byte[] prefix, int length) {
        if (isTiff(prefix)) {
            return TIFF;
        }
        try {
            String name = ArchiveStreamFactory.detect(
                    UnsynchronizedByteArrayInputStream.builder().setByteArray(prefix).setLength(length).get());
            return PackageConstants.getMediaType(name);
        } catch (IOException e) {
            return MediaType.OCTET_STREAM;
        }
    }

    static MediaType detectCompressorFormat(byte[] prefix, int length) {
        try {
            String type =
                    CompressorStreamFactory.detect(
                            UnsynchronizedByteArrayInputStream.builder().setByteArray(prefix).setLength(length).get());
            return CompressorConstants.getMediaType(type);
        } catch (IOException e) {
            return MediaType.OCTET_STREAM;
        }
    }

    private static final int MIN_BUFFER_SIZE = 1024;

    @Override
    public MediaType detect(TikaInputStream tis, Metadata metadata, ParseContext parseContext) throws IOException {
        // Check if we have access to the document
        if (tis == null) {
            return MediaType.OCTET_STREAM;
        }

        byte[] prefix = new byte[MIN_BUFFER_SIZE];
        tis.mark(MIN_BUFFER_SIZE);
        int length = -1;
        try {
            length = IOUtils.read(tis, prefix, 0, MIN_BUFFER_SIZE);
        } finally {
            tis.reset();
        }

        MediaType type = detectArchiveFormat(prefix, length);

        if (type == TIFF) {
            return TIFF;
        } else if (isZipArchive(type)) {
            // If content is truncated for detection, use streaming detection
            // since file-based detection with ZipFile requires the central directory
            // which is at the end of the file
            if (DetectHelper.isContentTruncatedForDetection(metadata)) {
                int contentLength = DetectHelper.getDetectionContentLength(metadata);
                tis.mark(contentLength > 0 ? contentLength : MIN_BUFFER_SIZE);
                try {
                    return detectStreaming(tis, metadata, false);
                } finally {
                    tis.reset();
                }
            }
            //spool to disk if not already file-backed and detect on file
            return detectZipFormatOnFile(tis, metadata, parseContext);
        } else if (!type.equals(MediaType.OCTET_STREAM)) {
            return type;
        } else {
            return detectCompressorFormat(prefix, length);
        }
    }

    /**
     * This will call TikaInputStream's getFile(). If there are no exceptions,
     * it will place the ZipFile in TikaInputStream's openContainer and leave it
     * open.
     * <p>
     * Sets detector hints in metadata for the parser:
     * <ul>
     *   <li>{@link Zip#DETECTOR_ZIPFILE_OPENED} - true if ZipFile opened successfully</li>
     *   <li>{@link Zip#DETECTOR_DATA_DESCRIPTOR_REQUIRED} - true if streaming needed data descriptor support</li>
     * </ul>
     *
     * @param tis the TikaInputStream
     * @param metadata the metadata (will be updated with detector hints)
     * @param parseContext the parse context
     * @return the detected media type
     */
    private MediaType detectZipFormatOnFile(TikaInputStream tis, Metadata metadata, ParseContext parseContext) {
        // Try to open ZipFile directly
        ZipFile zip = null;
        try {
            zip = ZipFile.builder().setFile(tis.getFile()).get();
            metadata.set(Zip.DETECTOR_ZIPFILE_OPENED, true);
        } catch (IOException e) {
            // ZipFile failed to open (truncated/corrupt)
            if (LOG.isDebugEnabled()) {
                LOG.debug("ZipFile failed to open directly", e);
            }
            metadata.set(Zip.DETECTOR_ZIPFILE_OPENED, false);

            // If parsing will follow, try salvaging to prepare ZipFile for parser reuse
            if (parseContext.get(ParsingIntent.class) != null) {
                zip = ZipSalvager.tryToOpenZipFile(tis, metadata);
                if (zip != null && LOG.isDebugEnabled()) {
                    LOG.debug("Successfully salvaged ZIP for parsing");
                }
            }
        }

        if (zip != null) {
            // Store ZipFile in openContainer for parser reuse
            if (tis.getOpenContainer() == null) {
                tis.setOpenContainer(zip);
            } else if (tis.getOpenContainer() != zip) {
                tis.addCloseableResource(zip);
            }

            // ZipFile available (direct or salvaged) - run file-based detection
            try {
                for (ZipContainerDetector zipDetector : getDetectors()) {
                    MediaType type = zipDetector.detect(zip, tis);
                    if (type != null) {
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("{} detected {}", zipDetector.getClass(), type.toString());
                        }
                        return type;
                    } else {
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("{} detected null", zipDetector.getClass());
                        }
                    }
                }
            } catch (IOException e) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Detection failed on ZipFile", e);
                }
            }
            // No specific type detected - it's a plain ZIP
            return MediaType.APPLICATION_ZIP;
        }

        // ZipFile not available - fall back to streaming detection
        // Streaming can examine entries without needing the central directory
        if (LOG.isDebugEnabled()) {
            LOG.debug("Falling back to streaming detection");
        }
        try {
            return detectStreamingFromPath(tis.getPath(), metadata, false);
        } catch (IOException e) {
            //swallow
        }
        return MediaType.APPLICATION_ZIP;

    }

    private MediaType detectStreaming(InputStream input, Metadata metadata, boolean allowStoredEntries)
            throws IOException {
        StreamingDetectContext detectContext = new StreamingDetectContext();
        try (ZipArchiveInputStream zis = new ZipArchiveInputStream(
                CloseShieldInputStream.wrap(input), "UTF8", false, allowStoredEntries)) {
            ZipArchiveEntry zae = zis.getNextEntry();
            while (zae != null) {
                MediaType mt = detect(zae, zis, detectContext);
                if (mt != null) {
                    return mt;
                }
                zae = zis.getNextEntry();
            }
        } catch (UnsupportedZipFeatureException zfe) {
            if (allowStoredEntries == false &&
                    zfe.getFeature() == UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
                // Set hint for parser that DATA_DESCRIPTOR support is required
                metadata.set(Zip.DETECTOR_DATA_DESCRIPTOR_REQUIRED, true);
                input.reset();
                return detectStreaming(input, metadata, true);
            }
        } catch (SecurityException e) {
            throw e;
        } catch (EOFException e) {
            //truncated zip -- swallow
        } catch (IOException e) {
            //another option for a truncated zip
        }

        return finalDetect(detectContext);
    }

    private MediaType detectStreamingFromPath(Path p, Metadata metadata, boolean allowStoredEntries)
            throws IOException {
        StreamingDetectContext detectContext = new StreamingDetectContext();
        try (ZipArchiveInputStream zis = new ZipArchiveInputStream(
                Files.newInputStream(p), "UTF8", false, allowStoredEntries)) {
            ZipArchiveEntry zae = zis.getNextEntry();
            while (zae != null) {
                MediaType mt = detect(zae, zis, detectContext);
                if (mt != null) {
                    return mt;
                }
                zae = zis.getNextEntry();
            }
        } catch (UnsupportedZipFeatureException zfe) {
            if (allowStoredEntries == false &&
                    zfe.getFeature() == UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
                // Set hint for parser that DATA_DESCRIPTOR support is required
                metadata.set(Zip.DETECTOR_DATA_DESCRIPTOR_REQUIRED, true);
                return detectStreamingFromPath(p, metadata, true);
            }
        } catch (SecurityException e) {
            throw e;
        } catch (EOFException e) {
            //truncated zip -- swallow
        } catch (IOException e) {
            //another option for a truncated zip
        }

        return finalDetect(detectContext);
    }

    private MediaType detect(ZipArchiveEntry zae, ZipArchiveInputStream zis,
                             StreamingDetectContext detectContext) throws IOException {
        for (ZipContainerDetector d : getDetectors()) {
            MediaType mt = d.streamingDetectUpdate(zae, zis, detectContext);
            if (mt != null) {
                return mt;
            }
        }
        return null;
    }

    private MediaType finalDetect(StreamingDetectContext detectContext) {
        for (ZipContainerDetector d : getDetectors()) {
            MediaType mt = d.streamingDetectFinal(detectContext);
            if (mt != null) {
                return mt;
            }
        }
        return MediaType.APPLICATION_ZIP;
    }

    private List<ZipContainerDetector> getDetectors() {
        if (loader != null && loader.isDynamic()) {
            List<ZipContainerDetector> dynamicDetectors =
                    loader.loadDynamicServiceProviders(ZipContainerDetector.class);
            if (!dynamicDetectors.isEmpty()) {
                List<ZipContainerDetector> zipDetectors = new ArrayList<>(staticZipDetectors);
                zipDetectors.addAll(dynamicDetectors);
                return zipDetectors;
            } else {
                return staticZipDetectors;
            }
        }
        return staticZipDetectors;
    }
}