StreamingZipContainerDetector.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect.zip;


import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;

import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;

import org.apache.tika.config.ServiceLoader;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;

/**
 * A zip container detector that uses only streaming detection, never opening
 * the file as a ZipFile. This is primarily used in tests to verify streaming
 * detection behavior.
 * <p>
 * Unlike {@link DefaultZipContainerDetector}, this will never try to open
 * the File as a ZipFile; this relies solely on streaming detection.
 * <p>
 * If you need to limit the amount of data read during detection, wrap your
 * input stream in a {@link org.apache.tika.io.BoundedInputStream} before
 * passing it to the detector.
 */
public class StreamingZipContainerDetector extends DefaultZipContainerDetector {

    /**
     * Serial version UID
     */
    private static final long serialVersionUID = 2891763938430295453L;

    public StreamingZipContainerDetector() {
        this(new ServiceLoader(StreamingZipContainerDetector.class.getClassLoader(), false));
    }

    public StreamingZipContainerDetector(ServiceLoader loader) {
        super(loader);
    }

    public StreamingZipContainerDetector(List<ZipContainerDetector> zipDetectors) {
        super(zipDetectors);
    }

    @Override
    public MediaType detect(TikaInputStream tis, Metadata metadata, ParseContext parseContext) throws IOException {
        // Check if we have access to the document
        if (tis == null) {
            return MediaType.OCTET_STREAM;
        }

        byte[] prefix = new byte[1024]; // enough for all known archive formats
        tis.mark(1024);
        int length = -1;
        try {
            length = IOUtils.read(tis, prefix, 0, 1024);
        } finally {
            tis.reset();
        }

        MediaType type = detectArchiveFormat(prefix, length);

        if (type == TIFF) {
            return TIFF;
        } else if (isZipArchive(type)) {
            return detectStreaming(tis, metadata, false);
        } else if (!type.equals(MediaType.OCTET_STREAM)) {
            return type;
        } else {
            return detectCompressorFormat(prefix, length);
        }
    }

    private MediaType detectStreaming(InputStream input, Metadata metadata, boolean allowStoredEntries)
            throws IOException {
        StreamingDetectContext detectContext = new StreamingDetectContext();
        try (ZipArchiveInputStream zis = new ZipArchiveInputStream(
                CloseShieldInputStream.wrap(input), "UTF8", false, allowStoredEntries)) {
            ZipArchiveEntry zae = zis.getNextEntry();
            while (zae != null) {
                MediaType mt = detect(zae, zis, detectContext);
                if (mt != null) {
                    return mt;
                }
                zae = zis.getNextEntry();
            }
        } catch (UnsupportedZipFeatureException zfe) {
            if (allowStoredEntries == false &&
                    zfe.getFeature() == UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
                input.reset();
                return detectStreaming(input, metadata, true);
            }
        } catch (SecurityException e) {
            throw e;
        } catch (EOFException e) {
            //truncated zip -- swallow
        } catch (IOException e) {
            //another option for a truncated zip
        }

        return finalDetect(detectContext);
    }

    private MediaType detect(ZipArchiveEntry zae, ZipArchiveInputStream zis,
                             StreamingDetectContext detectContext) throws IOException {
        for (ZipContainerDetector d : staticZipDetectors) {
            MediaType mt = d.streamingDetectUpdate(zae, zis, detectContext);
            if (mt != null) {
                return mt;
            }
        }
        return null;
    }

    private MediaType finalDetect(StreamingDetectContext detectContext) {
        for (ZipContainerDetector d : staticZipDetectors) {
            MediaType mt = d.streamingDetectFinal(detectContext);
            if (mt != null) {
                return mt;
            }
        }
        return MediaType.APPLICATION_ZIP;
    }
}