DetectHelper.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;

/**
 * Utility methods for content detection.
 */
public class DetectHelper {

    /**
     * Creates a TikaInputStream suitable for detection-only purposes by reading
     * up to {@code maxLength} bytes from the input stream into a byte array.
     * <p>
     * If the input stream contains more bytes than {@code maxLength}, the resulting
     * metadata will have {@link TikaCoreProperties#TRUNCATED_CONTENT_FOR_DETECTION}
     * set to {@code true}, signaling to detectors that they are working with
     * truncated content and should adjust their behavior accordingly.
     * <p>
     * This is useful when you want to perform detection on a limited portion of
     * a large file without spooling the entire file to disk.
     * <p>
     * <b>NOTE</b>The downside is that you may lose precision in detection!
     * This should only be used if you are performing detection only with no parsing.
     *
     * @param stream    the input stream to read from (will NOT be closed)
     * @param maxLength the maximum number of bytes to read
     * @param metadata  the metadata object where truncation flag will be set if applicable
     * @return a TikaInputStream backed by the buffered bytes
     * @throws IOException if an I/O error occurs
     */
    public static TikaInputStream getStreamForDetectionOnly(InputStream stream, int maxLength,
                                                             Metadata metadata) throws IOException {
        BoundedInputStream bounded = new BoundedInputStream(maxLength + 1, stream);

        ByteArrayOutputStream baos = new ByteArrayOutputStream(Math.min(maxLength, 8192));
        byte[] buffer = new byte[4096];
        int bytesRead;
        int totalRead = 0;

        while (totalRead < maxLength && (bytesRead = bounded.read(buffer, 0,
                Math.min(buffer.length, maxLength - totalRead))) != -1) {
            baos.write(buffer, 0, bytesRead);
            totalRead += bytesRead;
        }

        // Check if there's more data available (meaning we truncated)
        boolean truncated = bounded.read() != -1;

        byte[] bytes = baos.toByteArray();
        metadata.set(TikaCoreProperties.DETECTION_CONTENT_LENGTH, bytes.length);

        if (truncated) {
            metadata.set(TikaCoreProperties.TRUNCATED_CONTENT_FOR_DETECTION, true);
        }

        return TikaInputStream.get(bytes);
    }

    /**
     * Creates a TikaInputStream suitable for detection-only purposes by reading
     * up to {@code maxLength} bytes from the input stream into a byte array.
     * <p>
     * This overload creates a new Metadata object internally. If you need to check
     * whether the content was truncated, use
     * {@link #getStreamForDetectionOnly(InputStream, int, Metadata)} instead.
     *
     * @param stream    the input stream to read from (will NOT be closed)
     * @param maxLength the maximum number of bytes to read
     * @return a TikaInputStream backed by the buffered bytes
     * @throws IOException if an I/O error occurs
     */
    public static TikaInputStream getStreamForDetectionOnly(InputStream stream, int maxLength)
            throws IOException {
        return getStreamForDetectionOnly(stream, maxLength, new Metadata());
    }

    /**
     * Checks if the given metadata indicates that the content was truncated for detection.
     *
     * @param metadata the metadata to check
     * @return true if the content was truncated, false otherwise
     */
    public static boolean isContentTruncatedForDetection(Metadata metadata) {
        String value = metadata.get(TikaCoreProperties.TRUNCATED_CONTENT_FOR_DETECTION);
        return Boolean.parseBoolean(value);
    }

    /**
     * Gets the number of bytes buffered for detection.
     *
     * @param metadata the metadata to check
     * @return the number of bytes buffered, or -1 if not set
     */
    public static int getDetectionContentLength(Metadata metadata) {
        Integer value = metadata.getInt(TikaCoreProperties.DETECTION_CONTENT_LENGTH);
        return value != null ? value : -1;
    }
}