SpoolingStrategy.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.io;

import java.util.HashSet;
import java.util.Set;

import org.apache.tika.config.TikaComponent;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;

/**
 * Strategy for determining when to spool a TikaInputStream to disk.
 * <p>
 * Components (detectors, parsers) can check this strategy before calling
 * {@link TikaInputStream#getFile()} to determine if spooling is appropriate
 * for the given media type.
 * <p>
 * Default behavior (when no strategy is in ParseContext): components spool when needed.
 * A strategy allows fine-grained control over spooling decisions.
 * <p>
 * Configure via JSON:
 * <pre>
 * {
 *   "spooling-strategy": {
 *     "spoolTypes": ["application/zip", "application/x-tika-msoffice", "application/pdf"]
 *   }
 * }
 * </pre>
 */
@TikaComponent(spi = false)
public class SpoolingStrategy {

    private static final Set<MediaType> DEFAULT_SPOOL_TYPES;

    static {
        Set<MediaType> types = new HashSet<>();
        types.add(MediaType.application("zip"));
        types.add(MediaType.application("x-tika-msoffice"));
        types.add(MediaType.application("x-bplist"));
        types.add(MediaType.application("pdf"));
        DEFAULT_SPOOL_TYPES = Set.copyOf(types);
    }

    private Set<MediaType> spoolTypes = new HashSet<>(DEFAULT_SPOOL_TYPES);
    private MediaTypeRegistry mediaTypeRegistry;

    /**
     * Determines whether the stream should be spooled to disk.
     *
     * @param tis       the TikaInputStream (can check hasFile(), getLength())
     * @param metadata  metadata (can check content-type hints, filename)
     * @param mediaType the detected or declared media type
     * @return true if the stream should be spooled to disk
     */
    public boolean shouldSpool(TikaInputStream tis, Metadata metadata, MediaType mediaType) {
        // Already has file? No need to spool
        if (tis != null && tis.hasFile()) {
            return false;
        }
        // Check type against spoolTypes
        return matchesSpoolType(mediaType);
    }

    private boolean matchesSpoolType(MediaType type) {
        if (type == null) {
            return false;
        }
        // Exact match
        if (spoolTypes.contains(type)) {
            return true;
        }
        // Base type match (without parameters)
        MediaType baseType = type.getBaseType();
        if (spoolTypes.contains(baseType)) {
            return true;
        }
        // Check if type is a specialization of any spool type
        if (mediaTypeRegistry != null) {
            for (MediaType spoolType : spoolTypes) {
                if (mediaTypeRegistry.isSpecializationOf(type, spoolType)) {
                    return true;
                }
            }
        }
        return false;
    }

    /**
     * Sets the media types that should be spooled to disk.
     * Specializations of these types are also included.
     *
     * @param spoolTypes set of media types to spool
     */
    public void setSpoolTypes(Set<MediaType> spoolTypes) {
        this.spoolTypes = spoolTypes != null ? new HashSet<>(spoolTypes) : new HashSet<>();
    }

    /**
     * Returns the media types that should be spooled to disk.
     *
     * @return set of media types to spool
     */
    public Set<MediaType> getSpoolTypes() {
        return spoolTypes;
    }

    /**
     * Sets the media type registry used for checking type specializations.
     *
     * @param registry the media type registry
     */
    public void setMediaTypeRegistry(MediaTypeRegistry registry) {
        this.mediaTypeRegistry = registry;
    }

    /**
     * Returns the media type registry.
     *
     * @return the media type registry, or null if not set
     */
    public MediaTypeRegistry getMediaTypeRegistry() {
        return mediaTypeRegistry;
    }
}