UnpackConfig.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.pipes.core.extractor;

import java.io.Serializable;
import java.util.Objects;

import org.apache.tika.config.TikaComponent;

@TikaComponent(name = "unpack-config")
public class UnpackConfig implements Serializable {

    /**
     * Serial version UID
     */
    private static final long serialVersionUID = -3861669115439125268L;

    /**
     * Default maximum bytes to unpack per file: 10 GB.
     * Use -1 to disable the limit (not recommended).
     */
    public static final long DEFAULT_MAX_UNPACK_BYTES = 10L * 1024L * 1024L * 1024L;

    public enum SUFFIX_STRATEGY {
            NONE, EXISTING, DETECTED;

        public static SUFFIX_STRATEGY parse(String s) {
            if (s.equalsIgnoreCase("none")) {
                return NONE;
            } else if (s.equalsIgnoreCase("existing")) {
                return EXISTING;
            } else if (s.equalsIgnoreCase("detected")) {
                return DETECTED;
            }
            throw new IllegalArgumentException("can't parse " + s);
        }
    }

    public enum KEY_BASE_STRATEGY {
        /**
         * Default pattern: {containerKey}-embed/{id}{suffix}
         */
        DEFAULT,
        /**
         * Custom pattern using emitKeyBase
         */
        CUSTOM;

        public static KEY_BASE_STRATEGY parse(String s) {
            if (s.equalsIgnoreCase(DEFAULT.name())) {
                return DEFAULT;
            } else if (s.equalsIgnoreCase(CUSTOM.name())) {
                return CUSTOM;
            }
            throw new IllegalArgumentException("can't parse " + s);
        }
    }

    /**
     * Output format for UNPACK mode.
     */
    public enum OUTPUT_FORMAT {
        /**
         * Regular output - embedded files emitted individually or as simple zip
         */
        REGULAR,
        /**
         * Frictionless Data Package format with datapackage.json manifest,
         * SHA256 hashes, mimetypes, and files in unpacked/ subdirectory
         */
        FRICTIONLESS;

        public static OUTPUT_FORMAT parse(String s) {
            if (s.equalsIgnoreCase(REGULAR.name())) {
                return REGULAR;
            } else if (s.equalsIgnoreCase(FRICTIONLESS.name())) {
                return FRICTIONLESS;
            }
            throw new IllegalArgumentException("can't parse OUTPUT_FORMAT: " + s);
        }
    }

    /**
     * Output mode for how embedded files are delivered.
     */
    public enum OUTPUT_MODE {
        /**
         * Package all files into a single zip archive
         */
        ZIPPED,
        /**
         * Emit files directly to the configured emitter as separate items
         */
        DIRECTORY;

        public static OUTPUT_MODE parse(String s) {
            if (s.equalsIgnoreCase(ZIPPED.name())) {
                return ZIPPED;
            } else if (s.equalsIgnoreCase(DIRECTORY.name())) {
                return DIRECTORY;
            }
            throw new IllegalArgumentException("can't parse OUTPUT_MODE: " + s);
        }
    }
    private int zeroPadName = 0;

    private SUFFIX_STRATEGY suffixStrategy = SUFFIX_STRATEGY.NONE;

    private String embeddedIdPrefix = "-";

    private String emitter;

    private boolean includeOriginal = false;

    private KEY_BASE_STRATEGY keyBaseStrategy = KEY_BASE_STRATEGY.DEFAULT;
    //This should be set per file. This allows a custom
    //emit key base that bypasses the algorithmic generation of the emitKey
    //from the primary json emitKey when keyBase Strategy is CUSTOM
    private String emitKeyBase = "";

    // Zipping options
    private boolean zipEmbeddedFiles = false;
    private boolean includeMetadataInZip = false;

    // Maximum bytes to unpack per file (default 10GB, -1 to disable limit)
    private long maxUnpackBytes = DEFAULT_MAX_UNPACK_BYTES;

    // Frictionless Data Package options
    private OUTPUT_FORMAT outputFormat = OUTPUT_FORMAT.REGULAR;
    private OUTPUT_MODE outputMode = OUTPUT_MODE.ZIPPED;
    private boolean includeFullMetadata = false;  // Include metadata.json in Frictionless output

    /**
     * Create an UnpackConfig with default settings.
     */
    public UnpackConfig() {
    }

    public int getZeroPadName() {
        return zeroPadName;
    }

    public SUFFIX_STRATEGY getSuffixStrategy() {
        return suffixStrategy;
    }

    public KEY_BASE_STRATEGY getKeyBaseStrategy() {
        return keyBaseStrategy;
    }

    public String getEmbeddedIdPrefix() {
        return embeddedIdPrefix;
    }

    public String getEmitter() {
        return emitter;
    }

    public boolean isIncludeOriginal() {
        return includeOriginal;
    }

    public void setZeroPadName(int zeroPadName) {
        this.zeroPadName = zeroPadName;
    }

    public void setSuffixStrategy(SUFFIX_STRATEGY suffixStrategy) {
        this.suffixStrategy = suffixStrategy;
    }

    public void setSuffixStrategy(String suffixStrategy) {
        setSuffixStrategy(SUFFIX_STRATEGY.valueOf(suffixStrategy));
    }

    public void setKeyBaseStrategy(KEY_BASE_STRATEGY keyBaseStrategy) {
        this.keyBaseStrategy = keyBaseStrategy;
    }

    public void setKeyBaseStrategy(String keyBaseStrategy) {
        setKeyBaseStrategy(KEY_BASE_STRATEGY.valueOf(keyBaseStrategy));
    }

    public void setEmbeddedIdPrefix(String embeddedIdPrefix) {
        this.embeddedIdPrefix = embeddedIdPrefix;
    }

    public void setEmitter(String emitter) {
        this.emitter = emitter;
    }

    public void setIncludeOriginal(boolean includeOriginal) {
        this.includeOriginal = includeOriginal;
    }

    public void setEmitKeyBase(String emitKeyBase) {
        this.emitKeyBase = emitKeyBase;
    }

    public String getEmitKeyBase() {
        return emitKeyBase;
    }

    /**
     * Whether to zip all embedded files into a single archive before emitting.
     * When true, embedded files are collected during parsing and then zipped
     * and emitted as a single archive after parsing completes.
     */
    public boolean isZipEmbeddedFiles() {
        return zipEmbeddedFiles;
    }

    public void setZipEmbeddedFiles(boolean zipEmbeddedFiles) {
        this.zipEmbeddedFiles = zipEmbeddedFiles;
    }

    /**
     * Whether to include the metadata JSON for each embedded document in the zip file.
     * Only applicable when {@link #isZipEmbeddedFiles()} is true.
     */
    public boolean isIncludeMetadataInZip() {
        return includeMetadataInZip;
    }

    public void setIncludeMetadataInZip(boolean includeMetadataInZip) {
        this.includeMetadataInZip = includeMetadataInZip;
    }

    /**
     * Maximum total bytes to unpack per file. Default is 10GB.
     * Set to -1 to disable the limit (not recommended).
     *
     * @return max bytes to unpack, or -1 if no limit
     */
    public long getMaxUnpackBytes() {
        return maxUnpackBytes;
    }

    public void setMaxUnpackBytes(long maxUnpackBytes) {
        this.maxUnpackBytes = maxUnpackBytes;
    }

    /**
     * Get the output format for UNPACK mode.
     * REGULAR is the default (existing behavior).
     * FRICTIONLESS creates a Frictionless Data Package with datapackage.json manifest.
     */
    public OUTPUT_FORMAT getOutputFormat() {
        return outputFormat;
    }

    public void setOutputFormat(OUTPUT_FORMAT outputFormat) {
        this.outputFormat = outputFormat;
    }

    public void setOutputFormat(String outputFormat) {
        setOutputFormat(OUTPUT_FORMAT.valueOf(outputFormat));
    }

    /**
     * Get the output mode for how embedded files are delivered.
     * ZIPPED packages all files into a single zip archive.
     * DIRECTORY emits files directly to the configured emitter.
     */
    public OUTPUT_MODE getOutputMode() {
        return outputMode;
    }

    public void setOutputMode(OUTPUT_MODE outputMode) {
        this.outputMode = outputMode;
    }

    public void setOutputMode(String outputMode) {
        setOutputMode(OUTPUT_MODE.valueOf(outputMode));
    }

    /**
     * Whether to include full RMETA-style metadata in metadata.json.
     * Only applicable when outputFormat is FRICTIONLESS.
     */
    public boolean isIncludeFullMetadata() {
        return includeFullMetadata;
    }

    public void setIncludeFullMetadata(boolean includeFullMetadata) {
        this.includeFullMetadata = includeFullMetadata;
    }

    @Override
    public String toString() {
        return "UnpackConfig{" + "zeroPadName=" + zeroPadName + ", suffixStrategy=" +
                suffixStrategy + ", embeddedIdPrefix='" + embeddedIdPrefix + '\'' +
                ", emitter='" + emitter + '\'' + ", includeOriginal=" + includeOriginal +
                ", keyBaseStrategy=" + keyBaseStrategy + ", emitKeyBase='" + emitKeyBase + '\'' +
                ", zipEmbeddedFiles=" + zipEmbeddedFiles + ", includeMetadataInZip=" + includeMetadataInZip +
                ", maxUnpackBytes=" + maxUnpackBytes + ", outputFormat=" + outputFormat +
                ", outputMode=" + outputMode + ", includeFullMetadata=" + includeFullMetadata + '}';
    }

    @Override
    public final boolean equals(Object o) {
        if (!(o instanceof UnpackConfig config)) {
            return false;
        }

        return zeroPadName == config.zeroPadName && includeOriginal == config.includeOriginal &&
                suffixStrategy == config.suffixStrategy &&
                Objects.equals(embeddedIdPrefix, config.embeddedIdPrefix) &&
                Objects.equals(emitter, config.emitter) &&
                keyBaseStrategy == config.keyBaseStrategy &&
                Objects.equals(emitKeyBase, config.emitKeyBase) &&
                zipEmbeddedFiles == config.zipEmbeddedFiles &&
                includeMetadataInZip == config.includeMetadataInZip &&
                maxUnpackBytes == config.maxUnpackBytes &&
                outputFormat == config.outputFormat &&
                outputMode == config.outputMode &&
                includeFullMetadata == config.includeFullMetadata;
    }

    @Override
    public int hashCode() {
        int result = zeroPadName;
        result = 31 * result + Objects.hashCode(suffixStrategy);
        result = 31 * result + Objects.hashCode(embeddedIdPrefix);
        result = 31 * result + Objects.hashCode(emitter);
        result = 31 * result + Boolean.hashCode(includeOriginal);
        result = 31 * result + Objects.hashCode(keyBaseStrategy);
        result = 31 * result + Objects.hashCode(emitKeyBase);
        result = 31 * result + Boolean.hashCode(zipEmbeddedFiles);
        result = 31 * result + Boolean.hashCode(includeMetadataInZip);
        result = 31 * result + Long.hashCode(maxUnpackBytes);
        result = 31 * result + Objects.hashCode(outputFormat);
        result = 31 * result + Objects.hashCode(outputMode);
        result = 31 * result + Boolean.hashCode(includeFullMetadata);
        return result;
    }
}