TempFileUnpackHandler.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.pipes.core.extractor;

import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.FileUtils;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.api.emitter.EmitKey;

/**
 * An UnpackHandler that writes embedded bytes to a temporary directory
 * for later zipping. Files are stored with their emit key names (flattened, with path
 * separators replaced).
 */
public class TempFileUnpackHandler extends AbstractUnpackHandler
        implements Closeable {

    private final Path tempDirectory;
    private final EmitKey containerEmitKey;
    private final UnpackConfig unpackConfig;
    private final List<EmbeddedFileInfo> embeddedFiles = new ArrayList<>();
    private Path originalDocumentPath;
    private String originalDocumentName;
    private boolean closed = false;

    /**
     * Information about an embedded file stored in the temp directory.
     */
    public record EmbeddedFileInfo(int id, String fileName, Path filePath, Metadata metadata) {
    }

    public TempFileUnpackHandler(EmitKey containerEmitKey,
                                 UnpackConfig unpackConfig) throws IOException {
        this.containerEmitKey = containerEmitKey;
        this.unpackConfig = unpackConfig;
        this.tempDirectory = Files.createTempDirectory("tika-unpack-");
    }

    @Override
    public void add(int id, Metadata metadata, InputStream inputStream) throws IOException {
        super.add(id, metadata, inputStream);

        // Generate the file name based on emit key logic
        String emitKey = getEmitKey(containerEmitKey.getEmitKey(), id, unpackConfig, metadata);

        // Flatten the path for zip entry name - use just the filename portion
        String fileName = flattenFileName(emitKey, id);

        // Write to temp file
        Path tempFile = tempDirectory.resolve(fileName);
        try (OutputStream os = Files.newOutputStream(tempFile)) {
            inputStream.transferTo(os);
        }

        embeddedFiles.add(new EmbeddedFileInfo(id, fileName, tempFile, metadata));
    }

    /**
     * Flattens an emit key path to a simple filename suitable for a zip entry.
     * Replaces path separators and uses the last component plus id for uniqueness.
     */
    private String flattenFileName(String emitKey, int id) {
        // Get the last path component
        int lastSlash = Math.max(emitKey.lastIndexOf('/'), emitKey.lastIndexOf('\\'));
        if (lastSlash >= 0 && lastSlash < emitKey.length() - 1) {
            return emitKey.substring(lastSlash + 1);
        }
        return emitKey;
    }

    /**
     * Returns the temporary directory where embedded files are stored.
     */
    public Path getTempDirectory() {
        return tempDirectory;
    }

    /**
     * Returns information about all embedded files stored.
     */
    public List<EmbeddedFileInfo> getEmbeddedFiles() {
        return embeddedFiles;
    }

    /**
     * Returns true if there are any embedded files stored.
     */
    public boolean hasEmbeddedFiles() {
        return !embeddedFiles.isEmpty();
    }

    /**
     * Stores the original container document for inclusion in the zip.
     * Call this before parsing if includeOriginal is enabled.
     *
     * @param inputStream the original document input stream
     * @param fileName the file name for the original document
     */
    public void storeOriginalDocument(InputStream inputStream, String fileName) throws IOException {
        this.originalDocumentName = fileName;
        this.originalDocumentPath = tempDirectory.resolve("_original_" + fileName);
        try (OutputStream os = Files.newOutputStream(originalDocumentPath)) {
            inputStream.transferTo(os);
        }
    }

    /**
     * Returns the path to the original document if stored.
     */
    public Path getOriginalDocumentPath() {
        return originalDocumentPath;
    }

    /**
     * Returns the name of the original document if stored.
     */
    public String getOriginalDocumentName() {
        return originalDocumentName;
    }

    /**
     * Returns true if the original document was stored.
     */
    public boolean hasOriginalDocument() {
        return originalDocumentPath != null && Files.exists(originalDocumentPath);
    }

    @Override
    public void close() throws IOException {
        if (!closed) {
            closed = true;
            // Clean up temp directory - caller should have already zipped if needed
            FileUtils.deleteDirectory(tempDirectory.toFile());
        }
    }
}