ZipParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.pkg;

import static org.apache.tika.detect.zip.PackageConstants.JAR;
import static org.apache.tika.detect.zip.PackageConstants.ZIP;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.file.attribute.FileTime;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.config.ConfigDeserializer;
import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.Zip;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;

/**
 * Parser for ZIP and JAR archives using file-based access for complete metadata extraction.
 * <p>
 * This parser handles:
 * <ul>
 *   <li>Standard ZIP archives</li>
 *   <li>JAR (Java Archive) files</li>
 *   <li>Archive and entry comments</li>
 *   <li>Unix permissions and file attributes</li>
 *   <li>Charset detection for non-Unicode entry names</li>
 *   <li>Encryption detection</li>
 * </ul>
 * <p>
 * This parser prefers file-based access (ZipFile) for complete metadata extraction,
 * but falls back to streaming (ZipArchiveInputStream) for edge-case ZIPs that
 * cannot be read as files (e.g., those with data descriptors that overlap the
 * central directory).
 *
 * <h2>Truncated and Corrupted Files</h2>
 * <p>
 * This parser does not perform ZIP salvaging directly. When used with
 * {@link org.apache.tika.parser.AutoDetectParser}, the
 * {@link org.apache.tika.detect.zip.DefaultZipContainerDetector} handles salvaging
 * of truncated/corrupted files and provides the prepared {@link ZipFile} via
 * {@link TikaInputStream#getOpenContainer()}.
 * <p>
 * <b>Note:</b> If you call this parser directly without going through the detector,
 * truncated or corrupted ZIP files may fail to parse. For best results with
 * untrusted content, use {@link org.apache.tika.parser.AutoDetectParser}.
 */
@TikaComponent()
public class ZipParser extends AbstractArchiveParser {

    /**
     * Set of media types that are specializations of ZIP (e.g., Office documents, EPUB, APK).
     * Used to avoid overwriting more specific media types with generic "application/zip".
     */
    public static final Set<MediaType> ZIP_SPECIALIZATIONS = loadZipSpecializations();

    private static final long serialVersionUID = -5331043266963888709L;

    private static final Set<MediaType> SUPPORTED_TYPES = MediaType.set(ZIP, JAR);

    /**
     * Maximum number of entries to record in integrity check metadata fields.
     * Prevents excessive metadata in ZIPs with many discrepancies.
     */
    private static final int MAX_INTEGRITY_CHECK_ENTRIES = 100;


    private final ZipParserConfig defaultConfig;

    private static Set<MediaType> loadZipSpecializations() {
        Set<MediaType> zipSpecializations = new HashSet<>();
        for (String mediaTypeString : new String[]{
                //specializations of ZIP
                "application/bizagi-modeler", "application/epub+zip",
                "application/hwp+zip",
                "application/java-archive",
                "application/vnd.adobe.air-application-installer-package+zip",
                "application/vnd.android.package-archive", "application/vnd.apple.iwork",
                "application/vnd.apple.keynote", "application/vnd.apple.numbers",
                "application/vnd.apple.pages", "application/vnd.apple.unknown.13",
                "application/vnd.etsi.asic-e+zip", "application/vnd.etsi.asic-s+zip",
                "application/vnd.google-earth.kmz", "application/vnd.mindjet.mindmanager",
                "application/vnd.ms-excel.addin.macroenabled.12",
                "application/vnd.ms-excel.sheet.binary.macroenabled.12",
                "application/vnd.ms-excel.sheet.macroenabled.12",
                "application/vnd.ms-excel.template.macroenabled.12",
                "application/vnd.ms-powerpoint.addin.macroenabled.12",
                "application/vnd.ms-powerpoint.presentation.macroenabled.12",
                "application/vnd.ms-powerpoint.slide.macroenabled.12",
                "application/vnd.ms-powerpoint.slideshow.macroenabled.12",
                "application/vnd.ms-powerpoint.template.macroenabled.12",
                "application/vnd.ms-visio.drawing",
                "application/vnd.ms-visio.drawing.macroenabled.12",
                "application/vnd.ms-visio.stencil",
                "application/vnd.ms-visio.stencil.macroenabled.12",
                "application/vnd.ms-visio.template",
                "application/vnd.ms-visio.template.macroenabled.12",
                "application/vnd.ms-word.document.macroenabled.12",
                "application/vnd.ms-word.template.macroenabled.12",
                "application/vnd.ms-xpsdocument", "application/vnd.oasis.opendocument.formula",
                "application/vnd.openxmlformats-officedocument.presentationml.presentation",
                "application/vnd.openxmlformats-officedocument.presentationml.slide",
                "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
                "application/vnd.openxmlformats-officedocument.presentationml.template",
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                "application/vnd.openxmlformats-officedocument.spreadsheetml.template",
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
                "application/x-ibooks+zip", "application/x-itunes-ipa",
                "application/x-tika-iworks-protected", "application/x-tika-java-enterprise-archive",
                "application/x-tika-java-web-archive", "application/x-tika-ooxml",
                "application/x-tika-visio-ooxml", "application/x-xliff+zip", "application/x-xmind",
                "model/vnd.dwfx+xps", "application/vnd.sun.xml.calc",
                "application/vnd.sun.xml.writer", "application/vnd.sun.xml.writer.template",
                "application/vnd.sun.xml.draw", "application/vnd.sun.xml.impress",
                "application/vnd.openofficeorg.autotext",
                "application/vnd.oasis.opendocument.graphics-template",
                "application/vnd.oasis.opendocument.text-web",
                "application/vnd.oasis.opendocument.spreadsheet-template",
                "application/vnd.oasis.opendocument.graphics",
                "application/vnd.oasis.opendocument.image-template",
                "application/vnd.oasis.opendocument.text",
                "application/vnd.oasis.opendocument.text-template",
                "application/vnd.oasis.opendocument.presentation",
                "application/vnd.oasis.opendocument.chart",
                "application/vnd.openofficeorg.extension",
                "application/vnd.oasis.opendocument.spreadsheet",
                "application/vnd.oasis.opendocument.image",
                "application/vnd.oasis.opendocument.formula-template",
                "application/vnd.oasis.opendocument.presentation-template",
                "application/vnd.oasis.opendocument.chart-template",
                "application/vnd.oasis.opendocument.text-master",
                "application/vnd.adobe.indesign-idml-package",
                "application/x-wacz", "application/x-vnd.datapackage+zip"
        }) {
            zipSpecializations.add(MediaType.parse(mediaTypeString));
        }
        return Collections.unmodifiableSet(zipSpecializations);
    }

    public ZipParser() {
        super();
        this.defaultConfig = new ZipParserConfig();
    }

    public ZipParser(ZipParserConfig config) {
        super();
        this.defaultConfig = config;
    }

    /**
     * Constructor for JSON-based configuration.
     */
    public ZipParser(JsonConfig jsonConfig) throws TikaConfigException {
        this(ConfigDeserializer.buildConfig(jsonConfig, ZipParserConfig.class));
    }

    public ZipParser(EncodingDetector encodingDetector) {
        super(encodingDetector);
        this.defaultConfig = new ZipParserConfig();
    }

    @Override
    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    @Override
    public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
                      ParseContext context) throws IOException, SAXException, TikaException {

        ZipParserConfig config = context.get(ZipParserConfig.class, defaultConfig);

        // Check if detector already prepared a ZipFile (including salvaged)
        if (tis.getOpenContainer() instanceof ZipFile) {
            parseWithZipFile((ZipFile) tis.getOpenContainer(), tis, handler, metadata, context, config);
            return;
        }

        // No ZipFile from detector - try to open directly
        // This handles cases where parser is called without detector
        ZipFile zipFile = null;
        try {
            ZipFile.Builder builder = ZipFile.builder().setFile(tis.getFile());
            if (config.getEntryEncoding() != null) {
                builder.setCharset(config.getEntryEncoding());
            }
            zipFile = builder.get();
            tis.setOpenContainer(zipFile);
        } catch (IOException e) {
            // ZipFile failed - fall back to streaming
        }

        if (zipFile != null) {
            parseWithZipFile(zipFile, tis, handler, metadata, context, config);
        } else {
            // Use streaming - enable rewind for DATA_DESCRIPTOR retry
            tis.enableRewind();
            String dataDescriptorRequired = metadata.get(Zip.DETECTOR_DATA_DESCRIPTOR_REQUIRED);
            parseWithStream(tis, handler, metadata, context, config,
                    "true".equals(dataDescriptorRequired));
        }
    }

    /**
     * Parses using a pre-opened ZipFile passed from the detector.
     *
     * @param zipFile  the pre-opened ZipFile from detector
     * @param tis      the TikaInputStream (for integrity check rewind)
     * @param handler  the content handler
     * @param metadata the metadata
     * @param context  the parse context
     * @param config   the parser configuration
     */
    private void parseWithZipFile(ZipFile zipFile, TikaInputStream tis, ContentHandler handler,
                                   Metadata metadata, ParseContext context, ZipParserConfig config)
            throws IOException, SAXException, TikaException {

        // Collect entry names from central directory for integrity check
        Set<String> centralDirectoryEntries = config.isIntegrityCheck()
                ? new LinkedHashSet<>() : null;

        // Don't close the ZipFile - it was passed from the detector and will be closed
        // when TikaInputStream is closed (it's set as the openContainer)
        updateMediaType(zipFile, metadata);

        EmbeddedDocumentExtractor extractor =
                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
        xhtml.startDocument();

        try {
            Enumeration<ZipArchiveEntry> entries = zipFile.getEntries();
            while (entries.hasMoreElements()) {
                ZipArchiveEntry entry = entries.nextElement();
                if (centralDirectoryEntries != null) {
                    centralDirectoryEntries.add(entry.getName());
                }
                if (!entry.isDirectory()) {
                    parseZipFileEntry(zipFile, entry, extractor, metadata, xhtml, context, config);
                }
            }
        } finally {
            xhtml.endDocument();
        }

        // Perform integrity check if enabled
        if (config.isIntegrityCheck()) {
            tis.enableRewind();
            tis.rewind();
            performIntegrityCheck(tis, metadata, centralDirectoryEntries, config);
        }
    }

    /**
     * Parses using streaming with optional initial data descriptor support.
     *
     * @param tis                    the TikaInputStream
     * @param handler                the content handler
     * @param metadata               the metadata
     * @param context                the parse context
     * @param config                 the parser configuration
     * @param startWithDataDescriptor whether to start with data descriptor support enabled
     */
    private void parseWithStream(TikaInputStream tis, ContentHandler handler, Metadata metadata,
                                  ParseContext context, ZipParserConfig config,
                                  boolean startWithDataDescriptor)
            throws IOException, SAXException, TikaException {

        // Track entry names for duplicate detection during streaming
        Set<String> seenEntryNames = config.isIntegrityCheck()
                ? new LinkedHashSet<>() : null;
        List<String> duplicates = config.isIntegrityCheck()
                ? new ArrayList<>() : null;

        String encoding = config.getEntryEncoding() != null
                ? config.getEntryEncoding().name()
                : null;
        ZipArchiveInputStream zis = new ZipArchiveInputStream(tis, encoding, true, startWithDataDescriptor);

        updateMediaType(metadata);

        EmbeddedDocumentExtractor extractor =
                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
        xhtml.startDocument();

        AtomicInteger entryCnt = new AtomicInteger();
        try {
            parseStreamEntries(zis, metadata, extractor, xhtml, false, entryCnt, context, config,
                    seenEntryNames, duplicates);
        } catch (UnsupportedZipFeatureException zfe) {
            if (zfe.getFeature() == Feature.DATA_DESCRIPTOR && !startWithDataDescriptor) {
                // Re-read with data descriptor support
                zis.close();
                tis.rewind();
                zis = new ZipArchiveInputStream(tis, encoding, true, true);
                parseStreamEntries(zis, metadata, extractor, xhtml, true, entryCnt, context, config,
                        seenEntryNames, duplicates);
            } else {
                throw zfe;
            }
        } finally {
            zis.close();
            xhtml.endDocument();
        }

        // Record integrity check results (streaming only = can't compare to central directory)
        if (config.isIntegrityCheck()) {
            if (duplicates.isEmpty()) {
                // No duplicates found, but we couldn't compare to central directory
                metadata.set(Zip.INTEGRITY_CHECK_RESULT, "PARTIAL");
            } else {
                metadata.set(Zip.INTEGRITY_CHECK_RESULT, "FAIL");
                for (String dup : duplicates) {
                    metadata.add(Zip.DUPLICATE_ENTRY_NAMES, dup);
                }
            }
        }
    }

    private void parseStreamEntries(ZipArchiveInputStream zis, Metadata metadata,
                                     EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml,
                                     boolean shouldUseDataDescriptor, AtomicInteger entryCnt,
                                     ParseContext context, ZipParserConfig config,
                                     Set<String> seenEntryNames, List<String> duplicates)
            throws TikaException, IOException, SAXException {

        try {
            ArchiveEntry entry = zis.getNextEntry();
            while (entry != null) {
                if (shouldUseDataDescriptor && entryCnt.get() > 0) {
                    // Skip already-processed entries on re-read
                    entryCnt.decrementAndGet();
                    entry = zis.getNextEntry();
                    continue;
                }

                if (!entry.isDirectory() && entry instanceof ZipArchiveEntry) {
                    parseStreamEntry(zis, (ZipArchiveEntry) entry, extractor, metadata,
                            xhtml, context, config);

                    // Track duplicates AFTER successful processing
                    // (if DATA_DESCRIPTOR exception occurs, we'll re-read this entry)
                    if (seenEntryNames != null && duplicates != null) {
                        String name = entry.getName();
                        if (seenEntryNames.contains(name)) {
                            if (duplicates.size() < MAX_INTEGRITY_CHECK_ENTRIES) {
                                duplicates.add(name);
                            }
                        } else {
                            seenEntryNames.add(name);
                        }
                    }
                }

                // Increment AFTER successful processing
                if (!shouldUseDataDescriptor) {
                    entryCnt.incrementAndGet();
                }

                entry = zis.getNextEntry();
            }
        } catch (UnsupportedZipFeatureException zfe) {
            if (zfe.getFeature() == Feature.ENCRYPTION) {
                throw new EncryptedDocumentException(zfe);
            }
            if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) {
                throw zfe;
            }
            throw new TikaException("UnsupportedZipFeature", zfe);
        }
    }

    private void updateMediaType(ZipFile zipFile, Metadata metadata) {
        MediaType type = ZIP;
        Enumeration<ZipArchiveEntry> entries = zipFile.getEntries();
        if (entries.hasMoreElements()) {
            ZipArchiveEntry first = entries.nextElement();
            if ("META-INF/MANIFEST.MF".equals(first.getName())) {
                type = JAR;
            }
        }
        setMediaTypeIfNotSpecialization(metadata, type);
    }

    private void updateMediaType(Metadata metadata) {
        setMediaTypeIfNotSpecialization(metadata, ZIP);
    }

    private void setMediaTypeIfNotSpecialization(Metadata metadata, MediaType type) {
        String incomingContentTypeString = metadata.get(Metadata.CONTENT_TYPE);
        if (incomingContentTypeString == null) {
            metadata.set(Metadata.CONTENT_TYPE, type.toString());
            return;
        }

        MediaType incomingMediaType = MediaType.parse(incomingContentTypeString);
        if (incomingMediaType == null) {
            metadata.set(Metadata.CONTENT_TYPE, type.toString());
            return;
        }

        if (!ZIP_SPECIALIZATIONS.contains(incomingMediaType)) {
            metadata.set(Metadata.CONTENT_TYPE, type.toString());
        }
    }

    private void parseZipFileEntry(ZipFile zipFile, ZipArchiveEntry entry,
                                    EmbeddedDocumentExtractor extractor, Metadata parentMetadata,
                                    XHTMLContentHandler xhtml, ParseContext context,
                                    ZipParserConfig config)
            throws SAXException, IOException, TikaException {

        String name = detectEntryName(entry, parentMetadata, context, config);

        if (entry.getGeneralPurposeBit().usesEncryption()) {
            handleEncryptedEntry(name, parentMetadata, xhtml);
            return;
        }

        Metadata entryMetadata = buildEntryMetadata(entry, name, context);

        writeEntryXhtml(name, xhtml);

        if (extractor.shouldParseEmbedded(entryMetadata)) {
            TemporaryResources tmp = new TemporaryResources();
            try (InputStream entryStream = zipFile.getInputStream(entry)) {
                TikaInputStream tis = TikaInputStream.get(entryStream, tmp, entryMetadata);
                extractor.parseEmbedded(tis, xhtml, entryMetadata, new ParseContext(), true);
            } finally {
                tmp.dispose();
            }
        }
    }

    private void parseStreamEntry(ZipArchiveInputStream zis, ZipArchiveEntry entry,
                                   EmbeddedDocumentExtractor extractor, Metadata parentMetadata,
                                   XHTMLContentHandler xhtml, ParseContext context,
                                   ZipParserConfig config)
            throws SAXException, IOException, TikaException {

        String name = detectEntryName(entry, parentMetadata, context, config);

        if (!zis.canReadEntryData(entry)) {
            if (entry.getGeneralPurposeBit().usesEncryption()) {
                handleEncryptedEntry(name, parentMetadata, xhtml);
            } else if (entry.getGeneralPurposeBit().usesDataDescriptor()
                    && entry.getMethod() == java.util.zip.ZipEntry.STORED) {
                throw new UnsupportedZipFeatureException(Feature.DATA_DESCRIPTOR, entry);
            } else {
                EmbeddedDocumentUtil.recordEmbeddedStreamException(
                        new TikaException("Can't read archive stream (" + name + ")"),
                        parentMetadata);
                if (name != null && !name.isEmpty()) {
                    xhtml.element("p", name);
                }
            }
            return;
        }

        Metadata entryMetadata = buildEntryMetadata(entry, name, context);

        writeEntryXhtml(name, xhtml);

        if (extractor.shouldParseEmbedded(entryMetadata)) {
            TemporaryResources tmp = new TemporaryResources();
            try {
                TikaInputStream tis = TikaInputStream.get(zis, tmp, entryMetadata);
                extractor.parseEmbedded(tis, xhtml, entryMetadata, new ParseContext(), true);
            } finally {
                tmp.dispose();
            }
        }
    }

    private String detectEntryName(ZipArchiveEntry entry, Metadata parentMetadata,
                                    ParseContext context, ZipParserConfig config) throws IOException {
        // If user specified an encoding, decode raw bytes with that charset
        // This avoids needing to reopen the ZipFile with a different charset
        if (config.getEntryEncoding() != null) {
            return new String(entry.getRawName(), config.getEntryEncoding());
        }

        // If charset detection is enabled, try to detect and decode
        if (config.isDetectCharsetsInEntryNames()) {
            byte[] entryName = entry.getRawName();

            try (TikaInputStream detectStream = TikaInputStream.get(entryName)) {
                List<EncodingResult> encResults =
                        getEncodingDetector().detect(detectStream, parentMetadata, context);
                Charset candidate = encResults.isEmpty() ? null : encResults.get(0).getCharset();
                if (candidate != null) {
                    return new String(entry.getRawName(), candidate);
                }
            }
        }

        // Fall back to default decoding
        return entry.getName();
    }

    private void handleEncryptedEntry(String name, Metadata parentMetadata,
                                       XHTMLContentHandler xhtml) throws SAXException {
        EmbeddedDocumentUtil.recordEmbeddedStreamException(
                new EncryptedDocumentException("stream (" + name + ") is encrypted"),
                parentMetadata);
        if (name != null && !name.isEmpty()) {
            xhtml.element("p", name);
        }
    }

    private Metadata buildEntryMetadata(ZipArchiveEntry entry, String name, ParseContext context)
            throws IOException, TikaException, SAXException {
        Metadata entryMetadata = Metadata.newInstance(context);

        if (name != null && name.length() > 0) {
            name = name.replace("\\", "/");
            entryMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
            entryMetadata.set(TikaCoreProperties.INTERNAL_PATH, name);
        }

        FileTime creationTime = entry.getCreationTime();
        if (creationTime != null) {
            entryMetadata.set(TikaCoreProperties.CREATED, creationTime.toInstant().toString());
        }
        FileTime modifiedTime = entry.getLastModifiedTime();
        if (modifiedTime != null) {
            entryMetadata.set(TikaCoreProperties.MODIFIED, modifiedTime.toInstant().toString());
        }

        long size = entry.getSize();
        if (size >= 0) {
            entryMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
            entryMetadata.set(Zip.UNCOMPRESSED_SIZE, Long.toString(size));
        }
        long compressedSize = entry.getCompressedSize();
        if (compressedSize >= 0) {
            entryMetadata.set(Zip.COMPRESSED_SIZE, Long.toString(compressedSize));
        }

        entryMetadata.set(Zip.COMPRESSION_METHOD, entry.getMethod());

        long crc = entry.getCrc();
        if (crc >= 0) {
            entryMetadata.set(Zip.CRC32, Long.toString(crc));
        }

        int unixMode = entry.getUnixMode();
        if (unixMode != 0) {
            entryMetadata.set(Zip.UNIX_MODE, unixMode);
        }

        entryMetadata.set(Zip.PLATFORM, entry.getPlatform());
        entryMetadata.set(Zip.VERSION_MADE_BY, entry.getVersionMadeBy());

        String entryComment = entry.getComment();
        if (entryComment != null && !entryComment.isEmpty()) {
            entryMetadata.set(Zip.COMMENT, entryComment);
        }

        return entryMetadata;
    }

    private void writeEntryXhtml(String name, XHTMLContentHandler xhtml) throws SAXException {
        if (name != null && name.length() > 0) {
            org.xml.sax.helpers.AttributesImpl attributes = new org.xml.sax.helpers.AttributesImpl();
            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
            attributes.addAttribute("", "id", "id", "CDATA", name);
            xhtml.startElement("div", attributes);
            xhtml.endElement("div");
        }
    }

    /**
     * Performs integrity check by streaming through the ZIP and comparing
     * local file headers against the central directory entries.
     *
     * @param tis                     the TikaInputStream (must be rewound)
     * @param metadata                the parent metadata to record results
     * @param centralDirectoryEntries entry names from the central directory
     * @param config                  the parser configuration
     */
    private void performIntegrityCheck(TikaInputStream tis, Metadata metadata,
                                        Set<String> centralDirectoryEntries,
                                        ZipParserConfig config) throws IOException {

        String encoding = config.getEntryEncoding() != null
                ? config.getEntryEncoding().name()
                : null;

        Set<String> seenInStream = new LinkedHashSet<>();
        List<String> duplicates = new ArrayList<>();
        List<String> localHeaderOnly = new ArrayList<>();

        try (ZipArchiveInputStream zis = new ZipArchiveInputStream(tis, encoding, true, true)) {
            ZipArchiveEntry entry;
            while ((entry = zis.getNextZipEntry()) != null) {
                String name = entry.getName();

                // Check for duplicates
                if (seenInStream.contains(name)) {
                    if (duplicates.size() < MAX_INTEGRITY_CHECK_ENTRIES) {
                        duplicates.add(name);
                    }
                } else {
                    seenInStream.add(name);
                }

                // Check for entries not in central directory
                if (!centralDirectoryEntries.contains(name)) {
                    if (localHeaderOnly.size() < MAX_INTEGRITY_CHECK_ENTRIES) {
                        localHeaderOnly.add(name);
                    }
                }
            }
        } catch (IOException e) {
            // If streaming fails, we still record what we found
        }

        // Find entries in central directory but not in local headers
        List<String> centralOnly = new ArrayList<>();
        for (String cdEntry : centralDirectoryEntries) {
            if (!seenInStream.contains(cdEntry)) {
                if (centralOnly.size() < MAX_INTEGRITY_CHECK_ENTRIES) {
                    centralOnly.add(cdEntry);
                }
            }
        }

        // Record results
        boolean passed = duplicates.isEmpty() && localHeaderOnly.isEmpty() && centralOnly.isEmpty();
        metadata.set(Zip.INTEGRITY_CHECK_RESULT, passed ? "PASS" : "FAIL");

        for (String dup : duplicates) {
            metadata.add(Zip.DUPLICATE_ENTRY_NAMES, dup);
        }
        for (String local : localHeaderOnly) {
            metadata.add(Zip.LOCAL_HEADER_ONLY_ENTRIES, local);
        }
        for (String cd : centralOnly) {
            metadata.add(Zip.CENTRAL_DIRECTORY_ONLY_ENTRIES, cd);
        }
    }
}