XPSExtractorDecorator.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.ooxml.xps;

import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.Enumeration;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.io.IOUtils;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.ZipPackage;
import org.apache.poi.openxml4j.util.ZipEntrySource;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;

public class XPSExtractorDecorator extends AbstractOOXMLExtractor {

    private static final String OPEN_XPS_DOCUMENT =
            "http://schemas.openxps.org/oxps/v1.0/fixedrepresentation";
    private static String XPS_DOCUMENT =
            "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
    private final ParseContext context;
    private final ZipPackage pkg;
    Map<String, Metadata> embeddedImages = new LinkedHashMap<>();

    public XPSExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor)
            throws TikaException {
        super(context, extractor);
        this.context = context;
        if (extractor.getPackage() instanceof ZipPackage) {
            this.pkg = (ZipPackage) extractor.getPackage();
        } else {
            throw new TikaException("OPCPackage must be a ZipPackage");
        }
    }

    private static TikaInputStream getZipStream(String zipPath, ZipPackage zipPackage)
            throws IOException, TikaException {
        String targPath =
                (zipPath.length() > 1 && zipPath.startsWith("/") ? zipPath.substring(1) : zipPath);
        ZipEntrySource zipEntrySource = zipPackage.getZipArchive();
        Enumeration<? extends ZipArchiveEntry> zipEntryEnumeration = zipEntrySource.getEntries();
        ZipArchiveEntry zipEntry = null;
        while (zipEntryEnumeration.hasMoreElements()) {
            ZipArchiveEntry ze = zipEntryEnumeration.nextElement();
            if (ze.getName().equals(targPath)) {
                zipEntry = ze;
                break;
            }
        }
        if (zipEntry == null) {
            throw new TikaException("Couldn't find required zip entry: " + zipPath);
        }
        return TikaInputStream.get(zipEntrySource.getInputStream(zipEntry));
    }

    @Override
    public POIXMLDocument getDocument() {
        return null;
    }

    @Override
    protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {

        PackageRelationshipCollection prc = pkg.getRelationshipsByType(XPS_DOCUMENT);
        if (prc.size() == 0) {
            prc = pkg.getRelationshipsByType(OPEN_XPS_DOCUMENT);
        }
        for (int i = 0; i < prc.size(); i++) {
            PackageRelationship pr = prc.getRelationship(i);

            //there should only be one.
            //in the test file, this points to FixedDocSeq.fdseq
            try {
                handleDocuments(pr, xhtml);
            } catch (TikaException e) {
                throw new SAXException(e);
            }
        }

        //now handle embedded images
        if (embeddedImages.size() > 0) {
            EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
            for (Map.Entry<String, Metadata> embeddedImage : embeddedImages.entrySet()) {
                String zipPath = embeddedImage.getKey();
                Metadata metadata = embeddedImage.getValue();
                if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
                    handleEmbeddedImage(zipPath, metadata, embeddedDocumentUtil, xhtml);
                }
            }
        }

    }

    private void handleEmbeddedImage(String zipPath, Metadata metadata,
                                     EmbeddedDocumentUtil embeddedDocumentUtil,
                                     XHTMLContentHandler xhtml) throws SAXException, IOException {
        TikaInputStream tis = null;
        try {
            tis = getZipStream(zipPath, pkg);
        } catch (IOException | TikaException e) {
            //store this exception in the parent's metadata
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
            return;
        }

        try {
            embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, true);
        } finally {
            IOUtils.closeQuietly(tis);
        }
    }

    private void handleDocuments(PackageRelationship packageRelationship, XHTMLContentHandler xhtml)
            throws IOException, SAXException, TikaException {
        try (InputStream stream = pkg.getPart(packageRelationship).getInputStream()) {
            XMLReaderUtils.parseSAX(stream,
                    new EmbeddedContentHandler(new FixedDocSeqHandler(xhtml)), context);
        }
    }

    @Override
    protected List<PackagePart> getMainDocumentParts() throws TikaException {
        return Collections.EMPTY_LIST;
    }

    private class FixedDocSeqHandler extends DefaultHandler {
        private final static String DOCUMENT_REFERENCE = "DocumentReference";
        private final static String SOURCE = "Source";

        private final XHTMLContentHandler xhtml;

        private FixedDocSeqHandler(XHTMLContentHandler xhtml) {
            this.xhtml = xhtml;
        }

        @Override
        public void startElement(String uri, String localName, String qName, Attributes atts)
                throws SAXException {
            if (!DOCUMENT_REFERENCE.equals(localName)) {
                return;
            }
            for (int i = 0; i < atts.getLength(); i++) {
                String lName = atts.getLocalName(i);
                if (SOURCE.equals(lName)) {
                    handleDocumentRef(atts.getValue(i));
                }
            }
        }

        private void handleDocumentRef(String docRef) throws SAXException {
            //docRef is a path to a FixedDocumentSequence document,
            // e.g. /Documents/1/FixedDoc.fdoc

            //relative root is /Documents/1 ..need this Pages...
            String relativeRoot = null;
            int i = docRef.lastIndexOf("/");
            if (i > 0) {
                relativeRoot = docRef.substring(0, i);
            } else {
                relativeRoot = "";
            }
            String zipPath = (docRef.startsWith("/") ? docRef.substring(1) : docRef);
            if (pkg instanceof ZipPackage) {
                try (InputStream stream = getZipStream(zipPath, pkg)) {
                    XMLReaderUtils.parseSAX(stream,
                            new EmbeddedContentHandler(
                                    new PageContentPartHandler(relativeRoot, xhtml)), context);

                } catch (IOException | TikaException e) {
                    throw new SAXException(
                            new TikaException("IOException trying to read: " + docRef));
                }
            } else {
                throw new SAXException(new TikaException("Package must be ZipPackage"));
            }
        }

        private class PageContentPartHandler extends DefaultHandler {
            private static final String PAGE_CONTENT = "PageContent";
            private static final String SOURCE = "Source";

            private final String relativeRoot;
            private final XHTMLContentHandler xhtml;

            private PageContentPartHandler(String relativeRoot, XHTMLContentHandler xhtml) {
                this.relativeRoot = relativeRoot;
                this.xhtml = xhtml;
            }

            @Override
            public void startElement(String uri, String localName, String qName, Attributes atts)
                    throws SAXException {
                if (!PAGE_CONTENT.equals(localName)) {
                    return;
                }
                String pagePath = null;
                for (int i = 0; i < atts.getLength(); i++) {
                    if (SOURCE.equals(atts.getLocalName(i))) {
                        pagePath = atts.getValue(i);
                        break;
                    }
                }

                if (pagePath != null) {
                    if (!pagePath.startsWith("/")) {
                        pagePath = relativeRoot + "/" + pagePath;
                    }
                    //trim initial /
                    if (pagePath.startsWith("/")) {
                        pagePath = pagePath.substring(1);
                    }
                    try (InputStream stream = getZipStream(pagePath, pkg)) {
                        XMLReaderUtils.parseSAX(stream,
                                        new XPSPageContentHandler(xhtml, embeddedImages, context), context);
                    } catch (TikaException | IOException e) {
                        throw new SAXException(e);
                    }
                }

            }
        }
    }
}