EpubParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.epub;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.commons.lang3.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.xml.DcXMLParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ParserUtils;
import org.apache.tika.utils.XMLReaderUtils;
import org.apache.tika.zip.utils.ZipSalvager;

/**
 * Epub parser
 */
@TikaComponent
public class EpubParser implements Parser {

    /**
     * Serial version UID
     */
    private static final long serialVersionUID = 215176772484050550L;

    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
            new HashSet<>(Arrays.asList(MediaType.application("epub+zip"),
                    MediaType.application("x-ibooks+zip"))));

    private static final String META_INF_ENCRYPTION = "META-INF/encryption.xml";
    private Parser meta = new DcXMLParser();

    private Parser opf = new OPFParser();
    private Parser content = new EpubContentParser();

    public Parser getMetaParser() {
        return meta;
    }

    public void setMetaParser(Parser meta) {
        this.meta = meta;
    }

    public Parser getContentParser() {
        return content;
    }

    public void setContentParser(Parser content) {
        this.content = content;
    }

    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
                      ParseContext context) throws IOException, SAXException, TikaException {
        // Because an EPub file is often made up of multiple XHTML files,
        //  we need explicit control over the start and end of the document
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
        xhtml.startDocument();
        IOException caughtException = null;
        ContentHandler childHandler = new EmbeddedContentHandler(
                new EpubNormalizingHandler(new BodyContentHandler(xhtml)));
        Set<String> encryptedItems = Collections.EMPTY_SET;
        try {
            encryptedItems = bufferedParse(tis, childHandler, xhtml, metadata, context);
        } catch (IOException e) {
            caughtException = e;
        }
        // Finish everything
        xhtml.endDocument();
        if (caughtException != null) {
            throw caughtException;
        }
        maybeThrowEncryptedException(encryptedItems);
    }

    private void updateMimeType(InputStream is, Metadata metadata) throws IOException {
        String type = IOUtils.toString(is, UTF_8);
        //often has trailing new lines
        if (type != null) {
            type = type.trim();
        }
        metadata.set(Metadata.CONTENT_TYPE, type);

    }

    private Set<String> bufferedParse(TikaInputStream tis, ContentHandler bodyHandler,
                               XHTMLContentHandler xhtml, Metadata metadata, ParseContext context)
            throws IOException, TikaException, SAXException {
        if (tis.getOpenContainer() instanceof ZipFile) {
            return bufferedParseZipFile((ZipFile) tis.getOpenContainer(), bodyHandler, xhtml, metadata, context, true);
        }
        ZipFile zipFile = null;
        try {
            zipFile = ZipFile.builder().setFile(tis.getPath().toFile()).get();
        } catch (IOException e) {
            ParserUtils.recordParserFailure(this, e, metadata);
            return trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, context);
        }

        try {
            return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true);
        } finally {
            zipFile.close();
        }
    }

    private Set<String> trySalvage(Path brokenZip, ContentHandler bodyHandler,
                               XHTMLContentHandler xhtml,
                            Metadata metadata, ParseContext context)
            throws IOException, TikaException, SAXException {
        try (TemporaryResources resources = new TemporaryResources()) {
            Path salvaged =
                    resources.createTempFile(FilenameUtils.getSuffixFromPath(brokenZip.getFileName().toString()));
            ZipSalvager.salvageCopy(brokenZip, salvaged);
            try (ZipFile zipFile = ZipFile.builder().setFile(salvaged.toFile()).get()) {
                return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, false);
            } catch (EpubZipException e) {
                try (TikaInputStream tis = TikaInputStream.get(salvaged)) {
                    return streamingParse(tis, xhtml, metadata, context);
                }
            }
        }
    }

    private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHandler,
                                         XHTMLContentHandler xhtml, Metadata metadata,
                                         ParseContext context, boolean isStrict)
            throws IOException, TikaException, SAXException, EpubZipException {

        String rootOPF = getRoot(zipFile, context);
        if (rootOPF == null) {
            throw new EpubZipException();
        }
        ZipArchiveEntry zae = zipFile.getEntry(rootOPF);
        if (zae == null || !zipFile.canReadEntryData(zae)) {
            throw new EpubZipException();
        }
        try (TikaInputStream tis = TikaInputStream.get(zipFile.getInputStream(zae))) {
            opf.parse(tis, new DefaultHandler(), metadata, context);
        }

        ContentOrderScraper contentOrderScraper = new ContentOrderScraper();
        try (InputStream is = zipFile.getInputStream(zae)) {
            XMLReaderUtils.parseSAX(is, contentOrderScraper, context);
        }
        //if no content items, false
        if (contentOrderScraper.contentItems.isEmpty()) {
            throw new EpubZipException();
        }
        String relativePath = "";
        if (rootOPF.lastIndexOf("/") > -1) {
            relativePath = rootOPF.substring(0, rootOPF.lastIndexOf("/") + 1);
        }

        if (isStrict) {
            int found = 0;
            for (String id : contentOrderScraper.contentItems) {
                HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
                if (hRefMediaPair != null && hRefMediaPair.href != null) {
                    zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
                    if (zae != null && zipFile.canReadEntryData(zae)) {
                        found++;
                    }
                }
            }
            //if not perfect match btwn items and readable items
            //return false
            if (found != contentOrderScraper.contentItems.size()) {
                throw new EpubZipException();
            }
        }

        extractMetadata(zipFile, metadata, context);
        Set<String> encryptedItems = checkForDRM(zipFile);
        Set<String> processed = new HashSet<>();
        Set<SAXException> saxExceptions = new HashSet<>();
        for (String id : contentOrderScraper.contentItems) {
            HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
            if (hRefMediaPair != null && hRefMediaPair.href != null) {
                //we need to test for xhtml/xml because the content parser
                //expects that.
                boolean shouldParse = false;
                String href = hRefMediaPair.href.toLowerCase(Locale.US);
                if (hRefMediaPair.media != null) {
                    String mediaType = hRefMediaPair.media.toLowerCase(Locale.US);
                    if (mediaType.contains("html")) {
                        shouldParse = true;
                    }
                } else if (href.endsWith("htm") || href.endsWith("html") || href.endsWith(".xml")) {
                    shouldParse = true;
                }
                if (shouldParse) {
                    String path = relativePath + hRefMediaPair.href;
                    //if content is encrypted, do not parse it, throw an exception now
                    if (encryptedItems.contains(path)) {
                        maybeThrowEncryptedException(encryptedItems);
                    }
                    zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
                    if (zae != null) {
                        try (TikaInputStream tis = TikaInputStream.get(zipFile.getInputStream(zae))) {
                            content.parse(tis, bodyHandler, metadata, context);
                        } catch (SAXException e) {
                            if (WriteLimitReachedException.isWriteLimitReached(e)) {
                                throw e;
                            }
                            saxExceptions.add(e);
                        } finally {
                            processed.add(id);
                        }
                    }
                }
            }
        }

        //now handle embedded files
        EmbeddedDocumentExtractor embeddedDocumentExtractor =
                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
        for (String id : contentOrderScraper.locationMap.keySet()) {
            if (!processed.contains(id)) {
                HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
                String fullPath = relativePath + hRefMediaPair.href;
                if (encryptedItems.contains(fullPath)) {
                    continue;
                }
                if (shouldHandleEmbedded(hRefMediaPair.media)) {
                    handleEmbedded(zipFile, relativePath, hRefMediaPair, embeddedDocumentExtractor,
                            xhtml, metadata, context);
                }
            }
        }
        //throw SAXException if any from the parse of the body contents
        for (SAXException e : saxExceptions) {
            throw e;
        }
        return encryptedItems;
    }

    private Set<String> checkForDRM(ZipFile zipFile) throws IOException, TikaException,
            SAXException {
        ZipArchiveEntry zae = zipFile.getEntry(META_INF_ENCRYPTION);
        if (zae == null) {
            return Collections.EMPTY_SET;
        }
        try (InputStream is = zipFile.getInputStream(zae)) {
            return EncryptionHandler.parse(is, new ParseContext());
        }
    }

    private void checkForDRM(InputStream is, ParseContext parseContext)
            throws IOException, TikaException, SAXException {
        Set<String> encryptedItems = EncryptionHandler.parse(is, parseContext);
        maybeThrowEncryptedException(encryptedItems);
    }

    private void maybeThrowEncryptedException(Set<String> encryptedItems)
            throws EncryptedDocumentException {
        if (encryptedItems.size() == 0) {
            return;
        }
        StringBuilder sb = new StringBuilder();
        sb.append("EPUB contains encrypted items: ");
        int added = 0;
        for (String u : encryptedItems) {
            if (sb.length() > 500) {
                sb.append(" and others...");
                break;
            }
            if (added++ > 0) {
                sb.append(", ");
            }
            sb.append(u);
        }
        throw new EncryptedDocumentException(sb.toString());
    }

    private boolean shouldHandleEmbedded(String media) {
        if (media == null) {
            return true;
        }
        String lc = media.toLowerCase(Locale.US);
        if (lc.contains("css")) {
            return false;
        } else if (lc.contains("svg")) {
            return false;
        } else if (lc.endsWith("/xml")) {
            return false;
        } else if (lc.contains("x-ibooks")) {
            return false;
        } else if (lc.equals("application/x-dtbncx+xml")) {
            return false;
        }
        return true;
    }

    private void handleEmbedded(ZipFile zipFile, String relativePath, HRefMediaPair hRefMediaPair,
                                EmbeddedDocumentExtractor embeddedDocumentExtractor,
                                XHTMLContentHandler xhtml, Metadata parentMetadata,
                                ParseContext context)
            throws IOException, SAXException, TikaException {
        if (hRefMediaPair.href == null) {
            return;
        }
        String fullPath = relativePath + hRefMediaPair.href;

        ZipArchiveEntry ze = zipFile.getEntry(fullPath);
        if (ze == null || !zipFile.canReadEntryData(ze)) {
            return;
        }
        Metadata embeddedMetadata = Metadata.newInstance(context);
        if (!StringUtils.isBlank(hRefMediaPair.media)) {
            embeddedMetadata.set(Metadata.CONTENT_TYPE, hRefMediaPair.media);
        }
        embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fullPath);
        if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
            return;
        }

        TikaInputStream tis = null;
        try {
            tis = TikaInputStream.get(zipFile.getInputStream(ze));
        } catch (IOException e) {
            //store this exception in the parent's metadata
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
            return;
        }

        xhtml.startElement("div", "class", "embedded");
        try {
            boolean outputHtml = true;
            if (hRefMediaPair.media.contains("font") || hRefMediaPair.href.startsWith("fonts")) {
                outputHtml = false;
            }
            embeddedDocumentExtractor
                    .parseEmbedded(tis, new EmbeddedContentHandler(xhtml), embeddedMetadata, context, outputHtml);

        } finally {
            IOUtils.closeQuietly(tis);
        }
        xhtml.endElement("div");
    }

    private void extractMetadata(ZipFile zipFile, Metadata metadata, ParseContext context)
            throws IOException, TikaException, SAXException {
        ZipArchiveEntry zae = zipFile.getEntry("mimetype");
        if (zae != null && zipFile.canReadEntryData(zae)) {
            try (InputStream is = zipFile.getInputStream(zae)) {
                updateMimeType(is, metadata);
            }
        }
        zae = zipFile.getEntry("metadata.xml");
        if (zae != null && zipFile.canReadEntryData(zae)) {
            try (TikaInputStream tis = TikaInputStream.get(zipFile.getInputStream(zae))) {
                meta.parse(tis, new DefaultHandler(), metadata, context);
            }
        }
    }

    private String getRoot(ZipFile zipFile, ParseContext context)
            throws IOException, TikaException, SAXException {
        ZipArchiveEntry container = zipFile.getEntry("META-INF/container.xml");
        if (container != null) {
            RootFinder rootFinder = new RootFinder();
            try (InputStream is = zipFile.getInputStream(container)) {
                XMLReaderUtils.parseSAX(is, rootFinder, context);
            }
            return rootFinder.root;
        } else {
            Enumeration<ZipArchiveEntry> entryEnum = zipFile.getEntries();
            while (entryEnum.hasMoreElements()) {
                ZipArchiveEntry ze = entryEnum.nextElement();
                if (ze.getName().toLowerCase(Locale.US).endsWith(".opf") &&
                        zipFile.canReadEntryData(ze)) {
                    return ze.getName();
                }
            }
            return null;
        }
    }

    //should only be used as a last resort on a truncated zip
    private Set<String> streamingParse(InputStream stream, ContentHandler bodyHandler,
                                       Metadata metadata,
                                       ParseContext context)
            throws IOException, TikaException, SAXException {
        ZipArchiveInputStream zip = new ZipArchiveInputStream(stream, "UTF-8", false, true, false);

        ZipArchiveEntry entry = zip.getNextEntry();
        SAXException sax = null;
        while (entry != null) {
            if (entry.getName().equals("mimetype")) {
                updateMimeType(zip, metadata);
            } else if (entry.getName().equals(META_INF_ENCRYPTION)) {
                //when streaming, throw an encryption exception if anything is encrypted
                checkForDRM(zip, context);
            } else if (entry.getName().equals("metadata.xml")) {
                try (TikaInputStream tisZip = TikaInputStream.get(CloseShieldInputStream.wrap(zip))) {
                    meta.parse(tisZip, new DefaultHandler(), metadata, context);
                }
            } else if (entry.getName().endsWith(".opf")) {
                try (TikaInputStream tisZip = TikaInputStream.get(CloseShieldInputStream.wrap(zip))) {
                    opf.parse(tisZip, new DefaultHandler(), metadata, context);
                }
            } else if (entry.getName().endsWith(".htm") || entry.getName().endsWith(".html") ||
                    entry.getName().endsWith(".xhtml") || entry.getName().endsWith(".xml")) {
                try {
                    try (TikaInputStream tisZip = TikaInputStream.get(CloseShieldInputStream.wrap(zip))) {
                        content.parse(tisZip, bodyHandler, metadata, context);
                    }
                } catch (SAXException e) {
                    if (WriteLimitReachedException.isWriteLimitReached(e)) {
                        throw e;
                    }
                    if (sax == null) {
                        sax = e;
                    }
                }
            }
            entry = zip.getNextEntry();
        }
        if (sax != null) {
            throw sax;
        }
        //always empty -- we throw an encryption exception
        //as soon as checkForDRM hits an encrypted item
        return Collections.EMPTY_SET;
    }


    private static class RootFinder extends DefaultHandler {
        String root = null;

        @Override
        public void startElement(String uri, String localName, String name, Attributes atts)
                throws SAXException {
            if ("rootfile".equalsIgnoreCase(localName)) {
                root = XMLReaderUtils.getAttrValue("full-path", atts);
            }
        }
    }

    private static class ContentOrderScraper extends DefaultHandler {

        Map<String, HRefMediaPair> locationMap = new HashMap<>();
        List<String> contentItems = new ArrayList<>();
        boolean inManifest = false;
        boolean inSpine = false;

        @Override
        public void startElement(String uri, String localName, String name, Attributes atts)
                throws SAXException {
            if ("manifest".equalsIgnoreCase(localName)) {
                inManifest = true;
            } else if ("spine".equalsIgnoreCase(localName)) {
                inSpine = true;
            }
            if (inManifest) {
                if ("item".equalsIgnoreCase(localName)) {
                    String id = XMLReaderUtils.getAttrValue("id", atts);
                    String href = XMLReaderUtils.getAttrValue("href", atts);
                    String mime = XMLReaderUtils.getAttrValue("media-type", atts);
                    if (id != null && href != null) {
                        try {
                            href = URLDecoder.decode(href, UTF_8.name());
                        } catch (UnsupportedEncodingException e) {
                            //swallow
                        }
                        locationMap.put(id, new HRefMediaPair(href, mime));
                    }
                }
            }
            if (inSpine) {
                if ("itemRef".equalsIgnoreCase(localName)) {
                    String id = XMLReaderUtils.getAttrValue("idref", atts);
                    if (id != null) {
                        contentItems.add(id);
                    }
                }
            }
        }


        @Override
        public void endElement(String uri, String localName, String name) throws SAXException {
            if ("manifest".equalsIgnoreCase(localName)) {
                inManifest = false;
            } else if ("spine".equalsIgnoreCase(localName)) {
                inSpine = false;
            }
        }
    }

    private static class HRefMediaPair {
        private final String href;
        private final String media;

        HRefMediaPair(String href, String media) {
            this.href = href;
            this.media = media;
        }

        @Override
        public String toString() {
            return "HRefMediaPair{" + "href='" + href + '\'' + ", media='" + media + '\'' + '}';
        }
    }


    private static class EncryptionHandler extends DefaultHandler {
        private static Set<String> parse(InputStream is, ParseContext parseContext)
                throws TikaException, IOException, SAXException {
            EncryptionHandler handler = new EncryptionHandler();
            XMLReaderUtils.parseSAX(is, handler, parseContext);
            return handler.getEncryptedItems();
        }

        Set<String> encryptedItems = new HashSet<>();
        @Override
        public void startElement(String uri, String localName, String qName, Attributes attributes) {
            if ("CipherReference".equals(localName)) {
                String encryptedUri = XMLReaderUtils.getAttrValue("URI", attributes);
                encryptedItems.add(encryptedUri);
            }
        }
        public Set<String> getEncryptedItems() {
            return encryptedItems;
        }
    }

    //any problem with parsing an epub file when it is
    //a zip file
    private static class EpubZipException extends IOException {

    }

    //for now, this simply converts all names to local names to avoid
    //namespace conflicts in the content handler. This also removes namespaces
    //from attributes
    private static class EpubNormalizingHandler extends ContentHandlerDecorator {
        public EpubNormalizingHandler(ContentHandler contentHandler) {
            super(contentHandler);
        }

        @Override
        public void startElement(String uri, String localName, String name, Attributes atts)
                throws SAXException {
            //some atts may have namespaces that were not included in the header
            boolean needToRewrite = false;
            for (int i = 0; i < atts.getLength(); i++) {
                if (atts.getQName(i) != null && ! atts.getQName(i).equals(atts.getLocalName(i))) {
                    needToRewrite = true;
                    break;
                }
            }
            if (needToRewrite) {
                AttributesImpl simplifiedAtts = new AttributesImpl();
                for (int i = 0; i < atts.getLength(); i++) {
                    simplifiedAtts.addAttribute("", atts.getLocalName(i), atts.getLocalName(i),
                            atts.getType(i), atts.getValue(i));
                }
                super.startElement(uri, localName, localName, simplifiedAtts);
            } else {
                super.startElement(uri, localName, localName, atts);
            }
        }

        @Override
        public void endElement(String uri, String localName, String name) throws SAXException {
            super.endElement(uri, localName, localName);
        }
    }
}