OPFParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.epub;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.metadata.Epub;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.xml.DcXMLParser;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;

/**
 * Use this to parse the .opf files
 */
public class OPFParser extends DcXMLParser {

    @Override
    protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata,
                                               ParseContext context) {
        //set default.  This will be overwritten if it is pre-paginated
        metadata.set(Epub.RENDITION_LAYOUT, "reflowable");
        return new TeeContentHandler(super.getContentHandler(handler, metadata, context),
                new OPFHandler(metadata));
    }

    private static class OPFHandler extends DefaultHandler {
        private final Metadata metadata;
        boolean inRenditionLayout = false;
        StringBuilder sb = new StringBuilder();

        public OPFHandler(Metadata metadata) {
            this.metadata = metadata;
        }

        @Override
        public void startElement(String uri, String localName, String qName, Attributes attributes)
                throws SAXException {
            //check each item on spine for pre-paginated
            if ("itemref".equals(localName)) {
                String val = XMLReaderUtils.getAttrValue("properties", attributes);
                if (val != null && val.contains("rendition:layout-pre-paginated")) {
                    metadata.set(Epub.RENDITION_LAYOUT, "pre-paginated");
                }
            } else if ("meta".equals(localName)) {
                String prop = XMLReaderUtils.getAttrValue("property", attributes);
                if ("rendition:layout".equals(prop)) {
                    inRenditionLayout = true;
                }
            } else if ("package".equals(localName)) {
                String v = XMLReaderUtils.getAttrValue("version", attributes);
                if (!StringUtils.isBlank(v)) {
                    metadata.set(Epub.VERSION, v);
                }
            }
        }

        @Override
        public void endElement(String uri, String localName, String qName) throws SAXException {
            if (inRenditionLayout && "meta".equals(localName)) {
                String layout = sb.toString();
                if ("pre-paginated".equals(layout)) {
                    metadata.set(Epub.RENDITION_LAYOUT, "pre-paginated");
                }
                inRenditionLayout = false;
                sb.setLength(0);
            }
        }

        @Override
        public void characters(char[] ch, int start, int length) throws SAXException {
            if (inRenditionLayout) {
                sb.append(ch, start, length);
            }
        }
    }
}