FlatOpenDocumentParser.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.config.ConfigDeserializer;
import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
@TikaComponent
public class FlatOpenDocumentParser implements Parser {
/**
* Configuration class for JSON deserialization.
*/
public static class Config {
public boolean extractMacros = false;
}
static final MediaType FLAT_OD =
MediaType.application("vnd.oasis.opendocument.tika.flat.document");
static final MediaType FLAT_ODT = MediaType.application("vnd.oasis.opendocument.flat.text");
static final MediaType FLAT_ODP =
MediaType.application("vnd.oasis.opendocument.flat.presentation");
static final MediaType FLAT_ODS =
MediaType.application("vnd.oasis.opendocument.flat.spreadsheet");
static final MediaType ODT = MediaType.application("vnd.oasis.opendocument.text");
static final MediaType ODP = MediaType.application("vnd.oasis.opendocument.presentation");
static final MediaType ODS = MediaType.application("vnd.oasis.opendocument.spreadsheet");
private static final long serialVersionUID = -8739250869531737584L;
private static final Set<MediaType> SUPPORTED_TYPES = Collections
.unmodifiableSet(new HashSet<>(Arrays.asList(FLAT_OD, FLAT_ODT, FLAT_ODP, FLAT_ODS)));
private boolean extractMacros = false;
public FlatOpenDocumentParser() {
}
/**
* Constructor with explicit Config object.
*
* @param config the configuration
*/
public FlatOpenDocumentParser(Config config) {
this.extractMacros = config.extractMacros;
}
/**
* Constructor for JSON configuration.
* Requires Jackson on the classpath.
*
* @param jsonConfig JSON configuration
*/
public FlatOpenDocumentParser(JsonConfig jsonConfig) {
this(ConfigDeserializer.buildConfig(jsonConfig, Config.class));
}
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
xhtml.startDocument();
tis.setCloseShield();
try {
ContentHandler fodHandler = getContentHandler(xhtml, metadata, context);
XMLReaderUtils.parseSAX(tis,
new EmbeddedContentHandler(fodHandler), context);
//can only detect subtype (text/pres/sheet) during parse.
//update it here.
MediaType detected = ((FlatOpenDocumentParserHandler) fodHandler).getDetectedType();
if (detected != null) {
metadata.set(Metadata.CONTENT_TYPE, detected.toString());
}
} finally {
tis.removeCloseShield();
xhtml.endDocument();
}
}
public void setExtractMacros(boolean extractMacros) {
this.extractMacros = extractMacros;
}
public boolean isExtractMacros() {
return extractMacros;
}
private ContentHandler getContentHandler(ContentHandler handler, Metadata metadata,
ParseContext context) {
return new FlatOpenDocumentParserHandler(handler, metadata, context, extractMacros);
}
private static class FlatOpenDocumentParserHandler extends ContentHandlerDecorator {
private static final String META = "meta";
private static final String BODY = "body";
private static final String SCRIPTS = "scripts";
private static final String DOCUMENT = "document";
private final ContentHandler defaultHandler = new DefaultHandler();
private final ContentHandler bodyHandler;
private final ContentHandler metadataHandler;
private final ContentHandler macroHandler;
private final boolean extractMacros;
private ContentHandler currentHandler = defaultHandler;
private MediaType detectedType = null;
private FlatOpenDocumentParserHandler(ContentHandler baseHandler, Metadata metadata,
ParseContext parseContext, boolean extractMacros) {
this.extractMacros = extractMacros;
this.bodyHandler = new OpenDocumentBodyHandler(new NSNormalizerContentHandler(baseHandler),
parseContext);
this.metadataHandler = new NSNormalizerContentHandler(
OpenDocumentMetaParser.getContentHandler(metadata, parseContext));
if (extractMacros) {
this.macroHandler = new FlatOpenDocumentMacroHandler(baseHandler, parseContext);
} else {
this.macroHandler = null;
}
}
MediaType getDetectedType() {
return detectedType;
}
@Override
public void startElement(String namespaceURI, String localName, String qName,
Attributes attrs) throws SAXException {
if (META.equals(localName)) {
currentHandler = metadataHandler;
} else if (BODY.equals(localName)) {
currentHandler = bodyHandler;
} else if (extractMacros && SCRIPTS.equals(localName)) {
currentHandler = macroHandler;
}
//trust the mimetype element if it exists for the subtype
if (DOCUMENT.equals(localName)) {
String mime = XMLReaderUtils.getAttrValue("mimetype", attrs);
if (mime != null) {
if (mime.equals(ODT.toString())) {
detectedType = FLAT_ODT;
} else if (mime.equals(ODP.toString())) {
detectedType = FLAT_ODP;
} else if (mime.equals(ODS.toString())) {
detectedType = FLAT_ODS;
}
}
}
currentHandler.startElement(namespaceURI, localName, qName, attrs);
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
currentHandler.characters(ch, start, length);
}
@Override
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
if (META.equals(localName)) {
currentHandler = defaultHandler;
} else if (BODY.equals(localName)) {
currentHandler = defaultHandler;
} else if (extractMacros && SCRIPTS.equals(localName)) {
currentHandler = defaultHandler;
}
currentHandler.endElement(namespaceURI, localName, qName);
}
}
}