OOXMLTikaBodyPartHandler.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.math.BigInteger;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.WordExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
public class OOXMLTikaBodyPartHandler
implements XWPFBodyContentsHandler {
private static final String P = "p";
private static final char[] NEWLINE = new char[]{'\n'};
private final XHTMLContentHandler xhtml;
private final XWPFListManager listManager;
private final boolean includeDeletedText;
private final boolean includeMoveFromText;
private final XWPFStylesShim styles;
private final Metadata metadata;
private int pDepth = 0; //paragraph depth
private int tableDepth = 0;//table depth
private int sdtDepth = 0;//
private FormattingTagManager formattingTags;
//TODO: fix this
//pWithinCell should be an array/stack of given cell depths
//so that when you get to the end of an embedded table, e.g.,
//you know what your paragraph count was in the parent cell.
//<tc><p/><p/><table><tr><tc></p></p></tc></tr></table>...
private int tableCellDepth = 0;
private int pWithinCell = 0;
//will need to replace this with a stack
//if we're marking more that the first level <p/> element
private String paragraphTag = null;
private OOXMLInlineBodyPartMap inlinePartMap = OOXMLInlineBodyPartMap.EMPTY;
private ParseContext parseContext = null;
private final java.util.List<String> pendingCommentIds = new java.util.ArrayList<>();
private final java.util.Set<String> emittedCommentIds = new java.util.HashSet<>();
private final Map<String, EmbeddedPartMetadata> embeddedPartMetadataMap = new HashMap<>();
public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml) {
this(xhtml, null);
}
public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, Metadata metadata) {
this.xhtml = xhtml;
this.metadata = metadata;
this.formattingTags = new FormattingTagManager(xhtml);
this.styles = XWPFStylesShim.EMPTY_STYLES;
this.listManager = XWPFListManager.EMPTY_LIST;
this.includeDeletedText = false;
this.includeMoveFromText = false;
}
public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFStylesShim styles,
XWPFListManager listManager,
OfficeParserConfig parserConfig) {
this(xhtml, styles, listManager, parserConfig, null);
}
public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFStylesShim styles,
XWPFListManager listManager,
OfficeParserConfig parserConfig, Metadata metadata) {
this.xhtml = xhtml;
this.metadata = metadata;
this.formattingTags = new FormattingTagManager(xhtml);
this.styles = styles;
this.listManager = listManager;
this.includeDeletedText = parserConfig.isIncludeDeletedContent();
this.includeMoveFromText = parserConfig.isIncludeMoveFromContent();
}
/**
* Sets pre-parsed inline body part content (footnotes, endnotes, comments)
* so that references encountered during main document parsing can be
* resolved inline.
*/
public void setInlineBodyPartMap(OOXMLInlineBodyPartMap inlinePartMap,
ParseContext parseContext) {
this.inlinePartMap = inlinePartMap != null ? inlinePartMap : OOXMLInlineBodyPartMap.EMPTY;
this.parseContext = parseContext;
}
@Override
public void run(RunProperties runProperties, String contents) throws SAXException {
formattingTags.applyFormatting(runProperties);
xhtml.characters(contents);
}
@Override
public void hyperlinkStart(String link) throws SAXException {
formattingTags.openHyperlink(link);
}
@Override
public void hyperlinkEnd() throws SAXException {
formattingTags.closeHyperlink();
}
@Override
public void startParagraph(ParagraphProperties paragraphProperties) throws SAXException {
//if you're in a table cell and your after the first paragraph
//make sure to prepend a \n
if (tableCellDepth > 0 && pWithinCell > 0) {
xhtml.characters(NEWLINE, 0, 1);
}
if (pDepth == 0 && tableDepth == 0 && sdtDepth == 0) {
paragraphTag = P;
String styleClass = null;
//TIKA-2144 check that styles is not null
if (paragraphProperties.getStyleID() != null && styles != null) {
String styleName = styles.getStyleName(paragraphProperties.getStyleID());
if (styleName != null) {
WordExtractor.TagAndStyle tas =
WordExtractor.buildParagraphTagAndStyle(styleName, false);
paragraphTag = tas.getTag();
styleClass = tas.getStyleClass();
}
}
if (styleClass == null) {
xhtml.startElement(paragraphTag);
} else {
xhtml.startElement(paragraphTag, "class", styleClass);
}
}
writeParagraphNumber(paragraphProperties.getNumId(), paragraphProperties.getIlvl(),
listManager, xhtml);
pDepth++;
}
@Override
public void endParagraph() throws SAXException {
formattingTags.closeAll();
if (pDepth == 1 && tableDepth == 0) {
xhtml.endElement(paragraphTag);
} else if (tableCellDepth > 0 && pWithinCell > 0) {
xhtml.characters(NEWLINE, 0, 1);
} else if (tableCellDepth == 0) {
xhtml.characters(NEWLINE, 0, 1);
}
// Emit any pending comment content after the paragraph closes
// (matching the DOM parser's behavior of appending comments after paragraphs)
emitPendingComments();
if (tableCellDepth > 0) {
pWithinCell++;
}
pDepth--;
}
private void emitPendingComments() throws SAXException {
if (pendingCommentIds.isEmpty()) {
return;
}
for (String id : pendingCommentIds) {
byte[] xml = inlinePartMap.getComment(id);
if (xml != null) {
inlineNoteContent(xml, "comment");
emittedCommentIds.add(id);
}
}
pendingCommentIds.clear();
}
/**
* Returns the set of comment IDs that were inlined during parsing.
* Used by the decorator to skip these when dumping remaining comments.
*/
public java.util.Set<String> getEmittedCommentIds() {
return emittedCommentIds;
}
@Override
public void startTable() throws SAXException {
xhtml.startElement("table");
tableDepth++;
}
@Override
public void endTable() throws SAXException {
xhtml.endElement("table");
tableDepth--;
}
@Override
public void startTableRow() throws SAXException {
xhtml.startElement("tr");
}
@Override
public void endTableRow() throws SAXException {
xhtml.endElement("tr");
}
@Override
public void startTableCell() throws SAXException {
xhtml.startElement("td");
tableCellDepth++;
}
@Override
public void endTableCell() throws SAXException {
xhtml.endElement("td");
pWithinCell = 0;
tableCellDepth--;
}
@Override
public void startSDT() throws SAXException {
formattingTags.closeAll();
sdtDepth++;
}
@Override
public void endSDT() {
sdtDepth--;
}
@Override
public void startEditedSection(String editor, Date date,
EditType editType) {
//no-op
}
@Override
public void endEditedSection() {
//no-op
}
@Override
public boolean isIncludeDeletedText() {
return includeDeletedText;
}
@Override
public void footnoteReference(String id) throws SAXException {
if (id == null) {
return;
}
byte[] xml = inlinePartMap.getFootnote(id);
if (xml != null) {
inlineNoteContent(xml, "footnote");
} else {
xhtml.characters("[");
xhtml.characters(id);
xhtml.characters("]");
}
}
@Override
public void endnoteReference(String id) throws SAXException {
if (id == null) {
return;
}
byte[] xml = inlinePartMap.getEndnote(id);
if (xml != null) {
inlineNoteContent(xml, "endnote");
} else {
xhtml.characters("[");
xhtml.characters(id);
xhtml.characters("]");
}
}
@Override
public void commentReference(String id) throws SAXException {
if (id != null) {
pendingCommentIds.add(id);
}
}
private void inlineNoteContent(byte[] xml, String cssClass) throws SAXException {
// Use the inline part map's relationship map which includes relationships
// from the footnote/endnote parts (needed for picture resolution)
Map<String, String> noteRelationships = inlinePartMap.getLinkedRelationships();
xhtml.startElement("div", "class", cssClass);
try {
XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml),
new EmbeddedContentHandler(
new OOXMLWordAndPowerPointTextHandler(
new OOXMLTikaBodyPartHandler(xhtml),
noteRelationships)),
parseContext);
} catch (TikaException | IOException e) {
xhtml.characters("[" + cssClass + " parse error]");
}
xhtml.endElement("div");
}
@Override
public boolean isIncludeMoveFromText() {
return includeMoveFromText;
}
@Override
public void embeddedOLERef(String relId, String progId, String emfImageRId)
throws SAXException {
if (relId == null) {
return;
}
if ((progId != null && !progId.isEmpty()) ||
(emfImageRId != null && !emfImageRId.isEmpty())) {
EmbeddedPartMetadata epm = new EmbeddedPartMetadata(emfImageRId);
if (progId != null && !progId.isEmpty()) {
epm.setProgId(progId);
}
embeddedPartMetadataMap.put(relId, epm);
}
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", relId);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
}
public Map<String, EmbeddedPartMetadata> getEmbeddedPartMetadataMap() {
return embeddedPartMetadataMap;
}
@Override
public void linkedOLERef(String relId) throws SAXException {
if (relId == null) {
return;
}
if (metadata != null) {
metadata.set(Office.HAS_LINKED_OLE_OBJECTS, true);
}
// Emit as an external reference anchor - linked OLE objects reference external files
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "external-ref-linkedOle");
attributes.addAttribute("", "id", "id", "CDATA", relId);
xhtml.startElement("a", attributes);
xhtml.endElement("a");
}
@Override
public void embeddedPicRef(String picFileName, String picDescription) throws SAXException {
AttributesImpl attr = new AttributesImpl();
if (picFileName != null) {
attr.addAttribute("", "src", "src", "CDATA", "embedded:" + picFileName);
}
if (picDescription != null) {
attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
}
xhtml.startElement("img", attr);
xhtml.endElement("img");
}
@Override
public void fieldCodeHyperlinkStart(String link) throws SAXException {
if (metadata != null) {
metadata.set(Office.HAS_FIELD_HYPERLINKS, true);
}
hyperlinkStart(link);
}
@Override
public void externalRef(String fieldType, String url) throws SAXException {
if (url == null || url.isEmpty()) {
return;
}
if (metadata != null) {
if ("hlinkHover".equals(fieldType)) {
metadata.set(Office.HAS_HOVER_HYPERLINKS, true);
} else if ("vml-shape-href".equals(fieldType)) {
metadata.set(Office.HAS_VML_HYPERLINKS, true);
} else {
metadata.set(Office.HAS_FIELD_HYPERLINKS, true);
}
}
AttributesImpl attr = new AttributesImpl();
attr.addAttribute("", "class", "class", "CDATA", "external-ref-" + fieldType);
attr.addAttribute("", "href", "href", "CDATA", url);
xhtml.startElement("a", attr);
xhtml.endElement("a");
}
@Override
public void startBookmark(String id, String name) throws SAXException {
//skip bookmarks within hyperlinks
if (name != null && !formattingTags.isHyperlinkActive()) {
xhtml.startElement("a", "name", name);
xhtml.endElement("a");
}
}
@Override
public void endBookmark(String id) {
//no-op
}
private void writeParagraphNumber(int numId, int ilvl, XWPFListManager listManager,
XHTMLContentHandler xhtml) throws SAXException {
if (ilvl < 0 || numId < 0 || listManager == null) {
return;
}
String number = listManager.getFormattedNumber(BigInteger.valueOf(numId), ilvl);
if (number != null) {
xhtml.characters(number);
}
}
}