XSSFExcelExtractorDecorator.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.HeaderFooter;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.model.Comments;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.apache.poi.xssf.usermodel.XSSFDrawing;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.RuntimeSAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.TikaExcelDataFormatter;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
// Relationship types for external data sources
private static final String EXTERNAL_LINK_RELATION =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/externalLink";
private static final String CONNECTIONS_RELATION =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/connections";
private static final String QUERY_TABLE_RELATION =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/queryTable";
private static final String PIVOT_CACHE_DEFINITION_RELATION =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/pivotCacheDefinition";
// Power Query stores data in customData parts
private static final String POWER_QUERY_CONTENT_TYPE =
"application/vnd.ms-excel.customDataProperties+xml";
/**
* Allows access to headers/footers from raw xml strings
*/
protected static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
protected final DataFormatter formatter;
protected final List<PackagePart> sheetParts = new ArrayList<>();
protected final Map<String, String> drawingHyperlinks = new HashMap<>();
protected Metadata metadata;
protected ParseContext parseContext;
public XSSFExcelExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor,
Locale locale) {
super(context, extractor);
this.parseContext = context;
this.extractor = (XSSFEventBasedExcelExtractor) extractor;
configureExtractor(this.extractor, locale);
if (locale == null) {
formatter = new TikaExcelDataFormatter();
} else {
formatter = new TikaExcelDataFormatter(locale);
}
OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);
if (officeParserConfig != null) {
((TikaExcelDataFormatter) formatter)
.setDateFormatOverride(officeParserConfig.getDateFormatOverride());
}
}
protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
((XSSFEventBasedExcelExtractor) extractor)
.setIncludeTextBoxes(config.isIncludeShapeBasedContent());
((XSSFEventBasedExcelExtractor) extractor).setFormulasNotResults(false);
((XSSFEventBasedExcelExtractor) extractor).setLocale(locale);
//given that we load our own shared strings table, setting:
//((XSSFEventBasedExcelExtractor)extractor).setConcatenatePhoneticRuns();
//does no good here.
}
@Override
public void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context)
throws SAXException, XmlException, IOException, TikaException {
this.metadata = metadata;
this.parseContext = context;
metadata.set(Office.PROTECTED_WORKSHEET, "false");
super.getXHTML(handler, metadata, context);
}
/**
* @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
*/
@Override
protected void buildXHTML(XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
OPCPackage container = extractor.getPackage();
ReadOnlySharedStringsTable strings;
XSSFReader.SheetIterator iter;
XSSFReader xssfReader;
StylesTable styles;
try {
xssfReader = new XSSFReader(container);
styles = xssfReader.getStylesTable();
iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
strings = new ReadOnlySharedStringsTable(container, config.isConcatenatePhoneticRuns());
} catch (OpenXML4JException e) {
throw new XmlException(e);
}
while (iter.hasNext()) {
SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config, xhtml);
PackagePart sheetPart = null;
try (InputStream stream = iter.next()) {
sheetPart = iter.getSheetPart();
addDrawingHyperLinks(sheetPart);
sheetParts.add(sheetPart);
Comments comments = iter.getSheetComments();
if (comments != null && comments.getNumberOfComments() > 0) {
metadata.set(Office.HAS_COMMENTS, true);
}
// Start, and output the sheet name
xhtml.startElement("div", "class", "sheet");
xhtml.element("h1", iter.getSheetName());
// Extract the main sheet contents
xhtml.startElement("table");
xhtml.startElement("tbody");
processSheet(sheetExtractor, comments, styles, strings, stream);
try {
getThreadedComments(container, sheetPart, xhtml);
} catch (InvalidFormatException | TikaException | IOException e) {
//swallow
}
xhtml.endElement("tbody");
xhtml.endElement("table");
}
// Output any headers and footers
// (Need to process the sheet to get them, so we can't
// do the headers before the contents)
for (String header : sheetExtractor.headers) {
extractHeaderFooter(header, xhtml);
}
for (String footer : sheetExtractor.footers) {
extractHeaderFooter(footer, xhtml);
}
// Do text held in shapes, if required
if (config.isIncludeShapeBasedContent()) {
List<XSSFShape> shapes = iter.getShapes();
processShapes(shapes, xhtml);
}
//for now dump sheet hyperlinks at bottom of page
//consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes
//step 1: extract hyperlink info from bottom of page
//step 2: process as we do now, but with cached hyperlink relationship info
extractHyperLinks(sheetPart, xhtml);
// All done with this sheet
xhtml.endElement("div");
}
//consider adding this back to POI
try (InputStream wbData = xssfReader.getWorkbookData()) {
XMLReaderUtils
.parseSAX(wbData, new WorkbookMetadataHandler(),
parseContext);
} catch (InvalidFormatException | TikaException e) {
//swallow
}
try {
getPersons(container, metadata);
} catch (InvalidFormatException | TikaException | IOException | SAXException e) {
//swallow
}
// Extract external data sources (HIGH security risk - can hide malicious URLs)
try {
extractExternalDataSources(container, xhtml);
} catch (InvalidFormatException | TikaException | IOException | SAXException e) {
//swallow
}
}
/**
* Extracts external data sources from the workbook including:
* - External workbook links
* - Data connections (database, web queries)
* - Query tables
*/
private void extractExternalDataSources(OPCPackage container, XHTMLContentHandler xhtml)
throws InvalidFormatException, TikaException, IOException, SAXException {
PackageRelationship coreDocRelationship = container.getRelationshipsByType(
PackageRelationshipTypes.CORE_DOCUMENT).getRelationship(0);
if (coreDocRelationship == null) {
return;
}
PackagePart workbookPart = container.getPart(coreDocRelationship);
if (workbookPart == null) {
return;
}
// Extract external workbook links
extractExternalLinks(workbookPart, xhtml);
// Extract connections (database, ODBC, web queries)
extractConnections(workbookPart, xhtml);
// Extract query tables from each sheet
for (PackagePart sheetPart : sheetParts) {
extractQueryTables(sheetPart, xhtml);
}
// Detect pivot cache with external data sources
extractPivotCacheExternalData(workbookPart, xhtml);
// Detect Power Query / Data Mashup
detectPowerQuery(container);
}
/**
* Detects pivot cache definitions with external data sources (OLAP, databases).
*/
private void extractPivotCacheExternalData(PackagePart workbookPart, XHTMLContentHandler xhtml)
throws InvalidFormatException {
PackageRelationshipCollection coll = workbookPart.getRelationshipsByType(PIVOT_CACHE_DEFINITION_RELATION);
if (coll == null || coll.isEmpty()) {
return;
}
for (PackageRelationship rel : coll) {
try {
PackagePart pivotCachePart = workbookPart.getRelatedPart(rel);
if (pivotCachePart != null) {
PivotCacheHandler handler = new PivotCacheHandler(xhtml);
try (InputStream is = pivotCachePart.getInputStream()) {
XMLReaderUtils.parseSAX(is, handler, parseContext);
}
if (handler.hasExternalData()) {
metadata.set(Office.HAS_EXTERNAL_PIVOT_DATA, true);
}
}
} catch (IOException | TikaException | SAXException e) {
// swallow
}
}
}
/**
* Detects Power Query / Data Mashup presence.
*/
private void detectPowerQuery(OPCPackage container) {
// Power Query data is stored in customData parts with specific content type
// or in xl/customData/ folder
try {
List<PackagePart> customDataParts = container.getPartsByContentType(POWER_QUERY_CONTENT_TYPE);
if (customDataParts != null && !customDataParts.isEmpty()) {
metadata.set(Office.HAS_POWER_QUERY, true);
}
// Also check for customData folder parts
for (PackagePart part : container.getParts()) {
String partName = part.getPartName().getName();
if (partName.contains("/customData/") || partName.contains("/dataMashup")) {
metadata.set(Office.HAS_POWER_QUERY, true);
break;
}
}
} catch (InvalidFormatException e) {
// swallow
}
}
/**
* Extracts external workbook links from externalLink parts.
*/
private void extractExternalLinks(PackagePart workbookPart, XHTMLContentHandler xhtml)
throws InvalidFormatException, SAXException {
PackageRelationshipCollection coll = workbookPart.getRelationshipsByType(EXTERNAL_LINK_RELATION);
if (coll == null || coll.isEmpty()) {
return;
}
// If we have any external link relationships, set the metadata flag
if (coll.size() > 0) {
metadata.set(Office.HAS_EXTERNAL_LINKS, true);
}
for (PackageRelationship rel : coll) {
if (rel.getTargetMode() == TargetMode.EXTERNAL) {
// Direct external reference
emitExternalRef(xhtml, "externalLink", rel.getTargetURI().toString());
} else {
// Internal part that contains external reference - parse it
try {
PackagePart externalLinkPart = workbookPart.getRelatedPart(rel);
if (externalLinkPart != null) {
ExternalLinkHandler handler = new ExternalLinkHandler(xhtml);
try (InputStream is = externalLinkPart.getInputStream()) {
XMLReaderUtils.parseSAX(is, handler, parseContext);
}
if (handler.hasDdeLink()) {
metadata.set(Office.HAS_DDE_LINKS, true);
}
}
} catch (IOException | TikaException e) {
// swallow
}
}
}
}
/**
* Extracts data connections from connections.xml.
*/
private void extractConnections(PackagePart workbookPart, XHTMLContentHandler xhtml)
throws InvalidFormatException, SAXException {
PackageRelationshipCollection coll = workbookPart.getRelationshipsByType(CONNECTIONS_RELATION);
if (coll == null || coll.isEmpty()) {
return;
}
for (PackageRelationship rel : coll) {
try {
PackagePart connectionsPart = workbookPart.getRelatedPart(rel);
if (connectionsPart != null) {
ConnectionsHandler handler = new ConnectionsHandler(xhtml);
try (InputStream is = connectionsPart.getInputStream()) {
XMLReaderUtils.parseSAX(is, handler, parseContext);
}
if (handler.hasConnections()) {
metadata.set(Office.HAS_DATA_CONNECTIONS, true);
}
if (handler.hasWebQueries()) {
metadata.set(Office.HAS_WEB_QUERIES, true);
}
}
} catch (IOException | TikaException e) {
// swallow
}
}
}
/**
* Extracts query table external sources.
*/
private void extractQueryTables(PackagePart sheetPart, XHTMLContentHandler xhtml)
throws InvalidFormatException, SAXException {
PackageRelationshipCollection coll = sheetPart.getRelationshipsByType(QUERY_TABLE_RELATION);
if (coll == null || coll.isEmpty()) {
return;
}
for (PackageRelationship rel : coll) {
try {
PackagePart queryTablePart = sheetPart.getRelatedPart(rel);
if (queryTablePart != null) {
try (InputStream is = queryTablePart.getInputStream()) {
XMLReaderUtils.parseSAX(is, new QueryTableHandler(xhtml), parseContext);
}
}
} catch (IOException | TikaException e) {
// swallow
}
}
}
/**
* Emits an external reference as an anchor element with appropriate class.
*/
private void emitExternalRef(XHTMLContentHandler xhtml, String refType, String url)
throws SAXException {
if (url == null || url.isEmpty()) {
return;
}
org.xml.sax.helpers.AttributesImpl attrs = new org.xml.sax.helpers.AttributesImpl();
attrs.addAttribute("", "class", "class", "CDATA", "external-ref-" + refType);
attrs.addAttribute("", "href", "href", "CDATA", url);
xhtml.startElement("a", attrs);
xhtml.endElement("a");
}
/**
* Handler for parsing externalLink XML to extract external workbook references.
*/
private class ExternalLinkHandler extends DefaultHandler {
private final XHTMLContentHandler xhtml;
private boolean foundDdeLink = false;
ExternalLinkHandler(XHTMLContentHandler xhtml) {
this.xhtml = xhtml;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes atts)
throws SAXException {
// Look for externalBook element with r:id attribute
if ("externalBook".equals(localName)) {
String rId = atts.getValue("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id");
// The actual URL is in the relationship, not directly in the XML
// For now, we note that there's an external book reference
}
// Look for file element with href attribute (older format)
if ("file".equals(localName)) {
String href = atts.getValue("href");
if (href != null && !href.isEmpty()) {
emitExternalRef(xhtml, "externalWorkbook", href);
}
}
// Look for oleLink with r:id (OLE links to external files)
if ("oleLink".equals(localName)) {
String rId = atts.getValue("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id");
if (rId != null) {
emitExternalRef(xhtml, "oleLink", "relationship:" + rId);
}
}
// DDE links - security risk: can execute commands
if ("ddeLink".equals(localName)) {
foundDdeLink = true;
String ddeService = atts.getValue("ddeService");
String ddeTopic = atts.getValue("ddeTopic");
if (ddeService != null || ddeTopic != null) {
String ddeRef = (ddeService != null ? ddeService : "") + "|" +
(ddeTopic != null ? ddeTopic : "");
emitExternalRef(xhtml, "ddeLink", ddeRef);
}
}
}
boolean hasDdeLink() {
return foundDdeLink;
}
}
/**
* Handler for parsing connections.xml to extract external data connections.
*/
private class ConnectionsHandler extends DefaultHandler {
private final XHTMLContentHandler xhtml;
private boolean foundConnection = false;
private boolean foundWebQuery = false;
ConnectionsHandler(XHTMLContentHandler xhtml) {
this.xhtml = xhtml;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes atts)
throws SAXException {
if ("connection".equals(localName)) {
foundConnection = true;
}
// Database connection string
if ("dbPr".equals(localName)) {
String connection = atts.getValue("connection");
if (connection != null && !connection.isEmpty()) {
emitExternalRef(xhtml, "dbConnection", connection);
}
}
// Web query
if ("webPr".equals(localName)) {
foundWebQuery = true;
String url = atts.getValue("url");
if (url != null && !url.isEmpty()) {
emitExternalRef(xhtml, "webQuery", url);
}
}
// ODBC connection
if ("olapPr".equals(localName)) {
String connection = atts.getValue("connection");
if (connection != null && !connection.isEmpty()) {
emitExternalRef(xhtml, "olapConnection", connection);
}
}
// Text file import
if ("textPr".equals(localName)) {
String sourceFile = atts.getValue("sourceFile");
if (sourceFile != null && !sourceFile.isEmpty()) {
emitExternalRef(xhtml, "textFileImport", sourceFile);
}
}
}
boolean hasConnections() {
return foundConnection;
}
boolean hasWebQueries() {
return foundWebQuery;
}
}
/**
* Handler for parsing queryTable XML to extract web query sources.
*/
private class QueryTableHandler extends DefaultHandler {
private final XHTMLContentHandler xhtml;
QueryTableHandler(XHTMLContentHandler xhtml) {
this.xhtml = xhtml;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes atts)
throws SAXException {
if ("queryTable".equals(localName)) {
String connectionId = atts.getValue("connectionId");
// Connection details are in connections.xml
}
// Web query table refresh
if ("queryTableRefresh".equals(localName)) {
// Contains refresh settings
}
}
}
/**
* Handler for parsing pivotCacheDefinition XML to detect external data sources.
*/
private class PivotCacheHandler extends DefaultHandler {
private final XHTMLContentHandler xhtml;
private boolean hasExternalData = false;
PivotCacheHandler(XHTMLContentHandler xhtml) {
this.xhtml = xhtml;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes atts)
throws SAXException {
// cacheSource with type="external" indicates external data
if ("cacheSource".equals(localName)) {
String type = atts.getValue("type");
if ("external".equals(type) || "consolidation".equals(type)) {
hasExternalData = true;
}
}
// worksheetSource can have external references
if ("worksheetSource".equals(localName)) {
String ref = atts.getValue("ref");
String sheet = atts.getValue("sheet");
String rId = atts.getValue("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id");
// If there's a relationship ID, it likely points to external workbook
if (rId != null) {
hasExternalData = true;
}
}
// consolidation source (multiple ranges, possibly external)
if ("consolidation".equals(localName) || "rangeSets".equals(localName)) {
hasExternalData = true;
}
}
boolean hasExternalData() {
return hasExternalData;
}
}
private void getThreadedComments(OPCPackage container, PackagePart sheetPart, XHTMLContentHandler xhtml) throws TikaException,
InvalidFormatException, SAXException, IOException {
//consider caching the person id -> person names in getPersons and injecting that into the xhtml per comment?
PackageRelationshipCollection coll = sheetPart.getRelationshipsByType(OPCPackageWrapper.THREADED_COMMENT_RELATION);
if (coll == null || coll.isEmpty()) {
return;
}
for (PackageRelationship rel : coll) {
PackagePart threadedCommentPart = sheetPart.getRelatedPart(rel);
if (threadedCommentPart == null) {
continue;
}
try (InputStream is = threadedCommentPart.getInputStream()) {
XMLReaderUtils.parseSAX(is, new ThreadedCommentHandler(xhtml), parseContext);
}
}
}
private void getPersons(OPCPackage container, Metadata metadata) throws TikaException, InvalidFormatException,
IOException, SAXException {
PackageRelationship coreDocRelationship = container.getRelationshipsByType(
PackageRelationshipTypes.CORE_DOCUMENT).getRelationship(0);
if (coreDocRelationship == null) {
return;
}
// Get the part that holds the workbook
PackagePart workbookPart = container.getPart(coreDocRelationship);
if (workbookPart == null) {
return;
}
PackageRelationshipCollection coll = workbookPart.getRelationshipsByType(OPCPackageWrapper.PERSON_RELATION);
if (coll == null) {
return;
}
for (PackageRelationship rel : coll) {
PackagePart personsPart = workbookPart.getRelatedPart(rel);
if (personsPart == null) {
continue;
}
try (InputStream is = personsPart.getInputStream()) {
XMLReaderUtils.parseSAX(is, new CommentPersonHandler(metadata), parseContext);
}
}
}
protected void addDrawingHyperLinks(PackagePart sheetPart) {
try {
for (PackageRelationship rel : sheetPart
.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
PackagePart part = rel.getPackage().getPart(relName);
//parts can go missing, and Excel quietly ignores missing images -- TIKA-2134
if (part == null) {
continue;
}
for (PackageRelationship drawRel : part
.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
drawingHyperlinks.put(drawRel.getId(), drawRel.getTargetURI().toString());
}
}
}
} catch (InvalidFormatException e) {
//swallow
//an exception trying to extract
//hyperlinks on drawings should not cause a parse failure
}
}
protected void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml)
throws SAXException {
try {
for (PackageRelationship rel : sheetPart
.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
xhtml.startElement("a", "href", rel.getTargetURI().toString());
xhtml.characters(rel.getTargetURI().toString());
xhtml.endElement("a");
}
} catch (InvalidFormatException e) {
//swallow
}
}
protected void extractHeaderFooter(String hf, XHTMLContentHandler xhtml) throws SAXException {
String content = ExcelExtractor._extractHeaderFooter(new HeaderFooterFromString(hf));
if (content.length() > 0) {
xhtml.element("p", content);
}
}
protected void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml)
throws SAXException {
if (shapes == null) {
return;
}
//We don't currently have an obvious way to get drawings
//directly from sheetIter. Therefore, we grab the shapes and process those.
//To get the diagrams and charts, we need to get the parent drawing for each
//shape, and we need to make sure that we only process each parent shape once!
//SEE TIKA-2703 TODO: add unit test
Set<String> seenParentDrawings = new HashSet<>();
for (XSSFShape shape : shapes) {
if (shape instanceof XSSFSimpleShape) {
String sText = ((XSSFSimpleShape) shape).getText();
if (sText != null && sText.length() > 0) {
xhtml.element("p", sText);
}
extractHyperLinksFromShape(((XSSFSimpleShape) shape).getCTShape(), xhtml);
}
XSSFDrawing parentDrawing = shape.getDrawing();
if (parentDrawing != null) {
if (!seenParentDrawings
.contains(parentDrawing.getPackagePart().getPartName().toString())) {
//dump diagram data
handleGeneralTextContainingPart(AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
"diagram-data", parentDrawing.getPackagePart(), metadata,
new OOXMLWordAndPowerPointTextHandler(
new OOXMLTikaBodyPartHandler(xhtml),
new HashMap<>()//empty
));
//dump chart data
handleGeneralTextContainingPart(XSSFRelation.CHART.getRelation(), "chart",
parentDrawing.getPackagePart(), metadata,
new OOXMLWordAndPowerPointTextHandler(
new OOXMLTikaBodyPartHandler(xhtml),
new HashMap<>()//empty
));
}
seenParentDrawings.add(parentDrawing.getPackagePart().getPartName().toString());
}
}
}
private void extractHyperLinksFromShape(CTShape ctShape, XHTMLContentHandler xhtml)
throws SAXException {
if (ctShape == null) {
return;
}
CTShapeNonVisual nvSpPR = ctShape.getNvSpPr();
if (nvSpPR == null) {
return;
}
CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr();
if (cNvPr == null) {
return;
}
CTHyperlink ctHyperlink = cNvPr.getHlinkClick();
if (ctHyperlink == null) {
return;
}
String url = drawingHyperlinks.get(ctHyperlink.getId());
if (url != null) {
xhtml.startElement("a", "href", url);
xhtml.characters(url);
xhtml.endElement("a");
}
CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover();
if (ctHoverHyperlink == null) {
return;
}
url = drawingHyperlinks.get(ctHoverHyperlink.getId());
if (url != null) {
xhtml.startElement("a", "href", url);
xhtml.characters(url);
xhtml.endElement("a");
}
}
public void processSheet(SheetContentsHandler sheetContentsHandler, Comments comments,
StylesTable styles, ReadOnlySharedStringsTable strings,
InputStream sheetInputStream) throws IOException, SAXException {
try {
XSSFSheetInterestingPartsCapturer handler = new XSSFSheetInterestingPartsCapturer(
new XSSFSheetXMLHandler(styles, comments, strings, sheetContentsHandler,
formatter, false));
XMLReaderUtils.parseSAX(sheetInputStream, handler, parseContext);
sheetInputStream.close();
if (handler.hasProtection) {
metadata.set(Office.PROTECTED_WORKSHEET, true);
}
if (handler.hasHiddenColumn) {
metadata.set(Office.HAS_HIDDEN_COLUMNS, true);
}
if (handler.hasHiddenRow) {
metadata.set(Office.HAS_HIDDEN_ROWS, true);
}
} catch (TikaException e) {
throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
}
}
/**
* In Excel files, sheets have things embedded in them,
* and sheet drawings which have the images
*/
@Override
protected List<PackagePart> getMainDocumentParts() throws TikaException {
List<PackagePart> parts = new ArrayList<>();
for (PackagePart part : sheetParts) {
// Add the sheet
parts.add(part);
// If it has drawings, return those too
try {
for (PackageRelationship rel : part
.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName =
PackagingURIHelper.createPartName(rel.getTargetURI());
parts.add(rel.getPackage().getPart(relName));
}
}
for (PackageRelationship rel : part
.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName =
PackagingURIHelper.createPartName(rel.getTargetURI());
parts.add(rel.getPackage().getPart(relName));
}
}
} catch (InvalidFormatException e) {
throw new TikaException("Broken OOXML file", e);
}
}
//add main document so that macros can be extracted
//by AbstractOOXMLExtractor
parts.addAll(extractor.getPackage()
.getPartsByRelationshipType(PackageRelationshipTypes.CORE_DOCUMENT));
return parts;
}
/**
* Turns formatted sheet events into HTML
*/
protected static class SheetTextAsHTML implements SheetContentsHandler {
private final boolean includeHeadersFooters;
private final boolean includeMissingRows;
protected List<String> headers;
protected List<String> footers;
private XHTMLContentHandler xhtml;
private int lastSeenRow = -1;
private int lastSeenCol = -1;
protected SheetTextAsHTML(OfficeParserConfig config, XHTMLContentHandler xhtml) {
this.includeHeadersFooters = config.isIncludeHeadersAndFooters();
this.includeMissingRows = config.isIncludeMissingRows();
this.xhtml = xhtml;
headers = new ArrayList<>();
footers = new ArrayList<>();
}
public void startRow(int rowNum) {
try {
// Missing rows, if desired, with a single empty row
if (includeMissingRows && rowNum > (lastSeenRow + 1)) {
for (int rn = lastSeenRow + 1; rn < rowNum; rn++) {
xhtml.startElement("tr");
xhtml.startElement("td");
xhtml.endElement("td");
xhtml.endElement("tr");
}
}
// Start the new row
xhtml.startElement("tr");
lastSeenCol = -1;
} catch (SAXException e) {
//swallow
throw new RuntimeSAXException(e);
}
}
public void endRow(int rowNum) {
try {
xhtml.endElement("tr");
} catch (SAXException e) {
throw new RuntimeSAXException(e);
}
}
public void cell(String cellRef, String formattedValue, XSSFComment comment) {
try {
// Handle any missing cells
int colNum =
(cellRef == null) ? lastSeenCol + 1 : (new CellReference(cellRef)).getCol();
for (int cn = lastSeenCol + 1; cn < colNum; cn++) {
xhtml.startElement("td");
xhtml.endElement("td");
}
lastSeenCol = colNum;
// Start this cell
xhtml.startElement("td");
// Main cell contents
if (formattedValue != null) {
xhtml.characters(formattedValue);
}
// Comments
if (comment != null) {
xhtml.startElement("br");
xhtml.endElement("br");
xhtml.characters(comment.getAuthor());
xhtml.characters(": ");
xhtml.characters(comment.getString().getString());
}
xhtml.endElement("td");
} catch (SAXException e) {
throw new RuntimeSAXException(e);
}
}
public void headerFooter(String text, boolean isHeader, String tagName) {
if (!includeHeadersFooters) {
return;
}
if (isHeader) {
headers.add(text);
} else {
footers.add(text);
}
}
}
protected static class HeaderFooterFromString implements HeaderFooter {
private String text;
protected HeaderFooterFromString(String text) {
this.text = text;
}
public String getCenter() {
return hfHelper.getCenterSection(text);
}
public void setCenter(String paramString) {
}
public String getLeft() {
return hfHelper.getLeftSection(text);
}
public void setLeft(String paramString) {
}
public String getRight() {
return hfHelper.getRightSection(text);
}
public void setRight(String paramString) {
}
}
/**
* Captures information on interesting tags, whilst
* delegating the main work to the formatting handler
*/
protected static class XSSFSheetInterestingPartsCapturer extends DefaultHandler {
private ContentHandler delegate;
private boolean hasProtection = false;
private boolean hasHiddenRow = false;
private boolean hasHiddenColumn = false;
protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
this.delegate = delegate;
}
public void startElement(String uri, String localName, String qName, Attributes atts)
throws SAXException {
if ("sheetProtection".equals(qName)) {
hasProtection = true;
}
if (! hasHiddenRow && "row".equals(localName)) {
String v = atts.getValue("hidden");
if ("true".equals(v) || "1".equals(v)) {
hasHiddenRow = true;
}
}
if (! hasHiddenColumn && "col".equals(localName)) {
String v = atts.getValue("hidden");
if ("true".equals(v) || "1".equals(v)) {
hasHiddenColumn = true;
}
}
delegate.startElement(uri, localName, qName, atts);
}
public void characters(char[] ch, int start, int length) throws SAXException {
delegate.characters(ch, start, length);
}
public void endDocument() throws SAXException {
delegate.endDocument();
}
public void endElement(String uri, String localName, String qName) throws SAXException {
delegate.endElement(uri, localName, qName);
}
public void endPrefixMapping(String prefix) throws SAXException {
delegate.endPrefixMapping(prefix);
}
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
delegate.ignorableWhitespace(ch, start, length);
}
public void processingInstruction(String target, String data) throws SAXException {
delegate.processingInstruction(target, data);
}
public void setDocumentLocator(Locator locator) {
delegate.setDocumentLocator(locator);
}
public void skippedEntity(String name) throws SAXException {
delegate.skippedEntity(name);
}
public void startDocument() throws SAXException {
delegate.startDocument();
}
public void startPrefixMapping(String prefix, String uri) throws SAXException {
delegate.startPrefixMapping(prefix, uri);
}
}
private class WorkbookMetadataHandler extends DefaultHandler {
@Override
public void startElement(String uri, String localName, String qName, Attributes atts)
throws SAXException {
//require x15ac //http://schemas.microsoft.com/office/spreadsheetml/2010/11/ac ???
if ("absPath".equals(localName)) {
for (int i = 0; i < atts.getLength(); i++) {
String n = atts.getLocalName(i);
if ("url".equals(n)) {
String url = atts.getValue(i);
metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, url);
return;
}
}
} else if ("sheet".equals(localName)) {
String n = XMLReaderUtils.getAttrValue("name", atts);
String state = XMLReaderUtils.getAttrValue("state", atts);
if ("hidden".equals(state)) {
metadata.set(Office.HAS_HIDDEN_SHEETS, true);
metadata.add(Office.HIDDEN_SHEET_NAMES, n);
} else if ("veryHidden".equals(state)) {
metadata.set(Office.HAS_VERY_HIDDEN_SHEETS, true);
metadata.set(Office.VERY_HIDDEN_SHEET_NAMES, n);
}
} else if ("workbookPr".equals(localName)) {
String codeName = XMLReaderUtils.getAttrValue("codeName", atts);
if (!StringUtils.isBlank(codeName)) {
metadata.set(Office.WORKBOOK_CODENAME, codeName);
}
}
// file version? <fileVersion appName="xl" lastEdited="7" lowestEdited="7" rupBuild="28526"/>
}
}
private static class ThreadedCommentHandler extends DefaultHandler {
private final XHTMLContentHandler xhtml;
StringBuilder sb = new StringBuilder();
boolean inText = false;
public ThreadedCommentHandler(XHTMLContentHandler xhtml) {
this.xhtml = xhtml;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
if ("text".equals(localName)) {
inText = true;
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if ("text".equals(localName)) {
xhtml.startElement("div", "class", "threaded-comment");
xhtml.startElement("p");
xhtml.characters(sb.toString());
xhtml.endElement("p");
xhtml.endElement("div");
sb.setLength(0);
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (inText) {
sb.append(ch, start, length);
}
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
if (inText) {
sb.append(ch, start, length);
}
}
}
}