XSSFExcelExtractorDecorator.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.ooxml;


import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.HeaderFooter;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.model.Comments;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.apache.poi.xssf.usermodel.XSSFDrawing;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.exception.RuntimeSAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.TikaExcelDataFormatter;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;

public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {

    // Relationship types for external data sources
    private static final String EXTERNAL_LINK_RELATION =
            "http://schemas.openxmlformats.org/officeDocument/2006/relationships/externalLink";
    private static final String CONNECTIONS_RELATION =
            "http://schemas.openxmlformats.org/officeDocument/2006/relationships/connections";
    private static final String QUERY_TABLE_RELATION =
            "http://schemas.openxmlformats.org/officeDocument/2006/relationships/queryTable";
    private static final String PIVOT_CACHE_DEFINITION_RELATION =
            "http://schemas.openxmlformats.org/officeDocument/2006/relationships/pivotCacheDefinition";
    // Power Query stores data in customData parts
    private static final String POWER_QUERY_CONTENT_TYPE =
            "application/vnd.ms-excel.customDataProperties+xml";

    /**
     * Allows access to headers/footers from raw xml strings
     */
    protected static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
    protected final DataFormatter formatter;
    protected final List<PackagePart> sheetParts = new ArrayList<>();
    protected final Map<String, String> drawingHyperlinks = new HashMap<>();
    protected Metadata metadata;
    protected ParseContext parseContext;

    public XSSFExcelExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor,
                                       Locale locale) {
        super(context, extractor);

        this.parseContext = context;
        this.extractor = (XSSFEventBasedExcelExtractor) extractor;
        configureExtractor(this.extractor, locale);

        if (locale == null) {
            formatter = new TikaExcelDataFormatter();
        } else {
            formatter = new TikaExcelDataFormatter(locale);
        }
        OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);
        if (officeParserConfig != null) {
            ((TikaExcelDataFormatter) formatter)
                    .setDateFormatOverride(officeParserConfig.getDateFormatOverride());
        }
    }

    protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
        ((XSSFEventBasedExcelExtractor) extractor)
                .setIncludeTextBoxes(config.isIncludeShapeBasedContent());
        ((XSSFEventBasedExcelExtractor) extractor).setFormulasNotResults(false);
        ((XSSFEventBasedExcelExtractor) extractor).setLocale(locale);
        //given that we load our own shared strings table, setting:
        //((XSSFEventBasedExcelExtractor)extractor).setConcatenatePhoneticRuns();
        //does no good here.
    }

    @Override
    public void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context)
            throws SAXException, XmlException, IOException, TikaException {

        this.metadata = metadata;
        this.parseContext = context;
        metadata.set(Office.PROTECTED_WORKSHEET, "false");

        super.getXHTML(handler, metadata, context);
    }

    /**
     * @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
     */
    @Override
    protected void buildXHTML(XHTMLContentHandler xhtml)
            throws SAXException, XmlException, IOException {
        OPCPackage container = extractor.getPackage();

        ReadOnlySharedStringsTable strings;
        XSSFReader.SheetIterator iter;
        XSSFReader xssfReader;
        StylesTable styles;
        try {
            xssfReader = new XSSFReader(container);
            styles = xssfReader.getStylesTable();

            iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
            strings = new ReadOnlySharedStringsTable(container, config.isConcatenatePhoneticRuns());
        } catch (OpenXML4JException e) {
            throw new XmlException(e);
        }
        while (iter.hasNext()) {
            SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config, xhtml);
            PackagePart sheetPart = null;
            try (InputStream stream = iter.next()) {
                sheetPart = iter.getSheetPart();

                addDrawingHyperLinks(sheetPart);
                sheetParts.add(sheetPart);

                Comments comments = iter.getSheetComments();
                if (comments != null && comments.getNumberOfComments() > 0) {
                    metadata.set(Office.HAS_COMMENTS, true);
                }

                // Start, and output the sheet name
                xhtml.startElement("div", "class", "sheet");
                xhtml.element("h1", iter.getSheetName());

                // Extract the main sheet contents
                xhtml.startElement("table");
                xhtml.startElement("tbody");

                processSheet(sheetExtractor, comments, styles, strings, stream);
                try {
                    getThreadedComments(container, sheetPart, xhtml);
                } catch (InvalidFormatException | TikaException | IOException e) {
                    //swallow
                }
                xhtml.endElement("tbody");
                xhtml.endElement("table");
            }

            // Output any headers and footers
            // (Need to process the sheet to get them, so we can't
            //  do the headers before the contents)
            for (String header : sheetExtractor.headers) {
                extractHeaderFooter(header, xhtml);
            }
            for (String footer : sheetExtractor.footers) {
                extractHeaderFooter(footer, xhtml);
            }

            // Do text held in shapes, if required
            if (config.isIncludeShapeBasedContent()) {
                List<XSSFShape> shapes = iter.getShapes();
                processShapes(shapes, xhtml);
            }

            //for now dump sheet hyperlinks at bottom of page
            //consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes
            //step 1: extract hyperlink info from bottom of page
            //step 2: process as we do now, but with cached hyperlink relationship info
            extractHyperLinks(sheetPart, xhtml);
            // All done with this sheet
            xhtml.endElement("div");
        }

        //consider adding this back to POI
        try (InputStream wbData = xssfReader.getWorkbookData()) {
            XMLReaderUtils
                    .parseSAX(wbData, new WorkbookMetadataHandler(),
                            parseContext);
        } catch (InvalidFormatException | TikaException e) {
            //swallow
        }
        try {
            getPersons(container, metadata);
        } catch (InvalidFormatException | TikaException | IOException | SAXException e) {
            //swallow
        }

        // Extract external data sources (HIGH security risk - can hide malicious URLs)
        try {
            extractExternalDataSources(container, xhtml);
        } catch (InvalidFormatException | TikaException | IOException | SAXException e) {
            //swallow
        }

    }

    /**
     * Extracts external data sources from the workbook including:
     * - External workbook links
     * - Data connections (database, web queries)
     * - Query tables
     */
    private void extractExternalDataSources(OPCPackage container, XHTMLContentHandler xhtml)
            throws InvalidFormatException, TikaException, IOException, SAXException {

        PackageRelationship coreDocRelationship = container.getRelationshipsByType(
                PackageRelationshipTypes.CORE_DOCUMENT).getRelationship(0);
        if (coreDocRelationship == null) {
            return;
        }
        PackagePart workbookPart = container.getPart(coreDocRelationship);
        if (workbookPart == null) {
            return;
        }

        // Extract external workbook links
        extractExternalLinks(workbookPart, xhtml);

        // Extract connections (database, ODBC, web queries)
        extractConnections(workbookPart, xhtml);

        // Extract query tables from each sheet
        for (PackagePart sheetPart : sheetParts) {
            extractQueryTables(sheetPart, xhtml);
        }

        // Detect pivot cache with external data sources
        extractPivotCacheExternalData(workbookPart, xhtml);

        // Detect Power Query / Data Mashup
        detectPowerQuery(container);
    }

    /**
     * Detects pivot cache definitions with external data sources (OLAP, databases).
     */
    private void extractPivotCacheExternalData(PackagePart workbookPart, XHTMLContentHandler xhtml)
            throws InvalidFormatException {
        PackageRelationshipCollection coll = workbookPart.getRelationshipsByType(PIVOT_CACHE_DEFINITION_RELATION);
        if (coll == null || coll.isEmpty()) {
            return;
        }
        for (PackageRelationship rel : coll) {
            try {
                PackagePart pivotCachePart = workbookPart.getRelatedPart(rel);
                if (pivotCachePart != null) {
                    PivotCacheHandler handler = new PivotCacheHandler(xhtml);
                    try (InputStream is = pivotCachePart.getInputStream()) {
                        XMLReaderUtils.parseSAX(is, handler, parseContext);
                    }
                    if (handler.hasExternalData()) {
                        metadata.set(Office.HAS_EXTERNAL_PIVOT_DATA, true);
                    }
                }
            } catch (IOException | TikaException | SAXException e) {
                // swallow
            }
        }
    }

    /**
     * Detects Power Query / Data Mashup presence.
     */
    private void detectPowerQuery(OPCPackage container) {
        // Power Query data is stored in customData parts with specific content type
        // or in xl/customData/ folder
        try {
            List<PackagePart> customDataParts = container.getPartsByContentType(POWER_QUERY_CONTENT_TYPE);
            if (customDataParts != null && !customDataParts.isEmpty()) {
                metadata.set(Office.HAS_POWER_QUERY, true);
            }
            // Also check for customData folder parts
            for (PackagePart part : container.getParts()) {
                String partName = part.getPartName().getName();
                if (partName.contains("/customData/") || partName.contains("/dataMashup")) {
                    metadata.set(Office.HAS_POWER_QUERY, true);
                    break;
                }
            }
        } catch (InvalidFormatException e) {
            // swallow
        }
    }

    /**
     * Extracts external workbook links from externalLink parts.
     */
    private void extractExternalLinks(PackagePart workbookPart, XHTMLContentHandler xhtml)
            throws InvalidFormatException, SAXException {
        PackageRelationshipCollection coll = workbookPart.getRelationshipsByType(EXTERNAL_LINK_RELATION);
        if (coll == null || coll.isEmpty()) {
            return;
        }
        // If we have any external link relationships, set the metadata flag
        if (coll.size() > 0) {
            metadata.set(Office.HAS_EXTERNAL_LINKS, true);
        }
        for (PackageRelationship rel : coll) {
            if (rel.getTargetMode() == TargetMode.EXTERNAL) {
                // Direct external reference
                emitExternalRef(xhtml, "externalLink", rel.getTargetURI().toString());
            } else {
                // Internal part that contains external reference - parse it
                try {
                    PackagePart externalLinkPart = workbookPart.getRelatedPart(rel);
                    if (externalLinkPart != null) {
                        ExternalLinkHandler handler = new ExternalLinkHandler(xhtml);
                        try (InputStream is = externalLinkPart.getInputStream()) {
                            XMLReaderUtils.parseSAX(is, handler, parseContext);
                        }
                        if (handler.hasDdeLink()) {
                            metadata.set(Office.HAS_DDE_LINKS, true);
                        }
                    }
                } catch (IOException | TikaException e) {
                    // swallow
                }
            }
        }
    }

    /**
     * Extracts data connections from connections.xml.
     */
    private void extractConnections(PackagePart workbookPart, XHTMLContentHandler xhtml)
            throws InvalidFormatException, SAXException {
        PackageRelationshipCollection coll = workbookPart.getRelationshipsByType(CONNECTIONS_RELATION);
        if (coll == null || coll.isEmpty()) {
            return;
        }
        for (PackageRelationship rel : coll) {
            try {
                PackagePart connectionsPart = workbookPart.getRelatedPart(rel);
                if (connectionsPart != null) {
                    ConnectionsHandler handler = new ConnectionsHandler(xhtml);
                    try (InputStream is = connectionsPart.getInputStream()) {
                        XMLReaderUtils.parseSAX(is, handler, parseContext);
                    }
                    if (handler.hasConnections()) {
                        metadata.set(Office.HAS_DATA_CONNECTIONS, true);
                    }
                    if (handler.hasWebQueries()) {
                        metadata.set(Office.HAS_WEB_QUERIES, true);
                    }
                }
            } catch (IOException | TikaException e) {
                // swallow
            }
        }
    }

    /**
     * Extracts query table external sources.
     */
    private void extractQueryTables(PackagePart sheetPart, XHTMLContentHandler xhtml)
            throws InvalidFormatException, SAXException {
        PackageRelationshipCollection coll = sheetPart.getRelationshipsByType(QUERY_TABLE_RELATION);
        if (coll == null || coll.isEmpty()) {
            return;
        }
        for (PackageRelationship rel : coll) {
            try {
                PackagePart queryTablePart = sheetPart.getRelatedPart(rel);
                if (queryTablePart != null) {
                    try (InputStream is = queryTablePart.getInputStream()) {
                        XMLReaderUtils.parseSAX(is, new QueryTableHandler(xhtml), parseContext);
                    }
                }
            } catch (IOException | TikaException e) {
                // swallow
            }
        }
    }

    /**
     * Emits an external reference as an anchor element with appropriate class.
     */
    private void emitExternalRef(XHTMLContentHandler xhtml, String refType, String url)
            throws SAXException {
        if (url == null || url.isEmpty()) {
            return;
        }
        org.xml.sax.helpers.AttributesImpl attrs = new org.xml.sax.helpers.AttributesImpl();
        attrs.addAttribute("", "class", "class", "CDATA", "external-ref-" + refType);
        attrs.addAttribute("", "href", "href", "CDATA", url);
        xhtml.startElement("a", attrs);
        xhtml.endElement("a");
    }

    /**
     * Handler for parsing externalLink XML to extract external workbook references.
     */
    private class ExternalLinkHandler extends DefaultHandler {
        private final XHTMLContentHandler xhtml;
        private boolean foundDdeLink = false;

        ExternalLinkHandler(XHTMLContentHandler xhtml) {
            this.xhtml = xhtml;
        }

        @Override
        public void startElement(String uri, String localName, String qName, Attributes atts)
                throws SAXException {
            // Look for externalBook element with r:id attribute
            if ("externalBook".equals(localName)) {
                String rId = atts.getValue("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id");
                // The actual URL is in the relationship, not directly in the XML
                // For now, we note that there's an external book reference
            }
            // Look for file element with href attribute (older format)
            if ("file".equals(localName)) {
                String href = atts.getValue("href");
                if (href != null && !href.isEmpty()) {
                    emitExternalRef(xhtml, "externalWorkbook", href);
                }
            }
            // Look for oleLink with r:id (OLE links to external files)
            if ("oleLink".equals(localName)) {
                String rId = atts.getValue("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id");
                if (rId != null) {
                    emitExternalRef(xhtml, "oleLink", "relationship:" + rId);
                }
            }
            // DDE links - security risk: can execute commands
            if ("ddeLink".equals(localName)) {
                foundDdeLink = true;
                String ddeService = atts.getValue("ddeService");
                String ddeTopic = atts.getValue("ddeTopic");
                if (ddeService != null || ddeTopic != null) {
                    String ddeRef = (ddeService != null ? ddeService : "") + "|" +
                            (ddeTopic != null ? ddeTopic : "");
                    emitExternalRef(xhtml, "ddeLink", ddeRef);
                }
            }
        }

        boolean hasDdeLink() {
            return foundDdeLink;
        }
    }

    /**
     * Handler for parsing connections.xml to extract external data connections.
     */
    private class ConnectionsHandler extends DefaultHandler {
        private final XHTMLContentHandler xhtml;
        private boolean foundConnection = false;
        private boolean foundWebQuery = false;

        ConnectionsHandler(XHTMLContentHandler xhtml) {
            this.xhtml = xhtml;
        }

        @Override
        public void startElement(String uri, String localName, String qName, Attributes atts)
                throws SAXException {
            if ("connection".equals(localName)) {
                foundConnection = true;
            }
            // Database connection string
            if ("dbPr".equals(localName)) {
                String connection = atts.getValue("connection");
                if (connection != null && !connection.isEmpty()) {
                    emitExternalRef(xhtml, "dbConnection", connection);
                }
            }
            // Web query
            if ("webPr".equals(localName)) {
                foundWebQuery = true;
                String url = atts.getValue("url");
                if (url != null && !url.isEmpty()) {
                    emitExternalRef(xhtml, "webQuery", url);
                }
            }
            // ODBC connection
            if ("olapPr".equals(localName)) {
                String connection = atts.getValue("connection");
                if (connection != null && !connection.isEmpty()) {
                    emitExternalRef(xhtml, "olapConnection", connection);
                }
            }
            // Text file import
            if ("textPr".equals(localName)) {
                String sourceFile = atts.getValue("sourceFile");
                if (sourceFile != null && !sourceFile.isEmpty()) {
                    emitExternalRef(xhtml, "textFileImport", sourceFile);
                }
            }
        }

        boolean hasConnections() {
            return foundConnection;
        }

        boolean hasWebQueries() {
            return foundWebQuery;
        }
    }

    /**
     * Handler for parsing queryTable XML to extract web query sources.
     */
    private class QueryTableHandler extends DefaultHandler {
        private final XHTMLContentHandler xhtml;

        QueryTableHandler(XHTMLContentHandler xhtml) {
            this.xhtml = xhtml;
        }

        @Override
        public void startElement(String uri, String localName, String qName, Attributes atts)
                throws SAXException {
            if ("queryTable".equals(localName)) {
                String connectionId = atts.getValue("connectionId");
                // Connection details are in connections.xml
            }
            // Web query table refresh
            if ("queryTableRefresh".equals(localName)) {
                // Contains refresh settings
            }
        }
    }

    /**
     * Handler for parsing pivotCacheDefinition XML to detect external data sources.
     */
    private class PivotCacheHandler extends DefaultHandler {
        private final XHTMLContentHandler xhtml;
        private boolean hasExternalData = false;

        PivotCacheHandler(XHTMLContentHandler xhtml) {
            this.xhtml = xhtml;
        }

        @Override
        public void startElement(String uri, String localName, String qName, Attributes atts)
                throws SAXException {
            // cacheSource with type="external" indicates external data
            if ("cacheSource".equals(localName)) {
                String type = atts.getValue("type");
                if ("external".equals(type) || "consolidation".equals(type)) {
                    hasExternalData = true;
                }
            }
            // worksheetSource can have external references
            if ("worksheetSource".equals(localName)) {
                String ref = atts.getValue("ref");
                String sheet = atts.getValue("sheet");
                String rId = atts.getValue("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id");
                // If there's a relationship ID, it likely points to external workbook
                if (rId != null) {
                    hasExternalData = true;
                }
            }
            // consolidation source (multiple ranges, possibly external)
            if ("consolidation".equals(localName) || "rangeSets".equals(localName)) {
                hasExternalData = true;
            }
        }

        boolean hasExternalData() {
            return hasExternalData;
        }
    }

    private void getThreadedComments(OPCPackage container, PackagePart sheetPart, XHTMLContentHandler xhtml) throws TikaException,
            InvalidFormatException, SAXException, IOException {
        //consider caching the person id -> person names in getPersons and injecting that into the xhtml per comment?
        PackageRelationshipCollection coll = sheetPart.getRelationshipsByType(OPCPackageWrapper.THREADED_COMMENT_RELATION);
        if (coll == null || coll.isEmpty()) {
            return;
        }
        for (PackageRelationship rel : coll) {
            PackagePart threadedCommentPart = sheetPart.getRelatedPart(rel);
            if (threadedCommentPart == null) {
                continue;
            }
            try (InputStream is = threadedCommentPart.getInputStream()) {
                XMLReaderUtils.parseSAX(is, new ThreadedCommentHandler(xhtml), parseContext);
            }
        }
    }

    private void getPersons(OPCPackage container, Metadata metadata) throws TikaException, InvalidFormatException,
            IOException, SAXException {
        PackageRelationship coreDocRelationship = container.getRelationshipsByType(
                PackageRelationshipTypes.CORE_DOCUMENT).getRelationship(0);
        if (coreDocRelationship == null) {
            return;
        }
        // Get the part that holds the workbook
        PackagePart workbookPart = container.getPart(coreDocRelationship);
        if (workbookPart == null) {
            return;
        }
        PackageRelationshipCollection coll = workbookPart.getRelationshipsByType(OPCPackageWrapper.PERSON_RELATION);
        if (coll == null) {
            return;
        }
        for (PackageRelationship rel : coll) {
            PackagePart personsPart = workbookPart.getRelatedPart(rel);
            if (personsPart == null) {
                continue;
            }
            try (InputStream is = personsPart.getInputStream()) {
                XMLReaderUtils.parseSAX(is, new CommentPersonHandler(metadata), parseContext);
            }
        }
    }

    protected void addDrawingHyperLinks(PackagePart sheetPart) {
        try {
            for (PackageRelationship rel : sheetPart
                    .getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
                if (rel.getTargetMode() == TargetMode.INTERNAL) {
                    PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
                    PackagePart part = rel.getPackage().getPart(relName);
                    //parts can go missing, and Excel quietly ignores missing images -- TIKA-2134
                    if (part == null) {
                        continue;
                    }
                    for (PackageRelationship drawRel : part
                            .getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
                        drawingHyperlinks.put(drawRel.getId(), drawRel.getTargetURI().toString());
                    }
                }
            }
        } catch (InvalidFormatException e) {
            //swallow
            //an exception trying to extract
            //hyperlinks on drawings should not cause a parse failure
        }

    }


    protected void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml)
            throws SAXException {
        try {
            for (PackageRelationship rel : sheetPart
                    .getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
                xhtml.startElement("a", "href", rel.getTargetURI().toString());
                xhtml.characters(rel.getTargetURI().toString());
                xhtml.endElement("a");
            }
        } catch (InvalidFormatException e) {
            //swallow
        }
    }

    protected void extractHeaderFooter(String hf, XHTMLContentHandler xhtml) throws SAXException {
        String content = ExcelExtractor._extractHeaderFooter(new HeaderFooterFromString(hf));
        if (content.length() > 0) {
            xhtml.element("p", content);
        }
    }

    protected void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml)
            throws SAXException {
        if (shapes == null) {
            return;
        }
        //We don't currently have an obvious way to get drawings
        //directly from sheetIter. Therefore, we grab the shapes and process those.
        //To get the diagrams and charts, we need to get the parent drawing for each
        //shape, and we need to make sure that we only process each parent shape once!
        //SEE TIKA-2703 TODO: add unit test
        Set<String> seenParentDrawings = new HashSet<>();
        for (XSSFShape shape : shapes) {
            if (shape instanceof XSSFSimpleShape) {
                String sText = ((XSSFSimpleShape) shape).getText();
                if (sText != null && sText.length() > 0) {
                    xhtml.element("p", sText);
                }
                extractHyperLinksFromShape(((XSSFSimpleShape) shape).getCTShape(), xhtml);
            }

            XSSFDrawing parentDrawing = shape.getDrawing();
            if (parentDrawing != null) {
                if (!seenParentDrawings
                        .contains(parentDrawing.getPackagePart().getPartName().toString())) {
                    //dump diagram data
                    handleGeneralTextContainingPart(AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
                            "diagram-data", parentDrawing.getPackagePart(), metadata,
                            new OOXMLWordAndPowerPointTextHandler(
                                    new OOXMLTikaBodyPartHandler(xhtml),
                                    new HashMap<>()//empty
                            ));
                    //dump chart data
                    handleGeneralTextContainingPart(XSSFRelation.CHART.getRelation(), "chart",
                            parentDrawing.getPackagePart(), metadata,
                            new OOXMLWordAndPowerPointTextHandler(
                                    new OOXMLTikaBodyPartHandler(xhtml),
                                    new HashMap<>()//empty
                            ));
                }
                seenParentDrawings.add(parentDrawing.getPackagePart().getPartName().toString());
            }
        }
    }

    private void extractHyperLinksFromShape(CTShape ctShape, XHTMLContentHandler xhtml)
            throws SAXException {

        if (ctShape == null) {
            return;
        }

        CTShapeNonVisual nvSpPR = ctShape.getNvSpPr();
        if (nvSpPR == null) {
            return;
        }

        CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr();
        if (cNvPr == null) {
            return;
        }

        CTHyperlink ctHyperlink = cNvPr.getHlinkClick();
        if (ctHyperlink == null) {
            return;
        }

        String url = drawingHyperlinks.get(ctHyperlink.getId());
        if (url != null) {
            xhtml.startElement("a", "href", url);
            xhtml.characters(url);
            xhtml.endElement("a");
        }

        CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover();
        if (ctHoverHyperlink == null) {
            return;
        }

        url = drawingHyperlinks.get(ctHoverHyperlink.getId());
        if (url != null) {
            xhtml.startElement("a", "href", url);
            xhtml.characters(url);
            xhtml.endElement("a");
        }

    }

    public void processSheet(SheetContentsHandler sheetContentsHandler, Comments comments,
                             StylesTable styles, ReadOnlySharedStringsTable strings,
                             InputStream sheetInputStream) throws IOException, SAXException {
        try {

            XSSFSheetInterestingPartsCapturer handler = new XSSFSheetInterestingPartsCapturer(
                    new XSSFSheetXMLHandler(styles, comments, strings, sheetContentsHandler,
                            formatter, false));
            XMLReaderUtils.parseSAX(sheetInputStream, handler, parseContext);
            sheetInputStream.close();

            if (handler.hasProtection) {
                metadata.set(Office.PROTECTED_WORKSHEET, true);
            }
            if (handler.hasHiddenColumn) {
                metadata.set(Office.HAS_HIDDEN_COLUMNS, true);
            }
            if (handler.hasHiddenRow) {
                metadata.set(Office.HAS_HIDDEN_ROWS, true);
            }
        } catch (TikaException e) {
            throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
        }
    }

    /**
     * In Excel files, sheets have things embedded in them,
     * and sheet drawings which have the images
     */
    @Override
    protected List<PackagePart> getMainDocumentParts() throws TikaException {
        List<PackagePart> parts = new ArrayList<>();
        for (PackagePart part : sheetParts) {
            // Add the sheet
            parts.add(part);

            // If it has drawings, return those too
            try {
                for (PackageRelationship rel : part
                        .getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
                    if (rel.getTargetMode() == TargetMode.INTERNAL) {
                        PackagePartName relName =
                                PackagingURIHelper.createPartName(rel.getTargetURI());
                        parts.add(rel.getPackage().getPart(relName));
                    }
                }
                for (PackageRelationship rel : part
                        .getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
                    if (rel.getTargetMode() == TargetMode.INTERNAL) {
                        PackagePartName relName =
                                PackagingURIHelper.createPartName(rel.getTargetURI());
                        parts.add(rel.getPackage().getPart(relName));
                    }
                }
            } catch (InvalidFormatException e) {
                throw new TikaException("Broken OOXML file", e);
            }
        }

        //add main document so that macros can be extracted
        //by AbstractOOXMLExtractor
        parts.addAll(extractor.getPackage()
                .getPartsByRelationshipType(PackageRelationshipTypes.CORE_DOCUMENT));

        return parts;
    }

    /**
     * Turns formatted sheet events into HTML
     */
    protected static class SheetTextAsHTML implements SheetContentsHandler {
        private final boolean includeHeadersFooters;
        private final boolean includeMissingRows;
        protected List<String> headers;
        protected List<String> footers;
        private XHTMLContentHandler xhtml;
        private int lastSeenRow = -1;
        private int lastSeenCol = -1;

        protected SheetTextAsHTML(OfficeParserConfig config, XHTMLContentHandler xhtml) {
            this.includeHeadersFooters = config.isIncludeHeadersAndFooters();
            this.includeMissingRows = config.isIncludeMissingRows();
            this.xhtml = xhtml;
            headers = new ArrayList<>();
            footers = new ArrayList<>();
        }

        public void startRow(int rowNum) {
            try {
                // Missing rows, if desired, with a single empty row
                if (includeMissingRows && rowNum > (lastSeenRow + 1)) {
                    for (int rn = lastSeenRow + 1; rn < rowNum; rn++) {
                        xhtml.startElement("tr");
                        xhtml.startElement("td");
                        xhtml.endElement("td");
                        xhtml.endElement("tr");
                    }
                }

                // Start the new row
                xhtml.startElement("tr");
                lastSeenCol = -1;
            } catch (SAXException e) {
                //swallow
                throw new RuntimeSAXException(e);
            }

        }

        public void endRow(int rowNum) {
            try {
                xhtml.endElement("tr");
            } catch (SAXException e) {
                throw new RuntimeSAXException(e);
            }
        }

        public void cell(String cellRef, String formattedValue, XSSFComment comment) {
            try {
                // Handle any missing cells
                int colNum =
                        (cellRef == null) ? lastSeenCol + 1 : (new CellReference(cellRef)).getCol();
                for (int cn = lastSeenCol + 1; cn < colNum; cn++) {
                    xhtml.startElement("td");
                    xhtml.endElement("td");
                }
                lastSeenCol = colNum;

                // Start this cell
                xhtml.startElement("td");

                // Main cell contents
                if (formattedValue != null) {
                    xhtml.characters(formattedValue);
                }

                // Comments
                if (comment != null) {
                    xhtml.startElement("br");
                    xhtml.endElement("br");
                    xhtml.characters(comment.getAuthor());
                    xhtml.characters(": ");
                    xhtml.characters(comment.getString().getString());
                }

                xhtml.endElement("td");
            } catch (SAXException e) {
                throw new RuntimeSAXException(e);
            }
        }

        public void headerFooter(String text, boolean isHeader, String tagName) {
            if (!includeHeadersFooters) {
                return;
            }
            if (isHeader) {
                headers.add(text);
            } else {
                footers.add(text);
            }
        }
    }

    protected static class HeaderFooterFromString implements HeaderFooter {
        private String text;

        protected HeaderFooterFromString(String text) {
            this.text = text;
        }

        public String getCenter() {
            return hfHelper.getCenterSection(text);
        }

        public void setCenter(String paramString) {
        }

        public String getLeft() {
            return hfHelper.getLeftSection(text);
        }

        public void setLeft(String paramString) {
        }

        public String getRight() {
            return hfHelper.getRightSection(text);
        }

        public void setRight(String paramString) {
        }
    }

    /**
     * Captures information on interesting tags, whilst
     * delegating the main work to the formatting handler
     */
    protected static class XSSFSheetInterestingPartsCapturer extends DefaultHandler {
        private ContentHandler delegate;
        private boolean hasProtection = false;
        private boolean hasHiddenRow = false;
        private boolean hasHiddenColumn = false;

        protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
            this.delegate = delegate;
        }

        public void startElement(String uri, String localName, String qName, Attributes atts)
                throws SAXException {
            if ("sheetProtection".equals(qName)) {
                hasProtection = true;
            }
            if (! hasHiddenRow && "row".equals(localName)) {
                String v = atts.getValue("hidden");
                if ("true".equals(v) || "1".equals(v)) {
                    hasHiddenRow = true;
                }
            }
            if (! hasHiddenColumn && "col".equals(localName)) {
                String v = atts.getValue("hidden");
                if ("true".equals(v) || "1".equals(v)) {
                    hasHiddenColumn = true;
                }
            }
            delegate.startElement(uri, localName, qName, atts);
        }

        public void characters(char[] ch, int start, int length) throws SAXException {
            delegate.characters(ch, start, length);
        }

        public void endDocument() throws SAXException {
            delegate.endDocument();
        }

        public void endElement(String uri, String localName, String qName) throws SAXException {
            delegate.endElement(uri, localName, qName);
        }

        public void endPrefixMapping(String prefix) throws SAXException {
            delegate.endPrefixMapping(prefix);
        }

        public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
            delegate.ignorableWhitespace(ch, start, length);
        }

        public void processingInstruction(String target, String data) throws SAXException {
            delegate.processingInstruction(target, data);
        }

        public void setDocumentLocator(Locator locator) {
            delegate.setDocumentLocator(locator);
        }

        public void skippedEntity(String name) throws SAXException {
            delegate.skippedEntity(name);
        }

        public void startDocument() throws SAXException {
            delegate.startDocument();
        }

        public void startPrefixMapping(String prefix, String uri) throws SAXException {
            delegate.startPrefixMapping(prefix, uri);
        }
    }

    private class WorkbookMetadataHandler extends DefaultHandler {
        @Override
        public void startElement(String uri, String localName, String qName, Attributes atts)
                throws SAXException {
            //require x15ac //http://schemas.microsoft.com/office/spreadsheetml/2010/11/ac ???
            if ("absPath".equals(localName)) {
                for (int i = 0; i < atts.getLength(); i++) {
                    String n = atts.getLocalName(i);
                    if ("url".equals(n)) {
                        String url = atts.getValue(i);
                        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, url);
                        return;
                    }
                }
            } else if ("sheet".equals(localName)) {
                String n = XMLReaderUtils.getAttrValue("name", atts);
                String state = XMLReaderUtils.getAttrValue("state", atts);
                if ("hidden".equals(state)) {
                    metadata.set(Office.HAS_HIDDEN_SHEETS, true);
                    metadata.add(Office.HIDDEN_SHEET_NAMES, n);
                } else if ("veryHidden".equals(state)) {
                    metadata.set(Office.HAS_VERY_HIDDEN_SHEETS, true);
                    metadata.set(Office.VERY_HIDDEN_SHEET_NAMES, n);
                }
            } else if ("workbookPr".equals(localName)) {
                String codeName = XMLReaderUtils.getAttrValue("codeName", atts);
                if (!StringUtils.isBlank(codeName)) {
                    metadata.set(Office.WORKBOOK_CODENAME, codeName);
                }
            }
            // file version? <fileVersion appName="xl" lastEdited="7" lowestEdited="7" rupBuild="28526"/>
        }
    }

    private static class ThreadedCommentHandler extends DefaultHandler {
        private final XHTMLContentHandler xhtml;
        StringBuilder sb = new StringBuilder();
        boolean inText = false;
        public ThreadedCommentHandler(XHTMLContentHandler xhtml) {
            this.xhtml = xhtml;
        }

        @Override
        public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
            if ("text".equals(localName)) {
                inText = true;
            }
        }

        @Override
        public void endElement(String uri, String localName, String qName) throws SAXException {
            if ("text".equals(localName)) {
                xhtml.startElement("div", "class", "threaded-comment");
                xhtml.startElement("p");
                xhtml.characters(sb.toString());
                xhtml.endElement("p");
                xhtml.endElement("div");
                sb.setLength(0);
            }
        }

        @Override
        public void characters(char[] ch, int start, int length) throws SAXException {
            if (inText) {
                sb.append(ch, start, length);
            }
        }

        @Override
        public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
            if (inText) {
                sb.append(ch, start, length);
            }
        }
    }
}