ExcelExtractor.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft;

import java.awt.Point;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.poi.ddf.EscherBSERecord;
import org.apache.poi.ddf.EscherBlipRecord;
import org.apache.poi.ddf.EscherRecord;
import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
import org.apache.poi.hssf.extractor.OldExcelExtractor;
import org.apache.poi.hssf.model.InternalWorkbook;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.CellValueRecordInterface;
import org.apache.poi.hssf.record.ColumnInfoRecord;
import org.apache.poi.hssf.record.CountryRecord;
import org.apache.poi.hssf.record.DateWindow1904Record;
import org.apache.poi.hssf.record.DrawingGroupRecord;
import org.apache.poi.hssf.record.EOFRecord;
import org.apache.poi.hssf.record.ExtendedFormatRecord;
import org.apache.poi.hssf.record.FooterRecord;
import org.apache.poi.hssf.record.FormatRecord;
import org.apache.poi.hssf.record.FormulaRecord;
import org.apache.poi.hssf.record.HeaderRecord;
import org.apache.poi.hssf.record.HyperlinkRecord;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NoteRecord;
import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.ProtectRecord;
import org.apache.poi.hssf.record.RKRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.RowRecord;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.hssf.record.TextObjectRecord;
import org.apache.poi.hssf.record.chart.SeriesTextRecord;
import org.apache.poi.hssf.record.common.UnicodeString;
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
import org.apache.poi.hssf.usermodel.HSSFPictureData;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.DateUtil;
import org.xml.sax.SAXException;

import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;

/**
 * Excel parser implementation which uses POI's Event API
 * to handle the contents of a Workbook.
 * <p/>
 * The Event API uses a much smaller memory footprint than
 * <code>HSSFWorkbook</code> when processing excel files
 * but at the cost of more complexity.
 * <p/>
 * With the Event API a <i>listener</i> is registered for
 * specific record types and those records are created,
 * fired off to the listener and then discarded as the stream
 * is being processed.
 *
 * @see org.apache.poi.hssf.eventusermodel.HSSFListener
 * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
 * POI Event API How To</a>
 */
public class ExcelExtractor extends AbstractPOIFSExtractor {

    private static final String BOOK_ENTRY = "Book";
    /**
     * <code>true</code> if the HSSFListener should be registered
     * to listen for all records or <code>false</code> (the default)
     * if the listener should be configured to only receive specified
     * records.
     */
    private boolean listenForAllRecords = false;

    public ExcelExtractor(ParseContext context, Metadata metadata) {
        super(context, metadata);
    }

    /**
     * Looks for one of the variant names for the workbook entry;
     * returns null if not found.
     *
     * @param root directory root to search
     * @return workbook entry or null
     */
    private static String findWorkbookEntry(DirectoryNode root) {
        for (String workbookDirEntryName : InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) {
            if (root.hasEntry(workbookDirEntryName)) {
                return workbookDirEntryName;
            }
        }
        return null;
    }

    /**
     * Returns <code>true</code> if this parser is configured to listen
     * for all records instead of just the specified few.
     */
    public boolean isListenForAllRecords() {
        return listenForAllRecords;
    }

    /**
     * Specifies whether this parser should to listen for all
     * records or just for the specified few.
     * <p/>
     * <strong>Note:</strong> Under normal operation this setting should
     * be <code>false</code> (the default), but you can experiment with
     * this setting for testing and debugging purposes.
     *
     * @param listenForAllRecords <code>true</code> if the HSSFListener
     *                            should be registered to listen for all records or
     *                            <code>false</code>
     *                            if the listener should be configured to only receive specified
     *                            records.
     */
    public void setListenForAllRecords(boolean listenForAllRecords) {
        this.listenForAllRecords = listenForAllRecords;
    }

    /**
     * Extracts text from an Excel Workbook writing the extracted content
     * to the specified {@link Appendable}.
     *
     * @param filesystem POI file system
     * @throws IOException if an error occurs processing the workbook
     *                     or writing the extracted content
     */
    protected void parse(POIFSFileSystem filesystem, XHTMLContentHandler xhtml, Locale locale)
            throws IOException, SAXException, TikaException {
        parse(filesystem.getRoot(), xhtml, locale);
    }

    protected void parse(DirectoryNode root, XHTMLContentHandler xhtml, Locale locale)
            throws IOException, SAXException, TikaException {
        String workbookEntryName = findWorkbookEntry(root);
        if (workbookEntryName == null) {
            if (root.hasEntry(BOOK_ENTRY)) {
                // Excel 5 / Excel 95 file
                // Records are in a different structure so needs a
                //  different parser to process them
                OldExcelExtractor extractor = new OldExcelExtractor(root);
                OldExcelParser.parse(extractor, xhtml);
                return;
            } else {
                // Corrupt file / very old file
                throw new TikaException("Couldn't find workbook entry");
            }
        }

        // If a password was supplied, use it, otherwise the default
        Biff8EncryptionKey.setCurrentUserPassword(getPassword());

        // Have the file processed in event mode
        TikaHSSFListener listener =
                new TikaHSSFListener(workbookEntryName, xhtml, locale, this, officeParserConfig);
        listener.processFile(root, isListenForAllRecords());
        listener.throwStoredException();
        updateMetadata(listener);

        for (Entry entry : root) {
            if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) {
                try {
                    handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml, true);
                } catch (TikaException e) {
                    // ignore parse errors from embedded documents
                }
            }
        }
    }

    private void updateMetadata(TikaHSSFListener listener) {
        if (listener.hasProtectedSheet) {
            parentMetadata.set(Office.PROTECTED_WORKSHEET, true);
        }
        if (listener.hasHiddenColumn) {
            parentMetadata.set(Office.HAS_HIDDEN_COLUMNS, true);
        }
        if (listener.hasHiddenRow) {
            parentMetadata.set(Office.HAS_HIDDEN_ROWS, true);
        }
        if (! listener.commentAuthors.isEmpty()) {
            for (String author : listener.commentAuthors) {
                parentMetadata.add(Office.COMMENT_PERSONS, author);
            }
            parentMetadata.set(Office.HAS_COMMENTS, true);
        }
        if (! listener.hiddenSheets.isEmpty()) {
            for (String sheetName : listener.hiddenSheets) {
                parentMetadata.add(Office.HIDDEN_SHEET_NAMES, sheetName);
            }
            parentMetadata.set(Office.HAS_HIDDEN_SHEETS, true);
        }
        if (! listener.veryHiddenSheets.isEmpty()) {
            for (String sheetName : listener.veryHiddenSheets) {
                parentMetadata.add(Office.VERY_HIDDEN_SHEET_NAMES, sheetName);
            }
            parentMetadata.set(Office.HAS_VERY_HIDDEN_SHEETS, true);
        }
    }

    // ======================================================================

    /**
     * HSSF Listener implementation which processes the HSSF records.
     */
    private static class TikaHSSFListener implements HSSFListener {

        /**
         * XHTML content handler to which the document content is rendered.
         */
        private final XHTMLContentHandler handler;

        /**
         * The POIFS Extractor, used for embeded resources.
         */
        private final AbstractPOIFSExtractor extractor;
        /**
         * Format for rendering numbers in the worksheet. Currently we just
         * use the platform default formatting.
         *
         * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a>
         */
        private final NumberFormat format;

        private final OfficeParserConfig officeParserConfig;
        private final TikaExcelDataFormatter tikaExcelDataFormatter;
        private final String workbookEntryName;
        /**
         * Potential exception thrown by the content handler. When set to
         * non-<code>null</code>, causes all subsequent HSSF records to be
         * ignored and the stored exception to be thrown when
         * {@link #throwStoredException()} is invoked.
         */
        private Exception exception = null;
        private SSTRecord sstRecord;
        private FormulaRecord stringFormulaRecord;
        private short previousSid;
        /**
         * Internal <code>FormatTrackingHSSFListener</code> to handle cell
         * formatting within the extraction.
         */
        private FormatTrackingHSSFListener formatListener;
        /**
         * List of worksheet names.
         */
        private List<String> sheetNames = new ArrayList<>();
        /**
         * Index of the current worksheet within the workbook.
         * Used to find the worksheet name in the {@link #sheetNames} list.
         */
        private short currentSheetIndex;
        /**
         * Content of the current worksheet, or <code>null</code> if no
         * worksheet is currently active.
         */
        private SortedMap<Point, Cell> currentSheet = null;
        /**
         * Extra text or cells that crops up, typically as part of a
         * worksheet but not always.
         */
        private List<Cell> extraTextCells = new ArrayList<>();
        /**
         * These aren't complete when we first see them, as the
         * depend on continue records that aren't always
         * contiguous. Collect them for later processing.
         */
        private final List<DrawingGroupRecord> drawingGroups = new ArrayList<>();

        private final List<String> hiddenSheets = new ArrayList<>();
        private final List<String> veryHiddenSheets = new ArrayList<>();
        private final Set<String> commentAuthors = new TreeSet<>();
        private boolean hasHiddenColumn = false;
        private boolean hasHiddenRow = false;
        private boolean hasProtectedSheet = false;

        /**
         * Construct a new listener instance outputting parsed data to
         * the specified XHTML content handler.
         *
         * @param handler Destination to write the parsed output to
         */
        private TikaHSSFListener(String workbookEntryName, XHTMLContentHandler handler,
                                 Locale locale, AbstractPOIFSExtractor extractor,
                                 OfficeParserConfig officeParserConfig) {
            this.workbookEntryName = workbookEntryName;
            this.handler = handler;
            this.extractor = extractor;
            this.format = NumberFormat.getInstance(locale);
            this.formatListener = new TikaFormatTrackingHSSFListener(this, locale);
            this.tikaExcelDataFormatter = new TikaExcelDataFormatter(locale);
            this.officeParserConfig = officeParserConfig;

            this.tikaExcelDataFormatter
                    .setDateFormatOverride(officeParserConfig.getDateFormatOverride());
        }

        /**
         * Entry point to listener to start the processing of a file.
         *
         * @param filesystem          POI file system.
         * @param listenForAllRecords sets whether the listener is configured to listen
         *                            for all records types or not.
         * @throws IOException  on any IO errors.
         * @throws SAXException on any SAX parsing errors.
         */
        public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords)
                throws IOException, SAXException, TikaException {
            processFile(filesystem.getRoot(), listenForAllRecords);
        }

        public void processFile(DirectoryNode root, boolean listenForAllRecords)
                throws IOException, SAXException, TikaException {
            // Set up listener and register the records we want to process
            HSSFRequest hssfRequest = new HSSFRequest();
            if (listenForAllRecords) {
                hssfRequest.addListenerForAllRecords(formatListener);
            } else {
                hssfRequest.addListener(formatListener, BOFRecord.sid);
                hssfRequest.addListener(formatListener, EOFRecord.sid);
                hssfRequest.addListener(formatListener, DateWindow1904Record.sid);
                hssfRequest.addListener(formatListener, CountryRecord.sid);
                hssfRequest.addListener(formatListener, BoundSheetRecord.sid);
                hssfRequest.addListener(formatListener, SSTRecord.sid);
                hssfRequest.addListener(formatListener, FormulaRecord.sid);
                hssfRequest.addListener(formatListener, LabelRecord.sid);
                hssfRequest.addListener(formatListener, LabelSSTRecord.sid);
                hssfRequest.addListener(formatListener, NumberRecord.sid);
                hssfRequest.addListener(formatListener, RKRecord.sid);
                hssfRequest.addListener(formatListener, StringRecord.sid);
                hssfRequest.addListener(formatListener, HyperlinkRecord.sid);
                hssfRequest.addListener(formatListener, TextObjectRecord.sid);
                hssfRequest.addListener(formatListener, SeriesTextRecord.sid);
                hssfRequest.addListener(formatListener, FormatRecord.sid);
                hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid);
                hssfRequest.addListener(formatListener, DrawingGroupRecord.sid);
                hssfRequest.addListener(formatListener, ProtectRecord.sid);
                hssfRequest.addListener(formatListener, ColumnInfoRecord.sid);
                hssfRequest.addListener(formatListener, RowRecord.sid);
                hssfRequest.addListener(formatListener, NoteRecord.sid);
                if (extractor.officeParserConfig.isIncludeHeadersAndFooters()) {
                    hssfRequest.addListener(formatListener, HeaderRecord.sid);
                    hssfRequest.addListener(formatListener, FooterRecord.sid);
                }
            }

            // Create event factory and process Workbook (fire events)
            DocumentInputStream documentInputStream =
                    root.createDocumentInputStream(workbookEntryName);
            HSSFEventFactory eventFactory = new HSSFEventFactory();
            try {
                eventFactory.processEvents(hssfRequest, documentInputStream);
            } catch (org.apache.poi.EncryptedDocumentException e) {
                throw new EncryptedDocumentException(e);
            }

            // Output any extra text that came after all the sheets
            processExtraText();

            // Look for embeded images, now that the drawing records
            //  have been fully matched with their continue data
            for (DrawingGroupRecord dgr : drawingGroups) {
                dgr.decode();
                findPictures(dgr.getEscherRecords());
            }
        }

        /**
         * Process a HSSF record.
         *
         * @param record HSSF Record
         */
        public void processRecord(Record record) {
            if (exception == null) {
                try {
                    internalProcessRecord(record);
                } catch (TikaException | IOException | SAXException e) {
                    exception = e;
                }
            }
        }

        public void throwStoredException() throws TikaException, SAXException, IOException {
            if (exception != null) {
                if (exception instanceof IOException) {
                    throw (IOException) exception;
                }
                if (exception instanceof SAXException) {
                    throw (SAXException) exception;
                }
                if (exception instanceof TikaException) {
                    throw (TikaException) exception;
                }
                throw new TikaException(exception.getMessage());
            }
        }

        private void internalProcessRecord(Record record)
                throws SAXException, TikaException, IOException {
            switch (record.getSid()) {
                case BOFRecord.sid: // start of workbook, worksheet etc. records
                    BOFRecord bof = (BOFRecord) record;
                    if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
                        currentSheetIndex = -1;
                    } else if (bof.getType() == BOFRecord.TYPE_CHART) {
                        if (previousSid == EOFRecord.sid) {
                            // This is a sheet which contains only a chart
                            newSheet();
                        } else {
                            // This is a chart within a normal sheet
                            // Handling of this is a bit hacky...
                            if (currentSheet != null) {
                                processSheet();
                                currentSheetIndex--;
                                newSheet();
                            }
                        }
                    } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
                        newSheet();
                    }
                    break;

                case EOFRecord.sid: // end of workbook, worksheet etc. records
                    if (currentSheet != null) {
                        processSheet();
                    }
                    currentSheet = null;
                    break;

                case BoundSheetRecord.sid: // Worksheet index record
                    BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record;
                    if (boundSheetRecord.isHidden()) {
                        hiddenSheets.add(boundSheetRecord.getSheetname());
                    }
                    if (boundSheetRecord.isVeryHidden()) {
                        veryHiddenSheets.add(boundSheetRecord.getSheetname());
                    }
                    sheetNames.add(boundSheetRecord.getSheetname());
                    break;

                case SSTRecord.sid: // holds all the strings for LabelSSTRecords
                    sstRecord = (SSTRecord) record;
                    break;

                case FormulaRecord.sid: // Cell value from a formula
                    FormulaRecord formula = (FormulaRecord) record;
                    if (formula.hasCachedResultString()) {
                        // The String itself should be the next record
                        stringFormulaRecord = formula;
                    } else {
                        addTextCell(record, formatListener.formatNumberDateCell(formula));
                    }
                    break;

                case StringRecord.sid:
                    if (previousSid == FormulaRecord.sid) {
                        // Cached string value of a string formula
                        StringRecord sr = (StringRecord) record;
                        addTextCell(stringFormulaRecord, sr.getString());
                    } else {
                        // Some other string not associated with a cell, skip
                    }
                    break;

                case LabelRecord.sid: // strings stored directly in the cell
                    LabelRecord label = (LabelRecord) record;
                    addTextCell(record, label.getValue());
                    break;

                case LabelSSTRecord.sid: // Ref. a string in the shared string table
                    if (sstRecord == null) {
                        throw new TikaException("sstRecord should have been initialized before a ref to the shared string table");
                    }
                    LabelSSTRecord sst = (LabelSSTRecord) record;
                    UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
                    String cellString = null;
                    if (officeParserConfig.isConcatenatePhoneticRuns()) {
                        String phonetic = (unicode != null && unicode.getExtendedRst() != null &&
                                unicode.getExtendedRst().getPhoneticText() != null &&
                                !unicode.getExtendedRst().getPhoneticText().isBlank()) ?
                                unicode.getExtendedRst().getPhoneticText() : "";
                        cellString = unicode.getString() + " " + phonetic;
                    } else {
                        cellString = unicode.getString();
                    }
                    addTextCell(record, cellString);
                    break;

                case NumberRecord.sid: // Contains a numeric cell value
                    NumberRecord number = (NumberRecord) record;
                    addTextCell(record, formatListener.formatNumberDateCell(number));
                    break;

                case RKRecord.sid: // Excel internal number record
                    RKRecord rk = (RKRecord) record;
                    addCell(record, new NumberCell(rk.getRKNumber(), format));
                    break;

                case HyperlinkRecord.sid: // holds a URL associated with a cell
                    if (currentSheet != null) {
                        HyperlinkRecord link = (HyperlinkRecord) record;
                        Point point = new Point(link.getFirstColumn(), link.getFirstRow());
                        Cell cell = currentSheet.get(point);
                        if (cell != null) {
                            String address = link.getAddress();
                            if (address != null) {
                                addCell(record, new LinkedCell(cell, address));
                            } else {
                                addCell(record, cell);
                            }
                        }
                    }
                    break;

                case TextObjectRecord.sid:
                    if (extractor.officeParserConfig.isIncludeShapeBasedContent()) {
                        TextObjectRecord tor = (TextObjectRecord) record;
                        addTextCell(record, tor.getStr().getString());
                    }
                    break;

                case SeriesTextRecord.sid: // Chart label or title
                    SeriesTextRecord str = (SeriesTextRecord) record;
                    addTextCell(record, str.getText());
                    break;

                case DrawingGroupRecord.sid:
                    // Collect this now, we'll process later when all
                    //  the continue records are in
                    drawingGroups.add((DrawingGroupRecord) record);
                    break;

                case HeaderRecord.sid:
                    if (extractor.officeParserConfig.isIncludeHeadersAndFooters()) {
                        HeaderRecord headerRecord = (HeaderRecord) record;
                        addTextCell(record, headerRecord.getText());
                    }
                    break;

                case FooterRecord.sid:
                    if (extractor.officeParserConfig.isIncludeHeadersAndFooters()) {
                        FooterRecord footerRecord = (FooterRecord) record;
                        addTextCell(record, footerRecord.getText());
                    }
                    break;
                case ProtectRecord.sid:
                    if (((ProtectRecord)record).getProtect()) {
                        //TODO -- associate this worksheet name
                        hasProtectedSheet = true;
                    }
                    break;
                case ColumnInfoRecord.sid:
                    if (((ColumnInfoRecord)record).getHidden()) {
                        hasHiddenColumn = true;
                    }
                    break;
                case NoteRecord.sid:
                    String author = ((NoteRecord)record).getAuthor();
                    if (!StringUtils.isBlank(author)) {
                        commentAuthors.add(author);
                    }
                    break;
                case RowRecord.sid:
                    if (((RowRecord)record).getZeroHeight()) {
                        hasHiddenRow = true;
                    }
                    break;
            }

            previousSid = record.getSid();

            if (stringFormulaRecord != record) {
                stringFormulaRecord = null;
            }
        }

        private void processExtraText() throws SAXException {
            if (extraTextCells.size() > 0) {
                for (Cell cell : extraTextCells) {
                    handler.startElement("div", "class", "outside");
                    cell.render(handler);
                    handler.endElement("div");
                }

                // Reset
                extraTextCells.clear();
            }
        }

        /**
         * Adds the given cell (unless <code>null</code>) to the current
         * worksheet (if any) at the position (if any) of the given record.
         *
         * @param record record that holds the cell value
         * @param cell   cell value (or <code>null</code>)
         */
        private void addCell(Record record, Cell cell) throws SAXException {
            if (cell == null) {
                // Ignore empty cells
            } else if (currentSheet != null && record instanceof CellValueRecordInterface) {
                // Normal cell inside a worksheet
                CellValueRecordInterface value = (CellValueRecordInterface) record;
                Point point = new Point(value.getColumn(), value.getRow());
                if (currentSheet.containsKey(point)) {
                    //avoid overwriting content
                    //for now, add to extraTextCells
                    //TODO: consider allowing multiple text pieces
                    //per x,y to keep the text together
                    extraTextCells.add(cell);
                } else {
                    currentSheet.put(point, cell);
                }

            } else {
                // Cell outside the worksheets
                extraTextCells.add(cell);
            }
        }

        /**
         * Adds a text cell with the given text comment. The given text
         * is trimmed, and ignored if <code>null</code> or empty.
         *
         * @param record record that holds the text value
         * @param text   text content, may be <code>null</code>
         * @throws SAXException
         */
        private void addTextCell(Record record, String text) throws SAXException {
            if (text != null) {
                text = text.trim();
                if (text.length() > 0) {
                    addCell(record, new TextCell(text));
                }
            }
        }

        private void newSheet() {
            currentSheetIndex++;
            currentSheet = new TreeMap<>(new PointComparator());
        }

        /**
         * Process an excel sheet.
         *
         * @throws SAXException if an error occurs
         */
        private void processSheet() throws SAXException {
            // Sheet Start
            handler.startElement("div", "class", "page");
            if (currentSheetIndex < sheetNames.size()) {
                handler.element("h1", sheetNames.get(currentSheetIndex));
            }
            handler.startElement("table");
            handler.startElement("tbody");

            // Process Rows
            int currentRow = 0;
            int currentColumn = 0;
            handler.startElement("tr");
            handler.startElement("td");
            for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
                if (currentRow != entry.getKey().y) {
                    // We've moved onto a new row, possibly skipping some
                    do {
                        handler.endElement("td");
                        handler.endElement("tr");
                        handler.startElement("tr");
                        handler.startElement("td");
                        currentRow++;
                    } while (officeParserConfig.isIncludeMissingRows() &&
                            currentRow < entry.getKey().y);
                    currentRow = entry.getKey().y;
                    currentColumn = 0;
                }

                while (currentColumn < entry.getKey().x) {
                    handler.endElement("td");
                    handler.startElement("td");
                    currentColumn++;
                }

                entry.getValue().render(handler);
            }
            handler.endElement("td");
            handler.endElement("tr");

            // Sheet End
            handler.endElement("tbody");
            handler.endElement("table");

            // Finish up
            processExtraText();
            handler.endElement("div");
        }

        private void findPictures(List<EscherRecord> records)
                throws IOException, SAXException, TikaException {
            for (EscherRecord escherRecord : records) {
                if (escherRecord instanceof EscherBSERecord) {
                    EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord();
                    if (blip != null) {
                        HSSFPictureData picture = new HSSFPictureData(blip);
                        String mimeType = picture.getMimeType();
                        TikaInputStream tis = TikaInputStream.get(picture.getData());

                        // Handle the embeded resource
                        extractor.handleEmbeddedResource(tis, null, null, mimeType, handler,
                                true);
                    }
                }

                // Recursive call.
                findPictures(escherRecord.getChildRecords());
            }
        }

        private class TikaFormatTrackingHSSFListener extends FormatTrackingHSSFListener {
            public TikaFormatTrackingHSSFListener(HSSFListener childListener, Locale locale) {
                super(childListener, locale);
            }

            @Override
            public void processRecord(Record record) {
                super.processRecord(record);
            }

            @Override
            public String formatNumberDateCell(CellValueRecordInterface cell) {
                String formatString = this.getFormatString(cell);

                double value;
                if (cell instanceof NumberRecord) {
                    value = ((NumberRecord) cell).getValue();
                } else {
                    if (!(cell instanceof FormulaRecord)) {
                        throw new IllegalArgumentException(
                                "Unsupported CellValue Record passed in " + cell);
                    }
                    value = ((FormulaRecord) cell).getValue();
                }
                if (DateUtil.isADateFormat(getFormatIndex(cell), formatString)) {
                    return tikaExcelDataFormatter
                            .formatRawCellContents(value, getFormatIndex(cell), formatString,
                                    false);
                } else if ("general".equalsIgnoreCase(formatString)) {
                    return tikaExcelDataFormatter
                            .formatRawCellContents(value, getFormatIndex(cell), formatString,
                                    false);
                }
                return super.formatNumberDateCell(cell);
            }
        }
    }

    /**
     * Utility comparator for points.
     */
    private static class PointComparator implements Comparator<Point> {

        public int compare(Point a, Point b) {
            int diff = a.y - b.y;
            if (diff == 0) {
                diff = a.x - b.x;
            }
            return diff;
        }

    }
}