TikaSheetXMLHandler.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Queue;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.util.CellAddress;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* Sheet XML handler for XLSX event-based parsing that uses {@link XSSFStylesShim}
* and {@link XSSFCommentsShim} instead of POI's XMLBeans-dependent
* {@code StylesTable} and {@code CommentsTable}.
* <p>
* Adapted from Apache POI's {@code XSSFSheetXMLHandler} (Apache 2.0 license).
*/
class TikaSheetXMLHandler extends DefaultHandler {
private static final Logger LOG = LoggerFactory.getLogger(TikaSheetXMLHandler.class);
private static final String NS_SPREADSHEETML =
"http://schemas.openxmlformats.org/spreadsheetml/2006/main";
enum XssfDataType {
BOOLEAN,
ERROR,
FORMULA,
INLINE_STRING,
SST_STRING,
NUMBER,
}
private final XSSFStylesShim stylesShim;
private final XSSFCommentsShim commentsShim;
private final XSSFSharedStringsShim sharedStringsShim;
private final TikaSheetContentsHandler output;
private final DataFormatter formatter;
private final boolean formulasNotResults;
private boolean vIsOpen;
private boolean fIsOpen;
private boolean isIsOpen;
private boolean hfIsOpen;
private XssfDataType nextDataType;
private short formatIndex;
private String formatString;
private int rowNum;
private int nextRowNum;
private String cellRef;
private final StringBuilder value = new StringBuilder(64);
private final StringBuilder formula = new StringBuilder(64);
private final StringBuilder headerFooter = new StringBuilder(64);
private Queue<CellAddress> commentCellRefs;
TikaSheetXMLHandler(XSSFStylesShim stylesShim,
XSSFCommentsShim commentsShim,
XSSFSharedStringsShim sharedStringsShim,
TikaSheetContentsHandler sheetContentsHandler,
DataFormatter dataFormatter,
boolean formulasNotResults) {
this.stylesShim = stylesShim;
this.commentsShim = commentsShim;
this.sharedStringsShim = sharedStringsShim;
this.output = sheetContentsHandler;
this.formatter = dataFormatter;
this.formulasNotResults = formulasNotResults;
this.nextDataType = XssfDataType.NUMBER;
initComments(commentsShim);
}
TikaSheetXMLHandler(XSSFStylesShim stylesShim,
XSSFSharedStringsShim sharedStringsShim,
TikaSheetContentsHandler sheetContentsHandler,
DataFormatter dataFormatter,
boolean formulasNotResults) {
this(stylesShim, null, sharedStringsShim, sheetContentsHandler, dataFormatter,
formulasNotResults);
}
private void initComments(XSSFCommentsShim commentsShim) {
if (commentsShim != null) {
commentCellRefs = new LinkedList<>();
for (Iterator<CellAddress> iter = commentsShim.getCellAddresses();
iter.hasNext(); ) {
commentCellRefs.add(iter.next());
}
}
}
private boolean isTextTag(String name) {
if ("v".equals(name)) {
return true;
}
if ("inlineStr".equals(name)) {
return true;
}
return "t".equals(name) && isIsOpen;
}
@Override
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
if (uri != null && !uri.equals(NS_SPREADSHEETML)) {
return;
}
if (isTextTag(localName)) {
vIsOpen = true;
if (!isIsOpen) {
value.setLength(0);
}
} else if ("is".equals(localName)) {
isIsOpen = true;
} else if ("f".equals(localName)) {
formula.setLength(0);
if (this.nextDataType == XssfDataType.NUMBER) {
this.nextDataType = XssfDataType.FORMULA;
}
String type = attributes.getValue("t");
if (type != null && type.equals("shared")) {
String ref = attributes.getValue("ref");
if (ref != null) {
fIsOpen = true;
} else {
if (formulasNotResults) {
LOG.warn("shared formulas not yet supported!");
}
}
} else {
fIsOpen = true;
}
} else if ("oddHeader".equals(localName) || "evenHeader".equals(localName) ||
"firstHeader".equals(localName) || "firstFooter".equals(localName) ||
"oddFooter".equals(localName) || "evenFooter".equals(localName)) {
hfIsOpen = true;
headerFooter.setLength(0);
} else if ("row".equals(localName)) {
String rowNumStr = attributes.getValue("r");
if (rowNumStr != null) {
rowNum = Integer.parseInt(rowNumStr.trim()) - 1;
} else {
rowNum = nextRowNum;
}
output.startRow(rowNum);
} else if ("c".equals(localName)) {
// Cell element ��� resolve style to format index/string
this.formula.setLength(0);
this.nextDataType = XssfDataType.NUMBER;
this.formatIndex = -1;
this.formatString = null;
cellRef = attributes.getValue("r");
String cellType = attributes.getValue("t");
String cellStyleStr = attributes.getValue("s");
if ("b".equals(cellType)) {
nextDataType = XssfDataType.BOOLEAN;
} else if ("e".equals(cellType)) {
nextDataType = XssfDataType.ERROR;
} else if ("inlineStr".equals(cellType)) {
nextDataType = XssfDataType.INLINE_STRING;
} else if ("s".equals(cellType)) {
nextDataType = XssfDataType.SST_STRING;
} else if ("str".equals(cellType)) {
nextDataType = XssfDataType.FORMULA;
} else {
// Number ��� resolve format via our styles shim
if (stylesShim != null) {
int styleIndex;
if (cellStyleStr != null) {
styleIndex = Integer.parseInt(cellStyleStr.trim());
} else if (stylesShim.getNumCellStyles() > 0) {
styleIndex = 0;
} else {
styleIndex = -1;
}
if (styleIndex >= 0) {
this.formatIndex = stylesShim.getFormatIndex(styleIndex);
this.formatString = stylesShim.getFormatString(styleIndex);
if (this.formatString == null) {
this.formatString =
BuiltinFormats.getBuiltinFormat(this.formatIndex);
}
}
}
}
}
}
@Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
if (uri != null && !uri.equals(NS_SPREADSHEETML)) {
return;
}
if (isTextTag(localName)) {
vIsOpen = false;
if (!isIsOpen) {
outputCell();
value.setLength(0);
}
} else if ("f".equals(localName)) {
fIsOpen = false;
} else if ("is".equals(localName)) {
isIsOpen = false;
outputCell();
value.setLength(0);
} else if ("row".equals(localName)) {
checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_ROW);
output.endRow(rowNum);
nextRowNum = rowNum + 1;
} else if ("sheetData".equals(localName)) {
checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_SHEET_DATA);
output.endSheet();
} else if ("oddHeader".equals(localName) || "evenHeader".equals(localName) ||
"firstHeader".equals(localName)) {
hfIsOpen = false;
output.headerFooter(headerFooter.toString(), true, localName);
} else if ("oddFooter".equals(localName) || "evenFooter".equals(localName) ||
"firstFooter".equals(localName)) {
hfIsOpen = false;
output.headerFooter(headerFooter.toString(), false, localName);
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (vIsOpen) {
value.append(ch, start, length);
}
if (fIsOpen) {
formula.append(ch, start, length);
}
if (hfIsOpen) {
headerFooter.append(ch, start, length);
}
}
private void outputCell() {
String thisStr = null;
if (formulasNotResults && formula.length() > 0) {
thisStr = formula.toString();
} else {
switch (nextDataType) {
case BOOLEAN:
char first = value.charAt(0);
thisStr = first == '0' ? "FALSE" : "TRUE";
break;
case ERROR:
thisStr = "ERROR:" + value;
break;
case FORMULA:
if (formulasNotResults) {
thisStr = formula.toString();
} else {
String fv = value.toString();
if (this.formatString != null) {
try {
double d = Double.parseDouble(fv.trim());
thisStr = formatter.formatRawCellContents(
d, this.formatIndex, this.formatString);
} catch (Exception e) {
thisStr = fv;
}
} else {
thisStr = fv;
}
}
break;
case INLINE_STRING:
thisStr = value.toString();
break;
case SST_STRING:
String sstIndex = value.toString().trim();
if (!sstIndex.isEmpty()) {
try {
int idx = Integer.parseInt(sstIndex);
thisStr = sharedStringsShim.getItemAt(idx);
} catch (NumberFormatException ex) {
LOG.error("Failed to parse SST index '{}'", sstIndex, ex);
}
}
break;
case NUMBER:
String n = value.toString();
if (this.formatString != null && !n.isEmpty()) {
try {
thisStr = formatter.formatRawCellContents(
Double.parseDouble(n.trim()),
this.formatIndex, this.formatString);
} catch (Exception e) {
thisStr = n;
}
} else {
thisStr = n;
}
break;
default:
thisStr = "(TODO: Unexpected type: " + nextDataType + ")";
break;
}
}
checkForEmptyCellComments(EmptyCellCommentsCheckType.CELL);
XSSFCommentsShim.CommentData comment = commentsShim != null ?
commentsShim.findCellComment(new CellAddress(cellRef)) : null;
output.cell(cellRef, thisStr, comment);
}
private void checkForEmptyCellComments(EmptyCellCommentsCheckType type) {
if (commentCellRefs != null && !commentCellRefs.isEmpty()) {
if (type == EmptyCellCommentsCheckType.END_OF_SHEET_DATA) {
while (!commentCellRefs.isEmpty()) {
outputEmptyCellComment(commentCellRefs.remove());
}
return;
}
if (this.cellRef == null) {
if (type == EmptyCellCommentsCheckType.END_OF_ROW) {
while (!commentCellRefs.isEmpty()) {
if (commentCellRefs.peek().getRow() == rowNum) {
outputEmptyCellComment(commentCellRefs.remove());
} else {
return;
}
}
return;
} else {
throw new IllegalStateException(
"Cell ref should be null only if there are only empty " +
"cells in the row; rowNum: " + rowNum);
}
}
CellAddress nextCommentCellRef;
do {
CellAddress cellAddr = new CellAddress(this.cellRef);
CellAddress peekCellRef = commentCellRefs.peek();
if (type == EmptyCellCommentsCheckType.CELL &&
cellAddr.equals(peekCellRef)) {
commentCellRefs.remove();
return;
} else {
int comparison = peekCellRef.compareTo(cellAddr);
if (comparison > 0 &&
type == EmptyCellCommentsCheckType.END_OF_ROW &&
peekCellRef.getRow() <= rowNum) {
nextCommentCellRef = commentCellRefs.remove();
outputEmptyCellComment(nextCommentCellRef);
} else if (comparison < 0 &&
type == EmptyCellCommentsCheckType.CELL &&
peekCellRef.getRow() <= rowNum) {
nextCommentCellRef = commentCellRefs.remove();
outputEmptyCellComment(nextCommentCellRef);
} else {
nextCommentCellRef = null;
}
}
} while (nextCommentCellRef != null && !commentCellRefs.isEmpty());
}
}
private void outputEmptyCellComment(CellAddress cellRef) {
XSSFCommentsShim.CommentData comment = commentsShim.findCellComment(cellRef);
output.cell(cellRef.formatAsString(), null, comment);
}
private enum EmptyCellCommentsCheckType {
CELL,
END_OF_ROW,
END_OF_SHEET_DATA
}
}