XlsxReader.java
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package tech.tablesaw.io.xlsx;
import static org.apache.poi.ss.usermodel.CellType.FORMULA;
import static org.apache.poi.ss.usermodel.CellType.NUMERIC;
import static org.apache.poi.ss.usermodel.CellType.STRING;
import com.google.common.collect.Iterables;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import javax.annotation.concurrent.Immutable;
import org.apache.poi.ss.format.CellDateFormatter;
import org.apache.poi.ss.format.CellGeneralFormatter;
import org.apache.poi.ss.format.CellNumberFormatter;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Row.MissingCellPolicy;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import tech.tablesaw.api.ColumnType;
import tech.tablesaw.api.DoubleColumn;
import tech.tablesaw.api.LongColumn;
import tech.tablesaw.api.Table;
import tech.tablesaw.columns.Column;
import tech.tablesaw.io.DataReader;
import tech.tablesaw.io.ReaderRegistry;
import tech.tablesaw.io.RuntimeIOException;
import tech.tablesaw.io.Source;
@Immutable
public class XlsxReader implements DataReader<XlsxReadOptions> {
private static final XlsxReader INSTANCE = new XlsxReader();
static {
register(Table.defaultReaderRegistry);
}
public static void register(ReaderRegistry registry) {
registry.registerExtension("xlsx", INSTANCE);
registry.registerMimeType(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", INSTANCE);
registry.registerOptions(XlsxReadOptions.class, INSTANCE);
}
@Override
public Table read(XlsxReadOptions options) {
List<Table> tables = null;
try {
tables = readMultiple(options, true);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
if (options.sheetIndex() != null) {
int index = options.sheetIndex();
if (index < 0 || index >= tables.size()) {
throw new IndexOutOfBoundsException(
String.format("Sheet index %d outside bounds. %d sheets found.", index, tables.size()));
}
Table table = tables.get(index);
if (table == null) {
throw new IllegalArgumentException(
String.format("No table found at sheet index %d.", index));
}
return table;
}
// since no specific sheetIndex asked, return first table
return tables.stream()
.filter(t -> t != null)
.findFirst()
.orElseThrow(() -> new IllegalArgumentException("No tables found."));
}
public List<Table> readMultiple(XlsxReadOptions options) throws IOException {
return readMultiple(options, false);
}
/**
* Read at most a table from every sheet.
*
* @param includeNulls include nulls for sheets without a table
* @return a list of tables, at most one for every sheet
*/
protected List<Table> readMultiple(XlsxReadOptions options, boolean includeNulls)
throws IOException {
byte[] bytes = null;
InputStream input = getInputStream(options, bytes);
List<Table> tables = new ArrayList<>();
try (XSSFWorkbook workbook = new XSSFWorkbook(input)) {
for (Sheet sheet : workbook) {
TableRange tableArea = findTableArea(sheet);
if (tableArea != null) {
Table table = createTable(sheet, tableArea, options);
tables.add(table);
} else if (includeNulls) {
tables.add(null);
}
}
return tables;
} finally {
if (options.source().reader() == null) {
// if we get a reader back from options it means the client opened it, so let
// the client close it
// if it's null, we close it here.
input.close();
}
}
}
private Boolean isBlank(Cell cell) {
switch (cell.getCellType()) {
case STRING:
if (cell.getRichStringCellValue().length() > 0) {
return false;
}
break;
case NUMERIC:
if (DateUtil.isCellDateFormatted(cell)
? cell.getDateCellValue() != null
: cell.getNumericCellValue() != 0) {
return false;
}
break;
case BOOLEAN:
if (cell.getBooleanCellValue()) {
return false;
}
break;
case BLANK:
return true;
default:
break;
}
return null;
}
private static class TableRange {
private int startRow, endRow, startColumn, endColumn;
TableRange(int startRow, int endRow, int startColumn, int endColumn) {
this.startRow = startRow;
this.endRow = endRow;
this.startColumn = startColumn;
this.endColumn = endColumn;
}
public int getColumnCount() {
return endColumn - startColumn + 1;
}
}
private TableRange findTableArea(Sheet sheet) {
// find first row and column with contents
int row1 = -1;
int row2 = -1;
TableRange lastRowArea = null;
for (Row row : sheet) {
TableRange rowArea = findRowArea(row);
if (lastRowArea == null && rowArea != null) {
if (row1 < 0) {
lastRowArea = rowArea;
row1 = row.getRowNum();
row2 = row1;
}
} else if (lastRowArea != null && rowArea == null) {
if (row2 > row1) {
break;
} else {
row1 = -1;
}
} else if (lastRowArea == null && rowArea == null) {
row1 = -1;
} else if (rowArea.startColumn < lastRowArea.startColumn
|| rowArea.endColumn > lastRowArea.endColumn) {
lastRowArea = null;
row2 = -1;
} else {
row2 = row.getRowNum();
}
}
return row1 >= 0 && lastRowArea != null
? new TableRange(row1, row2, lastRowArea.startColumn, lastRowArea.endColumn)
: null;
}
private TableRange findRowArea(Row row) {
int col1 = -1;
int col2 = -1;
for (Cell cell : row) {
Boolean blank = isBlank(cell);
if (col1 < 0 && Boolean.FALSE.equals(blank)) {
col1 = cell.getColumnIndex();
col2 = col1;
} else if (col1 >= 0 && col2 >= col1) {
if (Boolean.FALSE.equals(blank)) {
col2 = cell.getColumnIndex();
} else if (Boolean.TRUE.equals(blank)) {
break;
}
}
}
return col1 >= 0 && col2 >= col1 ? new TableRange(0, 0, col1, col2) : null;
}
private InputStream getInputStream(XlsxReadOptions options, byte[] bytes)
throws FileNotFoundException {
if (bytes != null) {
return new ByteArrayInputStream(bytes);
}
if (options.source().inputStream() != null) {
return options.source().inputStream();
}
return new FileInputStream(options.source().file());
}
private Table createTable(Sheet sheet, TableRange tableArea, XlsxReadOptions options) {
Optional<List<String>> optHeaderNames = getHeaderNames(sheet, tableArea);
optHeaderNames.ifPresent(h -> tableArea.startRow++);
List<String> headerNames = optHeaderNames.orElse(calculateDefaultColumnNames(tableArea));
Table table = Table.create(options.tableName() + "#" + sheet.getSheetName());
List<Column<?>> columns = new ArrayList<>(Collections.nCopies(headerNames.size(), null));
for (int rowNum = tableArea.startRow; rowNum <= tableArea.endRow; rowNum++) {
Row row = sheet.getRow(rowNum);
for (int colNum = 0; colNum < headerNames.size(); colNum++) {
int excelColNum = colNum + tableArea.startColumn;
Cell cell = row.getCell(excelColNum, MissingCellPolicy.RETURN_BLANK_AS_NULL);
Column<?> column = columns.get(colNum);
String columnName = headerNames.get(colNum);
if (cell != null) {
if (column == null) {
column = createColumn(colNum, columnName, sheet, excelColNum, tableArea, options);
columns.set(colNum, column);
while (column.size() < rowNum - tableArea.startRow) {
column.appendMissing();
}
}
Column<?> altColumn = appendValue(column, cell);
if (altColumn != null && altColumn != column) {
column = altColumn;
columns.set(colNum, column);
}
} else {
boolean hasCustomizedType =
options.columnTypeReadOptions().columnType(colNum, columnName).isPresent();
if (column == null && hasCustomizedType) {
ColumnType columnType =
options.columnTypeReadOptions().columnType(colNum, columnName).get();
column = columnType.create(columnName).appendMissing();
columns.set(colNum, column);
} else if (hasCustomizedType) {
column.appendMissing();
}
}
if (column != null) {
while (column.size() <= rowNum - tableArea.startRow) {
column.appendMissing();
}
}
}
}
columns.removeAll(Collections.singleton(null));
table.addColumns(columns.toArray(new Column<?>[columns.size()]));
return table;
}
private Optional<List<String>> getHeaderNames(Sheet sheet, TableRange tableArea) {
// assume header row if all cells are of type String
Row row = sheet.getRow(tableArea.startRow);
List<String> headerNames =
IntStream.range(tableArea.startColumn, tableArea.endColumn + 1)
.mapToObj(row::getCell)
.filter(cell -> cell.getCellType() == STRING)
.map(cell -> cell.getRichStringCellValue().getString())
.collect(Collectors.toList());
return headerNames.size() == tableArea.getColumnCount()
? Optional.of(headerNames)
: Optional.empty();
}
private List<String> calculateDefaultColumnNames(TableRange tableArea) {
return IntStream.range(tableArea.startColumn, tableArea.endColumn + 1)
.mapToObj(i -> "col" + i)
.collect(Collectors.toList());
}
@SuppressWarnings("unchecked")
private Column<?> appendValue(Column<?> column, Cell cell) {
CellType cellType =
cell.getCellType() == FORMULA ? cell.getCachedFormulaResultType() : cell.getCellType();
switch (cellType) {
case STRING:
column.appendCell(cell.getRichStringCellValue().getString());
return null;
case NUMERIC:
if (DateUtil.isCellDateFormatted(cell)) {
Date date = cell.getDateCellValue();
// This will return inconsistent results across time zones, but that matches Excel's
// behavior
LocalDateTime localDate =
date.toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime();
if (column.type() == ColumnType.STRING) {
// If column has String type try to honor it and leave the value as an string as similar
// as posible as seen in Excel
String dataFormatStyle = cell.getCellStyle().getDataFormatString();
String val;
if ("general".equalsIgnoreCase(dataFormatStyle)) {
val = new CellGeneralFormatter().format(cell.getNumericCellValue());
} else {
val = new CellDateFormatter(dataFormatStyle).format(cell.getDateCellValue());
}
column.appendCell(val);
} else {
column.appendCell(localDate.toString());
}
return null;
} else {
double num = cell.getNumericCellValue();
if (column.type() == ColumnType.INTEGER) {
Column<Integer> intColumn = (Column<Integer>) column;
if ((int) num == num) {
intColumn.append((int) num);
return null;
} else if ((long) num == num) {
Column<Long> altColumn = LongColumn.create(column.name(), column.size());
altColumn = intColumn.mapInto(s -> (long) s, altColumn);
altColumn.append((long) num);
return altColumn;
} else {
Column<Double> altColumn = DoubleColumn.create(column.name(), column.size());
altColumn = intColumn.mapInto(s -> (double) s, altColumn);
altColumn.append(num);
return altColumn;
}
} else if (column.type() == ColumnType.LONG) {
Column<Long> longColumn = (Column<Long>) column;
if ((long) num == num) {
longColumn.append((long) num);
return null;
} else {
Column<Double> altColumn = DoubleColumn.create(column.name(), column.size());
altColumn = longColumn.mapInto(s -> (double) s, altColumn);
altColumn.append(num);
return altColumn;
}
} else if (column.type() == ColumnType.DOUBLE) {
Column<Double> doubleColumn = (Column<Double>) column;
doubleColumn.append(num);
return null;
} else if (column.type() == ColumnType.STRING) {
// If column has String type try to honor it and leave the value as an string as similar
// as posible as seen in Excel
Column<String> stringColumn = (Column<String>) column;
String dataFormatStyle = cell.getCellStyle().getDataFormatString();
String val;
if ("general".equalsIgnoreCase(dataFormatStyle)) {
val = new CellGeneralFormatter().format(cell.getNumericCellValue());
} else {
val = new CellNumberFormatter(dataFormatStyle).format(cell.getNumericCellValue());
}
stringColumn.append(val);
}
}
break;
case BOOLEAN:
if (column.type() == ColumnType.BOOLEAN) {
Column<Boolean> booleanColumn = (Column<Boolean>) column;
booleanColumn.append(cell.getBooleanCellValue());
return null;
} else if (column.type() == ColumnType.STRING) {
// If column has String type try to honor it and leave the value as an string as similar
// as posible as seen in Excel
Column<String> stringColumn = (Column<String>) column;
String val = new CellGeneralFormatter().format(cell.getBooleanCellValue());
stringColumn.append(val);
}
default:
break;
}
return null;
}
private Column<?> createColumn(
int colNum,
String name,
Sheet sheet,
int excelColNum,
TableRange tableRange,
XlsxReadOptions options) {
Column<?> column;
ColumnType columnType =
options
.columnTypeReadOptions()
.columnType(colNum, name)
.orElse(
calculateColumnTypeForColumn(sheet, excelColNum, tableRange)
.orElse(ColumnType.STRING));
column = columnType.create(name);
return column;
}
@Override
public Table read(Source source) {
return read(XlsxReadOptions.builder(source).build());
}
private Optional<ColumnType> calculateColumnTypeForColumn(
Sheet sheet, int col, TableRange tableRange) {
Set<CellType> cellTypes = getCellTypes(sheet, col, tableRange);
if (cellTypes.size() != 1) {
return Optional.empty();
}
CellType cellType = Iterables.get(cellTypes, 0);
switch (cellType) {
case STRING:
return Optional.of(ColumnType.STRING);
case NUMERIC:
return allNumericFieldsDateFormatted(sheet, col, tableRange)
? Optional.of(ColumnType.LOCAL_DATE_TIME)
: Optional.of(ColumnType.INTEGER);
case BOOLEAN:
return Optional.of(ColumnType.BOOLEAN);
default:
return Optional.empty();
}
}
private Set<CellType> getCellTypes(Sheet sheet, int col, TableRange tableRange) {
return IntStream.range(tableRange.startRow, tableRange.endRow + 1)
.mapToObj(sheet::getRow)
.filter(Objects::nonNull)
.map(row -> row.getCell(col))
.filter(Objects::nonNull)
.filter(cell -> !Optional.ofNullable(isBlank(cell)).orElse(false))
.map(
cell ->
cell.getCellType() == FORMULA
? cell.getCachedFormulaResultType()
: cell.getCellType())
.collect(Collectors.toSet());
}
private boolean allNumericFieldsDateFormatted(Sheet sheet, int col, TableRange tableRange) {
return IntStream.range(tableRange.startRow, tableRange.endRow + 1)
.mapToObj(sheet::getRow)
.filter(Objects::nonNull)
.map(row -> row.getCell(col))
.filter(Objects::nonNull)
.filter(
cell ->
cell.getCellType() == NUMERIC
|| (cell.getCellType() == FORMULA
&& cell.getCachedFormulaResultType() == NUMERIC))
.allMatch(DateUtil::isCellDateFormatted);
}
}