TextAndCSVParser.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.csv;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.config.ConfigDeserializer;
import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* Unless the {@link TikaCoreProperties#CONTENT_TYPE_USER_OVERRIDE} is set,
* this parser tries to assess whether the file is a text file, csv or tsv.
* If the detector detects regularity in column numbers and/or encapsulated cells,
* this parser will apply the {@link org.apache.commons.csv.CSVParser};
* otherwise, it will treat the contents as text.
* <p>
* If there is a csv parse exception during detection, the parser sets
* the {@link Metadata#CONTENT_TYPE} to {@link MediaType#TEXT_PLAIN}
* and treats the file as {@link MediaType#TEXT_PLAIN}.
* </p>
* <p>
* If there is a csv parse exception during the parse, the parser
* writes what's left of the stream as if it were text and then throws
* an exception. As of this writing, the content that was buffered by the underlying
* {@link org.apache.commons.csv.CSVParser} is lost.
* </p>
*/
@TikaComponent(name = "text-and-csv-parser")
public class TextAndCSVParser extends AbstractEncodingDetectorParser {
static final MediaType CSV = MediaType.text("csv");
static final MediaType TSV = MediaType.text("tsv");
private static final String CSV_PREFIX = "csv";
private static final String CHARSET = "charset";
private static final String DELIMITER = "delimiter";
public static final Property DELIMITER_PROPERTY = Property.externalText(
CSV_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + DELIMITER);
/**
* If the file is detected as a csv/tsv, this is the number of columns in the first row.
*/
public static final Property NUM_COLUMNS = Property.externalInteger(
CSV_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "num_columns");
/**
* If the file is detected as a csv/tsv, this is the number of rows if the file
* is successfully read (e.g. no encapsulation exceptions, etc).
*/
public static final Property NUM_ROWS = Property.externalInteger(
CSV_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "num_rows");
private static final String TD = "td";
private static final String TR = "tr";
private static final String TABLE = "table";
private static final Set<MediaType> SUPPORTED_TYPES = Collections
.unmodifiableSet(new HashSet<>(Arrays.asList(CSV, TSV, MediaType.TEXT_PLAIN)));
private final TextAndCSVConfig defaultTextAndCSVConfig;
public TextAndCSVParser() {
this.defaultTextAndCSVConfig = new TextAndCSVConfig();
}
public TextAndCSVParser(TextAndCSVConfig textAndCSVConfig) {
this.defaultTextAndCSVConfig = textAndCSVConfig;
}
/**
* This constructor is called by the JSON-based configuration
* loader.
*/
public TextAndCSVParser(JsonConfig jsonConfig) throws TikaConfigException {
this(ConfigDeserializer.buildConfig(jsonConfig, TextAndCSVConfig.class));
}
public TextAndCSVParser(EncodingDetector encodingDetector) {
super(encodingDetector);
this.defaultTextAndCSVConfig = new TextAndCSVConfig();
}
private static void handleText(Reader reader, XHTMLContentHandler xhtml)
throws SAXException, IOException {
xhtml.startElement("p");
char[] buffer = new char[4096];
int n = reader.read(buffer);
while (n != -1) {
xhtml.characters(buffer, 0, n);
n = reader.read(buffer);
}
xhtml.endElement("p");
}
static boolean isCSVOrTSV(MediaType mediaType) {
if (mediaType == null) {
return false;
}
return mediaType.getBaseType().equals(TSV) || mediaType.getBaseType().equals(CSV);
}
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
tis.setCloseShield();
try {
parseInternal(tis, handler, metadata, context);
} finally {
tis.removeCloseShield();
}
}
private void parseInternal(TikaInputStream tis, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
TextAndCSVConfig textAndCSVConfig = context.get(TextAndCSVConfig.class, defaultTextAndCSVConfig);
CSVParams params = getOverride(metadata, textAndCSVConfig);
Reader reader;
Charset charset;
if (!params.isComplete()) {
reader = detect(params, textAndCSVConfig, tis, metadata, context);
if (params.getCharset() != null) {
charset = params.getCharset();
} else {
charset = ((AutoDetectReader) reader).getCharset();
}
} else {
reader = new BufferedReader(new InputStreamReader(tis, params.getCharset()));
charset = params.getCharset();
}
updateMetadata(params, metadata, textAndCSVConfig);
//if text or a non-csv/tsv category of text
//treat this as text and be done
//TODO -- if it was detected as a non-csv subtype of text
if (!params.getMediaType().getBaseType().equals(CSV) &&
!params.getMediaType().getBaseType().equals(TSV)) {
handleText(reader, charset, handler, metadata, context);
return;
}
CSVFormat csvFormat = CSVFormat.EXCEL.builder().setDelimiter(params.getDelimiter()).get();
metadata.set(DELIMITER_PROPERTY, textAndCSVConfig.getDelimiterToNameMap().get(csvFormat.getDelimiterString().charAt(0)));
XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(handler, metadata, context);
int totalRows = 0;
try (CSVParser commonsParser = CSVParser.builder().setReader(reader).setFormat(csvFormat).get()) {
xhtmlContentHandler.startDocument();
xhtmlContentHandler.startElement(TABLE);
int firstRowColCount = 0;
try {
for (CSVRecord row : commonsParser) {
xhtmlContentHandler.startElement(TR);
for (String cell : row) {
if (totalRows == 0) {
firstRowColCount++;
}
xhtmlContentHandler.startElement(TD);
xhtmlContentHandler.characters(cell);
xhtmlContentHandler.endElement(TD);
}
xhtmlContentHandler.endElement(TR);
if (totalRows == 0) {
metadata.set(NUM_COLUMNS, firstRowColCount);
}
totalRows++;
}
metadata.set(NUM_ROWS, totalRows);
} catch (UncheckedIOException e) {
if (e.getCause() != null && e.getCause().getMessage() != null &&
e.getCause().getMessage().contains("encapsulated")) {
//if there's a parse exception
//try to get the rest of the content...treat it as text for now
//There will be some content lost because of buffering.
//TODO -- figure out how to improve this
xhtmlContentHandler.endElement(TABLE);
xhtmlContentHandler.startElement("div", "name", "after exception");
handleText(reader, xhtmlContentHandler);
xhtmlContentHandler.endElement("div");
xhtmlContentHandler.endDocument();
//TODO -- consider dumping what's left in the reader as text
throw new TikaException("exception parsing the csv", e);
} else {
if (e.getCause() != null) {
throw new TikaException("exception parsing the csv", e.getCause());
} else {
throw new TikaException("exception parsing the csv", e);
}
}
}
xhtmlContentHandler.endElement(TABLE);
xhtmlContentHandler.endDocument();
}
}
private void handleText(Reader reader, Charset charset, ContentHandler handler,
Metadata metadata, ParseContext context) throws SAXException, IOException, TikaException {
// Automatically detect the character encoding
//try to get detected content type; could be a subclass of text/plain
//such as vcal, etc.
String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
MediaType mediaType = MediaType.TEXT_PLAIN;
if (incomingMime != null) {
MediaType tmpMediaType = MediaType.parse(incomingMime);
if (tmpMediaType != null) {
mediaType = tmpMediaType;
}
}
MediaType type = new MediaType(mediaType, charset);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
xhtml.startDocument();
handleText(reader, xhtml);
xhtml.endDocument();
}
private Reader detect(CSVParams params, TextAndCSVConfig textAndCSVConfig, TikaInputStream tis, Metadata metadata,
ParseContext context) throws IOException, TikaException {
//if the file was already identified as not .txt, .csv or .tsv
//don't even try to csv or not
String mediaString = metadata.get(Metadata.CONTENT_TYPE);
if (mediaString != null) {
MediaType mediaType = MediaType.parse(mediaString);
if (!SUPPORTED_TYPES.contains(mediaType.getBaseType())) {
params.setMediaType(mediaType);
return new AutoDetectReader(tis, metadata,
getEncodingDetector(context));
}
}
Reader reader;
if (params.getCharset() == null) {
reader = new AutoDetectReader(tis, metadata,
getEncodingDetector(context));
params.setCharset(((AutoDetectReader) reader).getCharset());
if (params.isComplete()) {
return reader;
}
} else {
reader = new BufferedReader(
new InputStreamReader(tis, params.getCharset()));
}
if (params.getDelimiter() == null &&
(params.getMediaType() == null || isCSVOrTSV(params.getMediaType()))) {
CSVSniffer sniffer = new CSVSniffer(textAndCSVConfig.getMarkLimit(),
textAndCSVConfig.getDelimiterToNameMap().keySet(),
textAndCSVConfig.getMinConfidence());
CSVResult result = sniffer.getBest(reader, metadata);
params.setMediaType(result.getMediaType());
params.setDelimiter(result.getDelimiter());
}
return reader;
}
private CSVParams getOverride(Metadata metadata, TextAndCSVConfig textAndCSVConfig) {
String override = metadata.get(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE);
if (override == null) {
return new CSVParams();
}
MediaType mediaType = MediaType.parse(override);
if (mediaType == null) {
return new CSVParams();
}
String charsetString = mediaType.getParameters().get(CHARSET);
Charset charset = null;
if (charsetString != null) {
try {
charset = Charset.forName(charsetString);
} catch (UnsupportedCharsetException e) {
//swallow
}
}
if (!isCSVOrTSV(mediaType)) {
return new CSVParams(mediaType, charset);
}
String delimiterName = mediaType.getParameters().get(DELIMITER);
if (delimiterName == null) {
return new CSVParams(mediaType, charset);
}
if (textAndCSVConfig.getNameToDelimiterMap().containsKey(delimiterName)) {
return new CSVParams(mediaType, charset,
(char) textAndCSVConfig.getNameToDelimiterMap().get(delimiterName));
}
if (delimiterName.length() == 1) {
return new CSVParams(mediaType, charset, delimiterName.charAt(0));
}
//TODO: log bad/unrecognized delimiter string
return new CSVParams(mediaType, charset);
}
private void updateMetadata(CSVParams params, Metadata metadata, TextAndCSVConfig textAndCSVConfig) {
MediaType mediaType = null;
if (params.getMediaType().getBaseType().equals(MediaType.TEXT_PLAIN)) {
mediaType = MediaType.TEXT_PLAIN;
} else if (params.getDelimiter() != null) {
if (params.getDelimiter() == '\t') {
mediaType = TSV;
} else {
mediaType = CSV;
}
} else {
if (metadata.get(Metadata.CONTENT_TYPE) != null) {
mediaType = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
}
}
Map<String, String> attrs = new HashMap<>();
if (params.getCharset() != null) {
attrs.put(CHARSET, params.getCharset().name());
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, params.getCharset().name());
}
if (!MediaType.TEXT_PLAIN.equals(mediaType) && params.getDelimiter() != null) {
if (textAndCSVConfig.getDelimiterToNameMap().containsKey(params.getDelimiter())) {
attrs.put(DELIMITER, textAndCSVConfig.getDelimiterToNameMap().get(params.getDelimiter()));
} else {
attrs.put(DELIMITER, Integer.toString((int) params.getDelimiter()));
}
}
MediaType type = new MediaType(mediaType, attrs);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
}
}