TextAndCSVConfig.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.csv;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import org.apache.tika.exception.TikaConfigException;
public class TextAndCSVConfig implements Serializable {
private static final int DEFAULT_MARK_LIMIT = 20000;
private static final double DEFAULT_MIN_CONFIDENCE = 0.50;
private static final Map<Character, String> DELIMITER_TO_NAME_MAP = new HashMap<>();
private static final Map<String, Character> NAME_TO_DELIMITER_MAP = new HashMap<>();
static {
DELIMITER_TO_NAME_MAP.put(',', "comma");
DELIMITER_TO_NAME_MAP.put('\t', "tab");
DELIMITER_TO_NAME_MAP.put('|', "pipe");
DELIMITER_TO_NAME_MAP.put(';', "semicolon");
}
static {
for (Map.Entry<Character, String> e : DELIMITER_TO_NAME_MAP.entrySet()) {
NAME_TO_DELIMITER_MAP.put(e.getValue(), e.getKey());
}
}
private Map<String, Character> nameToDelimiterMap = NAME_TO_DELIMITER_MAP;
private Map<Character, String> delimiterToNameMap = DELIMITER_TO_NAME_MAP;
/**
* This is the mark limit in characters (not bytes) to
* read from the stream when classifying the stream as
* csv, tsv or txt.
*/
private int markLimit = DEFAULT_MARK_LIMIT;
/**
* minimum confidence score that there's enough
* evidence to determine csv/tsv vs. txt
*/
private double minConfidence = DEFAULT_MIN_CONFIDENCE;
public Map<String, Character> getNameToDelimiterMap() {
return nameToDelimiterMap;
}
public Map<Character, String> getDelimiterToNameMap() {
return delimiterToNameMap;
}
/**
* Set the name-to-delimiter map with Character values.
*/
public void setNameToDelimiterCharacterMap(Map<String, Character> nameToDelimiterMap) {
this.nameToDelimiterMap = new HashMap<>(nameToDelimiterMap);
this.delimiterToNameMap = new HashMap<>();
nameToDelimiterMap.entrySet()
.forEach(e -> delimiterToNameMap.put(e.getValue(), e.getKey()));
}
/**
* Set the name-to-delimiter map from String values (for JSON deserialization).
* Each String value must be exactly one character.
*/
public void setNameToDelimiterMap(Map<String, String> map) throws TikaConfigException {
Map<String, Character> m = new HashMap<>();
for (Map.Entry<String, String> e : map.entrySet()) {
if (e.getValue().length() != 1) {
throw new TikaConfigException("delimiter must be a single character: " + e.getValue());
}
m.put(e.getKey(), e.getValue().charAt(0));
}
setNameToDelimiterCharacterMap(m);
}
public int getMarkLimit() {
return markLimit;
}
public void setMarkLimit(int markLimit) {
this.markLimit = markLimit;
}
public double getMinConfidence() {
return minConfidence;
}
public void setMinConfidence(double minConfidence) {
this.minConfidence = minConfidence;
}
}