TesseractOCRConfig.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.ocr;
import java.io.File;
import java.io.Serializable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FilenameUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.utils.StringUtils;
/**
* Configuration for TesseractOCRParser.
* This class is not thread safe and must be synchronized externally.
*/
public class TesseractOCRConfig implements Serializable {
private static final long serialVersionUID = -4861942486845757891L;
private static final Logger LOG = LoggerFactory.getLogger(TesseractOCRConfig.class);
private static Pattern ALLOWABLE_PAGE_SEPARATORS_PATTERN =
Pattern.compile("(?i)^[-_/\\.A-Z0-9]+$");
private static Pattern ALLOWABLE_OTHER_PARAMS_PATTERN =
Pattern.compile("(?i)^[-_/\\.A-Z0-9]+$");
// whether or not to apply rotation calculated by the rotation.py script
private boolean applyRotation = false;
// colorspace of processed image.
private String colorspace = "gray";
// resolution of processed image (in dpi).
private int density = 300;
// number of bits in a color sample within a pixel.
private int depth = 4;
// enable image preprocessing with ImageMagick (optional)
private boolean enableImagePreprocessing = false;
// filter to be applied to the processed image.
private String filter = "triangle";
// Language dictionary to be used.
private String language = "eng";
// Maximum file size to submit file to ocr.
private long maxFileSizeToOcr = Integer.MAX_VALUE;
// Minimum file size to submit file to ocr.
private long minFileSizeToOcr = 0;
// The format of the ocr'ed output to be returned, txt or hocr.
private OUTPUT_TYPE outputType = OUTPUT_TYPE.TXT;
// Tesseract page segmentation mode.
private String pageSegMode = "1";
// See setPageSeparator.
private String pageSeparator = "";
// whether or not to preserve interword spacing
private boolean preserveInterwordSpacing = false;
// factor by which image is to be scaled.
// TODO: we should make this dynamic depending on the size of the image
// The current testRotation.png takes minutes to expand 900%
private int resize = 200;
// runtime switch to turn off OCR
private boolean skipOcr = false;
// Maximum time (seconds) to wait for the ocring process termination
private int timeoutSeconds = 120;
// See addOtherTesseractConfig.
private HashMap<String, String> otherTesseractConfig = new HashMap<>();
private boolean inlineContent = false;
private boolean preloadLangs = false;
private String tesseractPath = "";
private String tessdataPath = "";
private String imageMagickPath = "";
/**
* This takes a language string, parses it and then bins individual langs into
* valid or invalid based on regexes against the language codes
*
* @param language
* @param validLangs
* @param invalidLangs
*/
public static void getLangs(String language, Set<String> validLangs, Set<String> invalidLangs) {
if (StringUtils.isBlank(language)) {
return;
}
// Get rid of embedded spaces
language = language.replaceAll("\\s", "");
// Test for leading or trailing +
if (language.matches("\\+.*|.*\\+")) {
throw new IllegalArgumentException(
"Invalid syntax - Can't start or end with +" + language);
}
// Split on the + sign
final String[] langs = language.split("\\+");
for (String lang : langs) {
// First, make sure it conforms to the correct syntax
if (!lang.matches(
"([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+")) {
invalidLangs.add(lang + " (invalid syntax)");
} else {
validLangs.add(lang);
}
}
}
/**
* @see #setLanguage(String language)
*/
public String getLanguage() {
return language;
}
/**
* Set tesseract language dictionary to be used. Default is "eng".
* languages are either:
* <ol>
* <li>Nominally an ISO-639-2 code but compound codes are allowed separated by underscore:
* e.g., chi_tra_vert, aze_cyrl</li>
* <li>A file path in the script directory. The name starts with upper-case letter.
* Some of them have underscores and other upper-case letters: e.g., script/Arabic,
* script/HanS_vert, script/Japanese_vert, script/Canadian_Aboriginal</li>
* </ol>
* Multiple languages may be specified, separated by plus characters.
* e.g. "chi_tra+chi_sim+script/Arabic"
*/
public void setLanguage(String languageString) {
Set<String> invalidCodes = new HashSet<>();
Set<String> validCodes = new HashSet<>();
getLangs(languageString, validCodes, invalidCodes);
if (!invalidCodes.isEmpty()) {
throw new IllegalArgumentException("Invalid language code(s): " + invalidCodes);
}
this.language = languageString;
}
/**
* @see #setPageSegMode(String pageSegMode)
*/
public String getPageSegMode() {
return pageSegMode;
}
/**
* Set tesseract page segmentation mode.
* Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection)
*/
public void setPageSegMode(String pageSegMode) {
if (!pageSegMode.matches("[0-9]|10|11|12|13")) {
throw new IllegalArgumentException("Invalid page segmentation mode");
}
this.pageSegMode = pageSegMode;
}
/**
* @see #setPageSeparator(String pageSeparator)
*/
public String getPageSeparator() {
return pageSeparator;
}
/**
* The page separator to use in plain text output. This corresponds to Tesseract's
* page_separator config option.
* The default here is the empty string (i.e. no page separators). Note that this is also
* the default in
* Tesseract 3.x, but in Tesseract 4.0 the default is to use the form feed control character.
* We are overriding
* Tesseract 4.0's default here.
*
* @param pageSeparator
*/
public void setPageSeparator(String pageSeparator) {
if (pageSeparator.isBlank()) {
return;
}
Matcher m = ALLOWABLE_PAGE_SEPARATORS_PATTERN.matcher(pageSeparator);
if (!m.find()) {
throw new IllegalArgumentException(pageSeparator + " contains illegal characters.\n" +
"If you trust this value, set it with setTrustedPageSeparator");
}
setTrustedPageSeparator(pageSeparator);
}
/**
* Same as {@link #setPageSeparator(String)} but does not perform
* any checks on the string.
*
* @param pageSeparator
*/
public void setTrustedPageSeparator(String pageSeparator) {
this.pageSeparator = pageSeparator;
}
/**
* @return whether or not to maintain interword spacing.
*/
public boolean isPreserveInterwordSpacing() {
return preserveInterwordSpacing;
}
/**
* Whether or not to maintain interword spacing. Default is <code>false</code>.
*
* @param preserveInterwordSpacing
*/
public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing) {
this.preserveInterwordSpacing = preserveInterwordSpacing;
}
/**
* @see #setMinFileSizeToOcr(long minFileSizeToOcr)
*/
public long getMinFileSizeToOcr() {
return minFileSizeToOcr;
}
/**
* Set minimum file size to submit file to ocr.
* Default is 0.
*/
public void setMinFileSizeToOcr(long minFileSizeToOcr) {
this.minFileSizeToOcr = minFileSizeToOcr;
}
/**
* @see #setMaxFileSizeToOcr(long maxFileSizeToOcr)
*/
public long getMaxFileSizeToOcr() {
return maxFileSizeToOcr;
}
/**
* Set maximum file size to submit file to ocr.
* Default is Integer.MAX_VALUE.
*/
public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
this.maxFileSizeToOcr = maxFileSizeToOcr;
}
/**
* @return timeout value for Tesseract
* @see #setTimeoutSeconds(int timeout)
*/
public int getTimeoutSeconds() {
return timeoutSeconds;
}
/**
* Set maximum time (seconds) to wait for the ocring process to terminate.
* Default value is 120s.
*/
public void setTimeoutSeconds(int timeoutSeconds) {
this.timeoutSeconds = timeoutSeconds;
}
/**
* @see #setOutputType(OUTPUT_TYPE outputType)
*/
public OUTPUT_TYPE getOutputType() {
return outputType;
}
/**
* Set output type from ocr process. Default is "txt", but can be "hocr".
* Default value is {@link OUTPUT_TYPE#TXT}.
*/
public void setOutputType(OUTPUT_TYPE outputType) {
this.outputType = outputType;
}
public void setOutputType(String outputType) {
if (outputType == null) {
throw new IllegalArgumentException("outputType must not be null");
}
String lc = outputType.toLowerCase(Locale.US);
if ("txt".equals(lc)) {
setOutputType(OUTPUT_TYPE.TXT);
} else if ("hocr".equals(lc)) {
setOutputType(OUTPUT_TYPE.HOCR);
} else {
throw new IllegalArgumentException("outputType must be either 'txt' or 'hocr'");
}
}
/**
* @return image processing is enabled or not
* @see #setEnableImagePreprocessing(boolean)
*/
public boolean isEnableImagePreprocessing() {
return enableImagePreprocessing;
}
/**
* Set the value to true if processing is to be enabled.
* Default value is false.
*/
public void setEnableImagePreprocessing(boolean enableImagePreprocessing) {
this.enableImagePreprocessing = enableImagePreprocessing;
}
/**
* @return the density
*/
public int getDensity() {
return density;
}
/**
* @param density the density to set. Valid range of values is 150-1200.
* Default value is 300.
*/
public void setDensity(int density) {
if (density < 150 || density > 1200) {
throw new IllegalArgumentException(
"Invalid density value. Valid range of values is 150-1200.");
}
this.density = density;
}
/**
* @return the depth
*/
public int getDepth() {
return depth;
}
/**
* @param depth the depth to set. Valid values are 2, 4, 8, 16, 32, 64, 256, 4096.
* Default value is 4.
*/
public void setDepth(int depth) {
int[] allowedValues = {2, 4, 8, 16, 32, 64, 256, 4096};
for (int allowedValue : allowedValues) {
if (depth == allowedValue) {
this.depth = depth;
return;
}
}
throw new IllegalArgumentException(
"Invalid depth value. Valid values are 2, 4, 8, 16, 32, 64, 256, 4096.");
}
/**
* @return the colorspace
*/
public String getColorspace() {
return colorspace;
}
/**
* @param colorspace the colorspace to set
* Default value is gray.
*/
public void setColorspace(String colorspace) {
if (colorspace == null) {
throw new IllegalArgumentException("Colorspace value cannot be null.");
}
if (!colorspace.matches("(?i)^[-_A-Z0-9]+$")) {
throw new IllegalArgumentException(
"colorspace must match this pattern: (?i)^[-_A-Z0-9]+$");
}
this.colorspace = colorspace;
}
/**
* @return the filter
*/
public String getFilter() {
return filter;
}
/**
* @param filter the filter to set. Valid values are point, hermite, cubic, box, gaussian,
* catrom, triangle, quadratic and mitchell.
* Default value is triangle.
*/
public void setFilter(String filter) {
if (filter.equals(null)) {
throw new IllegalArgumentException(
"Filter value cannot be null. Valid values are point, hermite, " +
"cubic, box, gaussian, catrom, triangle, quadratic and mitchell.");
}
String[] allowedFilters =
{"Point", "Hermite", "Cubic", "Box", "Gaussian", "Catrom", "Triangle", "Quadratic",
"Mitchell"};
for (String allowedFilter : allowedFilters) {
if (filter.equalsIgnoreCase(allowedFilter)) {
this.filter = filter;
return;
}
}
throw new IllegalArgumentException(
"Invalid filter value. Valid values are point, hermite, " +
"cubic, box, gaussian, catrom, triangle, quadratic and mitchell.");
}
public boolean isSkipOcr() {
return skipOcr;
}
/**
* If you want to turn off OCR at run time for a specific file,
* set this to <code>true</code>
*
* @param skipOcr
*/
public void setSkipOcr(boolean skipOcr) {
this.skipOcr = skipOcr;
}
/**
* @return the resize
*/
public int getResize() {
return resize;
}
/**
* @param resize the resize to set. Valid range of values is 100-900.
* Default value is 900.
*/
public void setResize(int resize) {
for (int i = 1; i < 10; i++) {
if (resize == i * 100) {
this.resize = resize;
return;
}
}
throw new IllegalArgumentException(
"Invalid resize value. Valid range of values is 100-900.");
}
/**
* @return Whether or not a rotation value should be calculated and passed to ImageMagick
* before performing OCR.
*/
public boolean isApplyRotation() {
return this.applyRotation;
}
public void setInlineContent(boolean inlineContent) {
this.inlineContent = inlineContent;
}
public boolean isInlineContent() {
return inlineContent;
}
public boolean isPreloadLangs() {
return preloadLangs;
}
public void setPreloadLangs(boolean preloadLangs) {
this.preloadLangs = preloadLangs;
}
/**
* Sets whether or not a rotation value should be calculated and passed to ImageMagick.
*
* @param applyRotation to calculate and apply rotation, false to skip. Default is false
*/
public void setApplyRotation(boolean applyRotation) {
this.applyRotation = applyRotation;
}
/**
* @see #addOtherTesseractConfig(String, String)
*/
public Map<String, String> getOtherTesseractConfig() {
return otherTesseractConfig;
}
/**
* Set the map of other Tesseract config parameters.
* Each key-value pair is passed to Tesseract using its -c command line option.
* To see the possible options, run tesseract --print-parameters.
*
* @param otherTesseractConfig map of key-value pairs
*/
public void setOtherTesseractConfig(Map<String, String> otherTesseractConfig) {
if (otherTesseractConfig != null) {
for (Map.Entry<String, String> entry : otherTesseractConfig.entrySet()) {
addOtherTesseractConfig(entry.getKey(), entry.getValue());
}
}
}
/**
* Add a key-value pair to pass to Tesseract using its -c command line option.
* To see the possible options, run tesseract --print-parameters.
* <p>
* You may also add these parameters in TesseractOCRConfig.properties; any
* key-value pair in the properties file where the key contains an underscore
* is passed directly to Tesseract.
*
* @param key
* @param value
*/
public void addOtherTesseractConfig(String key, String value) {
if (key == null) {
throw new IllegalArgumentException("key must not be null");
}
if (value == null) {
throw new IllegalArgumentException("value must not be null");
}
Matcher m = ALLOWABLE_OTHER_PARAMS_PATTERN.matcher(key);
if (!m.find()) {
throw new IllegalArgumentException("Key contains illegal characters: " + key);
}
m.reset(value);
if (!m.find()) {
throw new IllegalArgumentException("Value contains illegal characters: " + value);
}
otherTesseractConfig.put(key.trim(), value.trim());
}
public String getTesseractPath() {
return tesseractPath;
}
public void setTesseractPath(String tesseractPath) throws TikaConfigException {
tesseractPath = FilenameUtils.normalize(tesseractPath);
if (!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator)) {
tesseractPath += File.separator;
}
this.tesseractPath = tesseractPath;
}
public String getTessdataPath() {
return tessdataPath;
}
public void setTessdataPath(String tessdataPath) throws TikaConfigException {
tessdataPath = FilenameUtils.normalize(tessdataPath);
if (!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator)) {
tessdataPath += File.separator;
}
this.tessdataPath = tessdataPath;
}
public String getImageMagickPath() {
return imageMagickPath;
}
public void setImageMagickPath(String imageMagickPath) throws TikaConfigException {
imageMagickPath = FilenameUtils.normalize(imageMagickPath);
if (!imageMagickPath.isEmpty() && !imageMagickPath.endsWith(File.separator)) {
imageMagickPath += File.separator;
}
this.imageMagickPath = imageMagickPath;
}
/**
* Runtime-only TesseractOCRConfig that prevents modification of paths.
* Used to enforce immutability of parser-level paths during parse-time configuration.
* <p>
* This class is deserialized by ConfigDeserializer (in tika-serialization) which uses
* Jackson to populate fields via setters. If the JSON contains any path fields, the
* overridden setters will throw TikaConfigException.
*/
public static class RuntimeConfig extends TesseractOCRConfig {
public RuntimeConfig() {
super();
}
@Override
public void setTesseractPath(String tesseractPath) throws TikaConfigException {
if (! StringUtils.isBlank(tesseractPath)) {
throw new TikaConfigException("Cannot modify tesseractPath at runtime. Paths must be configured at parser initialization time.");
}
}
@Override
public void setTessdataPath(String tessdataPath) throws TikaConfigException {
if (! StringUtils.isBlank(tessdataPath)) {
throw new TikaConfigException("Cannot modify tessdataPath at runtime. " + "Paths must be configured at parser initialization time.");
}
}
@Override
public void setImageMagickPath(String imageMagickPath) throws TikaConfigException {
if (! StringUtils.isBlank(imageMagickPath)) {
throw new TikaConfigException("Cannot modify imageMagickPath at runtime. " + "Paths must be configured at parser initialization time.");
}
}
@Override
public void setTrustedPageSeparator(String pageSeparator) {
throw new IllegalArgumentException("Cannot use setTrustedPageSeparator at runtime. " + "Use setPageSeparator instead.");
}
}
public enum OUTPUT_TYPE {
TXT, HOCR
}
}