CharSoupDetectorConfig.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.langdetect.charsoup;
import java.util.Locale;
import java.util.Map;
/**
* Immutable configuration for {@link CharSoupLanguageDetector}.
* <p>
* Instances are constructed via {@link #DEFAULT} (for typical use) or
* {@link #fromMap(Map)} (for JSON-driven configuration via ParseContext).
* There are no setters ��� create a new instance if different parameters
* are needed.
* <p>
* JSON keys (all optional; unrecognised keys are ignored):
* <pre>
* {
* "strategy" : "AUTOMATIC", // AUTOMATIC | SHORT_TEXT | STANDARD
* "lengthThreshold" : 200, // chars below which short-text model is preferred
* "featureThreshold" : 200 // n-gram emissions below which short-text model is preferred
* }
* </pre>
*
* @see CharSoupLanguageDetector.Strategy
*/
public final class CharSoupDetectorConfig {
/**
* Default configuration: automatic model selection, default thresholds.
* TODO: tune lengthThreshold and featureThreshold from ablation crossover data.
*/
public static final CharSoupDetectorConfig DEFAULT = new CharSoupDetectorConfig(
CharSoupLanguageDetector.Strategy.AUTOMATIC,
CharSoupLanguageDetector.SHORT_TEXT_LENGTH_THRESHOLD,
CharSoupLanguageDetector.SHORT_TEXT_FEATURE_THRESHOLD);
private final CharSoupLanguageDetector.Strategy strategy;
private final int lengthThreshold;
private final int featureThreshold;
private CharSoupDetectorConfig(CharSoupLanguageDetector.Strategy strategy,
int lengthThreshold,
int featureThreshold) {
if (strategy == null) {
throw new IllegalArgumentException("strategy must not be null");
}
if (lengthThreshold < 0 || featureThreshold < 0) {
throw new IllegalArgumentException("thresholds must be non-negative");
}
this.strategy = strategy;
this.lengthThreshold = lengthThreshold;
this.featureThreshold = featureThreshold;
}
/**
* Deserialize from a plain string-to-object map (as produced by a JSON parser).
* Unrecognised keys are silently ignored; missing keys use DEFAULT values.
*
* @param map JSON-decoded config map; may be null or empty
* @return configured instance
* @throws IllegalArgumentException if a value is present but invalid
*/
public static CharSoupDetectorConfig fromMap(Map<String, Object> map) {
if (map == null || map.isEmpty()) {
return DEFAULT;
}
CharSoupLanguageDetector.Strategy strategy = DEFAULT.strategy;
int lengthThreshold = DEFAULT.lengthThreshold;
int featureThreshold = DEFAULT.featureThreshold;
Object s = map.get("strategy");
if (s != null) {
strategy = CharSoupLanguageDetector.Strategy.valueOf(
s.toString().toUpperCase(Locale.ROOT));
}
Object lt = map.get("lengthThreshold");
if (lt != null) {
lengthThreshold = ((Number) lt).intValue();
}
Object ft = map.get("featureThreshold");
if (ft != null) {
featureThreshold = ((Number) ft).intValue();
}
return new CharSoupDetectorConfig(strategy, lengthThreshold, featureThreshold);
}
public CharSoupLanguageDetector.Strategy getStrategy() {
return strategy;
}
public int getLengthThreshold() {
return lengthThreshold;
}
public int getFeatureThreshold() {
return featureThreshold;
}
@Override
public String toString() {
return "CharSoupDetectorConfig{strategy=" + strategy
+ ", lengthThreshold=" + lengthThreshold
+ ", featureThreshold=" + featureThreshold + "}";
}
}