CompareDetectors.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.langdetect.charsoup.tools;
import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.InputStream;
import java.lang.reflect.Method;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.tika.langdetect.charsoup.CharSoupDetectorConfig;
import org.apache.tika.langdetect.charsoup.CharSoupLanguageDetector;
import org.apache.tika.langdetect.charsoup.CharSoupModel;
import org.apache.tika.langdetect.charsoup.ConfusableGroups;
import org.apache.tika.language.detect.LanguageConfidence;
import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.language.detect.LanguageResult;
/**
* Compares CharSoup against OpenNLP, Lingua, and Optimaize on a test split.
* <p>
* Results are broken out by text length:
* <ul>
* <li><b>short</b> ��� sentences with ≤ {@value #SHORT_CHAR_LIMIT} characters</li>
* <li><b>full</b> ��� sentences with > {@value #SHORT_CHAR_LIMIT} characters</li>
* <li><b>all</b> ��� combined</li>
* </ul>
* Each bucket reports accuracy, total time, and throughput (sentences/sec).
* Approximate heap usage for each model is measured around model loading.
* <p>
* Usage: {@code CompareDetectors <testSplitFile> <charSoupModelFile> [outputReport]}
* <p>
* The CharSoup model is loaded directly from a file so that any model
* (including one trained on a subset of languages) can be evaluated.
* All comparison detectors are loaded via reflection so their dependencies
* are optional ��� any that are absent from the classpath are silently skipped.
*/
public class CompareDetectors {
/** Text is truncated to each of these lengths before evaluation. */
static final int[] EVAL_LENGTHS = {20, 50, 100, 150, 200, 500, Integer.MAX_VALUE};
/**
* Flores-200 language+script codes that represent a secondary or alternative script
* for a language that also appears under a different script in Flores.
* <p>
* These codes are <em>not</em> normalized to bare ISO 639-3 ��� they are kept as
* {@code xxx_Yyyy} so they appear as distinct evaluation classes in the report.
* A model trained only on the primary script will score 0 on these, which is the
* honest result (the model doesn't cover that script variant).
* <p>
* Rationale per entry:
* <ul>
* <li>{@code ace_Arab} ��� Acehnese in Jawi; MADLAD {@code ace} is Latin-script</li>
* <li>{@code arb_Latn} ��� Romanized Arabic; distinct from Arabic-script {@code arb}</li>
* <li>{@code bjn_Arab} ��� Banjar in Jawi; digital Banjar is primarily Latin-script</li>
* <li>{@code kas_Deva} ��� Kashmiri in Devanagari; primary written form is Nastaliq</li>
* <li>{@code knc_Latn} ��� Central Kanuri in Latin; traditional script is Arabic</li>
* <li>{@code min_Arab} ��� Minangkabau in Jawi; MADLAD {@code min} is Latin-script</li>
* <li>{@code taq_Tfng} ��� Tamasheq in Tifinagh; digital text predominantly Latin</li>
* </ul>
* {@code zho_Hans} and {@code zho_Hant} are both native Chinese character sets and
* both normalize to {@code zho}.
*/
static final Set<String> FLORES_KEEP_SCRIPT_SUFFIX = Set.of(
"ace_Arab",
"arb_Latn",
"bjn_Arab",
"kas_Deva",
"knc_Latn",
"min_Arab",
"taq_Tfng"
);
/** Warm-up iterations before timing to stabilise JIT. */
private static final int WARMUP_ITERS = 200;
/**
* Confusable language groups ��� languages within the same group are nearly
* indistinguishable by character bigrams alone. "Group accuracy" counts a
* prediction as correct if it falls within the same group as the truth.
*/
static final String[][] CONFUSABLE_GROUPS = ConfusableGroups.load();
/**
* Maps each language code to the set of codes in its confusable group
* (including itself). Languages not in any group map to a singleton set.
*/
private static final Map<String, Set<String>> CONFUSABLE_MAP = buildConfusableMap();
private static Map<String, Set<String>> buildConfusableMap() {
Map<String, Set<String>> map = new HashMap<>();
for (String[] group : CONFUSABLE_GROUPS) {
Set<String> groupSet = new HashSet<>(Arrays.asList(group));
for (String lang : group) {
map.put(lang, groupSet);
}
}
return map;
}
/**
* Returns true if the predicted language is in the same confusable group
* as the true language, or if they are an exact match.
*/
static boolean isGroupMatch(String truth, String predicted) {
if (truth.equals(predicted)) {
return true;
}
Set<String> group = CONFUSABLE_MAP.get(truth);
return group != null && group.contains(predicted);
}
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println(
"Usage: CompareDetectors <testSplitFile>"
+ " [--strategy STANDARD|SHORT_TEXT|AUTOMATIC]"
+ " [outputReport] [threads]");
System.err.println(" Legacy: CompareDetectors <testSplitFile> <charSoupModelFile>"
+ " [outputReport] [threads]");
System.exit(1);
}
Path testFile = Paths.get(args[0]);
// Parse --strategy flag (new mode) vs positional model file (legacy mode)
CharSoupLanguageDetector.Strategy strategy = CharSoupLanguageDetector.Strategy.STANDARD;
Path modelFile = null;
Path reportFile = null;
int numThreads = Runtime.getRuntime().availableProcessors();
List<String> remaining = new ArrayList<>(Arrays.asList(args).subList(1, args.length));
for (int i = 0; i < remaining.size(); i++) {
if ("--strategy".equals(remaining.get(i)) && i + 1 < remaining.size()) {
strategy = CharSoupLanguageDetector.Strategy.valueOf(
remaining.get(i + 1).toUpperCase(Locale.ROOT));
remaining.remove(i + 1);
remaining.remove(i);
i--;
}
}
// Remaining positional args: [modelFile] [reportFile] [threads]
if (!remaining.isEmpty() && !remaining.get(0).matches("\\d+")
&& !remaining.get(0).endsWith(".log") && !remaining.get(0).endsWith(".txt")
&& Paths.get(remaining.get(0)).toFile().exists()) {
modelFile = Paths.get(remaining.remove(0));
}
if (!remaining.isEmpty() && !remaining.get(0).matches("\\d+")) {
reportFile = Paths.get(remaining.remove(0));
}
if (!remaining.isEmpty()) {
numThreads = Integer.parseInt(remaining.get(0));
}
System.out.println("CharSoup strategy: " + strategy);
System.out.println("Evaluation threads: " + numThreads);
// ---- Load test data ----
System.out.println("Loading test data: " + testFile);
List<LabeledSentence> allData = TrainLanguageModel.readPreprocessedFile(testFile);
// Normalize Flores-200 xxx_Yyyy codes (e.g. zho_Hans ��� zho, ace_Latn ��� ace).
// Secondary-script variants (romanized or minority-script forms) are dropped so
// they don't pollute evaluation of models trained on the primary script.
boolean floresMode = allData.stream()
.anyMatch(s -> s.getLanguage().contains("_"));
if (floresMode) {
System.out.println(" Flores-200 mode: normalizing xxx_Yyyy ��� xxx codes");
System.out.println(" (multi-script variants kept as xxx_Yyyy separate classes)");
List<LabeledSentence> normalized = new ArrayList<>(allData.size());
for (LabeledSentence s : allData) {
String raw = s.getLanguage();
// Keep secondary-script codes as-is (e.g. arb_Latn stays arb_Latn)
// so they appear as distinct classes in the per-language report.
String lang = FLORES_KEEP_SCRIPT_SUFFIX.contains(raw)
? raw : normalizeLang(raw);
normalized.add(new LabeledSentence(lang, s.getText()));
}
allData = normalized;
}
System.out.printf(Locale.US, "Test sentences: %,d%n", allData.size());
// ---- Resolve CharSoup supported-language set ----
// Evaluation always routes through CharSoupLanguageDetector (production pipeline).
// The model file, if supplied, is used only to override the supported-language set
// (legacy behaviour). When --strategy is used, we query the detector directly.
final Set<String> bigramLangs;
long bigramHeapBytes;
if (modelFile != null) {
System.out.println("\nLoading CharSoup model (for language set): " + modelFile);
long heapBefore = usedHeap();
CharSoupModel bigramModel;
try (InputStream is = new BufferedInputStream(Files.newInputStream(modelFile))) {
bigramModel = CharSoupModel.load(is);
}
bigramHeapBytes = usedHeap() - heapBefore;
System.out.printf(Locale.US,
" CharSoup model: %d classes, %d buckets, ~%.1f MB heap%n",
bigramModel.getNumClasses(), bigramModel.getNumBuckets(),
bigramHeapBytes / (1024.0 * 1024.0));
bigramLangs = new HashSet<>(Arrays.asList(bigramModel.getLabels()));
} else {
bigramLangs = CharSoupLanguageDetector.getSupportedLanguages(strategy);
System.out.printf(Locale.US,
"%nCharSoup supported languages (%s strategy): %d%n",
strategy, bigramLangs.size());
bigramHeapBytes = 0L; // model already loaded in static initializer
}
System.out.println(" Evaluation routes through CharSoupLanguageDetector"
+ " (script gate + confusable group collapse).");
// ---- Load OpenNLP detectors (one per thread; stateful) ----
System.out.println("Loading OpenNLP detector(s)...");
long heapBefore = usedHeap();
List<LanguageDetector> opennlpPool = new ArrayList<>();
for (int i = 0; i < numThreads; i++) {
LanguageDetector d = loadDetector(
"org.apache.tika.langdetect.opennlp.OpenNLPDetector");
if (d == null) {
break;
}
opennlpPool.add(d);
}
LanguageDetector opennlpDetector = opennlpPool.isEmpty() ? null : opennlpPool.get(0);
long opennlpHeapBytes = opennlpDetector != null ? usedHeap() - heapBefore : 0;
if (opennlpDetector != null) {
System.out.printf(Locale.US, " OpenNLP: %d instance(s), ~%.1f MB heap%n",
opennlpPool.size(), opennlpHeapBytes / (1024.0 * 1024.0));
}
// ---- Load Lingua detector (thread-safe; single instance) ----
System.out.println("Loading Lingua detector (low accuracy mode)...");
heapBefore = usedHeap();
LinguaWrapper lingua = LinguaWrapper.load();
long linguaHeapBytes = lingua != null ? usedHeap() - heapBefore : 0;
Set<String> linguaLangs = lingua != null
? lingua.supportedLangs : Collections.emptySet();
// ---- Load Optimaize detectors (one per thread; stateful) ----
System.out.println("Loading Optimaize detector(s)...");
heapBefore = usedHeap();
List<LanguageDetector> optimaizePool = new ArrayList<>();
for (int i = 0; i < numThreads; i++) {
LanguageDetector d = loadDetector(
"org.apache.tika.langdetect.optimaize.OptimaizeLangDetector");
if (d == null) {
break;
}
optimaizePool.add(d);
}
LanguageDetector optimaizeDetector = optimaizePool.isEmpty() ? null : optimaizePool.get(0);
long optimaizeHeapBytes = optimaizeDetector != null ? usedHeap() - heapBefore : 0;
if (optimaizeDetector != null) {
System.out.printf(Locale.US, " Optimaize: %d instance(s), ~%.1f MB heap%n",
optimaizePool.size(), optimaizeHeapBytes / (1024.0 * 1024.0));
}
// ---- Warm up JIT ----
System.out.println("\nWarming up (" + WARMUP_ITERS + " iterations)...");
CharSoupDetectorConfig warmupCfg = CharSoupDetectorConfig.fromMap(
java.util.Map.of("strategy", strategy.name()));
CharSoupLanguageDetector warmupDetector = new CharSoupLanguageDetector(warmupCfg);
for (int i = 0; i < WARMUP_ITERS && i < allData.size(); i++) {
String text = allData.get(i).getText();
warmupDetector.reset();
warmupDetector.addText(text.toCharArray(), 0, text.length());
warmupDetector.detectAll();
if (opennlpDetector != null) {
opennlpDetector.reset();
opennlpDetector.addText(text);
opennlpDetector.detectAll();
}
if (lingua != null) {
lingua.detect(text);
}
if (optimaizeDetector != null) {
optimaizeDetector.reset();
optimaizeDetector.addText(text);
optimaizeDetector.detectAll();
}
}
// ---- Build per-detector supported-language sets ----
// Used to gate evaluation: sentences from unsupported languages are skipped entirely
// so detectors are not penalised for languages they do not claim to cover.
Set<String> allTestLangs = new HashSet<>();
for (LabeledSentence s : allData) {
allTestLangs.add(s.getLanguage());
}
Set<String> opennlpAllLangs = new HashSet<>();
if (opennlpDetector != null) {
for (String lang : allTestLangs) {
if (opennlpDetector.hasModel(lang)) {
opennlpAllLangs.add(lang);
}
}
}
// Optimaize uses BCP-47 ISO 639-1 codes internally; translate to ISO 639-3
// so we can intersect with our ISO 639-3 test set.
Set<String> optimaizeAllLangs = optimaizeSupportedIso3(optimaizeDetector);
Set<String> opennlpSharedLangs = new HashSet<>(bigramLangs);
opennlpSharedLangs.retainAll(opennlpAllLangs);
Set<String> linguaSharedLangs = new HashSet<>(bigramLangs);
linguaSharedLangs.retainAll(linguaLangs);
Set<String> optimaizeSharedLangs = new HashSet<>(bigramLangs);
optimaizeSharedLangs.retainAll(optimaizeAllLangs);
List<LabeledSentence> opennlpSub = filterByLangs(allData, opennlpSharedLangs);
List<LabeledSentence> linguaSub = filterByLangs(allData, linguaSharedLangs);
List<LabeledSentence> optimaizeSub = filterByLangs(allData, optimaizeSharedLangs);
System.out.printf(Locale.US,
"\nCharSoup \u2229 OpenNLP: %d languages, %,d sentences%n",
opennlpSharedLangs.size(), opennlpSub.size());
System.out.printf(Locale.US,
"CharSoup \u2229 Lingua: %d languages, %,d sentences (Lingua covers %d)%n",
linguaSharedLangs.size(), linguaSub.size(), linguaLangs.size());
System.out.printf(Locale.US,
"CharSoup \u2229 Optimaize: %d languages, %,d sentences%n",
optimaizeSharedLangs.size(), optimaizeSub.size());
// ---- Evaluate at each length ----
List<LengthEval> allEvals = new ArrayList<>();
List<LengthEval> opennlpEvals = new ArrayList<>();
List<LengthEval> linguaEvals = new ArrayList<>();
List<LengthEval> optimaizeEvals = new ArrayList<>();
System.out.println();
for (int maxLen : EVAL_LENGTHS) {
String tag = maxLen == Integer.MAX_VALUE ? "full" : "@" + maxLen;
System.out.printf(Locale.US, "Evaluating %-6s ...", tag);
System.out.flush();
List<LabeledSentence> tAll = truncate(allData, maxLen);
List<LabeledSentence> tOnn = truncate(opennlpSub, maxLen);
List<LabeledSentence> tLin = truncate(linguaSub, maxLen);
List<LabeledSentence> tOpt = truncate(optimaizeSub, maxLen);
LengthEval ae = new LengthEval(maxLen);
ae.bigram = evaluateBigramParallel(
tAll, "bigram-" + tag, numThreads, bigramLangs, strategy);
ae.opennlp = evaluateOpenNLPParallel(opennlpPool, tAll, "opennlp-" + tag,
opennlpAllLangs);
ae.lingua = evaluateLingua(lingua, tAll, "lingua-" + tag, linguaLangs);
ae.optimaize = evaluateOptimaizeParallel(optimaizePool, tAll, "optimaize-" + tag,
optimaizeAllLangs);
allEvals.add(ae);
LengthEval oe = new LengthEval(maxLen);
oe.bigram = evaluateBigramParallel(tOnn, "bigram-onn-" + tag, numThreads, null, strategy);
oe.opennlp = evaluateOpenNLPParallel(opennlpPool, tOnn, "opennlp-onn-" + tag);
opennlpEvals.add(oe);
LengthEval le = new LengthEval(maxLen);
le.bigram = evaluateBigramParallel(tLin, "bigram-lin-" + tag, numThreads, null, strategy);
le.lingua = evaluateLingua(lingua, tLin, "lingua-lin-" + tag);
linguaEvals.add(le);
LengthEval pe = new LengthEval(maxLen);
pe.bigram = evaluateBigramParallel(tOpt, "bigram-opt-" + tag, numThreads, null, strategy);
pe.optimaize = evaluateOptimaizeParallel(optimaizePool, tOpt, "optimaize-opt-" + tag);
optimaizeEvals.add(pe);
System.out.printf(Locale.US,
" charsoup=%s opennlp=%s lingua=%s optimaize=%s%n",
fmtPct(ae.bigram), fmtPct(ae.opennlp), fmtPct(ae.lingua),
fmtPct(ae.optimaize));
}
// ---- Build report ----
String report = buildReport(
allEvals, opennlpEvals, linguaEvals, optimaizeEvals,
bigramHeapBytes, opennlpHeapBytes, linguaHeapBytes, optimaizeHeapBytes,
allData.size(),
opennlpSharedLangs.size(), opennlpSub.size(),
linguaSharedLangs.size(), linguaSub.size(),
optimaizeSharedLangs.size(), optimaizeSub.size());
System.out.println("\n" + report);
if (reportFile != null) {
if (reportFile.getParent() != null) {
Files.createDirectories(reportFile.getParent());
}
try (BufferedWriter w = Files.newBufferedWriter(reportFile, StandardCharsets.UTF_8)) {
w.write(report);
}
System.out.println("Report written to: " + reportFile);
}
}
/**
* Macro-averaged F1: for each language compute F1 from its precision and recall,
* then average equally across all languages that appear in the true set.
* Languages with zero true samples are excluded from the average.
* Languages with zero predictions get precision=0, recall=0, F1=0.
* Languages predicted but absent from the truth set are not included ���
* their penalty is already captured as a recall loss on the true language.
*/
static double computeMacroF1(EvalResult r) {
if (r == null || r.perLang == null || r.perLang.isEmpty()) {
return 0.0;
}
double sum = 0.0;
int count = 0;
for (var e : r.perLang.entrySet()) {
int tp = e.getValue()[0];
int total = e.getValue()[1];
if (total == 0) {
continue;
}
int predicted = r.predictedCounts != null
? r.predictedCounts.getOrDefault(e.getKey(), 0) : 0;
double recall = (double) tp / total;
double precision = predicted > 0 ? (double) tp / predicted : 0.0;
double f1 = (precision + recall) > 0
? 2.0 * precision * recall / (precision + recall) : 0.0;
sum += f1;
count++;
}
return count > 0 ? sum / count : 0.0;
}
/**
* Strict macro-F1 over all FLORES languages: covered languages use their real F1,
* unsupported languages contribute 0 to the sum and increment the denominator.
*/
static double computeStrictMacroF1(EvalResult r) {
if (r == null || r.perLang == null) {
return 0.0;
}
double sum = 0.0;
int count = r.unsupportedLangsSeen.size(); // unsupported langs all get F1=0
for (var e : r.perLang.entrySet()) {
int tp = e.getValue()[0];
int total = e.getValue()[1];
if (total == 0) {
continue;
}
int predicted = r.predictedCounts != null
? r.predictedCounts.getOrDefault(e.getKey(), 0) : 0;
double recall = (double) tp / total;
double precision = predicted > 0 ? (double) tp / predicted : 0.0;
double f1 = (precision + recall) > 0
? 2.0 * precision * recall / (precision + recall) : 0.0;
sum += f1;
count++;
}
return count > 0 ? sum / count : 0.0;
}
/** Strict accuracy over all FLORES languages: unsupported sentences count as misses. */
static double computeStrictAccuracy(EvalResult r) {
if (r == null) {
return 0.0;
}
int totalAll = r.total + r.unsupportedTotal;
return totalAll > 0 ? (double) r.correct / totalAll : 0.0;
}
private static String fmtF1(EvalResult r) {
if (r == null || r.total == 0) {
return " N/A ";
}
return String.format(Locale.US, "%6.2f%%", 100.0 * computeMacroF1(r));
}
private static String fmtPct(EvalResult r) {
if (r == null || r.total == 0) {
return " N/A ";
}
return String.format(Locale.US, "%6.2f%%", 100.0 * r.correct / r.total);
}
// ---- Parallel bigram evaluation ----
// Evaluation routes through CharSoupLanguageDetector (the production pipeline),
// which applies script gating and confusable group collapse. Each thread gets
// its own detector instance since CharSoupLanguageDetector is stateful.
/**
* Evaluate CharSoup via the full production pipeline in parallel.
* Each thread gets its own {@link CharSoupLanguageDetector} instance configured
* with the given strategy.
*
* @param supportedLangs if non-null, sentences whose true label is NOT in this set are
* skipped entirely ��� not counted in accuracy, F1, or confusion.
*/
static EvalResult evaluateBigramParallel(List<LabeledSentence> data, String name,
int numThreads, Set<String> supportedLangs,
CharSoupLanguageDetector.Strategy strategy)
throws Exception {
CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(
java.util.Map.of("strategy", strategy.name()));
if (data.isEmpty()) {
return new EvalResult(name);
}
if (numThreads <= 1) {
return evaluateBigramChunk(new CharSoupLanguageDetector(cfg), data, name, supportedLangs);
}
List<List<LabeledSentence>> chunks = partition(data, numThreads);
ExecutorService pool = Executors.newFixedThreadPool(chunks.size());
try {
List<Future<EvalResult>> futures = new ArrayList<>();
long wallStart = System.nanoTime();
for (int i = 0; i < chunks.size(); i++) {
final List<LabeledSentence> chunk = chunks.get(i);
final String chunkName = name + "-t" + i;
final CharSoupLanguageDetector detector = new CharSoupLanguageDetector(cfg);
futures.add(pool.submit(
() -> evaluateBigramChunk(detector, chunk, chunkName, supportedLangs)));
}
EvalResult merged = new EvalResult(name);
merged.perLang = new TreeMap<>();
merged.predictedCounts = new TreeMap<>();
merged.confusions = new TreeMap<>();
for (Future<EvalResult> f : futures) {
EvalResult partial = f.get();
merged.correct += partial.correct;
merged.correctGroup += partial.correctGroup;
merged.total += partial.total;
merged.unsupportedTotal += partial.unsupportedTotal;
merged.unsupportedLangsSeen.addAll(partial.unsupportedLangsSeen);
mergePerLang(merged.perLang, partial.perLang);
mergePredicted(merged.predictedCounts, partial.predictedCounts);
mergeConfusions(merged.confusions, partial.confusions);
}
merged.elapsedMs = (System.nanoTime() - wallStart) / 1_000_000;
return merged;
} finally {
pool.shutdown();
}
}
/**
* Single-threaded evaluation via the full CharSoupLanguageDetector production pipeline.
* The detector applies script gating and confusable group collapse ��� identical to
* what end users receive.
*
* @param supportedLangs if non-null, sentences whose true label is NOT in this set are
* skipped entirely ��� they do not contribute to accuracy, macro-F1,
* per-language counts, or confusion tracking.
*/
static EvalResult evaluateBigramChunk(CharSoupLanguageDetector detector,
List<LabeledSentence> data, String name,
Set<String> supportedLangs) {
EvalResult result = new EvalResult(name);
if (data.isEmpty()) {
return result;
}
Map<String, int[]> perLang = new TreeMap<>();
Map<String, Integer> predictedCounts = new TreeMap<>();
Map<String, Map<String, Integer>> confusions = new TreeMap<>();
int correct = 0;
int correctGroup = 0;
int coveredTotal = 0;
long startNs = System.nanoTime();
for (LabeledSentence s : data) {
detector.reset();
String text = s.getText();
detector.addText(text.toCharArray(), 0, text.length());
List<LanguageResult> results = detector.detectAll();
String predLabel = (results.isEmpty()
|| results.get(0).getConfidence() == LanguageConfidence.NONE)
? "unk" : results.get(0).getLanguage();
String truth = s.getLanguage();
if (supportedLangs != null && !supportedLangs.contains(truth)) {
result.unsupportedTotal++;
result.unsupportedLangsSeen.add(truth);
continue;
}
coveredTotal++;
predictedCounts.merge(predLabel, 1, Integer::sum);
int[] counts = perLang.computeIfAbsent(truth, k -> new int[3]);
counts[1]++;
if (predLabel.equals(truth)) {
counts[0]++;
correct++;
counts[2]++;
correctGroup++;
} else {
confusions.computeIfAbsent(truth, k -> new HashMap<>())
.merge(predLabel, 1, Integer::sum);
if (isGroupMatch(truth, predLabel)) {
counts[2]++;
correctGroup++;
}
}
}
result.correct = correct;
result.correctGroup = correctGroup;
result.total = supportedLangs != null ? coveredTotal : data.size();
result.elapsedMs = (System.nanoTime() - startNs) / 1_000_000;
result.perLang = perLang;
result.predictedCounts = predictedCounts;
result.confusions = confusions;
return result;
}
// ---- Parallel OpenNLP evaluation ----
/**
* Evaluate OpenNLP detector in parallel using a pre-created pool of
* detector instances (one per thread, since LanguageDetector is stateful).
*/
static EvalResult evaluateOpenNLPParallel(List<LanguageDetector> detectors,
List<LabeledSentence> data,
String name) throws Exception {
return evaluateOpenNLPParallel(detectors, data, name, null);
}
static EvalResult evaluateOpenNLPParallel(List<LanguageDetector> detectors,
List<LabeledSentence> data,
String name,
Set<String> supportedLangs) throws Exception {
if (detectors == null || detectors.isEmpty() || data.isEmpty()) {
return new EvalResult(name);
}
int numThreads = detectors.size();
if (numThreads <= 1) {
return evaluateOpenNLPChunk(detectors.get(0), data, name, supportedLangs);
}
List<List<LabeledSentence>> chunks = partition(data, numThreads);
ExecutorService pool = Executors.newFixedThreadPool(chunks.size());
try {
List<Future<EvalResult>> futures = new ArrayList<>();
long wallStart = System.nanoTime();
for (int i = 0; i < chunks.size(); i++) {
final List<LabeledSentence> chunk = chunks.get(i);
final String chunkName = name + "-t" + i;
final LanguageDetector threadDetector = detectors.get(
Math.min(i, detectors.size() - 1));
futures.add(pool.submit(
() -> evaluateOpenNLPChunk(threadDetector, chunk, chunkName,
supportedLangs)));
}
EvalResult merged = new EvalResult(name);
merged.perLang = new TreeMap<>();
merged.predictedCounts = new TreeMap<>();
for (Future<EvalResult> f : futures) {
EvalResult partial = f.get();
merged.correct += partial.correct;
merged.correctGroup += partial.correctGroup;
merged.total += partial.total;
merged.unsupportedTotal += partial.unsupportedTotal;
merged.unsupportedLangsSeen.addAll(partial.unsupportedLangsSeen);
mergePerLang(merged.perLang, partial.perLang);
mergePredicted(merged.predictedCounts, partial.predictedCounts);
}
merged.elapsedMs = (System.nanoTime() - wallStart) / 1_000_000;
return merged;
} finally {
pool.shutdown();
}
}
/** Single-threaded OpenNLP evaluation on a chunk of data. */
static EvalResult evaluateOpenNLPChunk(LanguageDetector detector,
List<LabeledSentence> data, String name) {
return evaluateOpenNLPChunk(detector, data, name, null);
}
static EvalResult evaluateOpenNLPChunk(LanguageDetector detector,
List<LabeledSentence> data, String name,
Set<String> supportedLangs) {
EvalResult result = new EvalResult(name);
if (detector == null || data.isEmpty()) {
return result;
}
Map<String, int[]> perLang = new TreeMap<>();
Map<String, Integer> predictedCounts = new TreeMap<>();
int correct = 0;
int correctGroup = 0;
int coveredTotal = 0;
long startNs = System.nanoTime();
for (LabeledSentence s : data) {
detector.reset();
detector.addText(s.getText());
List<LanguageResult> results = detector.detectAll();
String predicted = results.isEmpty() ? "unk" : results.get(0).getLanguage();
String truth = s.getLanguage();
if (supportedLangs != null && !supportedLangs.contains(truth)) {
result.unsupportedTotal++;
result.unsupportedLangsSeen.add(truth);
continue;
}
coveredTotal++;
predictedCounts.merge(predicted, 1, Integer::sum);
int[] counts = perLang.computeIfAbsent(truth, k -> new int[3]);
counts[1]++;
if (predicted.equals(truth)) {
counts[0]++;
correct++;
counts[2]++;
correctGroup++;
} else if (isGroupMatch(truth, predicted)) {
counts[2]++;
correctGroup++;
}
}
long elapsedNs = System.nanoTime() - startNs;
result.correct = correct;
result.correctGroup = correctGroup;
result.total = supportedLangs != null ? coveredTotal : data.size();
result.elapsedMs = elapsedNs / 1_000_000;
result.perLang = perLang;
result.predictedCounts = predictedCounts;
return result;
}
// ---- Optimaize evaluation ----
// Optimaize returns BCP-47 ISO 639-1 codes ("en", "fr", "zh-Hans").
// We translate to ISO 639-3 ("eng", "fra", "zho") via Locale before comparing
// against our ISO 639-3 test data.
/**
* Translate a BCP-47 language tag returned by Optimaize to an ISO 639-3 code.
* Uses Java's Locale to do the mapping; falls back to the input if the
* ISO 639-3 code is unavailable or empty.
*/
static String optimaizePredToIso3(String bcp47) {
if (bcp47 == null || bcp47.isEmpty() || "unk".equals(bcp47)) {
return "unk";
}
try {
String iso3 = java.util.Locale.forLanguageTag(bcp47).getISO3Language();
return (iso3 != null && !iso3.isEmpty()) ? iso3 : bcp47;
} catch (Exception e) {
return bcp47;
}
}
/**
* Build the set of ISO 639-3 codes that Optimaize supports,
* by translating each BCP-47 label returned by getSupportedLanguages().
*/
@SuppressWarnings("unchecked")
static Set<String> optimaizeSupportedIso3(LanguageDetector optimaize) {
Set<String> iso3 = new HashSet<>();
if (optimaize == null) {
return iso3;
}
try {
// Optimaize stores its supported BCP-47 tags in the static DEFAULT_LANGUAGES field.
java.lang.reflect.Field f =
optimaize.getClass().getDeclaredField("DEFAULT_LANGUAGES");
f.setAccessible(true);
Set<String> bcp47Tags = (Set<String>) f.get(null);
for (String tag : bcp47Tags) {
iso3.add(optimaizePredToIso3(tag));
}
} catch (Exception e) {
System.err.println("WARN: could not get Optimaize supported languages: " + e);
}
return iso3;
}
static EvalResult evaluateOptimaizeParallel(List<LanguageDetector> detectors,
List<LabeledSentence> data,
String name) throws Exception {
return evaluateOptimaizeParallel(detectors, data, name, null);
}
static EvalResult evaluateOptimaizeParallel(List<LanguageDetector> detectors,
List<LabeledSentence> data,
String name,
Set<String> supportedLangs) throws Exception {
if (detectors == null || detectors.isEmpty() || data.isEmpty()) {
return new EvalResult(name);
}
int numThreads = detectors.size();
if (numThreads <= 1) {
return evaluateOptimaizeChunk(detectors.get(0), data, name, supportedLangs);
}
List<List<LabeledSentence>> chunks = partition(data, numThreads);
ExecutorService pool = Executors.newFixedThreadPool(chunks.size());
try {
List<Future<EvalResult>> futures = new ArrayList<>();
long wallStart = System.nanoTime();
for (int i = 0; i < chunks.size(); i++) {
final List<LabeledSentence> chunk = chunks.get(i);
final LanguageDetector d = detectors.get(i % detectors.size());
futures.add(pool.submit(() ->
evaluateOptimaizeChunk(d, chunk, name, supportedLangs)));
}
EvalResult merged = new EvalResult(name);
merged.perLang = new java.util.TreeMap<>();
merged.predictedCounts = new java.util.TreeMap<>();
for (Future<EvalResult> f : futures) {
EvalResult part = f.get();
merged.correct += part.correct;
merged.correctGroup += part.correctGroup;
merged.total += part.total;
merged.unsupportedTotal += part.unsupportedTotal;
merged.unsupportedLangsSeen.addAll(part.unsupportedLangsSeen);
mergePerLang(merged.perLang, part.perLang);
mergePredicted(merged.predictedCounts, part.predictedCounts);
}
merged.elapsedMs = (System.nanoTime() - wallStart) / 1_000_000;
return merged;
} finally {
pool.shutdown();
}
}
static EvalResult evaluateOptimaizeChunk(LanguageDetector detector,
List<LabeledSentence> data,
String name,
Set<String> supportedLangs) {
EvalResult result = new EvalResult(name);
if (detector == null || data.isEmpty()) {
return result;
}
Map<String, int[]> perLang = new java.util.TreeMap<>();
Map<String, Integer> predictedCounts = new java.util.TreeMap<>();
int correct = 0;
int correctGroup = 0;
int coveredTotal = 0;
long startNs = System.nanoTime();
for (LabeledSentence s : data) {
detector.reset();
detector.addText(s.getText());
List<LanguageResult> results = detector.detectAll();
String rawPred = results.isEmpty() ? "unk" : results.get(0).getLanguage();
String predicted = optimaizePredToIso3(rawPred);
String truth = s.getLanguage();
if (supportedLangs != null && !supportedLangs.contains(truth)) {
result.unsupportedTotal++;
result.unsupportedLangsSeen.add(truth);
continue;
}
coveredTotal++;
predictedCounts.merge(predicted, 1, Integer::sum);
int[] counts = perLang.computeIfAbsent(truth, k -> new int[3]);
counts[1]++;
if (predicted.equals(truth)) {
counts[0]++;
correct++;
counts[2]++;
correctGroup++;
} else if (isGroupMatch(truth, predicted)) {
counts[2]++;
correctGroup++;
}
}
long elapsedNs = System.nanoTime() - startNs;
result.correct = correct;
result.correctGroup = correctGroup;
result.total = supportedLangs != null ? coveredTotal : data.size();
result.elapsedMs = elapsedNs / 1_000_000;
result.perLang = perLang;
result.predictedCounts = predictedCounts;
return result;
}
// ---- Lingua evaluation ----
/**
* Evaluate Lingua detector (single-threaded; Lingua's model is read-only and thread-safe,
* but reflection call overhead means parallelism gains are modest for shorter runs).
*/
static EvalResult evaluateLingua(LinguaWrapper lingua,
List<LabeledSentence> data, String name) {
return evaluateLingua(lingua, data, name, null);
}
static EvalResult evaluateLingua(LinguaWrapper lingua,
List<LabeledSentence> data, String name,
Set<String> supportedLangs) {
EvalResult result = new EvalResult(name);
if (lingua == null || data.isEmpty()) {
return result;
}
Map<String, int[]> perLang = new TreeMap<>();
int correct = 0;
int correctGroup = 0;
int coveredTotal = 0;
Map<String, Integer> predictedCounts = new TreeMap<>();
long startNs = System.nanoTime();
for (LabeledSentence s : data) {
String predicted = lingua.detect(s.getText());
String truth = s.getLanguage();
if (supportedLangs != null && !supportedLangs.contains(truth)) {
result.unsupportedTotal++;
result.unsupportedLangsSeen.add(truth);
continue;
}
coveredTotal++;
predictedCounts.merge(predicted, 1, Integer::sum);
int[] counts = perLang.computeIfAbsent(truth, k -> new int[3]);
counts[1]++;
if (predicted.equals(truth)) {
counts[0]++;
correct++;
counts[2]++;
correctGroup++;
} else if (isGroupMatch(truth, predicted)) {
counts[2]++;
correctGroup++;
}
}
result.correct = correct;
result.correctGroup = correctGroup;
result.total = supportedLangs != null ? coveredTotal : data.size();
result.elapsedMs = (System.nanoTime() - startNs) / 1_000_000;
result.perLang = perLang;
result.predictedCounts = predictedCounts;
return result;
}
// ---- Partitioning and merging helpers ----
/** Split a list into approximately equal-sized sublists. */
static <T> List<List<T>> partition(List<T> list, int n) {
List<List<T>> parts = new ArrayList<>();
int size = list.size();
int chunkSize = (size + n - 1) / n;
for (int i = 0; i < size; i += chunkSize) {
parts.add(list.subList(i, Math.min(i + chunkSize, size)));
}
return parts;
}
/** Merge per-language counts from a partial result into an accumulator. */
private static void mergePerLang(Map<String, int[]> target,
Map<String, int[]> source) {
if (source == null) {
return;
}
for (Map.Entry<String, int[]> e : source.entrySet()) {
int[] src = e.getValue();
int[] dst = target.computeIfAbsent(e.getKey(), k -> new int[3]);
dst[0] += src[0];
dst[1] += src[1];
dst[2] += src[2];
}
}
private static void mergePredicted(Map<String, Integer> target,
Map<String, Integer> source) {
if (source == null) {
return;
}
source.forEach((k, v) -> target.merge(k, v, Integer::sum));
}
private static void mergeConfusions(Map<String, Map<String, Integer>> target,
Map<String, Map<String, Integer>> source) {
if (source == null) {
return;
}
for (var e : source.entrySet()) {
Map<String, Integer> targetInner =
target.computeIfAbsent(e.getKey(), k -> new HashMap<>());
e.getValue().forEach((k, v) -> targetInner.merge(k, v, Integer::sum));
}
}
// ---- Report ----
static String buildReport(
List<LengthEval> allEvals,
List<LengthEval> opennlpEvals,
List<LengthEval> linguaEvals,
List<LengthEval> optimaizeEvals,
long bigramHeap, long opennlpHeap, long linguaHeap, long optimaizeHeap,
int totalSentences,
int opennlpSharedLangCount, int opennlpSharedCount,
int linguaSharedLangCount, int linguaSharedCount,
int optimaizeSharedLangCount, int optimaizeSharedCount) {
StringBuilder sb = new StringBuilder();
sb.append("=== Language Detection Comparison Report ===\n\n");
sb.append(String.format(Locale.US, "Test sentences: %,d%n", totalSentences));
sb.append(String.format(Locale.US,
"CharSoup \u2229 OpenNLP: %d languages, %,d sentences%n",
opennlpSharedLangCount, opennlpSharedCount));
sb.append(String.format(Locale.US,
"CharSoup \u2229 Lingua: %d languages, %,d sentences%n",
linguaSharedLangCount, linguaSharedCount));
sb.append(String.format(Locale.US,
"CharSoup \u2229 Optimaize: %d languages, %,d sentences%n%n",
optimaizeSharedLangCount, optimaizeSharedCount));
// Model sizes
sb.append("Model heap (approx):\n");
sb.append(String.format(Locale.US, " CharSoup: ~%.1f MB%n",
bigramHeap / (1024.0 * 1024.0)));
sb.append(String.format(Locale.US, " OpenNLP: ~%.1f MB%n",
opennlpHeap / (1024.0 * 1024.0)));
sb.append(String.format(Locale.US, " Lingua: ~%.1f MB (low accuracy mode)%n",
linguaHeap / (1024.0 * 1024.0)));
sb.append(String.format(Locale.US, " Optimaize: ~%.1f MB%n%n",
optimaizeHeap / (1024.0 * 1024.0)));
// ---- Coverage-adjusted table ----
sb.append("Coverage-adjusted accuracy \u2014 each detector scored on its own supported languages only\n");
sb.append(" (test sentences whose true language is not in a detector's covered set are skipped)\n");
sb.append(lengthTable(allEvals, true));
sb.append("\n");
// ---- Breadth-weighted (strict) table ----
sb.append("Breadth-weighted accuracy \u2014 all 203 FLORES languages, unsupported languages score 0\n");
sb.append(" (penalises limited coverage; use this to compare total useful output across all inputs)\n");
sb.append(strictLengthTable(allEvals));
sb.append("\n");
// ---- CharSoup ��� OpenNLP ----
sb.append(String.format(Locale.US,
"Strict accuracy \u2014 CharSoup \u2229 OpenNLP (%d languages, %,d sentences)%n",
opennlpSharedLangCount, opennlpSharedCount));
sb.append(lengthTableTwoDetectors(opennlpEvals, "CharSoup", "OpenNLP", "opennlp"));
sb.append("\n");
// ---- CharSoup ��� Lingua ----
sb.append(String.format(Locale.US,
"Strict accuracy \u2014 CharSoup \u2229 Lingua (%d languages, %,d sentences)%n",
linguaSharedLangCount, linguaSharedCount));
sb.append(lengthTableTwoDetectors(linguaEvals, "CharSoup", "Lingua", "lingua"));
sb.append("\n");
// ---- CharSoup ��� Optimaize ----
sb.append(String.format(Locale.US,
"Strict accuracy \u2014 CharSoup \u2229 Optimaize (%d languages, %,d sentences)%n",
optimaizeSharedLangCount, optimaizeSharedCount));
sb.append(lengthTableTwoDetectors(optimaizeEvals, "CharSoup", "Optimaize", "optimaize"));
sb.append("\n");
// ---- CharSoup timing at each length ----
sb.append("CharSoup timing (wall-clock, full pipeline including script gate + group collapse):\n");
sb.append(String.format(Locale.US, "%-6s %10s %10s%n",
"Length", "Wall(ms)", "Sent/sec"));
sb.append("-".repeat(32)).append("\n");
for (LengthEval le : allEvals) {
EvalResult b = le.bigram;
double sps = b.total > 0 && b.elapsedMs > 0
? b.total / (b.elapsedMs / 1000.0) : 0;
sb.append(String.format(Locale.US, "%-6s %,10d %,10.0f%n",
le.tag(), b.elapsedMs, sps));
}
sb.append("\n");
// ---- Per-language CharSoup F1 across all lengths ----
sb.append("Per-language CharSoup F1 by length:\n");
// Build header from eval tags
sb.append(String.format(Locale.US, "%-14s", "Language"));
for (LengthEval le : allEvals) {
sb.append(String.format(Locale.US, " %6s", le.tag()));
}
sb.append("\n");
sb.append("-".repeat(14 + allEvals.size() * 8)).append("\n");
// Collect all language codes across all eval snapshots
Set<String> allLangs = new TreeSet<>();
for (LengthEval le : allEvals) {
if (le.bigram != null && le.bigram.perLang != null) {
allLangs.addAll(le.bigram.perLang.keySet());
}
}
for (String lang : allLangs) {
// Only include languages that appear in the full-length test set
LengthEval fullSnap = allEvals.get(allEvals.size() - 1);
int[] fullCounts = fullSnap.bigram != null && fullSnap.bigram.perLang != null
? fullSnap.bigram.perLang.get(lang) : null;
if (fullCounts == null || fullCounts[1] == 0) continue;
sb.append(String.format(Locale.US, "%-14s", lang));
for (LengthEval le : allEvals) {
int[] c = le.bigram != null && le.bigram.perLang != null
? le.bigram.perLang.get(lang) : null;
if (c == null || c[1] == 0) {
sb.append(String.format(Locale.US, " %6s", "N/A"));
} else {
int predCount = le.bigram.predictedCounts != null
? le.bigram.predictedCounts.getOrDefault(lang, 0) : 0;
sb.append(String.format(Locale.US, " %s",
fmtPerLangF1(c[0], c[1], predCount)));
}
}
sb.append("\n");
}
sb.append("\n");
// ---- Per-language table at @500 / full ----
LengthEval snap = allEvals.get(allEvals.size() - 1); // "full"
sb.append(String.format(Locale.US,
"Per-language macro F1 (%s):%n", snap.tag()));
sb.append(String.format(Locale.US,
"%-12s %7s %7s %7s %7s%n",
"Language", "CharSoup", "OpenNLP", "Lingua", "Optimaize"));
sb.append("-".repeat(58)).append("\n");
// Collect per-language TP, true-total, predicted-total for each detector
// [0]=bi-tp, [1]=bi-true, [2]=bi-pred,
// [3]=on-tp, [4]=on-true, [5]=on-pred,
// [6]=li-tp, [7]=li-true, [8]=li-pred,
// [9]=opt-tp, [10]=opt-true, [11]=opt-pred
Map<String, int[]> merged = new TreeMap<>();
mergeIntoFull(merged, snap.bigram, 0);
mergeIntoFull(merged, snap.opennlp, 3);
mergeIntoFull(merged, snap.lingua, 6);
mergeIntoFull(merged, snap.optimaize, 9);
for (var e : merged.entrySet()) {
int[] c = e.getValue();
// c[1] = bigram true-total; skip languages absent from the test set
if (c[1] == 0) {
continue;
}
sb.append(String.format(Locale.US,
"%-12s %s %s %s %s%n",
e.getKey(),
fmtPerLangF1(c[0], c[1], c[2]),
fmtPerLangF1(c[3], c[4], c[5]),
fmtPerLangF1(c[6], c[7], c[8]),
fmtPerLangF1(c[9], c[10], c[11])));
}
// ---- Top confusions for CharSoup: @20 and full text ----
LengthEval short20 = allEvals.get(0);
appendConfusionTable(sb, short20.bigram, short20.tag());
appendConfusionTable(sb, snap.bigram, snap.tag());
return sb.toString();
}
private static void appendConfusionTable(StringBuilder sb, EvalResult result, String tag) {
if (result == null || result.confusions == null || result.perLang == null) {
return;
}
sb.append(String.format(Locale.US,
"%nCharSoup top confusions (languages with F1 < 95%%, %s):%n", tag));
sb.append(String.format(Locale.US, "%-12s %6s %s%n",
"TrueLabel", "F1", "Top misclassifications (predicted \u2192 count)"));
sb.append("-".repeat(72)).append("\n");
List<Map.Entry<String, int[]>> perLangEntries =
new ArrayList<>(result.perLang.entrySet());
perLangEntries.sort(Comparator.comparingDouble(e -> {
int[] c = e.getValue();
int tp = c[0], total = c[1];
if (total == 0) {
return 1.0;
}
int pred = result.predictedCounts != null
? result.predictedCounts.getOrDefault(e.getKey(), 0) : 0;
double rec = (double) tp / total;
double prec = pred > 0 ? (double) tp / pred : 0.0;
return (prec + rec) > 0 ? 2.0 * prec * rec / (prec + rec) : 0.0;
}));
for (var e : perLangEntries) {
int[] c = e.getValue();
int tp = c[0], total = c[1];
if (total == 0) {
continue;
}
int pred = result.predictedCounts != null
? result.predictedCounts.getOrDefault(e.getKey(), 0) : 0;
double rec = (double) tp / total;
double prec = pred > 0 ? (double) tp / pred : 0.0;
double f1 = (prec + rec) > 0 ? 2.0 * prec * rec / (prec + rec) : 0.0;
if (f1 >= 0.95) {
continue;
}
Map<String, Integer> cm = result.confusions.get(e.getKey());
String confStr = " (no misses recorded)";
if (cm != null && !cm.isEmpty()) {
List<Map.Entry<String, Integer>> sorted = new ArrayList<>(cm.entrySet());
sorted.sort((a, b) -> b.getValue() - a.getValue());
StringBuilder confSb = new StringBuilder();
int shown = 0;
for (var ce : sorted) {
if (shown++ > 0) {
confSb.append(", ");
}
confSb.append(ce.getKey()).append("\u2192").append(ce.getValue());
if (shown >= 7) {
break;
}
}
confStr = confSb.toString();
}
sb.append(String.format(Locale.US, "%-12s %5.1f%% %s%n",
e.getKey(), 100.0 * f1, confStr));
}
}
/** Merge perLang counts from an EvalResult into a 6-element merged map, at offset. */
private static void mergeInto(Map<String, int[]> merged, EvalResult r, int offset) {
if (r == null || r.perLang == null) {
return;
}
for (var e : r.perLang.entrySet()) {
int[] row = merged.computeIfAbsent(e.getKey(), k -> new int[6]);
row[offset] = e.getValue()[0]; // correct
row[offset + 1] = e.getValue()[1]; // total
}
}
/**
* Merge perLang + predictedCounts from an EvalResult into a 12-element map.
* Layout at offset: [tp, true-total, predicted-total].
*/
private static void mergeIntoFull(Map<String, int[]> merged, EvalResult r, int offset) {
if (r == null || r.perLang == null) {
return;
}
for (var e : r.perLang.entrySet()) {
int[] row = merged.computeIfAbsent(e.getKey(), k -> new int[12]);
row[offset] = e.getValue()[0]; // tp
row[offset + 1] = e.getValue()[1]; // true total
}
if (r.predictedCounts != null) {
for (var e : r.predictedCounts.entrySet()) {
int[] row = merged.computeIfAbsent(e.getKey(), k -> new int[12]);
row[offset + 2] += e.getValue(); // predicted total
}
}
}
private static String fmtPerLangF1(int tp, int trueTotal, int predTotal) {
if (trueTotal == 0) {
return " N/A";
}
double recall = (double) tp / trueTotal;
double precision = predTotal > 0 ? (double) tp / predTotal : 0.0;
double f1 = (precision + recall) > 0
? 2.0 * precision * recall / (precision + recall) : 0.0;
return String.format(Locale.US, "%6.2f%%", 100.0 * f1);
}
/** Four-detector table: macro F1 + accuracy + wall-clock times. */
/** Breadth-weighted table: uses strict macro-F1 and strict accuracy (all FLORES languages). */
private static String strictLengthTable(List<LengthEval> evals) {
StringBuilder sb = new StringBuilder();
sb.append(String.format(Locale.US,
"%-6s %-14s %-14s %-14s %-14s%n",
"", "��� CharSoup ���", "��� OpenNLP ���", "������ Lingua ������", "��� Optimaize ���"));
sb.append(String.format(Locale.US,
"%-6s %6s %6s %6s %6s %6s %6s %6s %6s%n",
"Length", "mF1", "acc", "mF1", "acc", "mF1", "acc", "mF1", "acc"));
sb.append("-".repeat(70)).append("\n");
for (LengthEval le : evals) {
sb.append(String.format(Locale.US,
"%-6s %s %s %s %s %s %s %s %s%n",
le.tag(),
fmtStrict(le.bigram), fmtStrictAcc(le.bigram),
fmtStrict(le.opennlp), fmtStrictAcc(le.opennlp),
fmtStrict(le.lingua), fmtStrictAcc(le.lingua),
fmtStrict(le.optimaize), fmtStrictAcc(le.optimaize)));
}
return sb.toString();
}
private static String fmtStrict(EvalResult r) {
if (r == null || (r.total + r.unsupportedTotal) == 0) {
return " N/A ";
}
return String.format(Locale.US, "%6.2f%%", 100.0 * computeStrictMacroF1(r));
}
private static String fmtStrictAcc(EvalResult r) {
if (r == null || (r.total + r.unsupportedTotal) == 0) {
return " N/A ";
}
return String.format(Locale.US, "%6.2f%%", 100.0 * computeStrictAccuracy(r));
}
private static String lengthTable(List<LengthEval> evals, boolean showLingua) {
StringBuilder sb = new StringBuilder();
// Two header rows: detector names span F1+acc columns
sb.append(String.format(Locale.US,
"%-6s %-14s %-14s %-14s %-14s %8s %8s %8s %8s %10s%n",
"", "��� CharSoup ���", "��� OpenNLP ���", "������ Lingua ������", "��� Optimaize ���",
"CS(ms)", "ON(ms)", "Li(ms)", "Opt(ms)", "CS sent/s"));
sb.append(String.format(Locale.US,
"%-6s %6s %6s %6s %6s %6s %6s %6s %6s %8s %8s %8s %8s %10s%n",
"Length", "mF1", "acc", "mF1", "acc", "mF1", "acc", "mF1", "acc",
"", "", "", "", ""));
sb.append("-".repeat(118)).append("\n");
for (LengthEval le : evals) {
double sps = le.bigram.total > 0 && le.bigram.elapsedMs > 0
? le.bigram.total / (le.bigram.elapsedMs / 1000.0) : 0;
sb.append(String.format(Locale.US,
"%-6s %s %s %s %s %s %s %s %s %,8d %,8d %,8d %,8d %,10.0f%n",
le.tag(),
fmtF1(le.bigram), fmtPct(le.bigram),
fmtF1(le.opennlp), fmtPct(le.opennlp),
fmtF1(le.lingua), fmtPct(le.lingua),
fmtF1(le.optimaize), fmtPct(le.optimaize),
le.bigram.elapsedMs,
le.opennlp != null ? le.opennlp.elapsedMs : 0L,
le.lingua != null ? le.lingua.elapsedMs : 0L,
le.optimaize != null ? le.optimaize.elapsedMs : 0L,
sps));
}
return sb.toString();
}
/** Two-detector shared table: macro F1 + accuracy + wall-clock times. */
private static String lengthTableTwoDetectors(List<LengthEval> evals,
String nameB, String nameC,
String cField) {
StringBuilder sb = new StringBuilder();
sb.append(String.format(Locale.US,
"%-6s %-14s %-14s %8s %8s %10s%n",
"", "������ " + nameB + " ������", "������ " + nameC + " ������",
"CS(ms)", nameC + "(ms)", "CS sent/s"));
sb.append(String.format(Locale.US,
"%-6s %6s %6s %6s %6s %8s %8s %10s%n",
"Length", "mF1", "acc", "mF1", "acc", "", "", ""));
sb.append("-".repeat(72)).append("\n");
for (LengthEval le : evals) {
EvalResult c;
switch (cField) {
case "lingua":
c = le.lingua;
break;
case "optimaize":
c = le.optimaize;
break;
default:
c = le.opennlp;
break;
}
double sps = le.bigram.total > 0 && le.bigram.elapsedMs > 0
? le.bigram.total / (le.bigram.elapsedMs / 1000.0) : 0;
sb.append(String.format(Locale.US,
"%-6s %s %s %s %s %,8d %,8d %,10.0f%n",
le.tag(),
fmtF1(le.bigram), fmtPct(le.bigram),
fmtF1(c), fmtPct(c),
le.bigram.elapsedMs, c != null ? c.elapsedMs : 0L, sps));
}
return sb.toString();
}
// ---- Helpers ----
/** Return a copy of {@code data} with each text truncated to {@code maxLen} characters. */
static List<LabeledSentence> truncate(List<LabeledSentence> data, int maxLen) {
if (maxLen == Integer.MAX_VALUE) {
return data;
}
List<LabeledSentence> result = new ArrayList<>(data.size());
for (LabeledSentence s : data) {
String text = s.getText();
if (text.length() > maxLen) {
text = text.substring(0, maxLen);
}
result.add(new LabeledSentence(s.getLanguage(), text));
}
return result;
}
/**
* FLORES-200 uses different ISO 639-3 codes for some languages than our training
* pipeline does (which follows Wikipedia dump naming). After stripping the script
* suffix we remap to the canonical code used in our model.
* <p>
* Must stay in sync with {@code PrepareCorpus.LANG_MERGE_MAP} and
* {@code CommonTokenGenerator.LANG_MERGE_MAP}.
*/
private static final Map<String, String> FLORES_CODE_REMAP = Map.ofEntries(
Map.entry("arb", "ara"), // Modern Standard Arabic ��� Arabic
Map.entry("pes", "fas"), // Western Persian ��� Farsi
Map.entry("zsm", "msa"), // Standard Malay ��� Malay
Map.entry("lvs", "lav"), // Standard Latvian ��� Latvian
Map.entry("azj", "aze"), // North Azerbaijani ��� Azerbaijani
Map.entry("ekk", "est"), // Standard Estonian ��� Estonian
Map.entry("npi", "nep"), // Nepali (individual) ��� Nepali
Map.entry("als", "sqi"), // Tosk Albanian ��� Albanian
Map.entry("ory", "ori"), // Odia (macrolanguage) ��� Oriya
Map.entry("nor", "nob"), // Norwegian ��� Bokm��l
Map.entry("cmn", "zho"), // Mandarin ��� Chinese
Map.entry("swa", "swh"), // Swahili (macrolanguage) ��� Swahili
Map.entry("yid", "ydd"), // Yiddish ��� Eastern Yiddish
Map.entry("gug", "grn"), // Paraguayan Guaran�� ��� Guaran��
Map.entry("quz", "que"), // Cusco Quechua ��� Quechua
Map.entry("plt", "mlg"), // Plateau Malagasy ��� Malagasy (dropped in v5; kept for safety)
Map.entry("pbt", "pus"), // Southern Pashto ��� Pashto
Map.entry("uzn", "uzb"), // Northern Uzbek ��� Uzbek
Map.entry("kmr", "kur"), // Kurmanji Kurdish ��� Kurdish
Map.entry("khk", "mon") // Khalkha Mongolian ��� Mongolian
);
/**
* Strip Flores-200 script suffix: {@code zho_Hans} ��� {@code zho},
* {@code ace_Arab} ��� {@code ace}. Then remap FLORES-specific codes to
* the canonical codes used in our model. Plain codes are returned unchanged.
*/
static String normalizeLang(String lang) {
int underscore = lang.indexOf('_');
String base = underscore >= 0 ? lang.substring(0, underscore) : lang;
return FLORES_CODE_REMAP.getOrDefault(base, base);
}
static List<LabeledSentence> filterByLangs(List<LabeledSentence> data,
Set<String> langs) {
List<LabeledSentence> filtered = new ArrayList<>();
for (LabeledSentence s : data) {
if (langs.contains(s.getLanguage())) {
filtered.add(s);
}
}
return filtered;
}
static LanguageDetector loadDetector(String className) {
try {
Class<?> clz = Class.forName(className);
LanguageDetector detector = (LanguageDetector) clz
.getDeclaredConstructor().newInstance();
detector.loadModels();
System.out.println(" Loaded: " + className);
return detector;
} catch (Exception e) {
System.err.println(" WARN: Could not load " + className + ": " + e.getMessage());
return null;
}
}
private static int argmax(float[] arr) {
int best = 0;
for (int i = 1; i < arr.length; i++) {
if (arr[i] > arr[best]) {
best = i;
}
}
return best;
}
/** Force GC and return approximate used heap in bytes. */
private static long usedHeap() {
Runtime rt = Runtime.getRuntime();
for (int i = 0; i < 3; i++) {
rt.gc();
}
try {
Thread.sleep(100);
} catch (InterruptedException ignored) {
Thread.currentThread().interrupt();
}
return rt.totalMemory() - rt.freeMemory();
}
/** Results for one evaluation length threshold across all detectors. */
static class LengthEval {
final int maxLen;
EvalResult bigram;
EvalResult opennlp;
EvalResult lingua;
EvalResult optimaize;
LengthEval(int maxLen) {
this.maxLen = maxLen;
}
String tag() {
return maxLen == Integer.MAX_VALUE ? "full" : "@" + maxLen;
}
}
/**
* Reflective wrapper around {@code com.github.pemistahl.lingua.api.LanguageDetectorBuilder}.
* Loaded optionally so that CompareDetectors runs even when Lingua is not on the classpath.
* Uses low-accuracy mode (~300 MB) instead of high-accuracy (~3.5 GB).
*/
static class LinguaWrapper {
final Set<String> supportedLangs;
private final Object detector;
private final Method detectMethod;
private final Method getIso3Method;
private LinguaWrapper(Object detector, Method detectMethod,
Method getIso3Method, Set<String> supportedLangs) {
this.detector = detector;
this.detectMethod = detectMethod;
this.getIso3Method = getIso3Method;
this.supportedLangs = Collections.unmodifiableSet(supportedLangs);
}
static LinguaWrapper load() {
try {
Class<?> builderClass = Class.forName(
"com.github.pemistahl.lingua.api.LanguageDetectorBuilder");
Object builder = builderClass.getMethod("fromAllLanguages").invoke(null);
builder = builder.getClass().getMethod("withLowAccuracyMode").invoke(builder);
Object det = builder.getClass().getMethod("build").invoke(builder);
// Locate detectLanguageOf(CharSequence) ��� Kotlin compiles it as CharSequence
Method detectM = null;
for (Method m : det.getClass().getMethods()) {
if ("detectLanguageOf".equals(m.getName())
&& m.getParameterCount() == 1) {
detectM = m;
break;
}
}
if (detectM == null) {
throw new NoSuchMethodException("detectLanguageOf not found on "
+ det.getClass());
}
// Enumerate supported ISO 639-3 codes from the Language enum
Class<?> langEnumClass = Class.forName(
"com.github.pemistahl.lingua.api.Language");
Method getIso3M = langEnumClass.getMethod("getIsoCode639_3");
Set<String> langs = new HashSet<>();
for (Object lang : langEnumClass.getEnumConstants()) {
if ("UNKNOWN".equals(lang.toString())) {
continue;
}
String code = getIso3M.invoke(lang).toString().toLowerCase(Locale.ROOT);
if (!"none".equals(code)) {
langs.add(code);
}
}
System.out.printf(Locale.US,
" Loaded Lingua (low accuracy mode, %d languages), ~%.1f MB heap%n",
langs.size(), 0.0); // heap measured externally
return new LinguaWrapper(det, detectM, getIso3M, langs);
} catch (Exception e) {
System.err.println(" WARN: Could not load Lingua: " + e.getMessage());
return null;
}
}
String detect(String text) {
try {
Object lang = detectMethod.invoke(detector, text);
if ("UNKNOWN".equals(lang.toString())) {
return "unk";
}
return getIso3Method.invoke(lang).toString().toLowerCase(Locale.ROOT);
} catch (Exception e) {
return "unk";
}
}
}
static class EvalResult {
String name;
int correct;
int correctGroup; // correct when allowing confusable group match
int total; // sentences in covered languages only
/** Sentences whose true language is not in the detector's supported set. */
int unsupportedTotal;
/** Distinct language codes seen from unsupported-language sentences. */
Set<String> unsupportedLangsSeen = new HashSet<>();
long elapsedMs;
/** Per-language counts: [strict correct, total true, group correct]. */
Map<String, int[]> perLang;
/** How many times each label was predicted (for precision / macro F1). */
Map<String, Integer> predictedCounts;
/** For each true label: how many times each predicted label appeared on misses. */
Map<String, Map<String, Integer>> confusions;
EvalResult(String name) {
this.name = name;
}
}
}