CharSoupModelRoutingTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.langdetect.charsoup;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Assumptions;
import org.junit.jupiter.api.Test;
import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.parser.ParseContext;
/**
* Correctness and routing tests for {@link CharSoupLanguageDetector}.
* <p>
* Covers two concerns:
* <ol>
* <li>Basic language detection ��� short snippets that must be classified
* correctly, including script-gate regressions (TIKA-4662).</li>
* <li>Model routing logic ��� verifying that {@link CharSoupLanguageDetector.Strategy},
* {@link CharSoupDetectorConfig}, and the two automatic gates (length and
* feature-density) route to the correct model.</li>
* </ol>
* If any detection assertion fails after a model retrain, investigate the
* training data rather than weakening the assertion.
*/
public class CharSoupModelRoutingTest {
@Test
public void testEnglishShortText() {
assertDetects("eng", "The children are playing in the park");
}
@Test
public void testFrenchShortText() {
assertDetects("fra", "Le chat est sur le tapis et");
}
@Test
public void testGermanShortText() {
assertDetects("deu", "Der Hund sa�� auf der Matte");
}
@Test
public void testSpanishShortText() {
assertDetects("spa", "El ni��o juega en el jard��n con sus amigos");
}
@Test
public void testJapaneseShortText() {
assertDetects("jpn", "������������������������������");
}
@Test
public void testKoreanShortText() {
assertDetects("kor", "��������� ������������������");
}
/**
* TIKA-4662 regression: UCAS (Inuktitut syllabics) text must NOT be
* classified as English. The inference-time script gate should zero out
* all Latin-script classes when the input is predominantly UCAS.
*/
@Test
public void testUcasTextNotEnglish() {
CharSoupLanguageDetector detector = new CharSoupLanguageDetector();
detector.addText("������������ ���������������������������");
List<LanguageResult> results = detector.detectAll();
assertTrue(results.size() > 0, "detectAll returned no results");
assertNotEquals("eng", results.get(0).getLanguage(),
"UCAS text must not be classified as English");
}
// -----------------------------------------------------------------------
// Model-routing / switching-logic tests
// -----------------------------------------------------------------------
/**
* Strategy.STANDARD must always use the general model, even for very short text.
* The general model has more classes than the short-text model, so we verify the
* result language comes from the general model's label set.
*/
@Test
public void testStrategyStandardAlwaysUsesGeneralModel() {
CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(
Map.of("strategy", "STANDARD"));
CharSoupLanguageDetector detector = new CharSoupLanguageDetector(cfg);
// "Hi" is only 2 chars ��� well below length threshold ��� but STANDARD forces general model
detector.addText("The children are playing");
List<LanguageResult> results = detector.detectAll();
assertTrue(results.size() > 0);
// Result label must be from the general model's label set
Set<String> generalLabels = Arrays.stream(CharSoupLanguageDetector.MODEL.getLabels())
.collect(Collectors.toSet());
assertTrue(generalLabels.contains(results.get(0).getLanguage()),
"STANDARD strategy must return a label from the general model");
}
/**
* Strategy.SHORT_TEXT must use the short-text model when it is loaded.
* Skipped if the short-text model binary is absent from the classpath.
*/
@Test
public void testStrategyShortTextUsesShortModel() {
Assumptions.assumeTrue(CharSoupLanguageDetector.SHORT_TEXT_MODEL != null,
"Short-text model not loaded ��� skipping routing test");
CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(
Map.of("strategy", "SHORT_TEXT"));
CharSoupLanguageDetector detector = new CharSoupLanguageDetector(cfg);
// Long input ��� but SHORT_TEXT strategy forces the short-text model regardless
detector.addText("The quick brown fox jumps over the lazy dog and then some more words");
List<LanguageResult> results = detector.detectAll();
assertTrue(results.size() > 0);
// Result label must be from the short-text model's label set
Set<String> shortLabels = Arrays.stream(CharSoupLanguageDetector.SHORT_TEXT_MODEL.getLabels())
.collect(Collectors.toSet());
assertTrue(shortLabels.contains(results.get(0).getLanguage()),
"SHORT_TEXT strategy must return a label from the short-text model");
}
/**
* Strategy.AUTOMATIC with text shorter than the length threshold should
* route to the short-text model when it is loaded.
*/
@Test
public void testAutomaticRoutesShortInputToShortModel() {
Assumptions.assumeTrue(CharSoupLanguageDetector.SHORT_TEXT_MODEL != null,
"Short-text model not loaded ��� skipping routing test");
// Use a threshold larger than our input so the length gate fires
int threshold = 500;
CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(Map.of(
"strategy", "AUTOMATIC",
"lengthThreshold", threshold,
"featureThreshold", 0)); // disable feature gate
CharSoupLanguageDetector detector = new CharSoupLanguageDetector(cfg);
// Text is well below the threshold
detector.addText("The children are playing");
List<LanguageResult> results = detector.detectAll();
assertTrue(results.size() > 0);
Set<String> shortLabels = Arrays.stream(CharSoupLanguageDetector.SHORT_TEXT_MODEL.getLabels())
.collect(Collectors.toSet());
assertTrue(shortLabels.contains(results.get(0).getLanguage()),
"AUTOMATIC with short input must route to short-text model");
}
/**
* Strategy.AUTOMATIC with text longer than the length threshold and above the
* feature threshold should route to the general model.
*/
@Test
public void testAutomaticRoutesLongInputToGeneralModel() {
// Use thresholds of 0 so neither gate fires
CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(Map.of(
"strategy", "AUTOMATIC",
"lengthThreshold", 0,
"featureThreshold", 0));
CharSoupLanguageDetector detector = new CharSoupLanguageDetector(cfg);
detector.addText("The children are playing in the park on a sunny afternoon");
List<LanguageResult> results = detector.detectAll();
assertTrue(results.size() > 0);
Set<String> generalLabels = Arrays.stream(CharSoupLanguageDetector.MODEL.getLabels())
.collect(Collectors.toSet());
assertTrue(generalLabels.contains(results.get(0).getLanguage()),
"AUTOMATIC with both gates disabled must route to general model");
}
/**
* Degenerate input: 1 KB of whitespace followed by a single short word.
* The feature-density gate must fire and route to the short-text model,
* even though the raw character length far exceeds the length threshold.
*/
@Test
public void testFeatureDensityGateCatchesDegenerateInput() {
Assumptions.assumeTrue(CharSoupLanguageDetector.SHORT_TEXT_MODEL != null,
"Short-text model not loaded ��� skipping routing test");
CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(Map.of(
"strategy", "AUTOMATIC",
"lengthThreshold", 10, // tiny ��� length gate won't fire on 1 KB
"featureThreshold", 1000)); // large ��� ensures feature gate fires
CharSoupLanguageDetector detector = new CharSoupLanguageDetector(cfg);
// 1 KB of spaces + a real sentence: length >> threshold, but emissions << featureThreshold
String degenerate = " ".repeat(1000) + "The children are playing in the park";
detector.addText(degenerate);
List<LanguageResult> results = detector.detectAll();
assertTrue(results.size() > 0);
Set<String> shortLabels = Arrays.stream(CharSoupLanguageDetector.SHORT_TEXT_MODEL.getLabels())
.collect(Collectors.toSet());
assertTrue(shortLabels.contains(results.get(0).getLanguage()),
"Degenerate input (whitespace + single word) must route to short-text model "
+ "via feature-density gate");
}
/**
* ParseContext injection: a per-document config supplied via ParseContext
* must override the detector's constructed config for that document only.
*/
@Test
public void testParseContextConfigInjection() {
// Detector constructed with AUTOMATIC
CharSoupLanguageDetector detector = new CharSoupLanguageDetector();
// Inject STANDARD override via ParseContext
ParseContext ctx = new ParseContext();
ctx.set(CharSoupDetectorConfig.class,
CharSoupDetectorConfig.fromMap(Map.of("strategy", "STANDARD")));
detector.reset(ctx);
detector.addText("The children are playing in the park");
List<LanguageResult> results = detector.detectAll();
assertTrue(results.size() > 0);
// Must have come from the general model
Set<String> generalLabels = Arrays.stream(CharSoupLanguageDetector.MODEL.getLabels())
.collect(Collectors.toSet());
assertTrue(generalLabels.contains(results.get(0).getLanguage()),
"ParseContext-injected STANDARD config must use general model");
// After a plain reset(), the detector reverts to AUTOMATIC (constructed config)
detector.reset();
// internal activeConfig should be back to DEFAULT ��� verified indirectly by
// confirming subsequent detection still works
detector.addText("The children are playing in the park");
assertNotNull(detector.detectAll());
}
// -----------------------------------------------------------------------
// Helpers
// -----------------------------------------------------------------------
private static void assertDetects(String expectedLang, String text) {
CharSoupLanguageDetector detector = new CharSoupLanguageDetector();
detector.addText(text);
List<LanguageResult> results = detector.detectAll();
assertTrue(results.size() > 0,
"detectAll returned no results for: " + text);
LanguageResult top = results.get(0);
assertEquals(expectedLang, top.getLanguage(),
String.format(Locale.US,
"Expected '%s' but got '%s' (score=%.3f) for: %s",
expectedLang, top.getLanguage(), top.getRawScore(), text));
}
}