CharSoupModelRoutingTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.langdetect.charsoup;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

import org.junit.jupiter.api.Test;

import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.parser.ParseContext;

/**
 * Correctness and strategy-routing tests for {@link CharSoupLanguageDetector}.
 */
public class CharSoupModelRoutingTest {

    @Test
    public void testEnglishShortText() {
        assertDetects("eng", "The children are playing in the park");
    }

    @Test
    public void testFrenchShortText() {
        assertDetects("fra", "Le chat est sur le tapis et");
    }

    @Test
    public void testGermanShortText() {
        assertDetects("deu", "Der Hund sa�� auf der Matte");
    }

    @Test
    public void testSpanishShortText() {
        assertDetects("spa", "El ni��o juega en el jard��n con sus amigos");
    }

    @Test
    public void testJapaneseShortText() {
        assertDetects("jpn", "������������������������������");
    }

    /** Mixed kanji + hiragana + katakana ��� typical real Japanese. */
    @Test
    public void testJapaneseMixedScript() {
        assertDetects("jpn", "������������������������������������������������������������������������������������������������");
    }

    @Test
    public void testKoreanShortText() {
        assertDetects("kor", "��������� ������������������");
    }

    @Test
    public void testUcasTextNotEnglish() {
        CharSoupLanguageDetector detector = new CharSoupLanguageDetector();
        detector.addText("������������ ���������������������������");
        List<LanguageResult> results = detector.detectAll();
        assertTrue(results.size() > 0, "detectAll returned no results");
        assertNotEquals("eng", results.get(0).getLanguage(),
                "UCAS text must not be classified as English");
    }

    // -----------------------------------------------------------------------
    // Strategy tests
    // -----------------------------------------------------------------------

    @Test
    public void testStrategyStandardNoGlm() {
        CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(
                Map.of("strategy", "STANDARD"));
        CharSoupLanguageDetector detector = new CharSoupLanguageDetector(cfg);
        detector.addText("The children are playing");
        List<LanguageResult> results = detector.detectAll();
        assertTrue(results.size() > 0);
        Set<String> labels = Arrays.stream(CharSoupLanguageDetector.MODEL.getLabels())
                .collect(Collectors.toSet());
        assertTrue(labels.contains(results.get(0).getLanguage()),
                "STANDARD strategy must return a label from the discriminative model");
    }

    @Test
    public void testStrategyGlm() {
        CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(
                Map.of("strategy", "GLM"));
        CharSoupLanguageDetector detector = new CharSoupLanguageDetector(cfg);
        detector.addText("The children are playing in the park on a sunny afternoon");
        List<LanguageResult> results = detector.detectAll();
        assertTrue(results.size() > 0);
        Set<String> labels = Arrays.stream(CharSoupLanguageDetector.MODEL.getLabels())
                .collect(Collectors.toSet());
        assertTrue(labels.contains(results.get(0).getLanguage()),
                "GLM strategy must return a label from the discriminative model");
    }

    @Test
    public void testStrategyAutomatic() {
        CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(
                Map.of("strategy", "AUTOMATIC"));
        CharSoupLanguageDetector detector = new CharSoupLanguageDetector(cfg);
        detector.addText("The children are playing in the park on a sunny afternoon");
        List<LanguageResult> results = detector.detectAll();
        assertTrue(results.size() > 0);
    }

    @Test
    public void testParseContextConfigInjection() {
        CharSoupLanguageDetector detector = new CharSoupLanguageDetector();

        ParseContext ctx = new ParseContext();
        ctx.set(CharSoupDetectorConfig.class,
                CharSoupDetectorConfig.fromMap(Map.of("strategy", "STANDARD")));
        detector.reset(ctx);

        detector.addText("The children are playing in the park");
        List<LanguageResult> results = detector.detectAll();
        assertTrue(results.size() > 0);

        Set<String> generalLabels = Arrays.stream(CharSoupLanguageDetector.MODEL.getLabels())
                .collect(Collectors.toSet());
        assertTrue(generalLabels.contains(results.get(0).getLanguage()),
                "ParseContext-injected STANDARD config must use general model");

        detector.reset();
        detector.addText("The children are playing in the park");
        assertNotNull(detector.detectAll());
    }

    // -----------------------------------------------------------------------
    // Helpers
    // -----------------------------------------------------------------------

    private static void assertDetects(String expectedLang, String text) {
        CharSoupLanguageDetector detector = new CharSoupLanguageDetector();
        detector.addText(text);
        List<LanguageResult> results = detector.detectAll();

        assertTrue(results.size() > 0,
                "detectAll returned no results for: " + text);

        LanguageResult top = results.get(0);
        assertEquals(expectedLang, top.getLanguage(),
                String.format(Locale.US,
                        "Expected '%s' but got '%s' (score=%.3f) for: %s",
                        expectedLang, top.getLanguage(), top.getRawScore(), text));
    }
}