LangIdRegressionTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.langdetect.charsoup;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.util.List;
import java.util.Locale;

import org.junit.jupiter.api.Test;

import org.apache.tika.language.detect.LanguageResult;

/**
 * End-to-end regression test for {@link CharSoupLanguageDetector}.
 * <p>
 * Each case runs a representative sentence through the full pipeline
 * (preprocessing ��� bigram extraction ��� softmax ��� group collapsing) and checks:
 * <ol>
 *   <li>The top-ranked language is the expected ISO 639-3 code.</li>
 *   <li>The raw score exceeds {@link #MIN_SCORE}, confirming the model is
 *       genuinely confident rather than picking the least-bad option.</li>
 * </ol>
 * <p>
 * <strong>If this test fails after a code change</strong> you have almost
 * certainly altered {@link CharSoupFeatureExtractor} in a way that diverges
 * from how the model was trained. Either revert the change or retrain.
 * <p>
 * <strong>If this test fails after deliberately retraining</strong> update
 * the expected language codes here to match the new model ��� do not lower
 * {@link #MIN_SCORE} to paper over a quality regression.
 */
public class LangIdRegressionTest {

    /**
     * Minimum raw softmax score the correct language must achieve.
     * Unambiguous sentences in supported languages should comfortably exceed 0.5.
     * Lowering this threshold to make a failing test pass is a sign that the
     * model or the feature pipeline has regressed.
     */
    private static final float MIN_SCORE = 0.50f;

    @Test
    public void testDetectsCorrectLanguage() {
        // Latin-script languages
        assertDetects("eng",
                "Scientists have discovered a new species of butterfly in the "
                        + "tropical rainforests of South America.");
        assertDetects("deu",
                "Der Deutsche Bundestag ist das Parlament der Bundesrepublik Deutschland "
                        + "und besteht aus direkt gew��hlten Abgeordneten.");
        assertDetects("fra",
                "La R��publique fran��aise est une d��mocratie la��que dont la devise est "
                        + "Libert��, ��galit��, Fraternit��.");
        assertDetects("spa",
                "El espa��ol es una lengua romance procedente del lat��n vulgar hablada "
                        + "principalmente en Espa��a y en Am��rica Latina.");
        assertDetects("por",
                "A l��ngua portuguesa �� uma l��ngua indo-europeia rom��nica falada "
                        + "principalmente em Portugal e no Brasil.");
        // TODO: Italian (ita) is systematically predicted as Corsican (cos)
        // in the current 220-lang model due to MADLAD data contamination in
        // the cos pool.  Remove cos from the training pool and retrain.
        // assertDetects("ita", "La lingua italiana �� una lingua romanza parlata principalmente "
        //         + "in Italia e in alcune regioni limitrofe.");
        assertDetects("nld",
                "Het Nederlands is een West-Germaanse taal die in Nederland, "
                        + "Belgi�� en Suriname als offici��le taal wordt gebruikt.");
        assertDetects("pol",
                "J��zyk polski jest j��zykiem z grupy s��owia��skiej i nale��y do rodziny "
                        + "indoeuropejskiej, urz��dowym j��zykiem Polski.");
        assertDetects("swe",
                "Svenska ��r ett ��stnordiskt spr��k som talas av ungef��r tio miljoner "
                        + "personer, fr��mst i Sverige och Finland.");
        assertDetects("tur",
                "T��rk��e, T��rk dil ailesinin O��uz grubuna ait bir dil olup "
                        + "a����rl��kl�� olarak T��rkiye'de konu��ulmaktad��r.");
        assertDetects("fin",
                "Suomi on uralilainen kieli, jota puhutaan p����asiassa Suomessa "
                        + "ja joka on maan virallinen kieli.");

        // Cyrillic-script languages
        assertDetects("rus",
                "�������������������� ������������������ ��� ���������������������� �� ������������������ ������������ �� ���������������� "
                        + "��������, �������������������� ������������ ���������� �� �������� ���� ��������������������.");
        // TODO: Ukrainian (ukr) is systematically predicted as Erzya (myv)
        // in the current 220-lang model due to MADLAD data contamination.
        // Remove myv from the training pool and retrain.
        // assertDetects("ukr", "�������������������� �������� �� ��������'�������������� ���������� �� �� ������������������ ���������� ��������������, "
        //         + "������������ �� ���������������������������� ������ �� ������������.");
        assertDetects("bul",
                "���������������������� �������� �� ���������������������������� �������� ���� �������������� ���� ������������������������������ "
                        + "���������� �� �� ������������������ �������� ���� ������������������ ����������������.");

        // Arabic-script languages
        assertDetects("ara",
                "���������� �������������� �������� ������������ ���������������� ������ �������� ������������ ���������������� "
                        + "�������������� ���� ������������ ������������ ������������ ������������.");
        // pes (Iranian Persian) is merged into fas at training time; prs (Dari)
        // is not present in the MADLAD corpus so the model only outputs fas.
        assertDetects("fas",
                "�������� ���������� ������ ���� ����������������� ���������������������� ������ ���� ���� ������������ ������������������ "
                        + "�� ������������������ ���� ���������� �������� �������� �������������� �������������.");

        // CJK and other East Asian
        assertDetects("zho",
                "���������������������������������������������������������������������������������������������������������������");
        assertDetects("jpn",
                "���������������������������������������������������������������������������������������������������������������������������������");
        assertDetects("kor",
                "������������ ��������������� ������������������������������������ ��������������� ��� ��������������� ������ ������������.");

        // Indic scripts
        assertDetects("hin",
                "������������������ ������������ ������ ��������������������� ������ ������ ������������ ��������� ������������ ������������ ������������ ������������ ������������ ������������������ ��������� ������ ������ ���������");
        assertDetects("ben",
                "��������������� ������������ ������������������������������ ��������������������������������� ��������� ������������������ ������������������������������ ��������������������� ������������������ ���������������");

        // Greek
        assertDetects("ell",
                "�� ���������������� ������������ ���������� ������ ������ ������ ������ ������������ �������������� �������� ���������� "
                        + "������ �� �������������� ������������ ������ �������������� ������ ������ ������������.");
    }

    private static void assertDetects(String expectedLang, String text) {
        CharSoupLanguageDetector detector = new CharSoupLanguageDetector();
        detector.addText(text);
        List<LanguageResult> results = detector.detectAll();

        assertTrue(results.size() > 0,
                "detectAll returned no results for: " + text.substring(0, 30));

        LanguageResult top = results.get(0);
        assertEquals(expectedLang, top.getLanguage(),
                String.format(Locale.US, "Expected '%s' but got '%s' (score=%.3f) for: %s",
                        expectedLang, top.getLanguage(), top.getRawScore(),
                        text.substring(0, Math.min(50, text.length()))));

        assertTrue(top.getRawScore() >= MIN_SCORE,
                String.format(Locale.US, "'%s' score %.3f is below MIN_SCORE %.3f for: %s",
                        expectedLang, top.getRawScore(), MIN_SCORE,
                        text.substring(0, Math.min(50, text.length()))));
    }
}