SjisLangSignalTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.langdetect.charsoup;

import java.nio.charset.Charset;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;

import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

import org.apache.tika.langdetect.opennlp.OpenNLPDetector;
import org.apache.tika.language.detect.LanguageResult;

/**
 * Deep-dive: do language detectors give a stronger Japanese signal for the
 * correct Shift-JIS decoding of a zip entry name than for plausible single-byte
 * alternatives?
 *
 * Raw bytes (testZipEntryNameCharsetShiftSJIS.zip, entry "������1.txt"):
 *   0x95 0xb6 0x8f 0xcd 0x31 0x2e 0x74 0x78 0x74
 *
 * Shift_JIS  ���  ������1.txt   (correct)
 * windows-1257 ���  ���������1.txt  (plausible single-byte alternative)
 *
 * <h2>Findings (Mar 2026)</h2>
 * <p>
 * At 9 raw bytes, both CharSoup and OpenNLP return pure noise ��� all scores are
 * NONE confidence (~0.007). The detected language at this length is entirely a
 * hash-collision artifact: only 4 letters survive the isLetter() filter
 * (��, t, x, t from windows-1257; ���, ���, t, x, t from Shift-JIS), producing
 * 6-7 bigrams out of 65,536 buckets. Whichever language has the highest weights
 * in those specific buckets "wins" ��� no linguistic signal whatsoever.
 * </p>
 * <p>
 * Padding/tiling the bytes (to 100 or 300 bytes) makes things actively worse:
 * CharSoup becomes confidently wrong. Tiled Shift-JIS ("���������������") is detected as
 * Chinese (zho, maxLogit = -8 at 100 bytes, -29 at 300 bytes) because:
 * (a) ������ IS valid Chinese text, and (b) the repetition creates extreme bigram
 * entropy that the model interprets as non-language content, collapsing the logit.
 * Meanwhile windows-1250 ("������������") scores maxLogit = +9 to +26 as Catalan
 * from the repeated Latin diacritics. Language detection is fundamentally
 * unsuitable for short charset arbitration.
 * </p>
 *
 * <h2>Historical context: why ICU4J and Universal needed padding</h2>
 * <p>
 * ICU4J (CharsetRecog_sjis) uses double-byte sequence counting and a common-char
 * frequency table. At 9 bytes it sees only 2 double-byte chars (��� 10 threshold)
 * and returns confidence=10 ��� too low to win. Even with tiling, ������ (0x95B6,
 * 0x8FCD) are NOT in ICU4J's commonChars list (which contains hiragana/katakana),
 * so commonCharCount=0 and confidence stays at 10 regardless. ICU4J fails at
 * both raw and padded lengths for this specific pair of kanji.
 * </p>
 * <p>
 * Universal (juniversalchardet) uses byte n-gram frequency profiles. At 9 bytes
 * the sample is too small to build a reliable distribution. At 100 bytes (~11
 * repetitions of the same 9 bytes), the n-gram frequency profile becomes large
 * enough that Universal correctly identifies Shift-JIS. This is why the original
 * ZipParser tiled short byte arrays to 100 bytes before calling the encoding
 * detector ��� it was load-bearing for Universal, but a hack that confused
 * Mojibuster's byte n-gram model with artificial repetition patterns.
 * </p>
 * <p>
 * The correct solution (implemented) is structural CJK grammar validation in
 * MojibusterEncodingDetector.refineCjkResults: 0x95B6 and 0x8FCD parse as
 * exactly 2 valid Shift-JIS double-byte sequences with zero parse errors, which
 * is conclusive structural evidence independent of sample size or language models.
 * See also: ZipFilenameDetectionTest in tika-encoding-detector-mojibuster.
 * </p>
 */
public class SjisLangSignalTest {

    // Raw bytes for "������1.txt" encoded as Shift-JIS
    private static final byte[] RAW =
            {(byte) 0x95, (byte) 0xb6, (byte) 0x8f, (byte) 0xcd,
             0x31, 0x2e, 0x74, 0x78, 0x74};

    private static final String[] CHARSETS = {
            "Shift_JIS", "windows-1257", "GB18030", "EUC-JP",
            "windows-1252", "windows-1250", "ISO-8859-2"
    };

    @Disabled("Diagnostic only ��� run manually to evaluate a new language detector against SJIS zip filenames")
    @Test
    public void diagnoseLanguageSignals() throws Exception {
        System.out.println("=== Language detector signals for SJIS zip entry name ===");
        System.out.printf(Locale.ROOT, "Raw bytes (%d): %s%n%n", RAW.length, hexDump(RAW));

        for (int padTo : new int[]{RAW.length, 100, 300}) {
            byte[] probe = tile(RAW, padTo);
            System.out.printf(Locale.ROOT,
                    "======== probe length = %d bytes (tiled x%.1f) ========%n",
                    probe.length, (double) probe.length / RAW.length);

            Map<Charset, String> candidates = new LinkedHashMap<>();
            for (String name : CHARSETS) {
                try {
                    Charset cs = Charset.forName(name);
                    candidates.put(cs, new String(probe, cs));
                } catch (Exception e) {
                    // unsupported charset, skip
                }
            }

            System.out.printf(Locale.ROOT, "%-20s  %-35s  %-9s%n",
                    "Charset", "Decoded (first 30 chars)", "JunkRatio");
            System.out.println("-".repeat(70));
            for (Map.Entry<Charset, String> e : candidates.entrySet()) {
                String preview = e.getValue().length() > 30
                        ? e.getValue().substring(0, 30) + "���" : e.getValue();
                System.out.printf(Locale.ROOT, "%-20s  %-35s  %.3f%n",
                        e.getKey().name(), preview, computeJunkRatio(e.getValue()));
            }
            System.out.println();

            System.out.println("  CharSoup top-lang:");
            for (Map.Entry<Charset, String> e : candidates.entrySet()) {
                if (computeJunkRatio(e.getValue()) > 0.10f) {
                    System.out.printf(Locale.ROOT, "    %-20s: skipped (junk > 10%%)%n",
                            e.getKey().name());
                    continue;
                }
                CharSoupLanguageDetector csd = new CharSoupLanguageDetector();
                csd.addText(e.getValue().toCharArray(), 0, e.getValue().length());
                List<LanguageResult> csdResults = csd.detectAll();
                String topLang = csdResults.isEmpty() ? "?" : csdResults.get(0).getLanguage();
                float conf = csdResults.isEmpty() ? 0f : csdResults.get(0).getRawScore();
                System.out.printf(Locale.ROOT,
                        "    %-20s: conf=%7.4f  topLang=%s%n",
                        e.getKey().name(), conf, topLang);
            }
            System.out.println();

            System.out.println("  OpenNLP top-3:");
            for (Map.Entry<Charset, String> e : candidates.entrySet()) {
                OpenNLPDetector d = new OpenNLPDetector();
                d.addText(e.getValue().toCharArray(), 0, e.getValue().length());
                List<LanguageResult> r = d.detectAll();
                System.out.printf(Locale.ROOT, "    %-20s: %s%n",
                        e.getKey().name(), r.subList(0, Math.min(3, r.size())));
            }
            System.out.println();

            CharSoupLanguageDetector csd = new CharSoupLanguageDetector();
            Charset winner = csd.compareLanguageSignal(candidates);
            System.out.printf(Locale.ROOT,
                    "  compareLanguageSignal winner: %s%n%n",
                    winner != null ? winner + " -> \"" + candidates.get(winner) + "\"" :
                            "(inconclusive ��� all below threshold)");
        }
    }

    private static byte[] tile(byte[] src, int targetLen) {
        if (src.length >= targetLen) {
            return src;
        }
        byte[] out = new byte[targetLen];
        for (int i = 0; i < targetLen; i++) {
            out[i] = src[i % src.length];
        }
        return out;
    }

    private static float computeJunkRatio(String text) {
        if (text == null || text.isEmpty()) {
            return 0f;
        }
        int junk = 0, total = 0;
        for (int i = 0; i < text.length(); ) {
            int cp = text.codePointAt(i);
            i += Character.charCount(cp);
            total++;
            if (cp == 0xFFFD || Character.isISOControl(cp)) {
                junk++;
            }
        }
        return total == 0 ? 0f : (float) junk / total;
    }

    @Test
    public void debugModelRoutingForShortEnglish() {
        String text = "the quick brown fox jumped the lazy dog elephant elephant elephant bear bear";
        System.out.println(String.format(Locale.ROOT,
                "%ntext length=%d  SHORT_TEXT_LENGTH_THRESHOLD=%d  SHORT_TEXT_MODEL_loaded=%b",
                text.length(),
                CharSoupLanguageDetector.SHORT_TEXT_LENGTH_THRESHOLD,
                CharSoupLanguageDetector.SHORT_TEXT_MODEL != null));

        System.out.println("--- AUTOMATIC (default) ---");
        printTop5(text, new CharSoupLanguageDetector());

        if (CharSoupLanguageDetector.SHORT_TEXT_MODEL != null) {
            System.out.println("--- forced SHORT_TEXT ---");
            CharSoupDetectorConfig shortCfg = CharSoupDetectorConfig.fromMap(
                    Map.of("strategy", "SHORT_TEXT"));
            printTop5(text, new CharSoupLanguageDetector(shortCfg));
        }

        System.out.println("--- forced STANDARD (v7) ---");
        CharSoupDetectorConfig stdCfg = CharSoupDetectorConfig.fromMap(
                Map.of("strategy", "STANDARD"));
        printTop5(text, new CharSoupLanguageDetector(stdCfg));
    }

    private static void printTop5(String text, CharSoupLanguageDetector d) {
        d.addText(text);
        List<LanguageResult> results = d.detectAll();
        for (int i = 0; i < Math.min(5, results.size()); i++) {
            LanguageResult r = results.get(i);
            System.out.println(String.format(Locale.ROOT, "  %d: %-8s raw=%.4f conf=%s",
                    i + 1, r.getLanguage(), r.getRawScore(), r.getConfidence()));
        }
        // find where key languages rank
        for (String lang : new String[]{"eng", "nld", "afr", "deu", "xho", "sna"}) {
            for (int i = 0; i < results.size(); i++) {
                if (lang.equals(results.get(i).getLanguage())) {
                    LanguageResult r = results.get(i);
                    System.out.println(String.format(Locale.ROOT,
                            "  [%s rank=%d raw=%.4f]", lang, i + 1, r.getRawScore()));
                    break;
                }
            }
        }
    }

    private static String hexDump(byte[] bytes) {
        StringBuilder sb = new StringBuilder("[");
        for (int i = 0; i < bytes.length; i++) {
            if (i > 0) {
                sb.append(", ");
            }
            sb.append(String.format(Locale.ROOT, "0x%02x", bytes[i] & 0xff));
        }
        return sb.append("]").toString();
    }
}