CharSoupFeatureExtractorTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.langdetect.charsoup;

import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

import org.junit.jupiter.api.Test;

public class CharSoupFeatureExtractorTest {

    private static final int NUM_BUCKETS = 1024;

    @Test
    public void testNullAndEmpty() {
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        int[] counts = ext.extract(null);
        assertEquals(NUM_BUCKETS, counts.length);
        assertEquals(0, sum(counts));

        counts = ext.extract("");
        assertEquals(0, sum(counts));
    }

    @Test
    public void testHelloWorld() {
        // "Hello world" ��� lowercase ��� "hello world"
        // bigrams: _h, he, el, ll, lo, o_, _w, wo, or, rl, ld, d_
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        int[] counts = ext.extract("Hello world");
        assertEquals(12, sum(counts));
    }

    @Test
    public void testCJK() {
        // "������������" ��� bigrams: _���, ������, ������, ������, ���_
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        int[] counts = ext.extract("������������");
        assertEquals(5, sum(counts));
    }

    @Test
    public void testSentinels() {
        // Single letter "a" ��� _a, a_ = 2 bigrams
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        int[] counts = ext.extract("a");
        assertEquals(2, sum(counts));
    }

    @Test
    public void testDigitsAndPunctuation() {
        // "ab 12 cd" ��� letters: a,b then c,d (separated by non-letters)
        // bigrams: _a, ab, b_, _c, cd, d_ = 6
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        int[] counts = ext.extract("ab 12 cd");
        assertEquals(6, sum(counts));
    }

    @Test
    public void testCaseFolding() {
        // "ABC" and "abc" should produce identical features
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        int[] upper = ext.extract("ABC");
        int[] lower = ext.extract("abc");
        assertArrayEquals(upper, lower);
    }

    @Test
    public void testNFCNormalization() {
        // �� as composed (U+00E9) vs decomposed (e + U+0301) should produce same features
        String composed = "\u00e9";        // �� (NFC)
        String decomposed = "e\u0301";     // e + combining acute (NFD)
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        int[] countsC = ext.extract(composed);
        int[] countsD = ext.extract(decomposed);
        assertArrayEquals(countsC, countsD);
    }

    @Test
    public void testURLStripping() {
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        int[] withUrl = ext.extract("hello https://example.com/path world");
        int[] withoutUrl = ext.extract("hello  world");
        assertArrayEquals(withUrl, withoutUrl);
    }

    @Test
    public void testEmailStripping() {
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        int[] withEmail = ext.extract("hello user@example.com world");
        int[] withoutEmail = ext.extract("hello  world");
        assertArrayEquals(withEmail, withoutEmail);
    }

    @Test
    public void testSurrogatePairs() {
        // ���� (U+1D400, Mathematical Bold Capital A) ��� supplementary char
        String text = "\uD835\uDC00"; // U+1D400
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        int[] counts = ext.extract(text);
        // Single letter ��� _X, X_ = 2 bigrams
        assertEquals(2, sum(counts));
    }

    @Test
    public void testFNVDeterminism() {
        // Same input ��� same hash, always
        int h1 = CharSoupFeatureExtractor.hashBigram('a', 'b');
        int h2 = CharSoupFeatureExtractor.hashBigram('a', 'b');
        assertEquals(h1, h2);

        // Different input ��� different hash (with overwhelming probability)
        int h3 = CharSoupFeatureExtractor.hashBigram('b', 'a');
        assertNotEquals(h1, h3);
    }

    @Test
    public void testFNVDistribution() {
        // Check that FNV-1a distributes across buckets reasonably
        int numBuckets = 256;
        int[] bucketHits = new int[numBuckets];
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(numBuckets);
        for (int cp1 = 'a'; cp1 <= 'z'; cp1++) {
            for (int cp2 = 'a'; cp2 <= 'z'; cp2++) {
                int bucket = ext.bucketIndex(cp1, cp2);
                assertTrue(bucket >= 0 && bucket < numBuckets);
                bucketHits[bucket]++;
            }
        }
        // 676 bigrams across 256 buckets ��� average ~2.6 per bucket
        // At least 200 buckets should be hit (no extreme clustering)
        int nonEmpty = 0;
        for (int h : bucketHits) {
            if (h > 0) nonEmpty++;
        }
        assertTrue(nonEmpty > 200, "FNV should distribute well: " + nonEmpty + " buckets hit");
    }

    @Test
    public void testTruncation() {
        // Text longer than MAX_TEXT_LENGTH should be truncated
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < CharSoupFeatureExtractor.MAX_TEXT_LENGTH + 1000; i++) {
            sb.append('a');
        }
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        // Should not throw
        int[] counts = ext.extract(sb.toString());
        assertTrue(sum(counts) > 0);
    }

    @Test
    public void testOnlyNonLetters() {
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        int[] counts = ext.extract("12345 !@#$%");
        assertEquals(0, sum(counts));
    }

    // --- Transparent character tests ---

    @Test
    public void testArabicDiacriticsTransparent() {
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        // ������������ (kaf + fatha + ta + fatha + ba + fatha)
        String withDiacritics = "\u0643\u064E\u062A\u064E\u0628\u064E";
        // ������ (kaf + ta + ba) ��� same word, no diacritics
        String withoutDiacritics = "\u0643\u062A\u0628";
        int[] countsWith = ext.extract(withDiacritics);
        int[] countsWithout = ext.extract(withoutDiacritics);
        assertArrayEquals(countsWithout, countsWith,
                "Arabic diacritics should be transparent ��� bigrams must be identical");
    }

    @Test
    public void testArabicShadda() {
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        // ���������� (ain + lam + shadda + fatha + mim)
        String withShadda = "\u0639\u0644\u0651\u064E\u0645";
        // ������ (ain + lam + mim)
        String withoutShadda = "\u0639\u0644\u0645";
        int[] countsWith = ext.extract(withShadda);
        int[] countsWithout = ext.extract(withoutShadda);
        assertArrayEquals(countsWithout, countsWith,
                "Shadda (gemination mark) should be transparent");
    }

    @Test
    public void testArabicTatweel() {
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        // ���������� (kaf + tatweel + ta + tatweel + ba)
        String withTatweel = "\u0643\u0640\u062A\u0640\u0628";
        // ������ (kaf + ta + ba)
        String withoutTatweel = "\u0643\u062A\u0628";
        int[] countsWith = ext.extract(withTatweel);
        int[] countsWithout = ext.extract(withoutTatweel);
        assertArrayEquals(countsWithout, countsWith,
                "Tatweel (kashida) should be transparent ��� bigrams must be identical");
    }

    @Test
    public void testHebrewNiqqud() {
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        // �������������� (shin + qamats + lamed + vav + holam + mem)
        String withNiqqud = "\u05E9\u05B8\u05C1\u05DC\u05D5\u05B9\u05DD";
        // �������� (shin + lamed + vav + mem) ��� no niqqud
        String withoutNiqqud = "\u05E9\u05DC\u05D5\u05DD";
        int[] countsWith = ext.extract(withNiqqud);
        int[] countsWithout = ext.extract(withoutNiqqud);
        assertArrayEquals(countsWithout, countsWith,
                "Hebrew niqqud should be transparent ��� bigrams must be identical");
    }

    @Test
    public void testZWNJTransparent() {
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        // Persian: ����������������� (mi + ZWNJ + khaaham) ��� ZWNJ is not a word boundary
        String withZWNJ = "\u0645\u06CC\u200C\u062E\u0648\u0627\u0647\u0645";
        // Same without ZWNJ
        String withoutZWNJ = "\u0645\u06CC\u062E\u0648\u0627\u0647\u0645";
        int[] countsWith = ext.extract(withZWNJ);
        int[] countsWithout = ext.extract(withoutZWNJ);
        assertArrayEquals(countsWithout, countsWith,
                "ZWNJ should be transparent ��� bigrams must span across it");
    }

    @Test
    public void testArabicVsArabicReversed() {
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        // ������ ��� bigrams: _��, ����, ����, ��_
        String normal = "\u0643\u062A\u0628";
        // ������ (reversed) ��� bigrams: _��, ����, ����, ��_
        String reversed = "\u0628\u062A\u0643";
        int[] countsNormal = ext.extract(normal);
        int[] countsReversed = ext.extract(reversed);
        // These must differ ��� reversed text produces different bigram distributions
        assertNotEquals(sum(countsNormal), 0);
        assertNotEquals(sum(countsReversed), 0);
        // Bigram counts should be equal (same number of bigrams) but in different buckets
        assertEquals(sum(countsNormal), sum(countsReversed));
        boolean differ = false;
        for (int j = 0; j < NUM_BUCKETS; j++) {
            if (countsNormal[j] != countsReversed[j]) {
                differ = true;
                break;
            }
        }
        assertTrue(differ, "Normal and reversed Arabic must produce different bigram distributions");
    }

    @Test
    public void testIsTransparent() {
        // Arabic harakat
        assertTrue(CharSoupFeatureExtractor.isTransparent(0x064E)); // fatha
        assertTrue(CharSoupFeatureExtractor.isTransparent(0x064F)); // damma
        assertTrue(CharSoupFeatureExtractor.isTransparent(0x0650)); // kasra
        assertTrue(CharSoupFeatureExtractor.isTransparent(0x0651)); // shadda
        assertTrue(CharSoupFeatureExtractor.isTransparent(0x0652)); // sukun
        assertTrue(CharSoupFeatureExtractor.isTransparent(0x0670)); // superscript alef
        // Hebrew niqqud
        assertTrue(CharSoupFeatureExtractor.isTransparent(0x05B0)); // sheva
        assertTrue(CharSoupFeatureExtractor.isTransparent(0x05B4)); // hiriq
        assertTrue(CharSoupFeatureExtractor.isTransparent(0x05C1)); // shin dot
        // Special characters
        assertTrue(CharSoupFeatureExtractor.isTransparent(0x0640)); // tatweel
        assertTrue(CharSoupFeatureExtractor.isTransparent(0x200C)); // ZWNJ
        assertTrue(CharSoupFeatureExtractor.isTransparent(0x200D)); // ZWJ
        // Not transparent
        assertFalse(CharSoupFeatureExtractor.isTransparent('a'));
        assertFalse(CharSoupFeatureExtractor.isTransparent('5'));
        assertFalse(CharSoupFeatureExtractor.isTransparent(' '));
        assertFalse(CharSoupFeatureExtractor.isTransparent(0x0643)); // Arabic kaf ��� a letter
    }

    // ---- Trigram tests ----

    @Test
    public void testTrigramsProduceMoreFeatures() {
        // "hello" ��� bigrams: _h, he, el, ll, lo, o_ = 6
        //         + trigrams: _he, hel, ell, llo, lo_, = 5
        //         = 11 total
        CharSoupFeatureExtractor biOnly = new CharSoupFeatureExtractor(NUM_BUCKETS, false);
        CharSoupFeatureExtractor biTri = new CharSoupFeatureExtractor(NUM_BUCKETS, true);

        int[] countsBi = biOnly.extract("hello");
        int[] countsBiTri = biTri.extract("hello");

        assertEquals(6, sum(countsBi));
        assertEquals(11, sum(countsBiTri));
    }

    @Test
    public void testTrigramsSingleWord() {
        // "ab" ��� bigrams: _a, ab, b_ = 3
        //       + trigrams: _ab, ab_ = 2
        //       = 5 total
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS, true);
        int[] counts = ext.extract("ab");
        assertEquals(5, sum(counts));
    }

    @Test
    public void testTrigramsSingleChar() {
        // "a" ��� bigrams: _a, a_ = 2
        //       trigrams: _a_ would need prevPrevCp set... let's check
        //       After _a: prevPrevCp=SENTINEL, prevCp=a. End-of-text trigram: (SENTINEL,a,SENTINEL)
        //       = 1 trigram
        //       = 3 total
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS, true);
        int[] counts = ext.extract("a");
        assertEquals(3, sum(counts));
    }

    @Test
    public void testTrigramsTwoWords() {
        // "ab cd" ��� bigrams: _a, ab, b_, _c, cd, d_ = 6
        //         + trigrams: _ab, ab_, _cd, cd_ = 4
        //         = 10 total
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS, true);
        int[] counts = ext.extract("ab cd");
        assertEquals(10, sum(counts));
    }

    @Test
    public void testTrigramsNoLetters() {
        // No letters ��� 0 features regardless of trigram mode
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS, true);
        int[] counts = ext.extract("12345");
        assertEquals(0, sum(counts));
    }

    @Test
    public void testTrigramsDisabledByDefault() {
        // Verify default constructor doesn't include trigrams
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        assertFalse(ext.isIncludeTrigrams());

        CharSoupFeatureExtractor extTri = new CharSoupFeatureExtractor(NUM_BUCKETS, true);
        assertTrue(extTri.isIncludeTrigrams());
    }

    @Test
    public void testTrigramHashesDifferFromBigrams() {
        // Verify that hashTrigram produces different values than hashBigram
        // for overlapping inputs (avoids systematic collisions)
        int biHash = CharSoupFeatureExtractor.hashBigram('a', 'b');
        int triHash = CharSoupFeatureExtractor.hashTrigram('a', 'b', 'c');
        assertNotEquals(biHash, triHash);
    }

    // ---- Accumulation tests (clear=false mode for training loops) ----

    @Test
    public void testAccumulateNoClear() {
        // When clear=false, extractFromPreprocessed should add to existing counts
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        String preprocessed = CharSoupFeatureExtractor.preprocess("hello world");

        int[] counts = new int[NUM_BUCKETS];
        ext.extractFromPreprocessed(preprocessed, counts, false);
        int firstSum = sum(counts);
        assertEquals(12, firstSum);

        // Extract again without clearing ��� counts should double
        ext.extractFromPreprocessed(preprocessed, counts, false);
        assertEquals(24, sum(counts));
    }

    @Test
    public void testAccumulateWithClear() {
        // When clear=true, extractFromPreprocessed should zero before extracting
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        String preprocessed = CharSoupFeatureExtractor.preprocess("hello world");

        int[] counts = new int[NUM_BUCKETS];
        ext.extractFromPreprocessed(preprocessed, counts, false);
        assertEquals(12, sum(counts));

        // Extract with clear ��� should be back to 12, not 24
        ext.extractFromPreprocessed(preprocessed, counts, true);
        assertEquals(12, sum(counts));
    }

    // ---- Growing-prefix invariant test ----

    @Test
    public void testGrowingPrefixProducesMoreFeatures() {
        // A larger prefix of text should produce >= bigram features than a
        // smaller prefix. This monotonicity property ensures that longer
        // text chunks always give the model at least as much signal.
        CharSoupFeatureExtractor ext = new CharSoupFeatureExtractor(NUM_BUCKETS);
        String text = "The quick brown fox jumps over the lazy dog near the river";
        String preprocessed = CharSoupFeatureExtractor.preprocess(text);

        int prevSum = 0;
        for (int prefixLen = 10; prefixLen <= preprocessed.length();
                prefixLen = Math.min(prefixLen * 2, preprocessed.length())) {
            String prefix = preprocessed.substring(0, prefixLen);
            int[] counts = ext.extractFromPreprocessed(prefix);
            int currentSum = sum(counts);
            assertTrue(currentSum >= prevSum,
                    "Prefix of " + prefixLen + " chars should produce >= features "
                    + "than shorter prefix: " + currentSum + " vs " + prevSum);
            prevSum = currentSum;
            if (prefixLen == preprocessed.length()) {
                break;
            }
        }
    }

    @Test
    public void testPreprocessIdempotent() {
        // Preprocessing the same text twice must produce identical output.
        // Idempotency is critical for deterministic detection results.
        String raw = "Hello world! Visit https://example.com for more.";
        String pp1 = CharSoupFeatureExtractor.preprocess(raw.substring(0, 20));
        String pp2 = CharSoupFeatureExtractor.preprocess(raw.substring(0, 20));
        assertEquals(pp1, pp2, "Preprocessing must be idempotent");
    }

    private int sum(int[] arr) {
        int s = 0;
        for (int v : arr) s += v;
        return s;
    }
}