WordTokenizerTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.langdetect.charsoup;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.util.ArrayList;
import java.util.List;

import org.junit.jupiter.api.Test;

public class WordTokenizerTest {

    @Test
    public void testSimpleEnglish() {
        List<String> tokens = WordTokenizer.tokenize("Hello World");
        assertEquals(List.of("hello", "world"), tokens);
    }

    @Test
    public void testPunctuationSplitting() {
        List<String> tokens = WordTokenizer.tokenize("hello, world! foo-bar");
        assertEquals(List.of("hello", "world", "foo", "bar"), tokens);
    }

    @Test
    public void testCaseFolding() {
        List<String> tokens = WordTokenizer.tokenize("ABC DEF");
        assertEquals(List.of("abc", "def"), tokens);
    }

    @Test
    public void testCJKBigrams() {
        // "������������" ��� bigrams: ������, ������, ������
        List<String> tokens = WordTokenizer.tokenize("������������");
        assertEquals(3, tokens.size());
        assertEquals("������", tokens.get(0));
        assertEquals("������", tokens.get(1));
        assertEquals("������", tokens.get(2));
    }

    @Test
    public void testMixedAlphaAndCJK() {
        // "hello������world" ��� "hello", ������, "world"
        List<String> tokens = WordTokenizer.tokenize("hello������world");
        assertEquals(3, tokens.size());
        assertEquals("hello", tokens.get(0));
        assertEquals("������", tokens.get(1));
        assertEquals("world", tokens.get(2));
    }

    @Test
    public void testNullAndEmpty() {
        assertTrue(WordTokenizer.tokenize(null).isEmpty());
        assertTrue(WordTokenizer.tokenize("").isEmpty());
    }

    @Test
    public void testURLStripping() {
        List<String> tokens = WordTokenizer.tokenize("visit https://example.com/page today");
        assertEquals(List.of("visit", "today"), tokens);
    }

    @Test
    public void testNFCNormalization() {
        // �� composed vs decomposed should produce same tokens
        String composed = "caf\u00e9";
        String decomposed = "cafe\u0301";
        assertEquals(WordTokenizer.tokenize(composed), WordTokenizer.tokenize(decomposed));
    }

    @Test
    public void testSingleCJKChar() {
        // A single ideographic character has no pair ��� no bigram emitted
        List<String> tokens = WordTokenizer.tokenize("���");
        assertTrue(tokens.isEmpty());
    }

    @Test
    public void testDigitsIgnored() {
        List<String> tokens = WordTokenizer.tokenize("abc 123 def");
        assertEquals(List.of("abc", "def"), tokens);
    }

    // --- tokenizeAlphanumeric tests ---

    @Test
    public void testAlphanumericIncludesNumbers() {
        List<String> tokens = new ArrayList<>();
        WordTokenizer.tokenizeAlphanumeric("abc 123 def", tokens::add);
        assertEquals(List.of("abc", "123", "def"), tokens);
    }

    @Test
    public void testAlphanumericMixed() {
        List<String> tokens = new ArrayList<>();
        WordTokenizer.tokenizeAlphanumeric("hello42world", tokens::add);
        // "hello", "42", "world" ��� digit run breaks the word
        assertEquals(List.of("hello", "42", "world"), tokens);
    }

    @Test
    public void testAlphanumericPunctuation() {
        List<String> tokens = new ArrayList<>();
        WordTokenizer.tokenizeAlphanumeric("price: $99, qty 5!", tokens::add);
        assertEquals(List.of("price", "99", "qty", "5"), tokens);
    }

    @Test
    public void testAlphanumericCJK() {
        // Ideographic bigrams still work, numbers still emitted
        // "������123������" ��� ������ (bigram), 123 (digits), ������ (bigram)
        // Note: no bigram across the digit boundary (���-��� lost)
        List<String> tokens = new ArrayList<>();
        WordTokenizer.tokenizeAlphanumeric("������123������", tokens::add);
        assertEquals(3, tokens.size());
        assertEquals("������", tokens.get(0));
        assertEquals("123", tokens.get(1));
        assertEquals("������", tokens.get(2));
    }

    @Test
    public void testAlphanumericNullEmpty() {
        List<String> tokens = new ArrayList<>();
        WordTokenizer.tokenizeAlphanumeric(null, tokens::add);
        assertTrue(tokens.isEmpty());
        WordTokenizer.tokenizeAlphanumeric("", tokens::add);
        assertTrue(tokens.isEmpty());
    }

    // --- Transparent character tests ---

    @Test
    public void testArabicDiacriticsStripped() {
        // ������������ with diacritics should tokenize to one word: ������
        String withDiacritics = "\u0643\u064E\u062A\u064E\u0628\u064E";
        String withoutDiacritics = "\u0643\u062A\u0628";
        assertEquals(WordTokenizer.tokenize(withoutDiacritics),
                WordTokenizer.tokenize(withDiacritics));
    }

    @Test
    public void testArabicTatweelStripped() {
        // ���������� (with tatweel) should tokenize same as ������
        String withTatweel = "\u0643\u0640\u062A\u0640\u0628";
        String withoutTatweel = "\u0643\u062A\u0628";
        assertEquals(WordTokenizer.tokenize(withoutTatweel),
                WordTokenizer.tokenize(withTatweel));
    }

    @Test
    public void testZWNJKeepsWordTogether() {
        // Persian ����������������� ��� ZWNJ should not split the word
        String withZWNJ = "\u0645\u06CC\u200C\u062E\u0648\u0627\u0647\u0645";
        List<String> tokens = WordTokenizer.tokenize(withZWNJ);
        assertEquals(1, tokens.size(), "ZWNJ should not be a word boundary");
    }

    @Test
    public void testHebrewNiqqudStripped() {
        // �������������� with niqqud should tokenize same as ��������
        String withNiqqud = "\u05E9\u05B8\u05C1\u05DC\u05D5\u05B9\u05DD";
        String withoutNiqqud = "\u05E9\u05DC\u05D5\u05DD";
        assertEquals(WordTokenizer.tokenize(withoutNiqqud),
                WordTokenizer.tokenize(withNiqqud));
    }

    @Test
    public void testAlphanumericArabicDiacritics() {
        // Same transparency applies in alphanumeric mode
        List<String> tokens = new ArrayList<>();
        // �������� 123 (Arabic word with fatha, then number)
        WordTokenizer.tokenizeAlphanumeric("\u0643\u062A\u0628\u064E 123", tokens::add);
        assertEquals(2, tokens.size());
        assertEquals("\u0643\u062A\u0628", tokens.get(0)); // base letters only
        assertEquals("123", tokens.get(1));
    }
}