WordTokenizer.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.langdetect.charsoup;

import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;

/**
 * General-purpose word tokenizer that shares the same preprocessing pipeline
 * as {@link CharSoupFeatureExtractor}: NFC normalization, URL/email stripping,
 * case folding via {@link Character#toLowerCase(int)}.
 * <p>
 * This tokenizer is designed to replace Lucene's analyzer pipeline in tika-eval.
 * It handles both alphabetic and ideographic scripts:
 * <ul>
 *   <li><b>Alphabetic scripts</b>: accumulates letters into words, emits
 *       on word boundary (non-letter codepoint)</li>
 *   <li><b>Ideographic characters</b>: emits character bigrams (pairs of adjacent
 *       ideographic characters), equivalent to Lucene's CJKBigramFilter</li>
 * </ul>
 * <p>
 * Mixed runs (e.g., alphabetic followed by ideographic) are handled correctly:
 * the alphabetic word is emitted at the boundary, then ideographic bigrams begin.
 */
public class WordTokenizer {

    private WordTokenizer() {
    }

    /**
     * Tokenize the given raw text with full preprocessing (truncate, strip URLs/emails,
     * NFC normalize, case fold) and return tokens as a list.
     * Only alphabetic and ideographic tokens are emitted (no numbers).
     *
     * @param rawText raw input text
     * @return list of token strings (words for alphabetic, bigrams for ideographic)
     */
    public static List<String> tokenize(String rawText) {
        List<String> result = new ArrayList<>();
        tokenize(rawText, result::add);
        return result;
    }

    /**
     * Tokenize with full preprocessing, streaming tokens to a consumer.
     * Only alphabetic and ideographic tokens are emitted (no numbers).
     *
     * @param rawText  raw input text
     * @param consumer receives each token
     */
    public static void tokenize(String rawText, Consumer<String> consumer) {
        if (rawText == null || rawText.isEmpty()) {
            return;
        }
        String text = CharSoupFeatureExtractor.preprocess(rawText);
        tokenizePreprocessed(text, consumer);
    }

    /**
     * Tokenize the given raw text with full preprocessing, including numeric tokens.
     * Alphabetic words and digit-only runs are emitted as separate tokens.
     * Ideographic text produces character bigrams as usual.
     *
     * @param rawText  raw input text
     * @param consumer receives each token
     */
    public static void tokenizeAlphanumeric(String rawText, Consumer<String> consumer) {
        if (rawText == null || rawText.isEmpty()) {
            return;
        }
        String text = CharSoupFeatureExtractor.preprocess(rawText);
        tokenizePreprocessedAlphanumeric(text, consumer);
    }

    /**
     * Tokenize already-preprocessed (NFC, stripped) text.
     * Only alphabetic and ideographic tokens; no numbers.
     * <p>
     * Transparent characters (Arabic harakat, Hebrew niqqud, tatweel, ZWNJ, ZWJ)
     * are skipped so that base letters remain contiguous within a word.
     * See {@link CharSoupFeatureExtractor#isTransparent(int)}.
     *
     * @param text     preprocessed text
     * @param consumer receives each token
     */
    static void tokenizePreprocessed(String text, Consumer<String> consumer) {
        StringBuilder wordBuffer = new StringBuilder();
        int prevIdeograph = -1; // previous ideographic codepoint for bigram emission

        int i = 0;
        int len = text.length();
        while (i < len) {
            int cp = text.codePointAt(i);
            i += Character.charCount(cp);

            // Skip diacritics, tatweel, ZWNJ, ZWJ
            if (cp >= 0x0300 && CharSoupFeatureExtractor.isTransparent(cp)) {
                continue;
            }

            if (Character.isLetter(cp)) {
                int lower = Character.toLowerCase(cp);
                if (Character.isIdeographic(cp)) {
                    // Flush any pending alphabetic word
                    if (wordBuffer.length() > 0) {
                        consumer.accept(wordBuffer.toString());
                        wordBuffer.setLength(0);
                    }
                    // Emit ideographic bigram if we have a previous ideograph
                    if (prevIdeograph >= 0) {
                        consumer.accept(new String(new int[]{prevIdeograph, lower}, 0, 2));
                    }
                    prevIdeograph = lower;
                } else {
                    // Alphabetic: flush ideographic state
                    prevIdeograph = -1;
                    wordBuffer.appendCodePoint(lower);
                }
            } else {
                // Non-letter: word boundary
                if (wordBuffer.length() > 0) {
                    consumer.accept(wordBuffer.toString());
                    wordBuffer.setLength(0);
                }
                prevIdeograph = -1;
            }
        }
        // Flush trailing alphabetic word
        if (wordBuffer.length() > 0) {
            consumer.accept(wordBuffer.toString());
        }
        // Note: a single trailing ideograph does NOT emit a bigram (needs a pair)
    }

    /**
     * Tokenize already-preprocessed text, emitting both alphabetic words and
     * digit-only runs as tokens.  Ideographic bigrams are emitted as usual.
     * <p>
     * This is a separate code path from {@link #tokenizePreprocessed} so that
     * the alpha-only hot path used by language detection has zero overhead
     * from a numeric check.
     * <p>
     * Transparent characters (Arabic harakat, Hebrew niqqud, tatweel, ZWNJ, ZWJ)
     * are skipped so that base letters remain contiguous within a word.
     * See {@link CharSoupFeatureExtractor#isTransparent(int)}.
     *
     * @param text     preprocessed text
     * @param consumer receives each token
     */
    static void tokenizePreprocessedAlphanumeric(String text, Consumer<String> consumer) {
        StringBuilder wordBuffer = new StringBuilder();
        int prevIdeograph = -1;
        boolean inDigits = false;

        int i = 0;
        int len = text.length();
        while (i < len) {
            int cp = text.codePointAt(i);
            i += Character.charCount(cp);

            // Skip diacritics, tatweel, ZWNJ, ZWJ
            if (cp >= 0x0300 && CharSoupFeatureExtractor.isTransparent(cp)) {
                continue;
            }

            if (Character.isDigit(cp)) {
                // Flush alphabetic word or ideographic state
                if (wordBuffer.length() > 0 && !inDigits) {
                    consumer.accept(wordBuffer.toString());
                    wordBuffer.setLength(0);
                }
                prevIdeograph = -1;
                inDigits = true;
                wordBuffer.appendCodePoint(cp);
            } else if (Character.isLetter(cp)) {
                // Flush digit run
                if (inDigits && wordBuffer.length() > 0) {
                    consumer.accept(wordBuffer.toString());
                    wordBuffer.setLength(0);
                    inDigits = false;
                }
                int lower = Character.toLowerCase(cp);
                if (Character.isIdeographic(cp)) {
                    if (wordBuffer.length() > 0) {
                        consumer.accept(wordBuffer.toString());
                        wordBuffer.setLength(0);
                    }
                    if (prevIdeograph >= 0) {
                        consumer.accept(new String(new int[]{prevIdeograph, lower}, 0, 2));
                    }
                    prevIdeograph = lower;
                } else {
                    prevIdeograph = -1;
                    wordBuffer.appendCodePoint(lower);
                }
            } else {
                // Non-letter, non-digit: word boundary
                if (wordBuffer.length() > 0) {
                    consumer.accept(wordBuffer.toString());
                    wordBuffer.setLength(0);
                }
                prevIdeograph = -1;
                inDigits = false;
            }
        }
        if (wordBuffer.length() > 0) {
            consumer.accept(wordBuffer.toString());
        }
    }
}