CharSoupFeatureExtractor.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.langdetect.charsoup;

import java.text.Normalizer;
import java.util.regex.Pattern;

/**
 * Extracts character n-gram features from text using the hashing trick (FNV-1a).
 * <p>
 * <strong>WARNING ��� DO NOT CHANGE THIS CLASS WITHOUT RETRAINING THE MODEL.</strong>
 * This class encodes the exact preprocessing and feature-extraction pipeline
 * that was used when {@code langdetect.bin} was trained. Training and inference
 * must be bit-for-bit identical: any change to {@link #isTransparent(int)},
 * {@link #preprocess(String)}, {@link #preprocessNoTruncate(String)},
 * {@link #extractBigrams}, or the FNV-1a hash will silently degrade accuracy
 * because the model weights will no longer correspond to the features being
 * computed. Changes here also affect tika-eval tokenization, which calls
 * {@link #preprocessNoTruncate(String)} and {@link #isTransparent(int)}
 * directly.
 * </p>
 * <h3>Pipeline</h3>
 * <ol>
 *   <li>Truncate input at {@link #MAX_TEXT_LENGTH} chars</li>
 *   <li>Strip URLs and emails (TIKA-2777 bounded patterns)</li>
 *   <li>NFC normalize</li>
 *   <li>Iterate codepoints (surrogate-safe)</li>
 *   <li>Skip transparent characters (see {@link #isTransparent(int)})</li>
 *   <li>Filter: {@link Character#isLetter(int)}</li>
 *   <li>Case fold: {@link Character#toLowerCase(int)}</li>
 *   <li>Emit bigrams (and optionally trigrams) with underscore {@code _} sentinels
 *       at word boundaries</li>
 *   <li>Hash each n-gram via FNV-1a ��� bucket index</li>
 * </ol>
 *
 * <h3>Trigram mode</h3>
 * <p>
 * When {@code includeTrigrams} is enabled, both bigrams and trigrams are hashed
 * into the same bucket vector. Trigrams are more discriminative than bigrams
 * (e.g., "the" vs "th"+"he"), which improves accuracy on very short texts.
 * The tradeoff is more hash collisions in smaller bucket vectors.
 * </p>
 *
 * <h3>Transparent character handling</h3>
 * <p>
 * Certain codepoints are treated as <em>transparent</em> ��� they are skipped entirely
 * during n-gram extraction so that base letters on either side form a contiguous pair.
 * This is critical for Arabic and Hebrew where diacritical marks (harakat, niqqud) are
 * Unicode nonspacing marks ({@code Mn}) that would otherwise break words into isolated
 * single-letter fragments, destroying the bigram signal.
 * </p>
 * <p>See {@link #isTransparent(int)} for the full list of skipped codepoints.</p>
 */
public class CharSoupFeatureExtractor {

    /** Maximum characters to process ��� prevents DoS, more than enough for detection. */
    static final int MAX_TEXT_LENGTH = 100_000;

    /** Underscore sentinel codepoint used for word boundary bigrams. */
    static final int SENTINEL = '_';

    // TIKA-2777: bounded regexes to avoid catastrophic backtracking
    private static final Pattern URL_REGEX =
            Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]{10,10000}");
    private static final Pattern MAIL_REGEX =
            Pattern.compile("[-_.0-9A-Za-z]{1,100}@[-_0-9A-Za-z]{1,100}[-_.0-9A-Za-z]{1,100}");

    /** Arabic Tatweel (kashida) ��� a typographic stretching character (U+0640). */
    private static final int TATWEEL = 0x0640;

    /** Zero Width Non-Joiner (U+200C) ��� used in Persian/Arabic/Urdu. */
    private static final int ZWNJ = 0x200C;

    /** Zero Width Joiner (U+200D). */
    private static final int ZWJ = 0x200D;

    private final int numBuckets;
    private final boolean includeTrigrams;

    /**
     * Create an extractor with bigrams only.
     *
     * @param numBuckets number of hash buckets (feature vector size)
     */
    public CharSoupFeatureExtractor(int numBuckets) {
        this(numBuckets, false);
    }

    /**
     * Create an extractor with configurable n-gram mode.
     *
     * @param numBuckets      number of hash buckets (feature vector size)
     * @param includeTrigrams if {@code true}, both bigrams and trigrams are
     *                        hashed into the bucket vector; if {@code false},
     *                        only bigrams are used (the default)
     */
    public CharSoupFeatureExtractor(int numBuckets, boolean includeTrigrams) {
        if (numBuckets <= 0) {
            throw new IllegalArgumentException("numBuckets must be positive: " + numBuckets);
        }
        this.numBuckets = numBuckets;
        this.includeTrigrams = includeTrigrams;
    }

    /**
     * Full preprocessing + feature extraction pipeline.
     *
     * @param rawText raw input text (may be {@code null})
     * @return int array of size {@code numBuckets} with bigram counts
     */
    public int[] extract(String rawText) {
        int[] counts = new int[numBuckets];
        if (rawText == null || rawText.isEmpty()) {
            return counts;
        }
        String text = preprocess(rawText);
        extractBigrams(text, counts);
        return counts;
    }

    /**
     * Extract features into a caller-supplied buffer, avoiding allocation.
     * The buffer is zeroed and then filled with bigram counts.
     * <p>
     * Use this in tight training loops to eliminate per-sample GC pressure
     * from allocating 128KB int arrays millions of times.
     *
     * @param rawText raw input text (may be {@code null})
     * @param counts  pre-allocated int array of size {@code numBuckets} (will be zeroed)
     */
    public void extract(String rawText, int[] counts) {
        java.util.Arrays.fill(counts, 0);
        if (rawText == null || rawText.isEmpty()) {
            return;
        }
        String text = preprocess(rawText);
        extractBigrams(text, counts);
    }

    /**
     * Extract features from <em>already-preprocessed</em> text (no NFC, no URL stripping,
     * no truncation). Use this when the text has already been passed through
     * {@link #preprocess(String)} ��� for example, when loading preprocessed data from disk.
     *
     * @param preprocessedText text that has already been through {@link #preprocess(String)}
     * @return int array of size {@code numBuckets} with bigram counts
     */
    public int[] extractFromPreprocessed(String preprocessedText) {
        int[] counts = new int[numBuckets];
        if (preprocessedText == null || preprocessedText.isEmpty()) {
            return counts;
        }
        extractBigrams(preprocessedText, counts);
        return counts;
    }

    /**
     * Extract features from already-preprocessed text into a caller-supplied buffer.
     * Combines the benefits of {@link #extractFromPreprocessed(String)} (skip preprocessing)
     * and {@link #extract(String, int[])} (no allocation).
     * <p>
     * This is the fastest extraction path ��� use it in training loops where text has
     * been preprocessed and written to disk ahead of time.
     *
     * @param preprocessedText text that has already been through {@link #preprocess(String)}
     * @param counts           pre-allocated int array of size {@code numBuckets} (will be zeroed)
     */
    public void extractFromPreprocessed(String preprocessedText, int[] counts) {
        extractFromPreprocessed(preprocessedText, counts, true);
    }

    /**
     * Extract features from already-preprocessed text into a caller-supplied buffer,
     * optionally clearing it first.
     * <p>
     * When {@code clear} is {@code false}, bigram counts are <em>accumulated</em> on
     * top of whatever is already in the buffer. This is useful in training loops
     * where features from multiple sources need to be combined into a single vector.
     *
     * @param preprocessedText text that has already been through {@link #preprocess(String)}
     * @param counts           pre-allocated int array of size {@code numBuckets}
     * @param clear            if {@code true}, zero the array before extracting;
     *                         if {@code false}, accumulate on top of existing counts
     */
    public void extractFromPreprocessed(String preprocessedText, int[] counts, boolean clear) {
        if (clear) {
            java.util.Arrays.fill(counts, 0);
        }
        if (preprocessedText == null || preprocessedText.isEmpty()) {
            return;
        }
        extractBigrams(preprocessedText, counts);
    }

    /**
     * Preprocessing: truncate, strip URLs/emails, NFC normalize.
     * <p>
     * This method is also used by the general word tokenizer so that
     * tika-eval shares the same normalization pipeline.
     * </p>
     *
     * @param rawText raw input
     * @return cleaned, NFC-normalized text
     */
    public static String preprocess(String rawText) {
        String text = rawText;

        // 1. Truncate (for language detection ��� keeps inference fast)
        if (text.length() > MAX_TEXT_LENGTH) {
            text = text.substring(0, MAX_TEXT_LENGTH);
        }

        return preprocessNoTruncate(text);
    }

    /**
     * Preprocessing without the length truncation: strip URLs/emails and
     * NFC-normalize. Used by tika-eval tokenization, which imposes its own
     * {@code maxTokens} limit rather than a character limit.
     * <p>
     * <strong>Important:</strong> {@link #preprocess(String)} delegates to this
     * method after truncating, so any change here affects both language detection
     * and tika-eval tokenization. Also note that
     * {@link org.apache.tika.eval.core.tokens.TikaEvalTokenizer} calls
     * {@link #isTransparent(int)} directly; changes to that method affect
     * tika-eval token boundaries as well.
     * </p>
     *
     * @param rawText raw input
     * @return cleaned, NFC-normalized text
     */
    public static String preprocessNoTruncate(String rawText) {
        // Strip URLs and emails
        String text = URL_REGEX.matcher(rawText).replaceAll(" ");
        text = MAIL_REGEX.matcher(text).replaceAll(" ");

        // NFC normalize
        if (!Normalizer.isNormalized(text, Normalizer.Form.NFC)) {
            text = Normalizer.normalize(text, Normalizer.Form.NFC);
        }

        return text;
    }

    /**
     * Determine whether a codepoint should be treated as transparent (skipped)
     * during bigram extraction and word tokenization.
     * <p>
     * Transparent codepoints are invisible to the bigram/tokenization logic:
     * base letters on either side of a transparent run form a contiguous bigram
     * or remain part of the same word token.
     * </p>
     * <p>The following categories are transparent:</p>
     * <ul>
     *   <li><b>Unicode nonspacing marks (Mn)</b> ��� includes Arabic harakat
     *       (fatha U+064E, damma U+064F, kasra U+0650, shadda U+0651,
     *       sukun U+0652, tanwin U+064B���U+064D, superscript alef U+0670)
     *       and Hebrew niqqud (U+05B0���U+05BD, U+05BF, U+05C1���U+05C2,
     *       U+05C4���U+05C5, U+05C7). Without this, diacritics break Arabic/Hebrew
     *       words into isolated single-letter fragments because
     *       {@link Character#isLetter(int)} returns {@code false} for Mn
     *       codepoints. Stripping them yields clean base-letter bigrams, which
     *       Stripping them preserves clean base-letter bigrams.</li>
     *   <li><b>Arabic Tatweel / Kashida (U+0640)</b> ��� a typographic stretching
     *       character that is classified as a letter but carries no linguistic
     *       information. "������" and "����������" should produce identical bigrams.</li>
     *   <li><b>ZWNJ (U+200C)</b> ��� Zero Width Non-Joiner, used heavily in
     *       Persian/Farsi (e.g., "�����������������") and in Arabic, Urdu, and Kurdish
     *       to control cursive joining. It is <em>not</em> a word boundary;
     *       bigrams should span across it.</li>
     *   <li><b>ZWJ (U+200D)</b> ��� Zero Width Joiner, forces cursive joining.
     *       Also not a word boundary.</li>
     * </ul>
     * <p>
     * A fast guard ({@code cp < 0x0300}) short-circuits the check for ASCII
     * and basic Latin/Greek text, adding zero overhead to the common case.
     * </p>
     *
     * @param cp a Unicode codepoint
     * @return {@code true} if the codepoint should be skipped
     */
    public static boolean isTransparent(int cp) {
        // Fast path: ASCII and Latin-1 Supplement + Latin Extended blocks
        // have no transparent characters. Combining Diacritical Marks start
        // at U+0300; ZWNJ/ZWJ/Tatweel are all above that.
        if (cp < 0x0300) {
            return false;
        }
        return Character.getType(cp) == Character.NON_SPACING_MARK
                || cp == TATWEEL
                || cp == ZWNJ
                || cp == ZWJ;
    }

    /**
     * Iterate codepoints, skip {@linkplain #isTransparent(int) transparent}
     * characters, emit sentinel-bounded bigrams (and optionally trigrams),
     * hash into buckets.
     */
    private void extractBigrams(String text, int[] counts) {
        int prevCp = SENTINEL;  // previous codepoint (for bigrams)
        int prevPrevCp = -1;    // two codepoints back (for trigrams)
        boolean prevWasLetter = false;

        int i = 0;
        int len = text.length();
        while (i < len) {
            int cp = text.codePointAt(i);
            i += Character.charCount(cp);

            // Skip diacritics, tatweel, ZWNJ, ZWJ ��� see isTransparent javadoc
            if (cp >= 0x0300 && isTransparent(cp)) {
                continue;
            }

            if (Character.isLetter(cp)) {
                int lower = Character.toLowerCase(cp);
                if (prevWasLetter) {
                    // mid-word bigram
                    incrementBucket(counts, prevCp, lower);
                    // mid-word trigram
                    if (includeTrigrams && prevPrevCp >= 0) {
                        incrementTrigramBucket(counts, prevPrevCp, prevCp, lower);
                    }
                } else {
                    // word-initial bigram: (sentinel, letter)
                    incrementBucket(counts, SENTINEL, lower);
                    // no trigram yet ��� need at least 2 previous codepoints
                }
                prevPrevCp = prevCp;
                prevCp = lower;
                prevWasLetter = true;
            } else {
                if (prevWasLetter) {
                    // word-final bigram: (letter, sentinel)
                    incrementBucket(counts, prevCp, SENTINEL);
                    // word-final trigram: (prev, letter, sentinel)
                    if (includeTrigrams && prevPrevCp >= 0) {
                        incrementTrigramBucket(counts, prevPrevCp, prevCp, SENTINEL);
                    }
                }
                prevWasLetter = false;
                prevPrevCp = -1;
                // prevCp doesn't matter when prevWasLetter is false
            }
        }

        // End of text: emit final sentinel bigram/trigram if last char was a letter
        if (prevWasLetter) {
            incrementBucket(counts, prevCp, SENTINEL);
            if (includeTrigrams && prevPrevCp >= 0) {
                incrementTrigramBucket(counts, prevPrevCp, prevCp, SENTINEL);
            }
        }
    }

    private void incrementBucket(int[] counts, int cp1, int cp2) {
        int bucket = bucketIndex(cp1, cp2);
        counts[bucket]++;
    }

    private void incrementTrigramBucket(int[] counts, int cp1, int cp2, int cp3) {
        int hash = hashTrigram(cp1, cp2, cp3);
        int bucket = (hash & 0x7FFFFFFF) % numBuckets;
        counts[bucket]++;
    }

    /**
     * Compute the bucket index for a bigram of two codepoints.
     *
     * @param cp1 first codepoint
     * @param cp2 second codepoint
     * @return bucket index in [0, numBuckets)
     */
    int bucketIndex(int cp1, int cp2) {
        int hash = hashBigram(cp1, cp2);
        return (hash & 0x7FFFFFFF) % numBuckets;
    }

    /**
     * FNV-1a 32-bit hash of two codepoints, each fed as 4 little-endian bytes.
     *
     * @param cp1 first codepoint
     * @param cp2 second codepoint
     * @return 32-bit FNV-1a hash
     */
    static int hashBigram(int cp1, int cp2) {
        int hash = 0x811c9dc5; // FNV offset basis
        hash = fnvFeedInt(hash, cp1);
        hash = fnvFeedInt(hash, cp2);
        return hash;
    }

    /**
     * FNV-1a 32-bit hash of three codepoints, each fed as 4 little-endian bytes.
     * <p>
     * Trigrams occupy a different hash space than bigrams naturally because
     * the third codepoint extends the hash chain, so bigram and trigram features
     * won't systematically collide.
     *
     * @param cp1 first codepoint
     * @param cp2 second codepoint
     * @param cp3 third codepoint
     * @return 32-bit FNV-1a hash
     */
    static int hashTrigram(int cp1, int cp2, int cp3) {
        int hash = 0x811c9dc5; // FNV offset basis
        hash = fnvFeedInt(hash, cp1);
        hash = fnvFeedInt(hash, cp2);
        hash = fnvFeedInt(hash, cp3);
        return hash;
    }

    /**
     * Feed a 32-bit int into an FNV-1a hash as 4 little-endian bytes.
     */
    private static int fnvFeedInt(int hash, int value) {
        hash ^= (value & 0xFF);
        hash *= 0x01000193;
        hash ^= ((value >>> 8) & 0xFF);
        hash *= 0x01000193;
        hash ^= ((value >>> 16) & 0xFF);
        hash *= 0x01000193;
        hash ^= ((value >>> 24) & 0xFF);
        hash *= 0x01000193;
        return hash;
    }

    public int getNumBuckets() {
        return numBuckets;
    }

    public boolean isIncludeTrigrams() {
        return includeTrigrams;
    }
}