ResearchFeatureExtractor.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.langdetect.charsoup.tools;

import java.util.Arrays;

import org.apache.tika.langdetect.charsoup.CharSoupFeatureExtractor;
import org.apache.tika.langdetect.charsoup.CharSoupModel;
import org.apache.tika.langdetect.charsoup.FeatureExtractor;
import org.apache.tika.langdetect.charsoup.ScriptCategory;

/**
 * Fully-parameterized feature extractor used for ablation studies and
 * model training experiments. Preserves the complete set of feature
 * configuration options that were explored during development of the
 * CharSoup model (flat-16k+tri+suf+pre, 2026-02).
 * <p>
 * Production inference uses the hardcoded
 * {@link org.apache.tika.langdetect.charsoup.ScriptAwareFeatureExtractor}
 * instead. This class exists so that {@link AblationRunner} and
 * {@link Phase2Trainer} can reproduce the exact training conditions.
 */
public class ResearchFeatureExtractor implements FeatureExtractor {

    static final int BIGRAM_BASIS       = 0x811c9dc5;
    static final int TRIGRAM_BASIS      = 0x9f4e3c21;
    static final int SKIP_BASIS         = 0x6d4d3a2b;
    static final int UNIGRAM_BASIS      = 0x2f4a3c17;
    static final int WORD_BASIS         = 0x4a1c7b39;
    static final int SUFFIX_BASIS       = 0x7e2b1a8f;
    static final int SUFFIX4_BASIS      = 0x5c8a1e49;
    static final int PREFIX_BASIS       = 0x3b7e9f12;
    static final int CHAR_UNIGRAM_BASIS = 0x1d4f8c3a;
    static final int FOURGRAM_BASIS     = 0xa3d8f215;
    static final int FIVEGRAM_BASIS     = 0xc7b46e38;

    static final int MAX_WORD_LENGTH = 30;
    static final int MIN_WORD_LENGTH = 2;
    static final int SENTINEL = '_';

    private final int numBuckets;
    private final boolean useTrigrams;
    private final boolean useSkipBigrams;
    private final boolean useSuffixes;
    private final boolean useSuffix4;
    private final boolean usePrefix;
    private final boolean useWordUnigrams;
    private final boolean useCharUnigrams;
    private final boolean use4grams;
    private final boolean use5grams;

    /** Minimal constructor: bigrams + word unigrams + CJK unigrams. */
    public ResearchFeatureExtractor(int numBuckets) {
        this(numBuckets, false, false, false, false, false, true, false, false, false);
    }

    /** Full-config constructor. All features share the same flat bucket space. */
    public ResearchFeatureExtractor(int numBuckets,
                                    boolean useTrigrams,
                                    boolean useSkipBigrams,
                                    boolean useSuffixes,
                                    boolean useSuffix4,
                                    boolean usePrefix,
                                    boolean useWordUnigrams,
                                    boolean useCharUnigrams,
                                    boolean use4grams,
                                    boolean use5grams) {
        if (numBuckets <= 0) {
            throw new IllegalArgumentException(
                    "numBuckets must be positive: " + numBuckets);
        }
        this.numBuckets = numBuckets;
        this.useTrigrams = useTrigrams;
        this.useSkipBigrams = useSkipBigrams;
        this.useSuffixes = useSuffixes;
        this.useSuffix4 = useSuffix4;
        this.usePrefix = usePrefix;
        this.useWordUnigrams = useWordUnigrams;
        this.useCharUnigrams = useCharUnigrams;
        this.use4grams = use4grams;
        this.use5grams = use5grams;
    }

    @Override
    public int[] extract(String rawText) {
        int[] counts = new int[numBuckets];
        if (rawText == null || rawText.isEmpty()) {
            return counts;
        }
        extractFeatures(CharSoupFeatureExtractor.preprocess(rawText), counts);
        return counts;
    }

    @Override
    public void extract(String rawText, int[] counts) {
        Arrays.fill(counts, 0);
        if (rawText == null || rawText.isEmpty()) {
            return;
        }
        extractFeatures(CharSoupFeatureExtractor.preprocess(rawText), counts);
    }

    @Override
    public int[] extractFromPreprocessed(String text) {
        int[] counts = new int[numBuckets];
        if (text == null || text.isEmpty()) {
            return counts;
        }
        extractFeatures(text, counts);
        return counts;
    }

    @Override
    public void extractFromPreprocessed(String text, int[] counts, boolean clear) {
        if (clear) {
            Arrays.fill(counts, 0);
        }
        if (text == null || text.isEmpty()) {
            return;
        }
        extractFeatures(text, counts);
    }

    private void extractFeatures(String text, int[] counts) {
        int prevCp = SENTINEL;
        int prevScript = -1;
        boolean prevWasLetter = false;
        boolean prevWasCjk = false;
        int prevPrevCp = SENTINEL;
        int prevPrevPrevCp = SENTINEL;
        int prevPrevPrevPrevCp = SENTINEL;

        int wordHash = WORD_BASIS;
        int wordLen = 0;
        int wordScript = -1;
        int suf0 = SENTINEL;
        int suf1 = SENTINEL;
        int suf2 = SENTINEL;
        int suf3 = SENTINEL;
        int preA = SENTINEL;
        int preB = SENTINEL;
        int preC = SENTINEL;

        int i = 0;
        int len = text.length();
        while (i < len) {
            int cp = text.codePointAt(i);
            i += Character.charCount(cp);

            if (cp >= 0x0300 && CharSoupFeatureExtractor.isTransparent(cp)) {
                continue;
            }

            if (Character.isLetter(cp)) {
                int lower = Character.toLowerCase(cp);
                int script = ScriptCategory.of(lower);
                boolean cjk = isCjkScript(script);

                if (prevWasLetter) {
                    if (!sameFamily(script, prevScript)) {
                        emitBoundaryEnd(counts, prevScript, prevCp, prevWasCjk,
                                wordHash, wordLen, wordScript,
                                suf0, suf1, suf2, suf3, preA, preB, preC);
                        emitBoundaryStart(counts, script, lower, cjk);
                        wordHash = WORD_BASIS;
                        wordHash = fnvFeedByte(wordHash, script);
                        wordHash = fnvFeedInt(wordHash, lower);
                        wordLen = 1;
                        wordScript = script;
                        prevPrevPrevPrevCp = SENTINEL;
                        prevPrevPrevCp = SENTINEL;
                        prevPrevCp = SENTINEL;
                        suf0 = SENTINEL;
                        suf1 = SENTINEL;
                        suf2 = SENTINEL;
                        suf3 = lower;
                        preA = lower;
                        preB = SENTINEL;
                        preC = SENTINEL;
                    } else {
                        emitBigram(counts, script, prevCp, lower);
                        if (!cjk && prevPrevCp != SENTINEL) {
                            if (useTrigrams) {
                                emitTrigram(counts, script, prevPrevCp, prevCp, lower);
                            }
                            if (useSkipBigrams) {
                                emitSkipBigram(counts, script, prevPrevCp, lower);
                            }
                            if (use4grams && prevPrevPrevCp != SENTINEL) {
                                emit4gram(counts, script, prevPrevPrevCp, prevPrevCp, prevCp, lower);
                            }
                            if (use5grams && prevPrevPrevPrevCp != SENTINEL) {
                                emit5gram(counts, script, prevPrevPrevPrevCp, prevPrevPrevCp, prevPrevCp, prevCp, lower);
                            }
                        }
                        prevPrevPrevPrevCp = prevPrevPrevCp;
                        prevPrevPrevCp = prevPrevCp;
                        prevPrevCp = prevCp;
                        wordHash = fnvFeedInt(wordHash, lower);
                        wordLen++;
                        if (!cjk) {
                            suf0 = suf1;
                            suf1 = suf2;
                            suf2 = suf3;
                            suf3 = lower;
                            if (wordLen == 2) {
                                preB = lower;
                                if (useTrigrams) {
                                    emitTrigram(counts, script, SENTINEL, prevCp, lower);
                                }
                            } else if (wordLen == 3) {
                                preC = lower;
                                if (use4grams) {
                                    emit4gram(counts, script, SENTINEL, preA, preB, lower);
                                }
                            } else if (wordLen == 4) {
                                if (use5grams) {
                                    emit5gram(counts, script, SENTINEL, preA, preB, preC, lower);
                                }
                            }
                        }
                    }
                } else {
                    if (prevWasCjk && cjk && prevCp != SENTINEL) {
                        emitBigram(counts, script, prevCp, lower);
                    } else {
                        emitBoundaryStart(counts, script, lower, cjk);
                        wordHash = WORD_BASIS;
                        wordHash = fnvFeedByte(wordHash, script);
                        wordHash = fnvFeedInt(wordHash, lower);
                        wordLen = 1;
                        wordScript = script;
                        prevPrevPrevPrevCp = SENTINEL;
                        prevPrevPrevCp = SENTINEL;
                        prevPrevCp = SENTINEL;
                        suf0 = SENTINEL;
                        suf1 = SENTINEL;
                        suf2 = SENTINEL;
                        suf3 = lower;
                        preA = lower;
                        preB = SENTINEL;
                        preC = SENTINEL;
                    }
                }

                if (isCjkOrKana(lower)) {
                    emitUnigram(counts, script, lower);
                } else if (useCharUnigrams) {
                    emitCharUnigram(counts, script, lower);
                }

                prevCp = lower;
                prevScript = script;
                prevWasLetter = true;
                prevWasCjk = cjk;
            } else {
                if (prevWasLetter) {
                    if (prevWasCjk && isSpace(cp)) {
                        prevWasLetter = false;
                        continue;
                    }
                    emitBoundaryEnd(counts, prevScript, prevCp, prevWasCjk,
                            wordHash, wordLen, wordScript,
                            suf0, suf1, suf2, suf3, preA, preB, preC);
                }
                prevWasLetter = false;
                prevWasCjk = false;
                prevCp = SENTINEL;
                prevPrevCp = SENTINEL;
                prevPrevPrevCp = SENTINEL;
                prevPrevPrevPrevCp = SENTINEL;
                wordLen = 0;
                suf0 = SENTINEL;
                suf1 = SENTINEL;
                suf2 = SENTINEL;
                suf3 = SENTINEL;
                preA = SENTINEL;
                preB = SENTINEL;
                preC = SENTINEL;
            }
        }

        if (prevWasLetter) {
            emitBoundaryEnd(counts, prevScript, prevCp, prevWasCjk,
                    wordHash, wordLen, wordScript,
                    suf0, suf1, suf2, suf3, preA, preB, preC);
        }
    }

    private void emitBoundaryStart(int[] counts, int script, int lower, boolean cjk) {
        if (!cjk) {
            emitBigram(counts, script, SENTINEL, lower);
        }
    }

    private void emitBoundaryEnd(int[] counts, int script, int prevCp, boolean cjk,
                                  int wordHash, int wordLen, int wordScript,
                                  int suf0, int suf1, int suf2, int suf3,
                                  int preA, int preB, int preC) {
        if (!cjk) {
            emitBigram(counts, script, prevCp, SENTINEL);
            if (useTrigrams && wordLen >= 2) {
                emitTrigram(counts, script, suf2, suf3, SENTINEL);
            }
            if (use4grams && wordLen >= 3) {
                emit4gram(counts, wordScript, suf1, suf2, suf3, SENTINEL);
            }
            if (use4grams && wordLen == 2) {
                // complete 2-letter word: (_, a, b, _)
                emit4gram(counts, wordScript, SENTINEL, suf2, suf3, SENTINEL);
            }
            if (use5grams && wordLen >= 4) {
                emit5gram(counts, wordScript, suf0, suf1, suf2, suf3, SENTINEL);
            }
            if (use5grams && wordLen == 3) {
                // complete 3-letter word: (_, a, b, c, _)
                emit5gram(counts, wordScript, SENTINEL, suf1, suf2, suf3, SENTINEL);
            }
            if (useWordUnigrams) {
                emitWordIfEligible(counts, wordHash, wordLen);
            }
            if (useSuffixes && wordLen >= 3) {
                emitSuffix(counts, wordScript, suf1, suf2, suf3);
            }
            if (useSuffix4 && wordLen >= 4) {
                emitSuffix4(counts, wordScript, suf0, suf1, suf2, suf3);
            }
            if (usePrefix && wordLen >= 3) {
                emitPrefix(counts, wordScript, preA, preB, preC);
            }
        }
    }

    private void emitBigram(int[] counts, int script, int cp1, int cp2) {
        int h = fnvFeedInt(fnvFeedInt(fnvFeedByte(BIGRAM_BASIS, script), cp1), cp2);
        counts[(h & 0x7FFFFFFF) % numBuckets]++;
    }

    private void emitTrigram(int[] counts, int script, int cp1, int cp2, int cp3) {
        int h = fnvFeedInt(fnvFeedInt(fnvFeedInt(fnvFeedByte(TRIGRAM_BASIS, script), cp1), cp2), cp3);
        counts[(h & 0x7FFFFFFF) % numBuckets]++;
    }

    private void emitSkipBigram(int[] counts, int script, int cp1, int cp2) {
        int h = fnvFeedInt(fnvFeedInt(fnvFeedByte(SKIP_BASIS, script), cp1), cp2);
        counts[(h & 0x7FFFFFFF) % numBuckets]++;
    }

    private void emitUnigram(int[] counts, int script, int cp) {
        int h = fnvFeedInt(fnvFeedByte(UNIGRAM_BASIS, script), cp);
        counts[(h & 0x7FFFFFFF) % numBuckets]++;
    }

    private void emitCharUnigram(int[] counts, int script, int cp) {
        int h = fnvFeedInt(fnvFeedByte(CHAR_UNIGRAM_BASIS, script), cp);
        counts[(h & 0x7FFFFFFF) % numBuckets]++;
    }

    private void emitSuffix(int[] counts, int script, int cp1, int cp2, int cp3) {
        int h = fnvFeedInt(fnvFeedInt(fnvFeedInt(fnvFeedByte(SUFFIX_BASIS, script), cp1), cp2), cp3);
        counts[(h & 0x7FFFFFFF) % numBuckets]++;
    }

    private void emitSuffix4(int[] counts, int script, int cp1, int cp2, int cp3, int cp4) {
        int h = fnvFeedInt(fnvFeedInt(fnvFeedInt(fnvFeedInt(fnvFeedByte(SUFFIX4_BASIS, script), cp1), cp2), cp3), cp4);
        counts[(h & 0x7FFFFFFF) % numBuckets]++;
    }

    private void emit4gram(int[] counts, int script, int cp1, int cp2, int cp3, int cp4) {
        int h = fnvFeedInt(fnvFeedInt(fnvFeedInt(fnvFeedInt(fnvFeedByte(FOURGRAM_BASIS, script), cp1), cp2), cp3), cp4);
        counts[(h & 0x7FFFFFFF) % numBuckets]++;
    }

    private void emit5gram(int[] counts, int script, int cp1, int cp2, int cp3, int cp4, int cp5) {
        int h = fnvFeedInt(fnvFeedInt(fnvFeedInt(fnvFeedInt(fnvFeedInt(fnvFeedByte(FIVEGRAM_BASIS, script), cp1), cp2), cp3), cp4), cp5);
        counts[(h & 0x7FFFFFFF) % numBuckets]++;
    }

    private void emitPrefix(int[] counts, int script, int cp1, int cp2, int cp3) {
        int h = fnvFeedInt(fnvFeedInt(fnvFeedInt(fnvFeedByte(PREFIX_BASIS, script), cp1), cp2), cp3);
        counts[(h & 0x7FFFFFFF) % numBuckets]++;
    }

    private void emitWordIfEligible(int[] counts, int wordHash, int wordLen) {
        if (wordLen >= MIN_WORD_LENGTH && wordLen <= MAX_WORD_LENGTH) {
            counts[(wordHash & 0x7FFFFFFF) % numBuckets]++;
        }
    }

    private static boolean isCjkScript(int script) {
        return script == ScriptCategory.HAN
                || script == ScriptCategory.HIRAGANA
                || script == ScriptCategory.KATAKANA;
    }

    private static boolean sameFamily(int a, int b) {
        if (a == b) return true;
        return isCjkScript(a) && isCjkScript(b);
    }

    private static boolean isSpace(int cp) {
        return cp == ' ' || cp == '\t'
                || Character.getType(cp) == Character.SPACE_SEPARATOR;
    }

    static boolean isCjkOrKana(int cp) {
        if (Character.isIdeographic(cp)) return true;
        Character.UnicodeScript us = Character.UnicodeScript.of(cp);
        return us == Character.UnicodeScript.HIRAGANA
                || us == Character.UnicodeScript.KATAKANA;
    }

    private static int fnvFeedByte(int hash, int b) {
        return (hash ^ (b & 0xFF)) * 0x01000193;
    }

    private static int fnvFeedInt(int hash, int value) {
        hash = (hash ^ (value & 0xFF)) * 0x01000193;
        hash = (hash ^ ((value >>> 8) & 0xFF)) * 0x01000193;
        hash = (hash ^ ((value >>> 16) & 0xFF)) * 0x01000193;
        hash = (hash ^ ((value >>> 24) & 0xFF)) * 0x01000193;
        return hash;
    }

    public int getNumBuckets() {
        return numBuckets;
    }

    @Override
    public int getFeatureFlags() {
        int flags = 0;
        if (useTrigrams)     flags |= CharSoupModel.FLAG_TRIGRAMS;
        if (useSkipBigrams)  flags |= CharSoupModel.FLAG_SKIP_BIGRAMS;
        if (useSuffixes)     flags |= CharSoupModel.FLAG_SUFFIXES;
        if (useSuffix4)      flags |= CharSoupModel.FLAG_SUFFIX4;
        if (usePrefix)       flags |= CharSoupModel.FLAG_PREFIX;
        if (useWordUnigrams) flags |= CharSoupModel.FLAG_WORD_UNIGRAMS;
        if (useCharUnigrams) flags |= CharSoupModel.FLAG_CHAR_UNIGRAMS;
        if (use4grams)       flags |= CharSoupModel.FLAG_4GRAMS;
        if (use5grams)       flags |= CharSoupModel.FLAG_5GRAMS;
        return flags;
    }
}