FeatureExtractor.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.langdetect.charsoup;
/**
* Common interface for feature extractors used by the bigram language detector.
* Implementations must share the same preprocessing pipeline
* ({@link CharSoupFeatureExtractor#preprocess(String)}) but may differ in how
* they extract and hash features from the preprocessed text.
*/
public interface FeatureExtractor {
/**
* Full preprocessing + feature extraction pipeline.
*
* @param rawText raw input text (may be {@code null})
* @return int array of size {@link #getNumBuckets()} with feature counts
*/
int[] extract(String rawText);
/**
* Extract into caller-supplied buffer (zeroed first).
*
* @param rawText raw input text (may be {@code null})
* @param counts pre-allocated int array of size {@link #getNumBuckets()} (will be zeroed)
*/
void extract(String rawText, int[] counts);
/**
* Extract from already-preprocessed text.
*
* @param preprocessedText text already passed through
* {@link CharSoupFeatureExtractor#preprocess(String)}
* @return int array of size {@link #getNumBuckets()} with feature counts
*/
int[] extractFromPreprocessed(String preprocessedText);
/**
* Extract from already-preprocessed text into a caller-supplied buffer.
*
* @param preprocessedText text already passed through
* {@link CharSoupFeatureExtractor#preprocess(String)}
* @param counts pre-allocated int array of size {@link #getNumBuckets()}
* @param clear if {@code true}, zero the array before extracting;
* if {@code false}, accumulate on top of existing counts
*/
void extractFromPreprocessed(String preprocessedText, int[] counts, boolean clear);
/**
* Extract features into {@code counts} and return the total n-gram emission count.
* <p>
* The count is the raw number of individual n-gram tokens processed before bucket
* hashing. It is a script-neutral measure of how much signal the input carries:
* whitespace-only input yields 0; ~200 chars of typical Latin or CJK prose yields
* roughly 400. This is the right threshold variable for length-gated confusables
* because it is insensitive to padding spaces or punctuation-heavy inputs, and it
* naturally accounts for the higher feature density of CJK text vs. Latin text.
* <p>
* The default implementation sums the feature vector after extraction, which is
* correct because every emission does {@code counts[bucket]++}; the sum therefore
* equals the total emission count regardless of hash collisions.
*
* @param rawText raw input text (may be {@code null})
* @param counts pre-allocated int array of size {@link #getNumBuckets()} (will be zeroed)
* @return total n-gram emission count (��� 0)
*/
default int extractAndCount(String rawText, int[] counts) {
extract(rawText, counts);
int n = 0;
for (int c : counts) {
n += c;
}
return n;
}
/**
* @return number of hash buckets (feature vector size)
*/
int getNumBuckets();
/**
* Returns the bitmask of {@link CharSoupModel} {@code FLAG_*} constants that
* describes which feature types this extractor emits.
* <p>
* This must match the {@code featureFlags} stored in any {@link CharSoupModel}
* used with this extractor. A mismatch means the model was trained with a
* different feature set and will produce garbage scores.
*
* @return bitmask of active feature flags
*/
int getFeatureFlags();
}