GlmScriptCategory.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.langdetect.charsoup;
/**
* Fine-grained Unicode script categories for the generative language model.
*
* <p>Unlike {@link ScriptCategory}, this class:
* <ul>
* <li>Covers all scripts present in the 204-language training set.</li>
* <li>Has <b>no OTHER catch-all</b> ��� {@link #of(int)} returns {@code -1}
* for unrecognized scripts, which are simply ignored rather than
* bucketed together.</li>
* <li>Is independent of {@link ScriptCategory#COUNT}, so the discriminative
* model is unaffected by changes here.</li>
* </ul>
*
* <p>The normalized script distribution (proportion of letters per script)
* provides a strong signal for detecting charset errors and garbled text:
* genuine Japanese text is ~40% Hiragana, ~20% Katakana, ~30% CJK, while
* mojibake produces random codepoints with a very different distribution.
*/
public final class GlmScriptCategory {
// ---- Shared with ScriptCategory (same IDs for first 15) ----
public static final int LATIN = 0;
public static final int CYRILLIC = 1;
public static final int ARABIC = 2;
public static final int HAN = 3;
public static final int HANGUL = 4;
public static final int HIRAGANA = 5;
public static final int KATAKANA = 6;
public static final int DEVANAGARI = 7;
public static final int THAI = 8;
public static final int GREEK = 9;
public static final int HEBREW = 10;
public static final int BENGALI = 11; // also Assamese
public static final int GEORGIAN = 12; // also Mingrelian
public static final int ARMENIAN = 13;
public static final int ETHIOPIC = 14; // Amharic, Tigrinya
// ---- Scripts covered in our model but previously in OTHER ----
public static final int MYANMAR = 15; // Burmese
public static final int TIBETAN = 16;
public static final int KHMER = 17;
public static final int TAMIL = 18;
public static final int TELUGU = 19;
public static final int KANNADA = 20;
public static final int MALAYALAM = 21;
public static final int GUJARATI = 22;
public static final int GURMUKHI = 23; // Punjabi
public static final int ORIYA = 24; // Odia
public static final int SINHALA = 25;
public static final int LAO = 26;
public static final int NKO = 27;
public static final int THAANA = 28; // Dhivehi/Maldivian
public static final int OL_CHIKI = 29; // Santali
// ---- CJK sub-blocks (finer-grained Han) ----
public static final int HAN_EXT_A = 30; // U+3400���U+4DBF
public static final int HAN_EXT_B = 31; // U+20000+ (rare extensions)
public static final int HAN_COMPAT = 32; // U+F900���U+FAFF
public static final int BOPOMOFO = 33; // Traditional Chinese phonetic
/** Total number of categories. No OTHER catch-all. */
public static final int COUNT = 34;
private static final String[] NAMES = {
"LATIN", "CYRILLIC", "ARABIC", "HAN", "HANGUL",
"HIRAGANA", "KATAKANA", "DEVANAGARI", "THAI", "GREEK",
"HEBREW", "BENGALI", "GEORGIAN", "ARMENIAN", "ETHIOPIC",
"MYANMAR", "TIBETAN", "KHMER",
"TAMIL", "TELUGU", "KANNADA", "MALAYALAM",
"GUJARATI", "GURMUKHI", "ORIYA", "SINHALA",
"LAO", "NKO", "THAANA", "OL_CHIKI",
"HAN_EXT_A", "HAN_EXT_B", "HAN_COMPAT", "BOPOMOFO"
};
private GlmScriptCategory() {}
/**
* Map a codepoint to its fine-grained script category.
*
* @param cp a Unicode codepoint (should already be lowercased)
* @return category ID in [0, {@link #COUNT}), or {@code -1} if the
* script is not covered (caller should skip, not bucket)
*/
public static int of(int cp) {
// Fast path: ASCII is Latin
if (cp < 0x0080) {
return LATIN;
}
// Bopomofo ��� check before UnicodeScript dispatch
if ((cp >= 0x3100 && cp <= 0x312F) || (cp >= 0x31A0 && cp <= 0x31BF)) {
return BOPOMOFO;
}
Character.UnicodeScript us = Character.UnicodeScript.of(cp);
if (us == Character.UnicodeScript.HAN) {
return hanSubBlock(cp);
}
return fromUnicodeScript(us);
}
private static int hanSubBlock(int cp) {
if (cp >= 0x3400 && cp <= 0x4DBF) return HAN_EXT_A;
if (cp >= 0xF900 && cp <= 0xFAFF) return HAN_COMPAT;
if (cp >= 0x20000) return HAN_EXT_B;
return HAN;
}
private static int fromUnicodeScript(Character.UnicodeScript us) {
switch (us) {
case LATIN: return LATIN;
case CYRILLIC: return CYRILLIC;
case ARABIC: return ARABIC;
case HANGUL: return HANGUL;
case HIRAGANA: return HIRAGANA;
case KATAKANA: return KATAKANA;
case DEVANAGARI: return DEVANAGARI;
case THAI: return THAI;
case GREEK: return GREEK;
case HEBREW: return HEBREW;
case BENGALI: return BENGALI;
case GEORGIAN: return GEORGIAN;
case ARMENIAN: return ARMENIAN;
case ETHIOPIC: return ETHIOPIC;
case MYANMAR: return MYANMAR;
case TIBETAN: return TIBETAN;
case KHMER: return KHMER;
case TAMIL: return TAMIL;
case TELUGU: return TELUGU;
case KANNADA: return KANNADA;
case MALAYALAM: return MALAYALAM;
case GUJARATI: return GUJARATI;
case GURMUKHI: return GURMUKHI;
case ORIYA: return ORIYA;
case SINHALA: return SINHALA;
case LAO: return LAO;
case NKO: return NKO;
case THAANA: return THAANA;
case OL_CHIKI: return OL_CHIKI;
default: return -1; // unrecognized ��� caller skips
}
}
/** Human-readable name for a category index. */
public static String name(int category) {
if (category >= 0 && category < NAMES.length) {
return NAMES[category];
}
return "UNKNOWN(" + category + ")";
}
}