ScriptCategory.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.langdetect.charsoup;
/**
* Coarse Unicode script categories for language detection.
* <p>
* The full {@link Character.UnicodeScript} enum has ~160 values, far more
* granularity than needed. This class maps scripts into a small set of
* categories that matter for language detection:
* <ul>
* <li>Scripts that cover multiple confusable languages (Latin, Cyrillic, Arabic)</li>
* <li>CJK scripts that need special n-gram treatment (Han, Hiragana, Katakana, Hangul)</li>
* <li>Major Indic and Southeast Asian scripts</li>
* <li>Everything else bucketed into OTHER</li>
* </ul>
* <p>
* The category ID (0���15) is used as a salt byte in feature hashing, ensuring
* that characters from different scripts never collide in the bucket space.
* </p>
*/
public final class ScriptCategory {
public static final int LATIN = 0;
public static final int CYRILLIC = 1;
public static final int ARABIC = 2;
public static final int HAN = 3;
public static final int HANGUL = 4;
public static final int HIRAGANA = 5;
public static final int KATAKANA = 6;
public static final int DEVANAGARI = 7;
public static final int THAI = 8;
public static final int GREEK = 9;
public static final int HEBREW = 10;
public static final int BENGALI = 11;
public static final int GEORGIAN = 12;
public static final int ARMENIAN = 13;
public static final int ETHIOPIC = 14;
public static final int OTHER = 15;
public static final int CANADIAN_ABORIGINAL = 16;
public static final int MYANMAR = 17;
public static final int TIBETAN = 18;
public static final int KHMER = 19;
/** Number of distinct categories. */
public static final int COUNT = 20;
private static final String[] NAMES = {
"LATIN", "CYRILLIC", "ARABIC", "HAN", "HANGUL",
"HIRAGANA", "KATAKANA", "DEVANAGARI", "THAI", "GREEK",
"HEBREW", "BENGALI", "GEORGIAN", "ARMENIAN", "ETHIOPIC", "OTHER",
"CANADIAN_ABORIGINAL", "MYANMAR", "TIBETAN", "KHMER"
};
private ScriptCategory() {
// utility class
}
/**
* Map a codepoint to its coarse script category.
* <p>
* Uses a fast-path for ASCII (Latin) before falling through to
* {@link Character.UnicodeScript#of(int)}.
*
* @param cp a Unicode codepoint (should already be lowercased)
* @return category ID in [0, {@link #COUNT})
*/
public static int of(int cp) {
// Fast path: ASCII is Latin
if (cp < 0x0080) {
return LATIN;
}
Character.UnicodeScript us = Character.UnicodeScript.of(cp);
return fromUnicodeScript(us);
}
/**
* Map a {@link Character.UnicodeScript} to a category.
*/
static int fromUnicodeScript(Character.UnicodeScript us) {
if (us == Character.UnicodeScript.LATIN) return LATIN;
if (us == Character.UnicodeScript.CYRILLIC) return CYRILLIC;
if (us == Character.UnicodeScript.ARABIC) return ARABIC;
if (us == Character.UnicodeScript.HAN) return HAN;
if (us == Character.UnicodeScript.HANGUL) return HANGUL;
if (us == Character.UnicodeScript.HIRAGANA) return HIRAGANA;
if (us == Character.UnicodeScript.KATAKANA) return KATAKANA;
if (us == Character.UnicodeScript.DEVANAGARI) return DEVANAGARI;
if (us == Character.UnicodeScript.THAI) return THAI;
if (us == Character.UnicodeScript.GREEK) return GREEK;
if (us == Character.UnicodeScript.HEBREW) return HEBREW;
if (us == Character.UnicodeScript.BENGALI) return BENGALI;
if (us == Character.UnicodeScript.GEORGIAN) return GEORGIAN;
if (us == Character.UnicodeScript.ARMENIAN) return ARMENIAN;
if (us == Character.UnicodeScript.ETHIOPIC) return ETHIOPIC;
if (us == Character.UnicodeScript.CANADIAN_ABORIGINAL) return CANADIAN_ABORIGINAL;
if (us == Character.UnicodeScript.MYANMAR) return MYANMAR;
if (us == Character.UnicodeScript.TIBETAN) return TIBETAN;
if (us == Character.UnicodeScript.KHMER) return KHMER;
return OTHER;
}
/**
* Human-readable name of a category.
*/
public static String name(int category) {
if (category >= 0 && category < NAMES.length) {
return NAMES[category];
}
return "UNKNOWN(" + category + ")";
}
}