FloresNorm.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.langdetect.charsoup.tools;
import java.util.Map;
import java.util.Set;
/**
* Normalizes FLORES-200 language codes to the canonical codes used by our
* training pipeline.
*
* <p>FLORES-200 uses {@code lang_Script} codes (e.g. {@code zho_Hans},
* {@code arb_Arab}). Normalization is two steps:
* <ol>
* <li>Strip the script suffix ��� {@code zho_Hans} ��� {@code zho} ��� unless
* the code is in {@link #KEEP_SCRIPT_SUFFIX}, where the script variant
* is a genuinely different language from our training data's default
* script for that code.</li>
* <li>Remap FLORES-specific codes to our canonical codes via
* {@link #CODE_REMAP} (e.g. {@code arb} ��� {@code ara},
* {@code cmn} ��� {@code zho}).</li>
* </ol>
*/
public final class FloresNorm {
/**
* FLORES codes where the script suffix must be kept because the
* script-suffixed variant is a different language from our training data:
* <ul>
* <li>{@code ace_Arab} ��� Acehnese in Jawi; our {@code ace} is Latin-script</li>
* <li>{@code arb_Latn} ��� Romanized Arabic; distinct from Arabic-script {@code arb}</li>
* <li>{@code bjn_Arab} ��� Banjar in Jawi; our {@code bjn} is Latin-script</li>
* <li>{@code kas_Deva} ��� Kashmiri in Devanagari; primary written form is Nastaliq</li>
* <li>{@code knc_Latn} ��� Central Kanuri in Latin; traditional script is Arabic</li>
* <li>{@code min_Arab} ��� Minangkabau in Jawi; our {@code min} is Latin-script</li>
* <li>{@code taq_Tfng} ��� Tamasheq in Tifinagh; digital text predominantly Latin</li>
* </ul>
* {@code zho_Hans} and {@code zho_Hant} both normalize to {@code zho}.
*/
public static final Set<String> KEEP_SCRIPT_SUFFIX = Set.of(
"ace_Arab",
"arb_Latn",
"bjn_Arab",
"kas_Deva",
"knc_Latn",
"min_Arab",
"taq_Tfng"
);
/**
* Maps FLORES base codes to the canonical codes used in our model.
* Only entries where the FLORES code differs from our canonical code
* are listed.
*/
public static final Map<String, String> CODE_REMAP = Map.ofEntries(
Map.entry("arb", "ara"), // Modern Standard Arabic ��� Arabic
Map.entry("pes", "fas"), // Western Persian ��� Farsi
Map.entry("zsm", "msa"), // Standard Malay ��� Malay
Map.entry("lvs", "lav"), // Standard Latvian ��� Latvian
Map.entry("azj", "aze"), // North Azerbaijani ��� Azerbaijani
Map.entry("ekk", "est"), // Standard Estonian ��� Estonian
Map.entry("npi", "nep"), // Nepali (individual) ��� Nepali
Map.entry("als", "sqi"), // Tosk Albanian ��� Albanian
Map.entry("ory", "ori"), // Odia ��� Oriya
Map.entry("nor", "nob"), // Norwegian ��� Bokm��l
Map.entry("cmn", "zho"), // Mandarin ��� Chinese
Map.entry("swa", "swh"), // Swahili (macrolanguage) ��� Swahili
Map.entry("yid", "ydd"), // Yiddish ��� Eastern Yiddish
Map.entry("gug", "grn"), // Paraguayan Guaran�� ��� Guaran��
Map.entry("quz", "que"), // Cusco Quechua ��� Quechua
Map.entry("plt", "mlg"), // Plateau Malagasy ��� Malagasy
Map.entry("pbt", "pus"), // Southern Pashto ��� Pashto
Map.entry("uzn", "uzb"), // Northern Uzbek ��� Uzbek
Map.entry("kmr", "kur"), // Kurmanji Kurdish ��� Kurdish
Map.entry("khk", "mon") // Khalkha Mongolian ��� Mongolian
);
/**
* Normalize a FLORES-200 language code to our canonical model code.
* Strips the script suffix then applies {@link #CODE_REMAP}.
*/
public static String normalize(String floresCode) {
String base = KEEP_SCRIPT_SUFFIX.contains(floresCode)
? floresCode
: stripScript(floresCode);
return CODE_REMAP.getOrDefault(base, base);
}
private static String stripScript(String code) {
int underscore = code.indexOf('_');
return underscore >= 0 ? code.substring(0, underscore) : code;
}
private FloresNorm() {}
}