PrepareCorpus.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.langdetect.charsoup.tools;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.TreeMap;
import org.apache.tika.langdetect.charsoup.CharSoupFeatureExtractor;
/**
* Converts a raw MADLAD corpus directory into the three data splits used by
* {@link TrainLanguageModel}:
* <ul>
* <li>{@code pool/} ��� per-language preprocessed files for training</li>
* <li>{@code dev.txt} ��� fixed dev set (preprocessed), used for early stopping</li>
* <li>{@code test_raw.txt} ��� fixed test set (raw), used for post-training eval</li>
* </ul>
*
* <p>This class owns all language-inclusion policy: the
* {@link #EXCLUDED_LANGS} list, the {@link #LANG_MERGE_MAP} aliases, and the
* script-aware sentence-count thresholds ({@link #MIN_SENTENCES_PER_LANG} for
* unique-script languages, {@link #LATIN_MIN_SENTENCES_PER_LANG} for Latin-script
* languages). {@link TrainLanguageModel} calls {@link #prepareData} directly so
* there is a single implementation shared by both entry points.
*
* <p>Usage:
* <pre>
* PrepareCorpus --corpus <dir> --output-dir <dir> [--max-train N]
* [--max-dev N] [--max-test N]
* </pre>
*/
public class PrepareCorpus {
/**
* Minimum sentences for unique-script (non-Latin) languages.
* These languages occupy a distinct region of the character space and can
* be identified reliably even with sparse data.
*/
static final int MIN_SENTENCES_PER_LANG = 10_000;
/**
* Minimum sentences for Latin-script languages.
* Previously 20k when training on MADLAD (noisy data required more volume).
* Lowered to 10k for Wikipedia data which is cleaner.
*/
static final int LATIN_MIN_SENTENCES_PER_LANG = 10_000;
private static final int DEFAULT_MAX_TEST_PER_LANG = 2_000;
private static final int DEFAULT_MAX_DEV_PER_LANG = 2_000;
/**
* Languages explicitly excluded from the model despite having enough corpus
* data to meet the sentence-count threshold. Each exclusion falls into one
* of two categories:
*
* <ul>
* <li><b>Accuracy interference</b> ��� adding the language causes a closely
* related majority language to drop significantly in accuracy.</li>
* <li><b>Unacceptable own accuracy</b> ��� the language's own detection
* accuracy is too low to be useful.</li>
* </ul>
*
* See the build documentation for per-language justifications.
*/
static final Set<String> EXCLUDED_LANGS;
static {
Set<String> ex = new HashSet<>();
// Venetian (vec): 72.0% own accuracy; Italian (ita) dropped to 83.6%.
ex.add("vec");
// Waray (war): previously excluded for two reasons: (1) MADLAD audit flag,
// (2) causes short-text confusion with English. Wikipedia source is cleaner;
// retested. POST-TRAINING: verify English (eng) recall is not degraded.
// Alsatian / Alemannic German (gsw): previously excluded for two reasons:
// (1) MADLAD content-quality flags, (2) confusion with German at short lengths.
// Wikipedia source retested. POST-TRAINING: verify German (deu) F1 not degraded.
// Hawaiian (haw): previously excluded due to MADLAD content-quality
// flags. Wikipedia source retested.
// Inuktitut (iku): previously excluded because MADLAD data was heavily
// English-contaminated. Wikipedia source uses Canadian Syllabics script;
// SCRIPT_CONSISTENCY_LANGS entry added to filter Latin contamination.
// Shan (shn): previously excluded due to Latin/English contamination in
// MADLAD. SCRIPT_CONSISTENCY_LANGS entry added (MYANMAR script) to filter it.
// Cornish (cor): Not present in Flores-200; Wikipedia corpus is
// known to contain substantial English code-switching. Re-enabled
// with Wikipedia data. POST-TRAINING: verify English (eng) recall
// is not degraded.
// Tosk Albanian (als): 69.7% own accuracy; Standard Albanian (sqi)
// collapsed to 51.6%.
ex.add("als");
// Madurese (mad): 9.1% own accuracy ��� essentially random.
ex.add("mad");
// Anaang (anw): 32.5% own accuracy on only 3,036 test sentences.
ex.add("anw");
// Konkani (knn): 46.2% own accuracy. Devanagari script overlaps Marathi.
ex.add("knn");
// Gilaki (glk): 88.6% own accuracy. Script overlaps Persian/Mazanderani.
ex.add("glk");
// Kituba / Monokutuba (mkw): 80.1% own accuracy. Overlaps Kongo/Lingala.
ex.add("mkw");
// Dzongkha (dzo): F1=0.584. Irresolvable collision with Tibetan (bod).
ex.add("dzo");
// Tibetan (bod): F1=0.652. Mutually destructive with Dzongkha (dzo).
ex.add("bod");
// Sabah Malay (msi): F1=0.611. Indistinguishable from msa/ind.
ex.add("msi");
// Meru (meo): F1=0.629. Similar Malay-family confusion to msi.
ex.add("meo");
// Eastern Balochi (bgp): F1=0.351 at 20 chars. Heavy overlap with
// Arabic/Urdu/Persian at short lengths.
ex.add("bgp");
// Fiji Hindi (hif): F1=0.808. Persistent confusion with hin.
ex.add("hif");
// Manx (glv): previously excluded due to MADLAD English contamination.
// Wikipedia source retested.
// Old English (ang): F1=0.860. Causes 109 English sentences to be
// misclassified. Not a practical Tika detection target.
ex.add("ang");
// Crimean Tatar (crh): F1=0.727. Persistently confused with Turkish (tur).
ex.add("crh");
// Zazaki / Southern Zaza (zza): F1=0.712. Overlaps Turkish/Kurdish.
ex.add("zza");
// Bosnian (bos): near-random with hrv/srp at character level; F1 ~0.
ex.add("bos");
// Southern Sotho (sot): F1=0.659. Accuracy interference with tsn/nso.
ex.add("sot");
// --- Wikipedia-era drops (March 2026) ---
// Serbo-Croatian (hbs): F1@20=0.113, F1@500=0.849 ��� never separable from
// hrv/srp/bos at any useful length. 77.6% Pass-2 retention (worst in pool).
ex.add("hbs");
// Maithili (mai): F1@20=0.146, F1@50=0.490. Devanagari script identical to
// hin/nep/mar at short lengths; 73.5% Pass-2 retention signals corpus
// contamination. Collateral damage to Hindi makes inclusion net-negative.
ex.add("mai");
// Komi-Permyak (koi): F1@500=0.765 ��� permanently confused with Komi-Zyrian
// (kpv) at 22% even at 500 chars. Weaker of the pair; kpv is retained.
ex.add("koi");
// Haitian Creole (hat): formerly dropped for F1@20=0.155 / English confusion.
// Reinstated: length-gated confusables (hat���fra, threshold=400 ngrams) suppress
// hat at short text and fold its probability into French, eliminating the
// English bleed. hat is retained as a full class for longer text.
// Scots (sco): F1@20=0.258. #1 FP source into English AND #2 destination
// for English recall loss at @20 chars ��� hurts English in both directions.
// Scots Wikipedia was largely written by a non-Scots speaker modifying
// English articles; training signal is unreliable.
ex.add("sco");
// Quechua (que): F1@500=0.833. Never breaks 0.85 at any length; confusable
// with Aymara (aym) ��� the two Andean languages bleed into each other.
ex.add("que");
// Aymara (aym): F1@500=0.837. Same structural problem as que; mutually
// confusable Andean pair.
ex.add("aym");
// Picard (pcd): F1@500=0.805. French-adjacent Romance variety; never
// separable from French at any length.
ex.add("pcd");
// Gorontalo (gor): F1@500=0.801, F1@50=0.503. Persistently poor at every
// length ��� worst recovery rate of the post-Wikipedia drop candidates.
ex.add("gor");
// Cebuano (ceb): Wikipedia corpus was Lsjbot-generated municipality stubs
// (F1@500=0.999 Wikipedia vs 13.9% FLORES-200). Re-enabled with MADLAD data
// (sentences_madlad.txt), which contains genuine Cebuano prose.
// POST-TRAINING: verify tgl/fil confusion is acceptable.
// Zeelandic (zea): F1@500=0.827, below 0.85 threshold. Still bleeds 1.4%
// into Dutch (nld) on FLORES at full sentence length even with the gate.
// West Flemish (vls) and Low Saxon Dutch (nds-nl) are retained ��� both reach
// F1@500=0.925 and the gate handles short-text confusion adequately.
ex.add("zea");
// --- v5 drops: bot-generated corpora identified by manual sampling ---
// Method: random-sample audit of Wikipedia corpus pool files, cross-referenced
// against Wikipedia/FLORES F1 gap analysis. Each language below had its training
// data inspected and found to be dominated by templated bot stubs.
// Egyptian Arabic (arz): corpus is overwhelmingly person/place stubs
// ("���� ������������ ������ [date] ������ [year] ���� [place]"). FLORES F1=5.3% vs
// Wikipedia F1@500=99.9% ��� same signature as ceb.
ex.add("arz");
// Buginese (bug): ~95% French municipality stubs
// ("X iyanaritu s��uwa komun ri d��paretema Y ri Perancis"). Only 8k sentences.
ex.add("bug");
// Bishnupriya Manipuri (bpy): dominated by Brazilian municipality, US county,
// and Indian/Bangladeshi location stubs. 18k sentences, near-zero genuine prose.
ex.add("bpy");
// Malagasy (mlg): Wikipedia corpus dominated by French commune stubs.
// Re-enabled with MADLAD data (sentences_madlad.txt), which contains
// genuine Malagasy prose. POST-TRAINING: verify F1 on FLORES.
// Min Nan Chinese romanized (nan-x-rom): 6/8 random samples are geographic
// stubs (Romanian communes, Bolivian municipalities, German/Iranian villages).
// 455k sentences but overwhelmingly templated.
ex.add("nan-x-rom");
// Newari (new): Indian village stubs dominate
// ("������������������ X ��������������������� Y ������������������ Z ��������������������� ��������� ��������� ���"). 20k sentences.
ex.add("new");
// Ladin (lld): all sampled sentences are stubs (mountain ranges, municipalities,
// towns, incomplete game articles). 143k sentences.
ex.add("lld");
// Chechen (che): Wikipedia corpus was overwhelmingly Russian/Mexican village
// stubs. Re-enabled with MADLAD data (sentences_madlad.txt), which contains
// genuine Chechen prose. POST-TRAINING: verify Russian (rus) F1 not degraded.
// Navajo (nav): ~95% species distribution stubs following the template
// "���� [animal] dah yikahj�� atah yisdzoh... Nda��a��kaah�� d���� ������deet����hii ����
// [scientific name] dei��n��igo day��zh��". 47k sentences.
ex.add("nav");
// --- gate-simplification drops (v5) ---
// The length-gating mechanism has been removed from CharSoupLanguageDetector.
// Languages that required gating to avoid severe parent-language bleed are
// dropped entirely. Languages with only minor short-text parent bleed (vls,
// ext, rue, bjn, nap) are retained and compete at all lengths.
// Haitian Creole (hat): was #1 false-positive source into English @20 chars
// without gating. Corpus is mixed (filmography stubs, place stubs, genuine
// prose), weakening the hat signal. Dropping rather than restoring the bleed.
ex.add("hat");
// Low Saxon Dutch (nds-nl): had the most complex dual-confusable behavior ���
// permanently paired with nds (Low Saxon German) AND length-gated against nld
// (Dutch). nds-nl text routes to nds via the confusable pair. Simplified out.
ex.add("nds-nl");
// Banyumasan (map-bms): 5.7% bleed into ind and 3.5% into jav at FULL length
// (above any threshold). Gating only addressed short text; the full-length
// bleed remained. Not recoverable without a better corpus.
ex.add("map-bms");
// --- v7 drops: Italian-dialect cluster and Spanish/Catalan bleed (March 2026) ---
// FLORES analysis showed these languages score 0% on independent evaluation
// while actively harming major neighbors as false-positive sources.
// Italian dialects / close relatives (all harm Italian precision):
// Neapolitan (nap): 966/~2000 Wikipedia FP sentences absorbed as Italian.
// Previously retained; FLORES analysis confirmed net-negative inclusion.
ex.add("nap");
// Friulian (fur): 0% FLORES (411/997 predicted as Italian).
ex.add("fur");
// Ligurian (lij): 0% FLORES (308/997 predicted as Italian).
ex.add("lij");
// Lombard (lmo): 0% FLORES (predicted as cat/roh/cos/ita across all sentences).
ex.add("lmo");
// Sicilian (scn): 0% FLORES (988/997 absorbed by Corsican cos).
ex.add("scn");
// Sardinian (srd): 0% FLORES (absorbed by cos/tet/ron/por/spa).
ex.add("srd");
// Piedmontese (pms): not in FLORES; Wikipedia FP source for Italian.
ex.add("pms");
// Tarantino (roa-tara): not in FLORES; 386 Wikipedia FP sentences into Italian.
ex.add("roa-tara");
// Twi (twi): 987/997 FLORES sentences absorbed by Akan (aka). Twi is a dialect
// of Akan; having both is structurally unsound. aka is retained (66.4% FLORES F1).
ex.add("twi");
// Asturian (ast): 0% FLORES (865/997 predicted as Spanish). Dumps 865 FP
// sentences into Spanish, holding Spanish FLORES F1 at 65%.
ex.add("ast");
// Occitan (oci): 0% FLORES (819/997 predicted as Catalan). Dumps 819 FP
// sentences into Catalan, holding Catalan FLORES F1 at 61.8%.
ex.add("oci");
EXCLUDED_LANGS = Collections.unmodifiableSet(ex);
}
/**
* ISO 639-3 aliases to merge into a canonical code before processing.
* Sentences from the alias are relabeled to the canonical code and pooled
* together before dedup and splitting.
*/
static final Map<String, String> LANG_MERGE_MAP;
static {
Map<String, String> m = new HashMap<>();
m.put("azj", "aze");
m.put("ekk", "est");
m.put("pes", "fas");
m.put("zsm", "msa");
m.put("nor", "nob");
m.put("plt", "mlg");
m.put("cmn", "zho");
m.put("lvs", "lav");
m.put("gug", "grn");
m.put("quz", "que");
m.put("swa", "swh");
m.put("yid", "ydd");
LANG_MERGE_MAP = Collections.unmodifiableMap(m);
}
/**
* Minimum fraction of letter characters that must belong to the expected
* script for a sentence to be retained. Sentences below this threshold
* are considered contaminated and dropped from all splits (pool, dev, test).
*/
static final double SCRIPT_CONSISTENCY_THRESHOLD = 0.80;
/**
* Per-language overrides for {@link #SCRIPT_CONSISTENCY_THRESHOLD}.
* Use when a language naturally mixes its primary script with another
* (e.g. Dhivehi mixes Thaana with Arabic in Islamic text).
*/
static final Map<String, Double> SCRIPT_CONSISTENCY_THRESHOLD_OVERRIDES;
static {
Map<String, Double> m = new HashMap<>();
// Dhivehi (div): Maldivian text naturally mixes Thaana with Arabic
// script for Islamic phrases. 0.80 cuts too aggressively; 0.50 retains
// genuine Thaana-dominant prose while allowing Arabic admixture.
m.put("div", 0.50);
SCRIPT_CONSISTENCY_THRESHOLD_OVERRIDES = Collections.unmodifiableMap(m);
}
/** Max fraction of Latin letters allowed for languages in {@link #MAX_LATIN_RATIO_LANGS}. */
static final double MAX_LATIN_RATIO = 0.50;
/**
* Languages that use non-Latin scripts but legitimately mix some Latin.
* Sentences where Latin letters exceed {@link #MAX_LATIN_RATIO} are dropped.
*/
private static final Set<String> MAX_LATIN_RATIO_LANGS =
Collections.unmodifiableSet(new HashSet<>(Arrays.asList("jpn", "kor")));
/**
* Maps language codes to the set of Unicode scripts expected for that
* language. Sentences from these languages are filtered by
* {@link #filterByScriptConsistency} to remove Latin/foreign-script contamination
* before training.
*
* <p>Note: {@code jpn} and {@code kor} are intentionally excluded from this map;
* they use {@link #MAX_LATIN_RATIO_LANGS} instead to drop Latin-dominant sentences
* while retaining naturally mixed ones.
* Japanese commonly mixes Latin characters (brand names, loanwords,
* abbreviations) and a strict CJK-consistency filter would destroy nearly half
* the training data. Korean has a similar, though less extreme, mixing pattern.
* These two languages are better handled by the volume of clean training data
* than by a script consistency filter.
*/
@SuppressWarnings("unchecked")
private static final Map<String, Set<Character.UnicodeScript>> SCRIPT_CONSISTENCY_LANGS;
static {
Map<String, Set<Character.UnicodeScript>> m = new HashMap<>();
// CJK ��� all Han-script Chinese variants
for (String lang : new String[]{"zho", "yue", "wuu", "gan", "lzh"}) {
m.put(lang, EnumSet.of(Character.UnicodeScript.HAN));
}
// nan-x-rom (Min Nan) uses romanized Pe��h-��e-j�� in Wikipedia ��� no HAN filter
// cdo-x-rom (Eastern Min) uses romanized Foochow Romanized ��� no HAN filter
// hak-x-rom (Hakka) uses romanized Pha��k-fa-s��� ��� no HAN filter
// Arabic script
for (String lang : new String[]{"ara", "fas", "urd", "pus", "ckb", "uig", "snd"}) {
m.put(lang, EnumSet.of(Character.UnicodeScript.ARABIC));
}
// Cyrillic script
for (String lang : new String[]{
"rus", "ukr", "bul", "bel", "mkd", "srp", "bak", "tat", "sah",
"chv", "bua", "kir", "myv", "mdf", "krc", "ava", "che", "oss",
"kom", "udm", "kjh", "kum", "mrj", "chm", "inh", "kbd", "mon"}) {
m.put(lang, EnumSet.of(Character.UnicodeScript.CYRILLIC));
}
// Devanagari script
for (String lang : new String[]{"hin", "mar", "nep", "san", "bho", "mai"}) {
m.put(lang, EnumSet.of(Character.UnicodeScript.DEVANAGARI));
}
// Indic scripts
m.put("pan", EnumSet.of(Character.UnicodeScript.GURMUKHI));
m.put("ben", EnumSet.of(Character.UnicodeScript.BENGALI));
m.put("asm", EnumSet.of(Character.UnicodeScript.BENGALI));
m.put("tel", EnumSet.of(Character.UnicodeScript.TELUGU));
m.put("kan", EnumSet.of(Character.UnicodeScript.KANNADA));
m.put("mal", EnumSet.of(Character.UnicodeScript.MALAYALAM));
m.put("sin", EnumSet.of(Character.UnicodeScript.SINHALA));
m.put("tam", EnumSet.of(Character.UnicodeScript.TAMIL));
m.put("guj", EnumSet.of(Character.UnicodeScript.GUJARATI));
m.put("ori", EnumSet.of(Character.UnicodeScript.ORIYA));
m.put("sat", EnumSet.of(Character.UnicodeScript.OL_CHIKI));
m.put("mni", EnumSet.of(Character.UnicodeScript.MEETEI_MAYEK));
// Other distinct scripts
m.put("kat", EnumSet.of(Character.UnicodeScript.GEORGIAN));
m.put("hye", EnumSet.of(Character.UnicodeScript.ARMENIAN));
m.put("ell", EnumSet.of(Character.UnicodeScript.GREEK));
m.put("heb", EnumSet.of(Character.UnicodeScript.HEBREW));
m.put("ydd", EnumSet.of(Character.UnicodeScript.HEBREW));
m.put("tha", EnumSet.of(Character.UnicodeScript.THAI));
m.put("khm", EnumSet.of(Character.UnicodeScript.KHMER));
m.put("mya", EnumSet.of(Character.UnicodeScript.MYANMAR));
m.put("ksw", EnumSet.of(Character.UnicodeScript.MYANMAR)); // S'gaw Karen
m.put("amh", EnumSet.of(Character.UnicodeScript.ETHIOPIC));
m.put("tir", EnumSet.of(Character.UnicodeScript.ETHIOPIC));
m.put("lao", EnumSet.of(Character.UnicodeScript.LAO));
m.put("iku", EnumSet.of(Character.UnicodeScript.CANADIAN_ABORIGINAL));
// Myanmar script: Burmese, S'gaw Karen, Shan, Mon, Pa'O Karen
for (String lang : new String[]{"shn", "mnw", "blk"}) {
m.put(lang, EnumSet.of(Character.UnicodeScript.MYANMAR));
}
m.put("div", EnumSet.of(Character.UnicodeScript.THAANA));
m.put("chr", EnumSet.of(Character.UnicodeScript.CHEROKEE));
m.put("nqo", EnumSet.of(Character.UnicodeScript.NKO));
m.put("bod", EnumSet.of(Character.UnicodeScript.TIBETAN));
m.put("dzo", EnumSet.of(Character.UnicodeScript.TIBETAN));
// Mingrelian uses Georgian script
m.put("xmf", EnumSet.of(Character.UnicodeScript.GEORGIAN));
SCRIPT_CONSISTENCY_LANGS = Collections.unmodifiableMap(m);
}
public static void main(String[] args) throws IOException {
Path corpusDir = null;
Path outputDir = null;
int maxTrainPerLang = 0;
int maxDevPerLang = DEFAULT_MAX_DEV_PER_LANG;
int maxTestPerLang = DEFAULT_MAX_TEST_PER_LANG;
boolean noScriptFilter = false;
int unkPerLang = 0;
for (int i = 0; i < args.length; i++) {
switch (args[i]) {
case "--corpus":
corpusDir = Paths.get(args[++i]);
break;
case "--output-dir":
outputDir = Paths.get(args[++i]);
break;
case "--max-train":
maxTrainPerLang = Integer.parseInt(args[++i]);
break;
case "--max-dev":
maxDevPerLang = Integer.parseInt(args[++i]);
break;
case "--max-test":
maxTestPerLang = Integer.parseInt(args[++i]);
break;
case "--no-script-filter":
noScriptFilter = true;
break;
case "--unk-per-lang":
unkPerLang = Integer.parseInt(args[++i]);
break;
default:
System.err.println("Unknown argument: " + args[i]);
printUsage();
System.exit(1);
}
}
if (corpusDir == null || outputDir == null) {
printUsage();
System.exit(1);
}
Files.createDirectories(outputDir);
System.out.println("=== PrepareCorpus ===");
System.out.println("Corpus : " + corpusDir);
System.out.println("Output dir: " + outputDir);
System.out.println("Script filter: " + (noScriptFilter ? "disabled" : "enabled"));
System.out.println();
long start = System.nanoTime();
int[] counts = prepareData(corpusDir, outputDir,
maxTrainPerLang, maxDevPerLang, maxTestPerLang,
noScriptFilter, unkPerLang);
double elapsed = (System.nanoTime() - start) / 1_000_000_000.0;
System.out.printf(Locale.US,
"%nDone: pool=%,d dev=%,d test=%,d [%.1f s]%n",
counts[0], counts[1], counts[2], elapsed);
System.out.println("Pool dir : " + outputDir.resolve("pool"));
System.out.println("Dev file : " + outputDir.resolve("dev.txt"));
System.out.println("Test file: " + outputDir.resolve("test_raw.txt"));
}
private static void printUsage() {
System.err.println("Usage: PrepareCorpus"
+ " --corpus <dir> --output-dir <dir>"
+ " [--max-train N] [--max-dev N] [--max-test N]"
+ " [--no-script-filter] [--unk-per-lang N]");
}
// ================================================================
// Core preparation logic ��� called by TrainLanguageModel as well
// ================================================================
/**
* Prepare data splits from a raw MADLAD corpus directory.
*
* @param corpusDir root directory; each subdirectory is a language code
* @param prepDir output directory; receives {@code pool/}, {@code dev.txt},
* {@code test_raw.txt}
* @param maxTrainPerLang cap on sentences per language (0 = unlimited)
* @param maxDevPerLang cap on dev sentences per language
* @param maxTestPerLang cap on test sentences per language
* @return int[3]: {poolCount, devCount, testCount}
*/
static int[] prepareData(Path corpusDir, Path prepDir,
int maxTrainPerLang,
int maxDevPerLang, int maxTestPerLang)
throws IOException {
return prepareData(corpusDir, prepDir,
maxTrainPerLang, maxDevPerLang, maxTestPerLang, false, 0);
}
static int[] prepareData(Path corpusDir, Path prepDir,
int maxTrainPerLang,
int maxDevPerLang, int maxTestPerLang,
boolean noScriptFilter)
throws IOException {
return prepareData(corpusDir, prepDir,
maxTrainPerLang, maxDevPerLang, maxTestPerLang,
noScriptFilter, 0);
}
static int[] prepareData(Path corpusDir, Path prepDir,
int maxTrainPerLang,
int maxDevPerLang, int maxTestPerLang,
boolean noScriptFilter, int unkPerLang)
throws IOException {
Path poolDir = prepDir.resolve("pool");
Path devFile = prepDir.resolve("dev.txt");
Path testFile = prepDir.resolve("test_raw.txt");
int effectiveUniqueMin = maxTrainPerLang > 0
? Math.min(MIN_SENTENCES_PER_LANG, maxTrainPerLang / 2)
: MIN_SENTENCES_PER_LANG;
int effectiveLatinMin = maxTrainPerLang > 0
? Math.min(LATIN_MIN_SENTENCES_PER_LANG, maxTrainPerLang / 2)
: LATIN_MIN_SENTENCES_PER_LANG;
Files.createDirectories(poolDir);
int totalPool = 0, totalDev = 0, totalTest = 0;
int langCount = 0;
int droppedCount = 0;
long totalDupes = 0;
Map<String, Integer> langCounts = new TreeMap<>();
List<String> dropped = new ArrayList<>();
// Accumulates sentences from non-trained languages for the unk class.
// Languages that are aliases (LANG_MERGE_MAP keys) are excluded because
// their text is indistinguishable from the canonical class they map to.
Set<String> mergeAliases = LANG_MERGE_MAP.keySet();
List<LabeledSentence> unkAccum = new ArrayList<>();
List<Path> langDirs = new ArrayList<>();
try (DirectoryStream<Path> dirs =
Files.newDirectoryStream(corpusDir,
Files::isDirectory)) {
for (Path d : dirs) {
langDirs.add(d);
}
}
langDirs.sort((a, b) -> a.getFileName().toString()
.compareTo(b.getFileName().toString()));
Map<String, List<LabeledSentence>> mergeAccum = new HashMap<>();
try (BufferedWriter devWriter = Files.newBufferedWriter(
devFile, StandardCharsets.UTF_8);
BufferedWriter testWriter = Files.newBufferedWriter(
testFile, StandardCharsets.UTF_8)) {
for (Path langDir : langDirs) {
String dirName = langDir.getFileName().toString();
if (dirName.startsWith("_")) {
continue;
}
List<LabeledSentence> sentences = new ArrayList<>();
if (maxTrainPerLang > 0) {
CorpusReader.readLanguageDirSampled(
langDir, dirName, maxTrainPerLang, sentences);
} else {
CorpusReader.readLanguageDir(
langDir, dirName, sentences);
}
int beforeDedup = sentences.size();
sentences = dedup(sentences);
int removed = beforeDedup - sentences.size();
if (removed > 0) {
totalDupes += removed;
if (removed > beforeDedup / 5) {
System.out.printf(Locale.US,
" %s: removed %,d/%,d dupes (%.1f%%)%n",
dirName, removed, beforeDedup,
100.0 * removed / beforeDedup);
}
}
String canonLang = LANG_MERGE_MAP.getOrDefault(
dirName, dirName);
if (!canonLang.equals(dirName)) {
List<LabeledSentence> relabeled =
new ArrayList<>(sentences.size());
for (LabeledSentence s : sentences) {
relabeled.add(new LabeledSentence(
canonLang, s.getText()));
}
sentences = relabeled;
System.out.printf(Locale.US,
" %s ��� %s (%,d sentences)%n",
dirName, canonLang, sentences.size());
mergeAccum.computeIfAbsent(canonLang,
k -> new ArrayList<>())
.addAll(sentences);
continue;
}
List<LabeledSentence> accumulated =
mergeAccum.remove(canonLang);
if (accumulated != null) {
sentences.addAll(accumulated);
sentences = dedup(sentences);
}
if (EXCLUDED_LANGS.contains(canonLang)) {
dropped.add(canonLang + "(excluded)");
droppedCount++;
if (unkPerLang > 0 && !mergeAliases.contains(dirName)) {
sampleIntoUnk(sentences, unkPerLang, unkAccum);
}
continue;
}
if (!noScriptFilter) {
sentences = filterByScriptConsistency(sentences, canonLang);
sentences = filterByLatinRatio(sentences, canonLang);
}
int minRequired = isLatinScript(sentences)
? effectiveLatinMin : effectiveUniqueMin;
if (sentences.size() < minRequired) {
dropped.add(canonLang + "(" + sentences.size() + ")");
droppedCount++;
if (unkPerLang > 0 && !mergeAliases.contains(dirName)) {
sampleIntoUnk(sentences, unkPerLang, unkAccum);
}
continue;
}
int[] written = writeLanguageSplit(
sentences, canonLang, poolDir,
devWriter, testWriter,
maxDevPerLang, maxTestPerLang);
totalPool += written[0];
totalDev += written[1];
totalTest += written[2];
langCounts.put(canonLang, sentences.size());
langCount++;
if (langCount % 50 == 0) {
System.out.printf(Locale.US,
" Processed %d languages...%n", langCount);
}
}
// Flush remaining merged languages (alias came after canonical)
for (Map.Entry<String, List<LabeledSentence>> e
: mergeAccum.entrySet()) {
String lang = e.getKey();
List<LabeledSentence> sentences = dedup(e.getValue());
if (maxTrainPerLang > 0
&& sentences.size() > maxTrainPerLang) {
sentences = sentences.subList(0, maxTrainPerLang);
}
if (EXCLUDED_LANGS.contains(lang)) {
dropped.add(lang + "(excluded)");
droppedCount++;
if (unkPerLang > 0) {
sampleIntoUnk(sentences, unkPerLang, unkAccum);
}
continue;
}
if (!noScriptFilter) {
sentences = filterByScriptConsistency(sentences, lang);
sentences = filterByLatinRatio(sentences, lang);
}
int minRequired = isLatinScript(sentences)
? effectiveLatinMin : effectiveUniqueMin;
if (sentences.size() < minRequired) {
dropped.add(lang + "(" + sentences.size() + ")");
droppedCount++;
if (unkPerLang > 0) {
sampleIntoUnk(sentences, unkPerLang, unkAccum);
}
continue;
}
int[] written = writeLanguageSplit(
sentences, lang, poolDir,
devWriter, testWriter,
maxDevPerLang, maxTestPerLang);
totalPool += written[0];
totalDev += written[1];
totalTest += written[2];
langCounts.put(lang, sentences.size());
langCount++;
}
}
// Write unk class pool/dev/test if requested
if (unkPerLang > 0 && !unkAccum.isEmpty()) {
java.util.Collections.shuffle(unkAccum,
new java.util.Random(42));
try (BufferedWriter devWriter = Files.newBufferedWriter(
prepDir.resolve("dev.txt"),
StandardCharsets.UTF_8,
java.nio.file.StandardOpenOption.APPEND);
BufferedWriter testWriter = Files.newBufferedWriter(
prepDir.resolve("test_raw.txt"),
StandardCharsets.UTF_8,
java.nio.file.StandardOpenOption.APPEND)) {
int[] written = writeLanguageSplit(
unkAccum, "unk", poolDir,
devWriter, testWriter,
maxDevPerLang, maxTestPerLang);
totalPool += written[0];
totalDev += written[1];
totalTest += written[2];
System.out.printf(Locale.US,
"unk class: %,d pool + %,d dev + %,d test "
+ "(from %,d source sentences across excluded langs)%n",
written[0], written[1], written[2], unkAccum.size());
}
}
if (totalDupes > 0) {
System.out.printf(Locale.US,
"Deduplicated: removed %,d duplicate sentences%n",
totalDupes);
}
if (!dropped.isEmpty()) {
dropped.sort(String::compareTo);
System.out.println("Dropped " + droppedCount
+ " low-resource languages: "
+ String.join(", ", dropped));
}
System.out.println("Languages included: " + langCounts.size());
langCounts.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue()
.reversed())
.limit(20)
.forEach(e -> System.out.printf(Locale.US,
" %-12s %,d%n", e.getKey(), e.getValue()));
if (langCounts.size() > 20) {
System.out.println(" ... and " + (langCounts.size() - 20) + " more");
}
return new int[]{totalPool, totalDev, totalTest};
}
// ================================================================
// Helpers
// ================================================================
/**
* Samples up to {@code maxPerLang} sentences from {@code source} into
* {@code unkAccum}, relabeling them all as {@code "unk"}.
*/
private static void sampleIntoUnk(List<LabeledSentence> source,
int maxPerLang,
List<LabeledSentence> unkAccum) {
int take = Math.min(source.size(), maxPerLang);
for (int i = 0; i < take; i++) {
unkAccum.add(new LabeledSentence("unk", source.get(i).getText()));
}
}
/**
* Split one language's sentences into test (raw), dev (preprocessed),
* and pool (preprocessed, per-language file).
*
* <ul>
* <li>Test : 10%, max {@code maxTestPerLang}, raw text</li>
* <li>Dev : 10%, max {@code maxDevPerLang}, preprocessed</li>
* <li>Pool : remainder, preprocessed</li>
* </ul>
*
* @return int[3]: {pool, dev, test} sentence counts
*/
static int[] writeLanguageSplit(
List<LabeledSentence> sentences, String lang,
Path poolDir,
BufferedWriter devWriter, BufferedWriter testWriter,
int maxDevPerLang, int maxTestPerLang)
throws IOException {
Random rng = new Random(lang.hashCode() + 42L);
Collections.shuffle(sentences, rng);
int remaining = sentences.size();
int testCount = Math.min(
(int) (remaining * 0.1f), maxTestPerLang);
int devCount = Math.min(
(int) ((remaining - testCount) * 0.1f / 0.9f),
maxDevPerLang);
int poolStart = testCount + devCount;
for (int i = 0; i < testCount; i++) {
testWriter.write(lang);
testWriter.write('\t');
testWriter.write(sentences.get(i).getText());
testWriter.newLine();
}
for (int i = testCount; i < testCount + devCount; i++) {
String cleaned = CharSoupFeatureExtractor.preprocess(
sentences.get(i).getText());
devWriter.write(lang);
devWriter.write('\t');
devWriter.write(cleaned);
devWriter.newLine();
}
Path poolFile = poolDir.resolve(lang);
try (BufferedWriter pw = Files.newBufferedWriter(
poolFile, StandardCharsets.UTF_8)) {
for (int i = poolStart; i < sentences.size(); i++) {
String cleaned = CharSoupFeatureExtractor.preprocess(
sentences.get(i).getText());
pw.write(cleaned);
pw.newLine();
}
}
return new int[]{sentences.size() - poolStart, devCount, testCount};
}
/**
* Returns {@code true} if the language's sample text is predominantly
* Latin-script (Basic Latin + Latin Extended-A/B, U+0000���U+024F).
* Samples up to 20 sentences; defaults to {@code true} (stricter threshold)
* when no letter content is found.
*/
static boolean isLatinScript(List<LabeledSentence> sentences) {
int latinLetters = 0;
int totalLetters = 0;
int limit = Math.min(sentences.size(), 20);
for (int i = 0; i < limit; i++) {
for (char c : sentences.get(i).getText().toCharArray()) {
if (Character.isLetter(c)) {
totalLetters++;
if (c <= '\u024F') {
latinLetters++;
}
}
}
}
return totalLetters == 0
|| (double) latinLetters / totalLetters > 0.5;
}
/**
* Removes sentences whose target-script letter fraction falls below
* {@link #SCRIPT_CONSISTENCY_THRESHOLD}. Languages not present in
* {@link #SCRIPT_CONSISTENCY_LANGS} are returned unchanged.
*
* <p>The filter operates on raw (un-preprocessed) text so that script
* detection is not confused by any lowercasing or normalization steps.
* It is applied uniformly before the train/dev/test split so all three
* splits see only clean sentences.
*/
static List<LabeledSentence> filterByScriptConsistency(
List<LabeledSentence> sentences, String lang) {
Set<Character.UnicodeScript> expected = SCRIPT_CONSISTENCY_LANGS.get(lang);
if (expected == null) {
return sentences;
}
List<LabeledSentence> filtered = new ArrayList<>(sentences.size());
int dropped = 0;
for (LabeledSentence s : sentences) {
String text = s.getText();
int letters = 0;
int matching = 0;
for (int i = 0; i < text.length(); ) {
int cp = text.codePointAt(i);
i += Character.charCount(cp);
if (Character.isLetter(cp)) {
letters++;
if (expected.contains(Character.UnicodeScript.of(cp))) {
matching++;
}
}
}
double consistency = letters == 0 ? 1.0 : (double) matching / letters;
double threshold = SCRIPT_CONSISTENCY_THRESHOLD_OVERRIDES.getOrDefault(
lang, SCRIPT_CONSISTENCY_THRESHOLD);
if (consistency >= threshold) {
filtered.add(s);
} else {
dropped++;
}
}
if (dropped > 0) {
System.out.printf(Locale.US,
" %s: script-consistency filter removed %,d/%,d sentences (%.1f%%)%n",
lang, dropped, sentences.size(),
100.0 * dropped / sentences.size());
}
return filtered;
}
/**
* Drops sentences where Latin letters exceed {@link #MAX_LATIN_RATIO} of all
* letter codepoints. Only applied to languages in {@link #MAX_LATIN_RATIO_LANGS}
* (currently {@code jpn} and {@code kor}), which legitimately mix some Latin but
* should not have Latin-dominant sentences.
*/
static List<LabeledSentence> filterByLatinRatio(
List<LabeledSentence> sentences, String lang) {
if (!MAX_LATIN_RATIO_LANGS.contains(lang)) {
return sentences;
}
List<LabeledSentence> filtered = new ArrayList<>(sentences.size());
int dropped = 0;
for (LabeledSentence s : sentences) {
String text = s.getText();
int letters = 0, latin = 0;
for (int i = 0; i < text.length(); ) {
int cp = text.codePointAt(i);
i += Character.charCount(cp);
if (Character.isLetter(cp)) {
letters++;
if (Character.UnicodeScript.of(cp) == Character.UnicodeScript.LATIN) {
latin++;
}
}
}
double ratio = letters == 0 ? 0.0 : (double) latin / letters;
if (ratio <= MAX_LATIN_RATIO) {
filtered.add(s);
} else {
dropped++;
}
}
if (dropped > 0) {
System.out.printf(Locale.US,
" %s: latin-ratio filter removed %,d/%,d sentences (%.1f%%)%n",
lang, dropped, sentences.size(),
100.0 * dropped / sentences.size());
}
return filtered;
}
/**
* Deduplicate by FNV-1a 64-bit hash of the sentence text.
*/
static List<LabeledSentence> dedup(List<LabeledSentence> sentences) {
Set<Long> seen = new HashSet<>();
List<LabeledSentence> unique = new ArrayList<>();
for (LabeledSentence s : sentences) {
long hash = DuplicateChecker.fnv1a64(s.getText());
if (seen.add(hash)) {
unique.add(s);
}
}
return unique;
}
}