CorpusReader.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.langdetect.charsoup.tools;
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
/**
* Reads Leipzig-format sentence files from a directory tree.
* <p>
* Expected structure:
* <pre>
* corpusDir/
* eng/
* sentences.txt (lineNum\tsentence\n)
* deu/
* sentences.txt
* ...
* </pre>
* <p>
* The directory name is used as the ISO 639-3 language code.
* Each sentence file has tab-delimited lines: {@code lineNumber\tsentence text}.
* Files ending in {@code .txt} under each language directory are read.
*/
public class CorpusReader {
private CorpusReader() {
}
/**
* Read all labeled sentences from the corpus directory.
*
* @param corpusDir root directory containing per-language subdirectories
* @return list of labeled sentences
* @throws IOException if reading fails
*/
public static List<LabeledSentence> readAll(Path corpusDir) throws IOException {
return readAll(corpusDir, 0);
}
/**
* Read labeled sentences from the corpus directory, optionally sampling
* at most {@code maxPerLang} sentences per language using reservoir sampling.
* This avoids loading the entire corpus into memory when the on-disk corpus
* is much larger than the training budget.
*
* @param corpusDir root directory containing per-language subdirectories
* @param maxPerLang maximum sentences per language (0 = unlimited)
* @return list of labeled sentences
* @throws IOException if reading fails
*/
public static List<LabeledSentence> readAll(Path corpusDir, int maxPerLang)
throws IOException {
List<LabeledSentence> sentences = new ArrayList<>();
try (DirectoryStream<Path> langDirs = Files.newDirectoryStream(corpusDir,
Files::isDirectory)) {
for (Path langDir : langDirs) {
String language = langDir.getFileName().toString();
if (maxPerLang > 0) {
readLanguageDirSampled(langDir, language, maxPerLang, sentences);
} else {
readLanguageDir(langDir, language, sentences);
}
}
}
return sentences;
}
/**
* Read all sentence files from a single language directory.
*/
static void readLanguageDir(Path langDir, String language,
List<LabeledSentence> sentences) throws IOException {
try (DirectoryStream<Path> files = Files.newDirectoryStream(langDir, "*.txt")) {
for (Path file : files) {
readSentenceFile(file, language, sentences);
}
}
}
/**
* When the candidate pool exceeds this multiple of {@code maxPerLang} the
* reservoir is statistically well-mixed and we stop reading early.
* Keeps large-language files (e.g. 500k-line English) from dominating I/O
* when only a small sample is needed.
*/
private static final int RESERVOIR_EARLY_STOP_FACTOR = 10;
/**
* Read sentence files from a language directory using reservoir sampling
* to cap at {@code maxPerLang} sentences. Reading stops as soon as
* {@code RESERVOIR_EARLY_STOP_FACTOR * maxPerLang} candidates have been
* seen, which is sufficient to produce a well-mixed reservoir while
* avoiding a full scan of very large files.
*/
static void readLanguageDirSampled(Path langDir, String language,
int maxPerLang,
List<LabeledSentence> allSentences)
throws IOException {
List<LabeledSentence> reservoir = new ArrayList<>(maxPerLang);
Random rng = new Random(language.hashCode()); // deterministic per language
int count = 0;
final int earlyStop = maxPerLang * RESERVOIR_EARLY_STOP_FACTOR;
boolean done = false;
try (DirectoryStream<Path> files = Files.newDirectoryStream(langDir, "*.txt")) {
for (Path file : files) {
if (done) {
break;
}
try (BufferedReader reader = Files.newBufferedReader(file,
StandardCharsets.UTF_8)) {
String line;
while ((line = reader.readLine()) != null) {
int tab = line.indexOf('\t');
if (tab < 0) {
continue;
}
String doc = line.substring(tab + 1);
for (String part : doc.split("\\\\n")) {
String text = part.trim();
if (text.length() < MIN_SENTENCE_CHARS) {
continue;
}
count++;
if (reservoir.size() < maxPerLang) {
reservoir.add(new LabeledSentence(language, text));
} else {
int j = rng.nextInt(count);
if (j < maxPerLang) {
reservoir.set(j, new LabeledSentence(language, text));
}
}
if (count >= earlyStop) {
done = true;
break;
}
}
if (done) {
break;
}
}
}
}
}
allSentences.addAll(reservoir);
}
private static final int MIN_SENTENCE_CHARS = 20;
/**
* Read a single MADLAD-format sentence file.
* Each line: {@code lineNumber\tdocument text}
* Documents contain sentences separated by the literal two-character
* sequence {@code \n} (backslash + n). Lines without a tab or with
* empty text are skipped. Sentence fragments shorter than
* {@link #MIN_SENTENCE_CHARS} characters are also skipped.
*/
static void readSentenceFile(Path file, String language,
List<LabeledSentence> sentences) throws IOException {
try (BufferedReader reader = Files.newBufferedReader(file, StandardCharsets.UTF_8)) {
String line;
while ((line = reader.readLine()) != null) {
int tab = line.indexOf('\t');
if (tab < 0) {
continue;
}
String doc = line.substring(tab + 1);
for (String part : doc.split("\\\\n")) {
String text = part.trim();
if (text.length() >= MIN_SENTENCE_CHARS) {
sentences.add(new LabeledSentence(language, text));
}
}
}
}
}
}