CorpusDiversityAnalyzer.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.langdetect.charsoup.tools;
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import org.apache.tika.langdetect.charsoup.CharSoupFeatureExtractor;
import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel;
import org.apache.tika.langdetect.charsoup.ScriptAwareFeatureExtractor;
/**
* Measures corpus diversity for each language in a flat-file corpus directory.
*
* <p>Three complementary metrics are computed entirely from the training
* sentences ��� no external evaluation set required:
*
* <ol>
* <li><b>Bigram bucket fill %</b>: fraction of the bigram hash table that
* has at least one count after seeing all training sentences. A corpus
* of near-identical stubs reuses the same n-grams over and over and
* fills a small fraction of buckets regardless of corpus size.</li>
* <li><b>Normalised bigram entropy</b>: Shannon entropy of the bigram count
* distribution divided by log2(filled buckets). A perfectly uniform
* distribution scores 1.0; a corpus dominated by a handful of repeated
* patterns scores near 0.</li>
* <li><b>Unique sentence %</b>: fraction of distinct lines. Templated
* corpora have many near- or exact-duplicate sentences.</li>
* </ol>
*
* <p>Languages whose fill% and entropy fall far below the median are flagged
* as potentially low-quality.
*
* <h3>Usage</h3>
* <pre>
* java CorpusDiversityAnalyzer \
* --corpus /path/to/pool_filtered \
* [--max-per-lang 100000] \
* [--flag-below 0.5]
* </pre>
*/
public class CorpusDiversityAnalyzer {
private static final int DEFAULT_MAX_PER_LANG = 100_000;
private static final double DEFAULT_FLAG_BELOW = 0.5;
public static void main(String[] args) throws Exception {
Path corpus = null;
int maxPerLang = DEFAULT_MAX_PER_LANG;
double flagBelow = DEFAULT_FLAG_BELOW;
for (int i = 0; i < args.length; i++) {
switch (args[i]) {
case "--corpus":
corpus = Paths.get(args[++i]);
break;
case "--max-per-lang":
maxPerLang = Integer.parseInt(args[++i]);
break;
case "--flag-below":
flagBelow = Double.parseDouble(args[++i]);
break;
default:
System.err.println("Unknown option: " + args[i]);
System.exit(1);
}
}
if (corpus == null) {
System.err.println("Usage: CorpusDiversityAnalyzer --corpus <dir> "
+ "[--max-per-lang N] [--flag-below 0.5]");
System.exit(1);
}
List<Path> langPaths = listRegularFiles(corpus);
System.out.printf(Locale.US, "Analysing %d languages in %s "
+ "(max %,d sentences each)%n%n",
langPaths.size(), corpus, maxPerLang);
System.out.printf(Locale.US,
"%-14s %10s %10s %8s %10s %10s %s%n",
"Language", "Sentences", "Unique%",
"Fill%", "Entropy", "NormEntropy", "Flag");
System.out.println("-".repeat(80));
List<LangStats> stats = new ArrayList<>();
for (Path p : langPaths) {
LangStats s = analyze(p, maxPerLang);
stats.add(s);
}
// Sort by normalised entropy ascending (worst first)
stats.sort((a, b) -> Double.compare(a.normEntropy, b.normEntropy));
for (LangStats s : stats) {
String flag = (s.fillPct < flagBelow * 100
|| s.normEntropy < flagBelow) ? " <<< LOW DIVERSITY" : "";
System.out.printf(Locale.US,
"%-14s %,10d %9.1f%% %7.1f%% %9.3f %11.3f %s%n",
s.lang, s.sentences, s.uniquePct,
s.fillPct, s.entropy, s.normEntropy, flag);
}
}
// ---- Analysis ----
static LangStats analyze(Path langFile, int maxPerLang) throws IOException {
String lang = langFile.getFileName().toString();
// Determine CJK by probing first 200 sentences
boolean cjk = probeCjk(langFile, 200);
int numBuckets = cjk
? GenerativeLanguageModel.CJK_BIGRAM_BUCKETS
: GenerativeLanguageModel.NONCJK_BIGRAM_BUCKETS;
long[] bigramCounts = new long[numBuckets];
Set<String> seen = new HashSet<>();
long sentences = 0;
long uniqueSentences = 0;
try (BufferedReader reader = Files.newBufferedReader(
langFile, StandardCharsets.UTF_8)) {
String line;
while ((line = reader.readLine()) != null) {
String text = line.trim();
if (text.isEmpty()) continue;
String pp = CharSoupFeatureExtractor.preprocess(text);
if (pp.isEmpty()) continue;
if (seen.add(text)) {
uniqueSentences++;
}
sentences++;
if (cjk) {
GenerativeLanguageModel.extractCjkNgrams(pp,
h -> { /* skip unigrams */ },
h -> bigramCounts[h
% GenerativeLanguageModel.CJK_BIGRAM_BUCKETS]++);
} else {
GenerativeLanguageModel.extractNonCjkNgrams(pp,
h -> { /* skip unigrams */ },
h -> bigramCounts[h
% GenerativeLanguageModel.NONCJK_BIGRAM_BUCKETS]++,
h -> { /* skip trigrams */ });
}
if (maxPerLang > 0 && sentences >= maxPerLang) {
break;
}
}
}
// Metrics
long filledBuckets = 0;
long total = 0;
for (long c : bigramCounts) {
if (c > 0) {
filledBuckets++;
total += c;
}
}
double fillPct = 100.0 * filledBuckets / numBuckets;
// Shannon entropy over filled buckets (bits)
double entropy = 0.0;
if (total > 0) {
for (long c : bigramCounts) {
if (c > 0) {
double p = (double) c / total;
entropy -= p * (Math.log(p) / Math.log(2));
}
}
}
// Normalised entropy: H / log2(filledBuckets) ��� [0, 1]
double normEntropy = filledBuckets > 1
? entropy / (Math.log(filledBuckets) / Math.log(2)) : 0.0;
double uniquePct = sentences > 0
? 100.0 * uniqueSentences / sentences : 0.0;
return new LangStats(lang, sentences, uniquePct,
fillPct, entropy, normEntropy);
}
// ---- Helpers ----
private static boolean probeCjk(Path file, int maxLines) throws IOException {
long cjk = 0;
long total = 0;
int lines = 0;
try (BufferedReader reader = Files.newBufferedReader(
file, StandardCharsets.UTF_8)) {
String line;
while ((line = reader.readLine()) != null && lines < maxLines) {
int i = 0;
while (i < line.length()) {
int cp = line.codePointAt(i);
i += Character.charCount(cp);
if (Character.isLetter(cp)) {
total++;
if (ScriptAwareFeatureExtractor.isCjkOrKana(
Character.toLowerCase(cp))) {
cjk++;
}
}
}
lines++;
}
}
return total > 0 && (double) cjk / total >= 0.60;
}
private static List<Path> listRegularFiles(Path dir) throws IOException {
List<Path> files = new ArrayList<>();
try (DirectoryStream<Path> stream = Files.newDirectoryStream(
dir, Files::isRegularFile)) {
for (Path p : stream) {
files.add(p);
}
}
Collections.sort(files);
return files;
}
// ---- Result record ----
static class LangStats {
final String lang;
final long sentences;
final double uniquePct;
final double fillPct;
final double entropy;
final double normEntropy;
LangStats(String lang, long sentences, double uniquePct,
double fillPct, double entropy, double normEntropy) {
this.lang = lang;
this.sentences = sentences;
this.uniquePct = uniquePct;
this.fillPct = fillPct;
this.entropy = entropy;
this.normEntropy = normEntropy;
}
}
}