MarginDiagnostic.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.langdetect.charsoup.tools;

import java.io.BufferedReader;
import java.io.FileReader;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Locale;

import org.apache.tika.langdetect.charsoup.CharSoupLanguageDetector;
import org.apache.tika.language.detect.LanguageResult;

/**
 * Quick diagnostic: run short English snippets through the detector,
 * print top-2 results with scores and entropy to understand margin behaviour.
 */
public class MarginDiagnostic {
    public static void main(String[] args) throws Exception {
        String floresPath = args.length > 0
                ? args[0]
                : System.getProperty("user.home") + "/datasets/flores-200/flores200_dev.tsv";
        int truncLen = args.length > 1 ? Integer.parseInt(args[1]) : 20;
        String filterLang = args.length > 2 ? args[2] : "eng";

        CharSoupLanguageDetector det = new CharSoupLanguageDetector();
        det.loadModels();

        System.out.printf(Locale.ROOT, "%-22s  %-6s %7s  %-6s %7s  %-6s %7s  %s%n",
                "SNIPPET", "TOP", "SCORE", "2ND", "SCORE", "CONF", "ENTROPY", "RESULT");
        System.out.println("-".repeat(100));

        int count = 0;
        int unkCount = 0;
        int wrongCount = 0;
        int okCount = 0;
        try (BufferedReader br = new BufferedReader(new FileReader(floresPath, StandardCharsets.UTF_8))) {
            String line;
            while ((line = br.readLine()) != null) {
                String[] parts = line.split("\t", 2);
                if (parts.length < 2) continue;
                String lang = parts[0].split("_")[0];
                if (!lang.equals(filterLang)) continue;

                String text = parts[1];
                if (text.length() > truncLen) text = text.substring(0, truncLen);

                det.reset();
                det.addText(text);
                List<LanguageResult> results = det.detectAll();
                float entropy = det.getDistributionEntropy();

                String top = results.size() > 0 ? results.get(0).getLanguage() : "?";
                float topScore = results.size() > 0 ? results.get(0).getRawScore() : 0;
                String topConf = results.size() > 0 ? results.get(0).getConfidence().name() : "?";
                String second = results.size() > 1 ? results.get(1).getLanguage() : "-";
                float secScore = results.size() > 1 ? results.get(1).getRawScore() : 0;

                String mark;
                if (top.isEmpty()) {
                    mark = "UNK";
                    unkCount++;
                } else if (top.equals(filterLang)) {
                    mark = "OK";
                    okCount++;
                } else {
                    mark = "WRONG:" + top;
                    wrongCount++;
                }

                System.out.printf(Locale.ROOT, "%-22s  %-6s %7.4f  %-6s %7.4f  %-6s %7.3f  %s%n",
                        text, top, topScore, second, secScore, topConf, entropy, mark);
                count++;
            }
        }
        System.out.println("-".repeat(100));
        System.out.printf(Locale.ROOT, "Total: %d  OK: %d (%.1f%%)  UNK: %d  WRONG: %d%n",
                count, okCount, 100.0 * okCount / count, unkCount, wrongCount);
    }
}