TextQualityDiagTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.langdetect.charsoup;

import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Locale;

import org.junit.jupiter.api.Test;

/**
 * Diagnostic test to explore text quality scores for encoding arbitration.
 * Not a regression test ��� just prints scores for analysis.
 */
public class TextQualityDiagTest {

    @Test
    public void dumpScores() {
        // Arabic text in windows-1256
        Charset windows1256 = Charset.forName("windows-1256");
        String arabicText =
                "\u0641\u064a \u0642\u0631\u064a\u0629 \u0645\u0646 " +
                "\u0627\u0644\u0642\u0631\u0649 \u0643\u0627\u0646 " +
                "\u0647\u0646\u0627\u0643 \u0631\u062c\u0644 " +
                "\u062d\u0643\u064a\u0645 \u064a\u0639\u0631\u0641 " +
                "\u0643\u0644 \u0634\u064a\u0621 \u0639\u0646 " +
                "\u0627\u0644\u062d\u064a\u0627\u0629 \u0648\u0643\u0627\u0646 " +
                "\u064a\u0639\u0644\u0645 \u0627\u0644\u0646\u0627\u0633 " +
                "\u0643\u064a\u0641 \u064a\u0639\u064a\u0634\u0648\u0646 " +
                "\u0628\u0633\u0644\u0627\u0645 \u0648\u0627\u0646\u0633\u062c\u0627\u0645.";
        byte[] arabicBytes = arabicText.getBytes(windows1256);

        // "hello world\r\n" as windows-1252
        byte[] helloBytes = "hello world\r\n".getBytes(StandardCharsets.US_ASCII);

        System.out.println("=== Arabic bytes decoded with different charsets ===");
        for (String csName : new String[]{"windows-1256", "x-MacCyrillic", "UTF-8"}) {
            Charset cs = Charset.forName(csName);
            String decoded = CharSoupEncodingDetector.decode(arabicBytes, cs);
            printScores(csName, decoded);
        }

        System.out.println("\n=== 'hello world\\r\\n' decoded with different charsets ===");
        for (String csName : new String[]{"windows-1252", "IBM500"}) {
            Charset cs = Charset.forName(csName);
            String decoded = CharSoupEncodingDetector.decode(helloBytes, cs);
            printScores(csName, decoded);
        }

        // Also try some real-world short text
        System.out.println("\n=== Short real text ===");
        printScores("English sentence", "The quick brown fox jumps over the lazy dog.");
        printScores("French sentence", "Le renard brun rapide saute par-dessus le chien paresseux.");
        printScores("German sentence", "Der schnelle braune Fuchs springt \u00fcber den faulen Hund.");
    }

    private void printScores(String label, String text) {
        int totalChars = text.length();
        int letterCount = 0;
        int replacementCount = 0;
        int controlCount = 0;
        int spaceCount = 0;
        int digitCount = 0;
        int punctCount = 0;
        int otherCount = 0;

        for (int i = 0; i < text.length(); ) {
            int cp = text.codePointAt(i);
            i += Character.charCount(cp);

            if (cp == 0xFFFD) {
                replacementCount++;
            } else if (Character.isISOControl(cp) || (cp >= 0x80 && cp <= 0x9F)) {
                controlCount++;
            } else if (Character.isLetter(cp)) {
                letterCount++;
            } else if (Character.isWhitespace(cp)) {
                spaceCount++;
            } else if (Character.isDigit(cp)) {
                digitCount++;
            } else if (isPunctuation(cp)) {
                punctCount++;
            } else {
                otherCount++;
            }
        }

        float letterRatio = totalChars > 0 ? (float) letterCount / totalChars : 0;
        float junkRatio = totalChars > 0 ?
                (float) (replacementCount + controlCount) / totalChars : 0;
        float nonLetterNonSpaceRatio = totalChars > 0 ?
                (float) (totalChars - letterCount - spaceCount) / totalChars : 0;

        System.out.printf(Locale.ROOT,
                "  %-20s len=%3d  letters=%.2f  junk(repl+ctrl)=%.2f  " +
                        "nonLetterNonSpace=%.2f  [L=%d S=%d P=%d D=%d R=%d C=%d O=%d]%n",
                label, totalChars, letterRatio, junkRatio, nonLetterNonSpaceRatio,
                letterCount, spaceCount, punctCount, digitCount,
                replacementCount, controlCount, otherCount);

        // Show first 60 chars with hex for non-printable
        StringBuilder preview = new StringBuilder();
        for (int i = 0; i < Math.min(text.length(), 60); ) {
            int cp = text.codePointAt(i);
            i += Character.charCount(cp);
            if (cp >= 0x20 && cp < 0x7F) {
                preview.appendCodePoint(cp);
            } else if (Character.isLetter(cp)) {
                preview.appendCodePoint(cp);
            } else {
                preview.append(String.format(Locale.ROOT, "\\u%04X", cp));
            }
        }
        System.out.printf(Locale.ROOT, "  %-20s text: %s%n", "", preview);
    }

    private boolean isPunctuation(int cp) {
        int type = Character.getType(cp);
        return type == Character.CONNECTOR_PUNCTUATION ||
                type == Character.DASH_PUNCTUATION ||
                type == Character.END_PUNCTUATION ||
                type == Character.FINAL_QUOTE_PUNCTUATION ||
                type == Character.INITIAL_QUOTE_PUNCTUATION ||
                type == Character.OTHER_PUNCTUATION ||
                type == Character.START_PUNCTUATION;
    }
}