CharsetConfusables.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.ml.chardetect;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * Charset relationships used for lenient (lenient) evaluation of charset detectors.
 * <p>
 * Two kinds of relationship are modelled:
 *
 * <h3>Symmetric confusable groups</h3>
 * <p>
 * A pair (or larger set) of charsets is <em>symmetrically confusable</em> when
 * a significant fraction of real-world byte sequences are identical under all
 * members, making it unreasonable to penalise a detector for choosing either
 * direction. Examples:
 * <ul>
 *   <li>ISO-8859-1 / ISO-8859-15 / windows-1252 ��� differ only in 8 code points
 *       and the C1 control range (0x80���0x9F)</li>
 *   <li>UTF-16-LE / UTF-16-BE without a BOM</li>
 *   <li>IBM424-ltr / IBM424-rtl ��� same code page, differ only in text-reversal
 *       convention</li>
 *   <li>KOI8-R / KOI8-U ��� share all Cyrillic letters, differ only in four
 *       Ukrainian characters</li>
 * </ul>
 *
 * <h3>Superset / subset chains</h3>
 * <p>
 * Some charsets stand in a strict superset/subset relationship:
 * <ul>
 *   <li>GB2312 ��� GBK ��� GB18030</li>
 *   <li>Big5 ��� Big5-HKSCS</li>
 * </ul>
 * <p>
 * For these, the relationship is <em>directional</em>:
 * <ul>
 *   <li>Predicting the <strong>superset</strong> when the true charset is the
 *       <strong>subset</strong> is always safe ��� the superset decoder handles
 *       all subset byte sequences correctly (e.g. predicting GB18030 for a
 *       GB2312 file).</li>
 *   <li>Predicting the <strong>subset</strong> when the true charset is the
 *       <strong>superset</strong> is <em>not</em> safe ��� the subset decoder
 *       may corrupt characters that exist only in the superset (e.g. predicting
 *       GB2312 for a file that uses GB18030-only characters).</li>
 * </ul>
 *
 * <h3>Summary</h3>
 * <p>
 * Use {@link #isLenientMatch(String, String)} for evaluation: it returns
 * {@code true} when the prediction is acceptable given the above rules.
 * Use {@link #GROUPS} + {@link #buildGroupIndices} +
 * {@link #collapseGroups} for inference-time probability pooling (direction
 * does not matter when merging probability mass).
 * <p>
 * This class is kept in sync with the charset configuration in
 * {@code BuildCharsetTrainingData} (see {@code CHARSET_JAVA} and the
 * language-to-charset mappings).
 */
public final class CharsetConfusables {

    /**
     * All confusable groups (both symmetric and superset chains), used for
     * probability collapsing during inference via {@link #collapseGroups}.
     * Direction does not matter here ��� we just want to pool probability mass
     * among related charsets and award it to the highest-scoring member.
     */
    public static final List<Set<String>> GROUPS;

    /**
     * Symmetric-only confusable groups. Both directions are equally acceptable
     * as lenient matches (neither charset is "safer" than the other).
     */
    public static final List<Set<String>> SYMMETRIC_GROUPS;

    /**
     * Maps each ISO-8859-X charset to its Windows-12XX equivalent.
     *
     * <p>When bytes in the C1 range ({@code 0x80���0x9F}) are present in a
     * probe, the ISO encoding is ruled out ��� those byte values are C1 control
     * characters in every ISO-8859-X standard and never appear in real text.
     * The corresponding Windows-12XX encoding uses that range for printable
     * characters (smart quotes, Euro sign, em-dash, ���) and is the correct
     * attribution.</p>
     *
     * <p>Note that ISO-8859-15 maps to windows-1252 because ISO-8859-15 also
     * leaves 0x80���0x9F as C1 controls; its differences from ISO-8859-1 are
     * all in the 0xA0���0xFF region.</p>
     */
    public static final Map<String, String> ISO_TO_WINDOWS;

    static {
        Map<String, String> m = new HashMap<>();
        m.put("ISO-8859-1",  "windows-1252");
        m.put("ISO-8859-15", "windows-1252");
        m.put("ISO-8859-2",  "windows-1250");
        m.put("ISO-8859-4",  "windows-1257");
        m.put("ISO-8859-5",  "windows-1251");
        m.put("ISO-8859-6",  "windows-1256");
        m.put("ISO-8859-7",  "windows-1253");
        m.put("ISO-8859-8",  "windows-1255");
        m.put("ISO-8859-9",  "windows-1254");
        m.put("ISO-8859-13", "windows-1257");
        ISO_TO_WINDOWS = Collections.unmodifiableMap(m);
    }

    /**
     * Directional superset relationships: key is a charset, value is its
     * immediate superset. A chain is expressed as successive entries:
     * <pre>
     *   GB2312 ��� GBK ��� GB18030
     *   Big5   ��� Big5-HKSCS
     * </pre>
     * Predicting any ancestor of {@code actual} in this map is a lenient match.
     * Predicting any descendant is <em>not</em> a lenient match.
     */
    public static final Map<String, String> SUPERSET_OF;

    private static final Map<String, Set<String>> SYMMETRIC_PEER_MAP;

    static {
        // ----------------------------------------------------------------
        // Symmetric groups
        // ----------------------------------------------------------------
        List<Set<String>> sym = Arrays.asList(
                // Western European: differ only in 8 code points + C1 range
                group("ISO-8859-1", "ISO-8859-15", "windows-1252"),

                // Central / Eastern European
                group("ISO-8859-2", "windows-1250"),

                // Cyrillic: differ only in C1 range
                group("ISO-8859-5", "windows-1251"),

                // KOI8 family: share all 32 basic Russian Cyrillic letters at
                // identical byte positions.  Pure Russian text is byte-identical
                // in both encodings and cannot be distinguished by any byte-level
                // classifier.
                //
                // IMPORTANT ��� this is a LOSSY confusable, unlike most others:
                // KOI8-U reassigns some byte positions that KOI8-R uses for
                // Russian characters.  Specifically, byte 0xA4 is �� (capital)
                // in KOI8-R but �� (Ukrainian small e) in KOI8-U.  Predicting
                // KOI8-U for a KOI8-R file will misrender �� as �� and two other
                // characters.  Predicting KOI8-R for a KOI8-U file will
                // misrender the Ukrainian letters ��, ��, ��, ��.
                //
                // Ukrainian text (containing ��, ��, ��, ��) IS distinguishable:
                // those characters encode to different bytes in each variant.
                // The model can correctly identify KOI8-U from Ukrainian content.
                // The ambiguity is specific to pure-Russian content, where neither
                // encoding can be preferred on byte evidence alone.
                //
                // This group is declared for soft-match evaluation (predicting
                // either variant for a pure-Russian file is pragmatically
                // acceptable), but callers that care about correct rendering of
                // �� should treat KOI8-R/KOI8-U as distinct.
                group("KOI8-R", "KOI8-U"),

                // Arabic
                group("ISO-8859-6", "windows-1256"),

                // Greek
                group("ISO-8859-7", "windows-1253"),

                // Hebrew
                group("ISO-8859-8", "windows-1255"),

                // Turkish / Latin-5
                group("ISO-8859-9", "windows-1254"),

                // Baltic
                group("ISO-8859-4", "ISO-8859-13", "windows-1257"),

                // UTF-16 without BOM: byte-for-byte indistinguishable
                group("UTF-16-LE", "UTF-16-BE"),

                // Hebrew EBCDIC: same IBM424 code page, differ only in text-reversal
                // convention.  Java's Charset registry has "IBM424" but not the
                // directional variants, so our detector returns IBM424 for both.
                group("IBM424-ltr", "IBM424-rtl", "IBM424"),

                // Arabic EBCDIC: same IBM420 code page, differ in reversal convention.
                // Java's Charset registry has "IBM420" but not the directional variants.
                group("IBM420-ltr", "IBM420-rtl", "IBM420"),

                // windows-874 is the training label; Java's canonical name for the
                // same charset is "x-windows-874".  Both refer to the Thai Windows
                // code page (superset of TIS-620).
                group("windows-874", "x-windows-874"),

                // IBM500 vs IBM1047: share 247 of 256 byte mappings. The 9
                // differing positions involve low-byte characters (!, [, ], |)
                // and the LF/NEL line terminator swap ��� all rare in prose text
                // and mostly below 0x80, invisible to stride-1 features.
                // Prose-level training data cannot reliably distinguish them;
                // both decode each other's content correctly for typical text.
                group("IBM500", "IBM1047"),

                // IBM437 vs IBM850: IBM437's distinguishing bytes are box-drawing
                // and graphical characters (0xB0���0xDF range) that never appear
                // in prose text. IBM850 is the preferred Western European DOS
                // code page; IBM437 is not trained as a separate class.
                // This group exists so that evaluation tools and
                // post-processing handle legacy files labelled "IBM437"
                // gracefully by treating IBM850 as an equally acceptable result.
                group("IBM437", "IBM850")
        );
        SYMMETRIC_GROUPS = Collections.unmodifiableList(sym);

        // ----------------------------------------------------------------
        // Directional superset chains
        // ----------------------------------------------------------------
        Map<String, String> supersetOf = new HashMap<>();
        // Simplified Chinese: GB2312 ��� GBK ��� GB18030
        supersetOf.put("GB2312", "GBK");
        supersetOf.put("GBK", "GB18030");
        // Traditional Chinese: Big5 ��� Big5-HKSCS
        supersetOf.put("Big5", "Big5-HKSCS");
        // Japanese Windows: Shift_JIS ��� windows-31j (CP932).
        // Java's Shift_JIS resolves to CP932 internally; training uses cp932
        // bytes. Returning Shift_JIS for a windows-31j file is safe because
        // every Shift_JIS decoder on the JVM actually uses the CP932 mapping.
        supersetOf.put("Shift_JIS", "windows-31j");
        // Korean Windows: EUC-KR ��� x-windows-949 (MS949).
        // MS949 extends EUC-KR with additional Hangul syllables. Returning
        // EUC-KR for an MS949 file is safe for all standard Korean content.
        supersetOf.put("EUC-KR", "x-windows-949");
        // US-ASCII ��� UTF-8: every UTF-8 decoder handles ASCII bytes correctly,
        // so predicting UTF-8 for a US-ASCII file is always safe.
        supersetOf.put("US-ASCII", "UTF-8");
        SUPERSET_OF = Collections.unmodifiableMap(supersetOf);

        // ----------------------------------------------------------------
        // GROUPS = symmetric + one flat group per superset chain
        // (used only for probability collapsing ��� direction irrelevant here)
        // ----------------------------------------------------------------
        List<Set<String>> all = new ArrayList<>(sym);
        all.add(group("GB2312", "GBK", "GB18030")); // flattened chain
        all.add(group("Big5", "Big5-HKSCS"));
        all.add(group("Shift_JIS", "windows-31j"));
        all.add(group("EUC-KR", "x-windows-949"));
        // IBM500/IBM1047 and IBM437/IBM850 are already in sym (symmetric groups),
        // so they are included transitively ��� no extra entries needed here.
        GROUPS = Collections.unmodifiableList(all);

        // ----------------------------------------------------------------
        // Symmetric peer map for fast lookup
        // ----------------------------------------------------------------
        Map<String, Set<String>> peerMap = new HashMap<>();
        for (Set<String> g : SYMMETRIC_GROUPS) {
            for (String cs : g) {
                Set<String> peers = new HashSet<>(g);
                peers.remove(cs);
                peerMap.put(cs, Collections.unmodifiableSet(peers));
            }
        }
        SYMMETRIC_PEER_MAP = Collections.unmodifiableMap(peerMap);
    }

    private CharsetConfusables() {
    }

    /**
     * Return {@code true} if predicting {@code predicted} when the true charset
     * is {@code actual} is an acceptable ("lenient") result.
     * <p>
     * <strong>Symmetric groups:</strong> both directions are acceptable (e.g.
     * predicting ISO-8859-1 for windows-1252 or vice versa).<br>
     * <strong>Superset chains:</strong> only predicting a <em>superset</em> of
     * {@code actual} is acceptable (e.g. predicting GB18030 for GB2312), because
     * the superset decoder can always handle the subset's byte sequences.
     * Predicting a <em>subset</em> is not acceptable because the subset decoder
     * may corrupt extended characters.
     */
    public static boolean isLenientMatch(String actual, String predicted) {
        if (actual.equals(predicted)) {
            return true;
        }
        // Symmetric group?
        Set<String> symPeers = SYMMETRIC_PEER_MAP.get(actual);
        if (symPeers != null && symPeers.contains(predicted)) {
            return true;
        }
        // Superset chain: walk ancestors of actual to see if predicted is one
        String ancestor = SUPERSET_OF.get(actual);
        while (ancestor != null) {
            if (ancestor.equals(predicted)) {
                return true;
            }
            ancestor = SUPERSET_OF.get(ancestor);
        }
        return false;
    }

    /**
     * Return the set of charsets that are symmetrically confusable with
     * {@code charset}, not including {@code charset} itself.
     * Returns an empty set for charsets not in any symmetric group.
     */
    public static Set<String> symmetricPeersOf(String charset) {
        return SYMMETRIC_PEER_MAP.getOrDefault(charset, Collections.emptySet());
    }

    /**
     * Build a per-class group-index array from a label array (e.g. from a
     * {@link org.apache.tika.ml.LinearModel}), using {@link #GROUPS} (both
     * symmetric and superset chains) for probability collapsing in inference.
     *
     * @param labels class labels in model order
     * @return {@code int[numClasses][]} group-index map
     */
    public static int[][] buildGroupIndices(String[] labels) {
        Map<String, Integer> labelIdx = new HashMap<>();
        for (int i = 0; i < labels.length; i++) {
            labelIdx.put(labels[i], i);
        }

        int[][] result = new int[labels.length][];
        boolean[] assigned = new boolean[labels.length];

        for (Set<String> group : GROUPS) {
            List<Integer> members = new ArrayList<>();
            for (String cs : group) {
                Integer idx = labelIdx.get(cs);
                if (idx != null) {
                    members.add(idx);
                }
            }
            if (members.size() >= 2) {
                int[] arr = members.stream().mapToInt(Integer::intValue).toArray();
                for (int idx : arr) {
                    result[idx] = arr;
                    assigned[idx] = true;
                }
            }
        }
        for (int i = 0; i < result.length; i++) {
            if (!assigned[i]) {
                result[i] = new int[]{i};
            }
        }
        return result;
    }

    /**
     * Collapse confusable group probabilities: within each group, sum all
     * members' probabilities and assign the total to the highest-scoring
     * member; the other members get 0.
     *
     * @param probs        raw softmax output (not modified)
     * @param groupIndices result of {@link #buildGroupIndices(String[])}
     * @return new probability array with group probabilities collapsed
     */
    public static float[] collapseGroups(float[] probs, int[][] groupIndices) {
        float[] collapsed = Arrays.copyOf(probs, probs.length);
        boolean[] visited = new boolean[probs.length];

        for (int i = 0; i < probs.length; i++) {
            if (visited[i] || groupIndices[i].length <= 1) {
                continue;
            }
            int[] group = groupIndices[i];
            float sum = 0f;
            int best = group[0];
            for (int idx : group) {
                sum += probs[idx];
                if (probs[idx] > probs[best]) {
                    best = idx;
                }
                visited[idx] = true;
            }
            for (int idx : group) {
                collapsed[idx] = (idx == best) ? sum : 0f;
            }
        }
        return collapsed;
    }

    private static Set<String> group(String... members) {
        return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(members)));
    }
}