CjkEncodingRules.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.ml.chardetect;

import java.nio.charset.Charset;
import java.util.Set;

/**
 * Grammar walkers for CJK multi-byte encodings, used to refine and validate
 * the statistical model's output.
 *
 * <h3>Approach</h3>
 * <p>The byte-ngram model is reliable at distinguishing CJK from non-CJK content
 * but can confuse related CJK encodings ��� most critically Shift_JIS vs EUC-JP,
 * where both are Japanese and language-scoring arbitration cannot help.
 * These grammar walkers provide a hard structural check: if the model nominates
 * a CJK encoding, the walker either confirms it (good byte sequences), weakly
 * confirms it (too few multi-byte chars to be sure), or rejects it outright
 * (invalid byte sequences).</p>
 *
 * <h3>Invocation</h3>
 * <p>Walkers are only invoked when the statistical model has already placed a
 * CJK encoding in its output ��� never unconditionally on every probe.  This
 * avoids false positives on Latin/Arabic/Cyrillic content where some byte
 * values happen to fall in CJK lead-byte ranges.</p>
 *
 * <h3>Confidence scoring</h3>
 * <p>Returns a score on a 0���100 scale (matching ICU4J's convention) so the
 * calling code can decide uniformly:</p>
 * <ul>
 *   <li>0 ��� invalid byte sequences detected; reject this encoding</li>
 *   <li>10 ��� valid grammar but very few multi-byte chars; too little evidence
 *       to prefer the grammar score over the model's</li>
 *   <li>11���100 ��� enough valid multi-byte chars to trust the grammar score</li>
 * </ul>
 *
 * <h3>Common character tables</h3>
 * <p>ICU4J uses hardcoded frequency tables (sorted int[] of common two-byte
 * values) to boost confidence on short probes.  We omit these for now and
 * derive them from the MadLAD corpus if short-probe confidence proves
 * insufficient in practice.  See charset-detection-design.adoc.</p>
 *
 * <p>The grammar-walking approach and confidence formula are inspired by
 * ICU4J's {@code CharsetRecog_mbcs} class (unicode-org/icu).</p>
 */
public final class CjkEncodingRules {

    /**
     * Canonical Java charset names that this class handles.
     * Used by {@link #isCjk(Charset)} to gate walker invocation.
     */
    private static final Set<String> CJK_NAMES = Set.of(
            "Shift_JIS", "windows-31j",
            "EUC-JP",
            "EUC-KR",
            "Big5", "Big5-HKSCS",
            "GB18030", "GBK", "GB2312"
    );

    private CjkEncodingRules() {}

    /**
     * Returns {@code true} if {@code cs} is one of the CJK encodings handled
     * by this class.
     */
    public static boolean isCjk(Charset cs) {
        return CJK_NAMES.contains(cs.name());
    }

    /**
     * Run the grammar walker for {@code cs} against {@code probe} and return
     * a confidence score on the 0���100 scale.
     *
     * <p>Returns {@code -1} if {@code cs} is not a CJK encoding handled here
     * (callers should check {@link #isCjk(Charset)} first).</p>
     */
    public static int match(byte[] probe, Charset cs) {
        switch (cs.name()) {
            case "Shift_JIS":
            case "windows-31j":
                return matchSjis(probe);
            case "EUC-JP":
                return matchEucJp(probe);
            case "EUC-KR":
                return matchEucKr(probe);
            case "Big5":
            case "Big5-HKSCS":
                return matchBig5(probe);
            case "GB18030":
            case "GBK":
            case "GB2312":
                return matchGb18030(probe);
            default:
                return -1;
        }
    }

    // -----------------------------------------------------------------------
    // Shift_JIS / windows-31j (CP932)
    // -----------------------------------------------------------------------

    /**
     * Grammar walker for Shift_JIS / CP932.
     *
     * <p>Byte categories:</p>
     * <ul>
     *   <li>{@code 0x00���0x7F}: single-byte ASCII</li>
     *   <li>{@code 0xA1���0xDF}: single-byte half-width katakana</li>
     *   <li>{@code 0x81���0x9F}, {@code 0xE0���0xFC}: lead byte of a double-byte
     *       character; trail byte must be {@code 0x40���0x7F} or
     *       {@code 0x80���0xFF}</li>
     *   <li>All other values ({@code 0x80}, {@code 0xA0}, {@code 0xFD���0xFF}):
     *       invalid</li>
     * </ul>
     */
    private static int matchSjis(byte[] probe) {
        int doubleByte = 0;
        int bad = 0;
        int i = 0;
        while (i < probe.length) {
            int b = probe[i++] & 0xFF;

            if (b <= 0x7F || (b >= 0xA1 && b <= 0xDF)) {
                continue; // single byte
            }

            if ((b >= 0x81 && b <= 0x9F) || (b >= 0xE0 && b <= 0xFC)) {
                if (i >= probe.length) break;
                int trail = probe[i++] & 0xFF;
                if ((trail >= 0x40 && trail <= 0x7F) || (trail >= 0x80 && trail <= 0xFF)) {
                    doubleByte++;
                } else {
                    bad++;
                }
            } else {
                bad++; // 0x80, 0xA0, 0xFD���0xFF
            }

            if (bail(bad, doubleByte)) break;
        }
        return confidence(doubleByte, bad);
    }

    // -----------------------------------------------------------------------
    // EUC-JP
    // -----------------------------------------------------------------------

    /**
     * Grammar walker for EUC-JP.
     *
     * <p>Byte categories:</p>
     * <ul>
     *   <li>{@code 0x00���0x8D}: single byte</li>
     *   <li>{@code 0xA1���0xFE}: lead of a two-byte char; trail ��� {@code 0xA1}</li>
     *   <li>{@code 0x8E}: SS2 (half-width katakana); second byte ��� {@code 0xA1}</li>
     *   <li>{@code 0x8F}: SS3; second and third bytes both ��� {@code 0xA1}</li>
     *   <li>{@code 0x8E���0x90}, {@code 0xFF}: invalid</li>
     * </ul>
     */
    private static int matchEucJp(byte[] probe) {
        return matchEuc(probe);
    }

    // -----------------------------------------------------------------------
    // EUC-KR
    // -----------------------------------------------------------------------

    /**
     * Grammar walker for EUC-KR.
     * EUC-KR uses the same two-byte structure as EUC-JP but without the SS2/SS3
     * extensions; the shared EUC walker handles both correctly because SS2/SS3
     * sequences that validate grammatically are simply counted as double-byte
     * characters.
     */
    private static int matchEucKr(byte[] probe) {
        return matchEuc(probe);
    }

    /**
     * Shared EUC grammar walker (covers EUC-JP and EUC-KR).
     */
    private static int matchEuc(byte[] probe) {
        int doubleByte = 0;
        int bad = 0;
        int i = 0;
        while (i < probe.length) {
            int b = probe[i++] & 0xFF;

            if (b <= 0x8D) {
                continue; // single byte
            }

            if (b >= 0xA1 && b <= 0xFE) {
                // two-byte char
                if (i >= probe.length) break;
                int trail = probe[i++] & 0xFF;
                if (trail >= 0xA1) {
                    doubleByte++;
                } else {
                    bad++;
                }
            } else if (b == 0x8E) {
                // SS2: two bytes total
                if (i >= probe.length) break;
                int trail = probe[i++] & 0xFF;
                if (trail >= 0xA1) {
                    doubleByte++;
                } else {
                    bad++;
                }
            } else if (b == 0x8F) {
                // SS3: three bytes total
                if (i + 1 >= probe.length) break;
                int b2 = probe[i++] & 0xFF;
                int b3 = probe[i++] & 0xFF;
                if (b2 >= 0xA1 && b3 >= 0xA1) {
                    doubleByte++;
                } else {
                    bad++;
                }
            } else {
                bad++; // 0x8E���0x9F or 0xFF
            }

            if (bail(bad, doubleByte)) break;
        }
        return confidence(doubleByte, bad);
    }

    // -----------------------------------------------------------------------
    // Big5 / Big5-HKSCS
    // -----------------------------------------------------------------------

    /**
     * Grammar walker for Big5 / Big5-HKSCS.
     *
     * <p>Byte categories:</p>
     * <ul>
     *   <li>{@code 0x00���0x7F}, {@code 0xFF}: single byte</li>
     *   <li>{@code 0x81���0xFE}: lead byte; trail must not be
     *       {@code < 0x40}, {@code 0x7F}, or {@code 0xFF}</li>
     * </ul>
     */
    private static int matchBig5(byte[] probe) {
        int doubleByte = 0;
        int bad = 0;
        int i = 0;
        while (i < probe.length) {
            int b = probe[i++] & 0xFF;

            if (b <= 0x7F || b == 0xFF) {
                continue; // single byte
            }

            if (b >= 0x81 && b <= 0xFE) {
                if (i >= probe.length) break;
                int trail = probe[i++] & 0xFF;
                if (trail < 0x40 || trail == 0x7F || trail == 0xFF) {
                    bad++;
                } else {
                    doubleByte++;
                }
            } else {
                bad++; // 0x80
            }

            if (bail(bad, doubleByte)) break;
        }
        return confidence(doubleByte, bad);
    }

    // -----------------------------------------------------------------------
    // GB18030 / GBK / GB2312
    // -----------------------------------------------------------------------

    /**
     * Grammar walker for GB18030 (superset of GBK and GB2312).
     *
     * <p>Byte categories:</p>
     * <ul>
     *   <li>{@code 0x00���0x80}: single byte</li>
     *   <li>{@code 0x81���0xFE}: lead byte of either a two-byte sequence
     *       (trail {@code 0x40���0x7E} or {@code 0x80���0xFE}) or a four-byte
     *       sequence (second {@code 0x30���0x39}, third {@code 0x81���0xFE},
     *       fourth {@code 0x30���0x39})</li>
     * </ul>
     */
    private static int matchGb18030(byte[] probe) {
        int doubleByte = 0;
        int bad = 0;
        int i = 0;
        while (i < probe.length) {
            int b = probe[i++] & 0xFF;

            if (b <= 0x80) {
                continue; // single byte
            }

            if (b >= 0x81 && b <= 0xFE) {
                if (i >= probe.length) break;
                int b2 = probe[i] & 0xFF;

                if ((b2 >= 0x40 && b2 <= 0x7E) || (b2 >= 0x80 && b2 <= 0xFE)) {
                    // two-byte char
                    i++;
                    doubleByte++;
                } else if (b2 >= 0x30 && b2 <= 0x39) {
                    // four-byte char
                    i++;
                    if (i + 1 >= probe.length) break;
                    int b3 = probe[i++] & 0xFF;
                    int b4 = probe[i++] & 0xFF;
                    if (b3 >= 0x81 && b3 <= 0xFE && b4 >= 0x30 && b4 <= 0x39) {
                        doubleByte++;
                    } else {
                        bad++;
                    }
                } else {
                    bad++;
                }
            } else {
                bad++;
            }

            if (bail(bad, doubleByte)) break;
        }
        return confidence(doubleByte, bad);
    }

    // -----------------------------------------------------------------------
    // Shared helpers
    // -----------------------------------------------------------------------

    /**
     * Early-exit condition: bail when there are at least 2 bad characters
     * and bad characters make up more than 1/5 of double-byte characters.
     * Mirrors ICU4J's {@code CharsetRecog_mbcs.match()} bail condition.
     */
    private static boolean bail(int bad, int doubleByte) {
        return bad >= 2 && bad * 5 >= doubleByte;
    }

    /**
     * Minimum confidence returned when all high bytes form valid CJK sequences
     * (bad == 0) but the sample is short (doubleByte ��� 10). A score at or above
     * this value means "structurally clean" and is used by
     * {@link MojibusterEncodingDetector#refineCjkResults} to prefer the CJK
     * charset over single-byte alternatives on short probes.
     *
     * <p>Single-byte encodings have no structural constraints ��� they decode any
     * byte sequence without errors. A CJK multi-byte parser that consumes all
     * high bytes into valid 2-byte sequences IS providing structural evidence
     * even when the sample contains only 1���2 double-byte characters.</p>
     */
    public static final int CLEAN_SHORT_PROBE_CONFIDENCE = 40;

    /**
     * Confidence formula, inspired by ICU4J's {@code CharsetRecog_mbcs.match()}.
     *
     * <ul>
     *   <li>0 double-byte chars ��� 0 (no CJK evidence at all)</li>
     *   <li>bad == 0: all high bytes are valid CJK sequences ���
     *       {@code min(100, 30 + doubleByte * 10)}.
     *       Gives 40 for 1 double-byte char, 50 for 2, 100 for 7+.
     *       These scores are ��� {@link #CLEAN_SHORT_PROBE_CONFIDENCE}, signalling
     *       "structurally clean" to the caller even on very short probes.</li>
     *   <li>Too many bad chars relative to good ��� 0 (wrong encoding)</li>
     *   <li>bad > 0 within tolerance ��� {@code 30 + doubleByte - 20 * bad},
     *       capped at 100</li>
     * </ul>
     */
    static int confidence(int doubleByte, int bad) {
        if (doubleByte == 0) {
            return 0;
        }
        if (bad == 0) {
            // All high bytes consumed as valid sequences ��� structurally clean.
            // Scale with sample size but never return the old floor of 10.
            return Math.min(100, 30 + doubleByte * 10);
        }
        if (doubleByte < 20 * bad) {
            return 0;
        }
        return Math.min(100, 30 + doubleByte - 20 * bad);
    }
}