WideUnicodeDetector.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.ml.chardetect;

import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

/**
 * Structural analysis for UTF-16 LE/BE and UTF-32 LE/BE based on
 * byte-position patterns. This is an internal component of
 * {@link MojibusterEncodingDetector}'s pipeline ��� not a standalone
 * {@code EncodingDetector}. It intentionally does not handle CJK UTF-16
 * (which falls through to the statistical model) and requires upstream
 * BOM stripping.
 *
 * <h3>UTF-32</h3>
 * <p>Every 4-byte group is decoded as a 32-bit integer in both BE and LE
 * order and checked for Unicode validity (0x000000���0x10FFFF, excluding
 * surrogates). The valid range is only 0.004% of the 32-bit space, so
 * non-UTF-32 data almost always produces out-of-range values immediately.
 * Inspired by ICU4J's {@code CharsetRecog_UTF_32}.</p>
 *
 * <h3>UTF-16</h3>
 * <p>Two phases, each targeting a different script family:
 * <ol>
 *   <li><strong>Null-column</strong> ��� Latin/ASCII BMP content: one byte
 *       column (even or odd positions at stride-2) has a high null rate.
 *       Safe: no legacy encoding produces alternating nulls.</li>
 *   <li><strong>Low-block-prefix</strong> ��� scripts whose UTF-16 high byte
 *       is below {@code 0x20} (Cyrillic 0x04, Arabic 0x06, Hebrew 0x05,
 *       Devanagari 0x09, Bengali 0x09, Thai 0x0E, etc.): the constrained
 *       column has all non-null values below {@code 0x20}, the other column
 *       is more diverse. Safe: Big5/Shift-JIS/GBK lead bytes are always
 *       &ge; 0x81.</li>
 * </ol>
 *
 * <p>CJK Unified (block prefix 0x4E���0x9F) and Hangul (0xAC���0xD7) are
 * intentionally not handled ��� their block prefixes overlap with
 * Big5/Shift-JIS/GBK lead bytes (0x81+) and with ISO-2022-JP JIS row
 * bytes, making structural discrimination unsafe. Those cases fall
 * through to the statistical model.</p>
 *
 * <p>In addition to positive detection, {@link Result} carries surrogate-
 * invalidity flags for each endianness. When no positive detection fires,
 * these flags allow the caller to suppress UTF-16 model predictions for
 * probes that are structurally impossible as UTF-16.</p>
 *
 * <p>All methods are stateless and safe to call from multiple threads.</p>
 */
final class WideUnicodeDetector {

    private WideUnicodeDetector() {}

    /**
     * Result of wide-Unicode structural analysis. Contains a positive detection
     * (if one was made) plus invalidity flags that can be used to suppress
     * model predictions even when no positive detection fires.
     */
    static final class Result {
        /** Positively detected charset, or {@code null} if none. */
        public final Charset charset;
        /** True if the probe contains invalid UTF-16-BE surrogate sequences. */
        public final boolean invalidUtf16Be;
        /** True if the probe contains invalid UTF-16-LE surrogate sequences. */
        public final boolean invalidUtf16Le;

        private Result(Charset charset, boolean invalidUtf16Be, boolean invalidUtf16Le) {
            this.charset = charset;
            this.invalidUtf16Be = invalidUtf16Be;
            this.invalidUtf16Le = invalidUtf16Le;
        }

        static final Result EMPTY = new Result(null, false, false);
    }

    /**
     * Attempt to detect UTF-32 or UTF-16 from structural byte patterns.
     * Also reports UTF-16 invalidity for use as a model suppression signal.
     *
     * @param bytes  BOM-stripped probe bytes
     * @param offset start of the region to analyse
     * @param length number of bytes to analyse
     * @return result with detected charset and invalidity flags
     */
    static Result analyze(byte[] bytes, int offset, int length) {
        if (bytes == null || length < 8) {
            return Result.EMPTY;
        }

        // UTF-32 must be tested before UTF-16: Latin UTF-32 also triggers the
        // UTF-16 null-column check (every other stride-2 pair is (0x00, 0x00)).
        Charset utf32 = tryUtf32(bytes, offset, length);
        if (utf32 != null) {
            return new Result(utf32, false, false);
        }
        return tryUtf16(bytes, offset, length);
    }

    /** Convenience overload for a full array. */
    static Result analyze(byte[] bytes) {
        return bytes == null ? Result.EMPTY : analyze(bytes, 0, bytes.length);
    }

    /**
     * Convenience method that returns only the detected charset.
     * Use {@link #analyze} when you also need the invalidity flags.
     */
    static Charset detect(byte[] bytes) {
        return analyze(bytes).charset;
    }

    // -----------------------------------------------------------------------
    //  UTF-32
    // -----------------------------------------------------------------------

    /**
     * Minimum number of valid codepoints required for detection. At 8 bytes
     * we get exactly 2 groups; the probability that 2 random 4-byte values
     * both fall in 0x000000���0x10FFFF (excluding surrogates) is ~1.6e-11,
     * so 2 is safe against false positives on real-world byte data.
     */
    private static final int UTF32_MIN_VALID = 2;

    /**
     * Checks if every 4-byte group decodes to a valid Unicode codepoint
     * (U+0000���U+10FFFF, excluding surrogates U+D800���U+DFFF).
     *
     * <p>Inspired by ICU4J's {@code CharsetRecog_UTF_32}: the valid codepoint
     * range is tiny (0x000000���0x10FFFF) relative to the 32-bit value space,
     * so random data almost always produces out-of-range values that
     * immediately disqualify it. No null-byte density threshold is needed,
     * which means non-BMP content (emoji, historic scripts) works perfectly.</p>
     */
    private static Charset tryUtf32(byte[] bytes, int offset, int length) {
        int sampleLen = (Math.min(length, 512) / 4) * 4;
        if (sampleLen < 8) {
            return null;
        }
        int groups = sampleLen / 4;
        int validBe = 0, invalidBe = 0;
        int validLe = 0, invalidLe = 0;

        for (int g = 0; g < groups; g++) {
            int base = offset + g * 4;
            int b0 = bytes[base] & 0xFF;
            int b1 = bytes[base + 1] & 0xFF;
            int b2 = bytes[base + 2] & 0xFF;
            int b3 = bytes[base + 3] & 0xFF;

            int cpBe = (b0 << 24) | (b1 << 16) | (b2 << 8) | b3;
            if (cpBe >= 0 && cpBe <= 0x10FFFF && (cpBe < 0xD800 || cpBe > 0xDFFF)) {
                validBe++;
            } else {
                invalidBe++;
            }

            int cpLe = (b3 << 24) | (b2 << 16) | (b1 << 8) | b0;
            if (cpLe >= 0 && cpLe <= 0x10FFFF && (cpLe < 0xD800 || cpLe > 0xDFFF)) {
                validLe++;
            } else {
                invalidLe++;
            }
        }

        if (invalidBe == 0 && validBe >= UTF32_MIN_VALID) {
            return Charset.forName("UTF-32BE");
        }
        if (invalidLe == 0 && validLe >= UTF32_MIN_VALID) {
            return Charset.forName("UTF-32LE");
        }
        return null;
    }

    // -----------------------------------------------------------------------
    //  UTF-16
    // -----------------------------------------------------------------------

    /**
     * Null-column threshold: the null rate in one column must exceed
     * {@code 1 / NULL_DENOM} of pairs. Set to 4 (25%) to avoid false
     * positives on OLE2 and bzip2 which have 12���20% null at one column.
     * Real Latin UTF-16 has >90% null in the null column.
     */
    private static final int NULL_DENOM = 4;

    /**
     * Variety-ratio minimum: the diverse column must have at least this
     * many times more distinct values than the constrained column.
     */
    private static final double VARIETY_RATIO = 2.0;

    /**
     * The constrained column must have fewer than this fraction of pairs
     * as distinct values. Guards against uniformly random data.
     */
    private static final double CONSTRAINED_MAX_RATIO = 0.40;

    /**
     * Upper bound for the low-block-prefix phase. Scripts with UTF-16 high
     * bytes below this value are safely distinguishable from all legacy CJK
     * lead bytes (which start at 0x81).
     */
    private static final int LOW_PREFIX_MAX = 0x20;


    private static Result tryUtf16(byte[] bytes, int offset, int length) {
        int sampleLen = (Math.min(length, 512) / 2) * 2;
        if (sampleLen < 8) {
            return Result.EMPTY;
        }
        int pairs = sampleLen / 2;

        int nullsAtEven = 0;
        int nullsAtOdd = 0;
        int[] countsEven = new int[256];
        int[] countsOdd = new int[256];

        // Surrogate validation
        boolean awaitLowBe = false, awaitLowLe = false;
        boolean invalidBe = false, invalidLe = false;

        for (int p = 0; p < pairs; p++) {
            int even = bytes[offset + p * 2] & 0xFF;
            int odd = bytes[offset + p * 2 + 1] & 0xFF;

            if (even == 0) nullsAtEven++;
            if (odd == 0) nullsAtOdd++;
            countsEven[even]++;
            countsOdd[odd]++;

            // UTF-16BE surrogate validation (high byte = even)
            if (!invalidBe) {
                if (awaitLowBe) {
                    if (even >= 0xDC && even <= 0xDF) {
                        awaitLowBe = false;
                    } else {
                        invalidBe = true;
                    }
                } else {
                    if (even >= 0xD8 && even <= 0xDB) {
                        awaitLowBe = true;
                    } else if (even >= 0xDC && even <= 0xDF) {
                        invalidBe = true;
                    }
                }
            }

            // UTF-16LE surrogate validation (high byte = odd)
            if (!invalidLe) {
                if (awaitLowLe) {
                    if (odd >= 0xDC && odd <= 0xDF) {
                        awaitLowLe = false;
                    } else {
                        invalidLe = true;
                    }
                } else {
                    if (odd >= 0xD8 && odd <= 0xDB) {
                        awaitLowLe = true;
                    } else if (odd >= 0xDC && odd <= 0xDF) {
                        invalidLe = true;
                    }
                }
            }
        }
        if (awaitLowBe) invalidBe = true;
        if (awaitLowLe) invalidLe = true;

        int uniqueEven = countUnique(countsEven);
        int uniqueOdd = countUnique(countsOdd);

        // Phase 1: null-column (Latin/ASCII BMP content)
        boolean highEven = nullsAtEven * NULL_DENOM > pairs;
        boolean highOdd = nullsAtOdd * NULL_DENOM > pairs;
        if (highOdd && !highEven && !invalidLe) {
            return new Result(StandardCharsets.UTF_16LE, invalidBe, false);
        }
        if (highEven && !highOdd && !invalidBe) {
            return new Result(StandardCharsets.UTF_16BE, false, invalidLe);
        }

        // Phase 2: low-block-prefix (Cyrillic, Arabic, Hebrew, Indic, Thai, ���)
        // The constrained column has all non-null values < 0x20.
        // Safe: no legacy CJK lead byte is below 0x81.
        double constrainedMax = pairs * CONSTRAINED_MAX_RATIO;

        // Check LE: odd column is constrained (block-prefix), even is diverse
        if (!invalidLe
                && allNonNullBelow(countsOdd, LOW_PREFIX_MAX)
                && uniqueOdd <= constrainedMax
                && (double) uniqueEven / uniqueOdd >= VARIETY_RATIO
                && hasNonNull(countsOdd)) {
            return new Result(StandardCharsets.UTF_16LE, invalidBe, false);
        }
        // Check BE: even column is constrained, odd is diverse
        if (!invalidBe
                && allNonNullBelow(countsEven, LOW_PREFIX_MAX)
                && uniqueEven <= constrainedMax
                && (double) uniqueOdd / uniqueEven >= VARIETY_RATIO
                && hasNonNull(countsEven)) {
            return new Result(StandardCharsets.UTF_16BE, false, invalidLe);
        }

        return new Result(null, invalidBe, invalidLe);
    }

    // -----------------------------------------------------------------------
    //  Helpers
    // -----------------------------------------------------------------------

    private static int countUnique(int[] counts) {
        int n = 0;
        for (int c : counts) {
            if (c > 0) n++;
        }
        return n;
    }

    /** True if every non-null byte value in {@code counts} is < {@code max}. */
    private static boolean allNonNullBelow(int[] counts, int max) {
        for (int v = max; v < counts.length; v++) {
            if (counts[v] > 0) return false;
        }
        return true;
    }

    /** True if at least one non-null byte value has a positive count. */
    private static boolean hasNonNull(int[] counts) {
        for (int v = 1; v < counts.length; v++) {
            if (counts[v] > 0) return true;
        }
        return false;
    }

}