StructuralEncodingRules.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.ml.chardetect;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
/**
* Fast, rule-based encoding checks that run before the statistical model.
*
* <h3>Pipeline</h3>
* <ol>
* <li>{@link #checkAscii}: no bytes >= 0x80 ��� UTF-8 (ASCII is a subset)</li>
* <li>{@link #detectIso2022}: ISO-2022 escape sequences present ��� ISO-2022-JP,
* ISO-2022-KR, or ISO-2022-CN depending on the designation sequence</li>
* <li>{@link #checkUtf8}: validate UTF-8 multi-byte grammar; returns a
* {@link Utf8Result} indicating whether the bytes are definitively UTF-8,
* definitively not UTF-8, or ambiguous (pass to model).</li>
* </ol>
*
* <p>UTF-16/32 detection is handled upstream by
* {@link org.apache.tika.utils.ByteEncodingHint} and is not repeated here.</p>
*
* <p>IBM424 (EBCDIC Hebrew) is detected via {@link #checkIbm424}: the Hebrew
* letters in this code page occupy bytes 0x41���0x6A, which fall entirely below
* the 0x80 threshold used by the statistical model's feature extractor. The
* EBCDIC space (0x40) vs ASCII space (0x20) frequency ratio provides a cheap
* first-pass EBCDIC gate before the Hebrew letter frequencies are checked.</p>
*
* <p>All methods are stateless and safe to call from multiple threads.</p>
*/
public final class StructuralEncodingRules {
private StructuralEncodingRules() {}
/** ISO-2022 ESC byte. */
private static final int ESC = 0x1B;
/**
* Ratio of valid high bytes required to call UTF-8 "definitive".
* If the sample has more than {@value #MIN_HIGH_BYTE_RATIO_FOR_UTF8} of its
* bytes in multi-byte sequences and they all parse correctly, we trust UTF-8.
*/
private static final double MIN_HIGH_BYTE_RATIO_FOR_UTF8 = 0.01; // at least 1%
// -----------------------------------------------------------------------
// Public API
// -----------------------------------------------------------------------
/**
* Returns {@code true} if {@code bytes} contains no bytes with value
* >= 0x80 (i.e. pure 7-bit ASCII, which is a strict subset of UTF-8).
*/
public static boolean checkAscii(byte[] bytes) {
return checkAscii(bytes, 0, bytes.length);
}
public static boolean checkAscii(byte[] bytes, int offset, int length) {
int end = offset + length;
for (int i = offset; i < end; i++) {
if ((bytes[i] & 0xFF) >= 0x80) {
return false;
}
}
return true;
}
/**
* Detects ISO-2022-JP, ISO-2022-KR, and ISO-2022-CN by scanning for their
* characteristic ESC designation sequences.
*
* <p>All three share the {@code ESC $} ({@code 0x1B 0x24}) prefix, so we
* must read further to distinguish them:</p>
* <pre>
* ISO-2022-JP: ESC $ B (JIS X 0208-1983)
* ESC $ @ (JIS X 0208-1978)
* ESC $ ( D (JIS X 0212 supplementary)
* ISO-2022-KR: ESC $ ) C
* ISO-2022-CN: ESC $ ) A (GB2312)
* ESC $ ) G (CNS 11643 plane 1)
* ESC $ * H (CNS 11643 plane 2)
* </pre>
*
* <p>If {@code ESC $} is found but no recognised third byte follows (or the
* buffer is too short), ISO-2022-JP is returned as the most common default.</p>
*
* @return the detected ISO-2022 charset, or {@code null} if no ISO-2022
* escape sequence is found
*/
public static Charset detectIso2022(byte[] bytes) {
return detectIso2022(bytes, 0, bytes.length);
}
public static Charset detectIso2022(byte[] bytes, int offset, int length) {
int end = offset + length;
for (int i = offset; i < end - 1; i++) {
if ((bytes[i] & 0xFF) != ESC) {
continue;
}
int b1 = bytes[i + 1] & 0xFF;
if (b1 == 0x24) { // ESC $
// Need at least one more byte to classify
if (i + 2 >= end) {
return Charset.forName("ISO-2022-JP"); // ESC $ alone ��� JP default
}
int b2 = bytes[i + 2] & 0xFF;
switch (b2) {
case 0x42: // ESC $ B ��� JIS X 0208-1983
case 0x40: // ESC $ @ ��� JIS X 0208-1978
return Charset.forName("ISO-2022-JP");
case 0x28: // ESC $ ( ��� JIS X 0212 (JP supplementary)
return Charset.forName("ISO-2022-JP");
case 0x29: // ESC $ ) ��� KR or CN depending on b3
if (i + 3 >= end) {
return Charset.forName("ISO-2022-JP"); // can't tell, JP most common
}
int b3paren = bytes[i + 3] & 0xFF;
if (b3paren == 0x43) return Charset.forName("ISO-2022-KR"); // ESC $ ) C
if (b3paren == 0x41 || b3paren == 0x47) return Charset.forName("ISO-2022-CN");
return Charset.forName("ISO-2022-JP");
case 0x2A: // ESC $ * ��� CNS 11643 plane 2
return Charset.forName("ISO-2022-CN");
default:
return Charset.forName("ISO-2022-JP"); // unknown designation ��� JP default
}
}
if (b1 == 0x28) { // ESC ( ��� single-byte designation (e.g. ASCII restore)
// These appear in all ISO-2022 variants and don't help distinguish
continue;
}
}
return null; // no ISO-2022 escape found
}
/**
* Returns {@code true} if HZ-GB-2312 switching sequences are present.
*
* <p>HZ is a 7-bit encoding: it uses {@code ~\{} ({@code 0x7E 0x7B}) to
* enter two-byte GB2312 mode and {@code ~\}} ({@code 0x7E 0x7D}) to return
* to ASCII mode. Like ISO-2022, all bytes are below 0x80, so the model
* would see no features and must be bypassed with this structural check.</p>
*/
public static boolean checkHz(byte[] bytes) {
return checkHz(bytes, 0, bytes.length);
}
public static boolean checkHz(byte[] bytes, int offset, int length) {
int end = offset + length - 1;
for (int i = offset; i < end; i++) {
if ((bytes[i] & 0xFF) == 0x7E) {
int next = bytes[i + 1] & 0xFF;
if (next == 0x7B || next == 0x7D) { // ~{ or ~}
return true;
}
}
}
return false;
}
/**
* Detects IBM424 (EBCDIC Hebrew) by examining the sub-0x80 byte landscape.
*
* <h3>Why this is needed</h3>
* <p>In EBCDIC, the space character is {@code 0x40} (not {@code 0x20} as in
* ASCII). In IBM424 specifically, the 22 Hebrew base letters plus their five
* final forms occupy three byte clusters entirely below {@code 0x80}:</p>
* <pre>
* 0x41���0x49 alef ��� tet (9 letters)
* 0x51���0x59 yod ��� samekh (9 letters)
* 0x62���0x6A ayin ��� tav (9 letters + final-pe, tsadi, etc.)
* </pre>
* <p>The statistical model ignores all bytes below {@code 0x80}, so these
* letters are invisible to it. This structural rule detects them directly.</p>
*
* <h3>Algorithm</h3>
* <ol>
* <li><b>EBCDIC gate:</b> byte {@code 0x40} (EBCDIC space) must appear
* significantly more often than {@code 0x20} (ASCII space). In normal
* Latin text {@code 0x40} is the rare {@code @} character; in any EBCDIC
* text it is the word separator and appears at ~10���20% of bytes.</li>
* <li><b>Hebrew letter gate:</b> the combined frequency of bytes in the
* three Hebrew clusters above must exceed {@value #IBM424_HEBREW_THRESHOLD}
* of the sample length. Genuine Hebrew text has ~65% of its
* printable characters in these ranges. ASCII text with the same byte
* values (upper-case A���I, Q���Y, lower-case b���j) stays well below this
* threshold in practice.</li>
* </ol>
*
* @return {@code true} if the byte stream is almost certainly IBM424
*/
public static boolean checkIbm424(byte[] bytes) {
return checkIbm424(bytes, 0, bytes.length);
}
public static boolean checkIbm424(byte[] bytes, int offset, int length) {
if (length < 8) {
return false;
}
int sample = Math.min(length, 4096);
int end = offset + sample;
int ebcdicSpace = 0; // 0x40 ��� EBCDIC word separator
int asciiSpace = 0; // 0x20 ��� ASCII word separator
int hebrewBytes = 0; // 0x41-0x49, 0x51-0x59, 0x62-0x6A
int highBytes = 0; // bytes >= 0x80
int prev = -1;
for (int i = offset; i < end; i++) {
int b = bytes[i] & 0xFF;
if (b == 0x40) {
// In Shift_JIS, 0x40 appears only as a trail byte after a lead byte
// (0x81���0x9F or 0xE0���0xFC). Discount it as EBCDIC space in that case.
boolean isShiftJisTrail = (prev >= 0x81 && prev <= 0x9F)
|| (prev >= 0xE0 && prev <= 0xFC);
if (!isShiftJisTrail) {
ebcdicSpace++;
}
} else if (b == 0x20) {
asciiSpace++;
} else if ((b >= 0x41 && b <= 0x49)
|| (b >= 0x51 && b <= 0x59)
|| (b >= 0x62 && b <= 0x6A)) {
hebrewBytes++;
}
if (b >= 0x80) {
highBytes++;
}
prev = b;
}
// Gate 1: 0x40 must dominate over 0x20 (EBCDIC vs ASCII whitespace).
// We require 0x40 to be at least 3�� as frequent as 0x20, and appear
// at least 3% of the sample (rules out near-empty / binary content).
boolean ebcdicLikely = ebcdicSpace >= sample * 0.03
&& ebcdicSpace > asciiSpace * 3;
// Gate 2: Hebrew letter density must exceed the threshold.
boolean hebrewDense = hebrewBytes > sample * IBM424_HEBREW_THRESHOLD;
// Gate 3: IBM424 Hebrew has ALL letters below 0x80; IBM420 Arabic EBCDIC
// maps many Arabic letters to bytes >= 0x80 (e.g. ��=0xB1, ��=0xBB, ��=0xBD).
// If the high-byte ratio is substantial, this is IBM420, not IBM424.
boolean lowByteCharset = highBytes < sample * IBM424_MAX_HIGH_BYTE_RATIO;
return ebcdicLikely && hebrewDense && lowByteCharset;
}
/**
* Minimum fraction of bytes in IBM424 Hebrew letter positions (0x41���0x49,
* 0x51���0x59, 0x62���0x6A) required to confirm IBM424. Set conservatively to
* avoid false positives on ASCII text where those same byte values are
* upper-/lower-case Latin letters.
*/
private static final double IBM424_HEBREW_THRESHOLD = 0.12;
/**
* IBM424 (EBCDIC Hebrew) has all letter bytes below 0x80. IBM420 (EBCDIC
* Arabic) maps many Arabic letters to bytes >= 0x80 (lam=0xB1, meem=0xBB,
* etc.), giving it a high-byte ratio of ~40-50%. Reject the IBM424 gate if
* the high-byte ratio exceeds this threshold ��� the probe is IBM420 or another
* high-byte EBCDIC variant, not Hebrew.
*/
private static final double IBM424_MAX_HIGH_BYTE_RATIO = 0.15;
/**
* Detects IBM500 (International EBCDIC / EBCDIC-500) by looking for the
* combination of the EBCDIC space byte and high-byte Latin letter density.
*
* <h3>Why this is needed</h3>
* <p>In IBM500 every Latin letter is encoded as a byte ≥ 0x80:</p>
* <pre>
* 0x81���0x89 a���i (lowercase)
* 0x91���0x99 j���r (lowercase)
* 0xA2���0xA9 s���z (lowercase)
* 0xC1���0xC9 A���I (uppercase)
* 0xD1���0xD9 J���R (uppercase)
* 0xE2���0xE9 S���Z (uppercase)
* </pre>
* <p>At full probe length the statistical model distinguishes IBM500 from
* IBM424 without difficulty. At very short probes (��� 20 bytes) the model
* sees too few bytes to be confident and tends to confuse the two EBCDIC
* code pages. This structural gate fires early ��� before the model ��� using
* the cheap EBCDIC-space dominance check followed by a Latin-letter density
* check.</p>
*
* <h3>Algorithm</h3>
* <ol>
* <li><b>EBCDIC gate:</b> same as {@link #checkIbm424} ��� byte {@code 0x40}
* must dominate over {@code 0x20}. This distinguishes any EBCDIC
* encoding from ASCII/UTF-8/Latin-1 where {@code 0x40} is the rare
* {@code @} character.</li>
* <li><b>Latin letter density:</b> the combined frequency of bytes in the
* six IBM500 Latin-letter clusters above must exceed
* {@value #IBM500_LATIN_THRESHOLD} of the sample. Normal Latin text
* has ~60���70% letter bytes; the threshold is intentionally conservative
* to fire reliably at 20 bytes.</li>
* </ol>
*
* <p>{@link #checkIbm424} should be called first. If it fires the probe is
* IBM424 (Hebrew EBCDIC); only if it does not fire should this method be
* consulted for IBM500.</p>
*
* @return {@code true} if the byte stream is almost certainly IBM500
*/
public static boolean checkIbm500(byte[] bytes) {
return checkIbm500(bytes, 0, bytes.length);
}
public static boolean checkIbm500(byte[] bytes, int offset, int length) {
if (length < 8) {
return false;
}
int sample = Math.min(length, 4096);
int end = offset + sample;
int ebcdicSpace = 0; // 0x40 ��� EBCDIC word separator
int asciiSpace = 0; // 0x20 ��� ASCII word separator
int latinBytes = 0; // bytes in the six IBM500 Latin-letter clusters
int prev = -1;
for (int i = offset; i < end; i++) {
int b = bytes[i] & 0xFF;
if (b == 0x40) {
// Exclude Shift_JIS trail bytes (same guard as checkIbm424)
boolean isShiftJisTrail = (prev >= 0x81 && prev <= 0x9F)
|| (prev >= 0xE0 && prev <= 0xFC);
if (!isShiftJisTrail) {
ebcdicSpace++;
}
} else if (b == 0x20) {
asciiSpace++;
} else if ((b >= 0x81 && b <= 0x89) // a���i
|| (b >= 0x91 && b <= 0x99) // j���r
|| (b >= 0xA2 && b <= 0xA9) // s���z
|| (b >= 0xC1 && b <= 0xC9) // A���I
|| (b >= 0xD1 && b <= 0xD9) // J���R
|| (b >= 0xE2 && b <= 0xE9)) { // S���Z
latinBytes++;
}
prev = b;
}
// Gate 1: EBCDIC space must dominate over ASCII space.
boolean ebcdicLikely = ebcdicSpace >= sample * 0.03
&& ebcdicSpace > asciiSpace * 3;
// Gate 2: Latin letter density must exceed the threshold.
boolean latinDense = latinBytes > sample * IBM500_LATIN_THRESHOLD;
return ebcdicLikely && latinDense;
}
/**
* Minimum fraction of bytes in IBM500 Latin letter positions required to
* confirm IBM500. Latin text has ~60���70% letter bytes; 0.25 is conservative
* enough to fire at 20 bytes while avoiding false positives on binary data.
*/
private static final double IBM500_LATIN_THRESHOLD = 0.25;
/**
* Returns {@code true} if the probe contains at least one CRLF pair
* ({@code 0x0D 0x0A}).
*
* <p>Files originating on Windows use CRLF as the line separator.
* The presence of a {@code 0x0D 0x0A} pair in a probe that is otherwise
* 7-bit ASCII is weak evidence that the file was created on Windows and
* therefore more likely to use a Windows code page (e.g. windows-1252)
* than a Unix-origin ISO-8859-X encoding for any high-byte content
* beyond the probe window.</p>
*
* <p>A bare {@code 0x0D} without a following {@code 0x0A} is <em>not</em>
* counted: classic Mac OS used bare CR as its line ending, and that is a
* different case that does not imply Windows origin.</p>
*/
public static boolean hasCrlfBytes(byte[] bytes) {
return hasCrlfBytes(bytes, 0, bytes.length);
}
public static boolean hasCrlfBytes(byte[] bytes, int offset, int length) {
int end = offset + length;
for (int i = offset; i < end - 1; i++) {
if (bytes[i] == 0x0D && bytes[i + 1] == 0x0A) {
return true;
}
}
return false;
}
/**
* Returns {@code true} if the probe contains any byte in the C1 control
* range {@code 0x80���0x9F}.
*
* <p>In every ISO-8859-X encoding those byte values are C1 control
* characters that never appear in real text. In every Windows-12XX
* encoding they are printable characters (smart quotes, Euro sign,
* em-dash, ���). Their presence is therefore definitive proof that the
* content is <em>not</em> a valid ISO-8859-X encoding and should be
* attributed to the corresponding Windows-12XX variant instead.</p>
*/
public static boolean hasC1Bytes(byte[] bytes) {
return hasC1Bytes(bytes, 0, bytes.length);
}
public static boolean hasC1Bytes(byte[] bytes, int offset, int length) {
int end = offset + length;
for (int i = offset; i < end; i++) {
int v = bytes[i] & 0xFF;
if (v >= 0x80 && v <= 0x9F) {
return true;
}
}
return false;
}
/**
* Returns {@code true} if the probe contains at least one GB18030-specific
* 4-byte sequence.
*
* <h3>GB18030 4-byte structure</h3>
* <pre>
* Byte 1 (lead): 0x81���0xFE
* Byte 2 (second): 0x30���0x39 ��� ASCII digits
* Byte 3 (third): 0x81���0xFE
* Byte 4 (trail): 0x30���0x39 ��� ASCII digits
* </pre>
*
* <p>In GBK and GB2312 all trail bytes are in {@code 0x40���0xFE}, so a digit
* ({@code 0x30���0x39}) in the second or fourth position is impossible. A single
* matching 4-tuple is therefore definitive proof that the content was encoded
* with GB18030 and must be decoded with a GB18030-capable codec to avoid
* replacement characters for the affected code points.</p>
*/
public static boolean hasGb18030FourByteSequence(byte[] bytes) {
return hasGb18030FourByteSequence(bytes, 0, bytes.length);
}
public static boolean hasGb18030FourByteSequence(byte[] bytes, int offset, int length) {
int end = offset + length - 3;
for (int i = offset; i < end; i++) {
int b0 = bytes[i] & 0xFF;
if (b0 < 0x81 || b0 > 0xFE) {
continue;
}
int b1 = bytes[i + 1] & 0xFF;
if (b1 < 0x30 || b1 > 0x39) {
continue;
}
int b2 = bytes[i + 2] & 0xFF;
if (b2 < 0x81 || b2 > 0xFE) {
continue;
}
int b3 = bytes[i + 3] & 0xFF;
if (b3 < 0x30 || b3 > 0x39) {
continue;
}
return true;
}
return false;
}
/** @deprecated Use {@link #detectIso2022} which distinguishes JP/KR/CN. */
@Deprecated
public static boolean checkIso2022Jp(byte[] bytes) {
return detectIso2022(bytes) != null;
}
/**
* Validates the UTF-8 byte grammar of the sample and returns one of three
* outcomes:
* <ul>
* <li>{@link Utf8Result#DEFINITIVE_UTF8}: all multi-byte sequences are
* valid <em>and</em> the sample contains enough high bytes to be
* informative. Use UTF-8.</li>
* <li>{@link Utf8Result#NOT_UTF8}: at least one invalid byte sequence was
* found. Remove UTF-8 from the candidate set.</li>
* <li>{@link Utf8Result#AMBIGUOUS}: the sample is structurally valid UTF-8
* but contains very few high bytes (almost pure ASCII), so validity is
* uninformative. Pass to the model.</li>
* </ul>
*/
public static Utf8Result checkUtf8(byte[] bytes) {
return checkUtf8(bytes, 0, bytes.length);
}
public static Utf8Result checkUtf8(byte[] bytes, int offset, int length) {
int highByteCount = 0;
int i = offset;
int end = offset + length;
while (i < end) {
int b = bytes[i] & 0xFF;
if (b < 0x80) {
i++;
continue;
}
highByteCount++;
// Determine expected continuation count from the lead byte
int seqLen;
if (b >= 0xF8) {
// 5-/6-byte sequences are not valid Unicode
return Utf8Result.NOT_UTF8;
} else if (b >= 0xF0) {
seqLen = 4;
} else if (b >= 0xE0) {
seqLen = 3;
} else if (b >= 0xC0) {
seqLen = 2;
} else {
// 0x80���0xBF is a continuation byte without a lead ��� invalid
return Utf8Result.NOT_UTF8;
}
// Overlong 2-byte sequence (C0 or C1 lead)
if (seqLen == 2 && b <= 0xC1) {
return Utf8Result.NOT_UTF8;
}
// Check that the right number of continuation bytes follow
for (int k = 1; k < seqLen; k++) {
if (i + k >= end) {
// Truncated sequence at end of sample ��� treat as ambiguous
// (the sample just ran out, not necessarily bad data)
break;
}
int cb = bytes[i + k] & 0xFF;
if (cb < 0x80 || cb > 0xBF) {
return Utf8Result.NOT_UTF8;
}
}
// Validate scalar value ranges for 3- and 4-byte sequences
if (seqLen == 3) {
int cp = ((b & 0x0F) << 12)
| ((i + 1 < end ? bytes[i + 1] & 0xFF : 0) & 0x3F) << 6
| ((i + 2 < end ? bytes[i + 2] & 0xFF : 0) & 0x3F);
// Overlong encoding (< U+0800) or surrogate pair range
if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF)) {
return Utf8Result.NOT_UTF8;
}
} else if (seqLen == 4) {
int cp = ((b & 0x07) << 18)
| ((i + 1 < end ? bytes[i + 1] & 0xFF : 0) & 0x3F) << 12
| ((i + 2 < end ? bytes[i + 2] & 0xFF : 0) & 0x3F) << 6
| ((i + 3 < end ? bytes[i + 3] & 0xFF : 0) & 0x3F);
// Overlong or above U+10FFFF
if (cp < 0x10000 || cp > 0x10FFFF) {
return Utf8Result.NOT_UTF8;
}
}
i += seqLen;
}
// Grammar is valid. Was there enough evidence?
double highRatio = length > 0 ? (double) highByteCount / length : 0.0;
if (highRatio >= MIN_HIGH_BYTE_RATIO_FOR_UTF8) {
return Utf8Result.DEFINITIVE_UTF8;
}
return Utf8Result.AMBIGUOUS;
}
// -----------------------------------------------------------------------
// Result type
// -----------------------------------------------------------------------
/**
* Outcome of the UTF-8 structural check.
*/
public enum Utf8Result {
/** Sample is structurally valid UTF-8 with enough high bytes to be sure. */
DEFINITIVE_UTF8,
/** Sample contains at least one invalid UTF-8 sequence. */
NOT_UTF8,
/**
* Sample is structurally valid but nearly all-ASCII. Cannot confirm or
* deny; pass to the statistical model.
*/
AMBIGUOUS;
public boolean isDefinitive() {
return this != AMBIGUOUS;
}
public Charset toCharset() {
if (this == DEFINITIVE_UTF8) {
return StandardCharsets.UTF_8;
}
throw new IllegalStateException("Only DEFINITIVE_UTF8 has a Charset: " + this);
}
}
}