1"""Early detection of escape-sequence-based encodings (ISO-2022, HZ-GB-2312, UTF-7).
2
3These encodings use ESC (0x1B), tilde (~), or plus (+) sequences to switch
4character sets. They must be detected before binary detection (ESC is a control
5byte) and before ASCII detection (HZ-GB-2312 and UTF-7 use only printable ASCII
6bytes plus their respective shift markers).
7
8Note: ``from __future__ import annotations`` is intentionally omitted because
9this module is compiled with mypyc, which does not support PEP 563 string
10annotations.
11"""
12
13from chardet.pipeline import DETERMINISTIC_CONFIDENCE, DetectionResult
14
15
16def _has_valid_hz_regions(data: bytes) -> bool:
17 """Check that at least one ~{...~} region contains valid GB2312 byte pairs.
18
19 In HZ-GB-2312 GB mode, characters are encoded as pairs of bytes in the
20 0x21-0x7E range. We require at least one region with a non-empty, even-
21 length run of such bytes.
22 """
23 start = 0
24 while True:
25 begin = data.find(b"~{", start)
26 if begin == -1:
27 return False
28 end = data.find(b"~}", begin + 2)
29 if end == -1:
30 return False
31 region = data[begin + 2 : end]
32 # Must be non-empty, even length, and all bytes in GB2312 range
33 if (
34 len(region) >= 2
35 and len(region) % 2 == 0
36 and all(0x21 <= b <= 0x7E for b in region)
37 ):
38 return True
39 start = end + 2
40
41
42# Base64 alphabet used inside UTF-7 shifted sequences (+<Base64>-)
43_B64_CHARS: bytes = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
44_UTF7_BASE64: frozenset[int] = frozenset(_B64_CHARS)
45
46# Lookup table mapping each Base64 byte to its 6-bit value (0-63).
47_B64_DECODE: dict[int, int] = {c: i for i, c in enumerate(_B64_CHARS)}
48
49
50def _is_valid_utf7_b64(b64_bytes: bytes) -> bool:
51 """Check if base64 bytes decode to valid UTF-16BE with correct padding.
52
53 A valid UTF-7 shifted sequence must:
54 1. Contain at least 3 Base64 characters (18 bits, enough for one 16-bit
55 UTF-16 code unit).
56 2. Have zero-valued trailing padding bits (the unused low bits of the last
57 Base64 sextet after the last complete 16-bit code unit).
58 3. Decode to valid UTF-16BE — no lone surrogates.
59
60 This rejects accidental ``+<alphanum>-`` patterns found in URLs, MIME
61 boundaries, hex-encoded hashes (e.g. SHA-1 git refs), and other ASCII data.
62
63 The caller (``_has_valid_utf7_sequences``) already checks ``b64_len >= 3``
64 before calling this function, so *b64_bytes* is always at least 3 bytes.
65 """
66 n = len(b64_bytes)
67 total_bits = n * 6
68 # Check that padding bits (trailing bits after last complete code unit)
69 # are zero.
70 padding_bits = total_bits % 16
71 if padding_bits > 0:
72 last_val = _B64_DECODE[b64_bytes[-1]]
73 # The low `padding_bits` of the last sextet must be zero
74 mask = (1 << padding_bits) - 1
75 if last_val & mask:
76 return False
77 # Decode the base64 to raw bytes and validate as UTF-16BE.
78 # Lone surrogates (unpaired 0xD800-0xDFFF code units) are illegal in
79 # well-formed UTF-16 and cannot appear in real UTF-7 text. This catches
80 # hex-encoded hashes and other accidental base64-like sequences.
81 num_bytes = total_bits // 8
82 raw = bytearray(num_bytes)
83 bit_buf = 0
84 bit_count = 0
85 out_idx = 0
86 for c in b64_bytes:
87 bit_buf = (bit_buf << 6) | _B64_DECODE[c]
88 bit_count += 6
89 if bit_count >= 8:
90 bit_count -= 8
91 raw[out_idx] = (bit_buf >> bit_count) & 0xFF
92 out_idx += 1
93 prev_high = False
94 for i in range(0, num_bytes - 1, 2):
95 code_unit = (raw[i] << 8) | raw[i + 1]
96 if 0xD800 <= code_unit <= 0xDBFF: # high surrogate
97 if prev_high:
98 return False # consecutive high surrogates
99 prev_high = True
100 elif 0xDC00 <= code_unit <= 0xDFFF: # low surrogate
101 if not prev_high:
102 return False # lone low surrogate
103 prev_high = False
104 else:
105 if prev_high:
106 return False # high surrogate not followed by low surrogate
107 prev_high = False
108 return not prev_high
109
110
111def _is_embedded_in_base64(data: bytes, pos: int) -> bool:
112 """Return True if the ``+`` at *pos* is embedded in a base64 stream.
113
114 Walks backward from *pos*, skipping CR/LF, and counts consecutive base64
115 characters (including ``=`` for padding). If 4 or more are found, the
116 ``+`` is likely part of a PEM certificate, email attachment, or similar
117 base64 blob rather than a real UTF-7 shift character.
118 """
119 b64_with_pad: frozenset[int] = _UTF7_BASE64 | frozenset(b"=")
120 count = 0
121 i = pos - 1
122 while i >= 0:
123 b = data[i]
124 if b in {0x0A, 0x0D}: # skip newlines
125 i -= 1
126 continue
127 if b in b64_with_pad:
128 count += 1
129 i -= 1
130 else:
131 break
132 return count >= 4
133
134
135def _has_valid_utf7_sequences(data: bytes) -> bool:
136 """Check that *data* contains at least one valid UTF-7 shifted sequence.
137
138 A valid shifted sequence is ``+<base64 chars>`` terminated by either an
139 explicit ``-`` or any non-Base64 character (per RFC 2152). The base64
140 portion must decode to valid UTF-16BE with correct zero-padding bits.
141 The sequence ``+-`` is a literal plus sign and is **not** counted.
142 """
143 start = 0
144 while True:
145 shift_pos = data.find(ord("+"), start)
146 if shift_pos == -1:
147 return False
148 pos = shift_pos + 1 # skip the '+'
149 # +- is a literal plus, not a shifted sequence
150 if pos < len(data) and data[pos] == ord("-"):
151 start = pos + 1
152 continue
153 # Guard A: '+' as the first base64 character encodes PUA code points
154 # (U+F800-U+FBFC) which never appear in real text. This catches
155 # patterns like "C++20" and "++row". Skip past ALL consecutive '+'
156 # characters so the next '+' in a run like ``++`` or ``+++`` is not
157 # re-examined as a new shift character.
158 if pos < len(data) and data[pos] == ord("+"):
159 while pos < len(data) and data[pos] == ord("+"):
160 pos += 1
161 start = pos
162 continue
163 # Guard B: if the '+' is embedded in a base64 stream (PEM, email
164 # attachment, etc.), it's not a real UTF-7 shift character.
165 if _is_embedded_in_base64(data, shift_pos):
166 start = pos
167 continue
168 # Collect consecutive Base64 characters
169 i = pos
170 while i < len(data) and data[i] in _UTF7_BASE64:
171 i += 1
172 b64_len = i - pos
173 b64_data = data[pos:i]
174 # Guard C: reject base64 blocks with no uppercase letters.
175 # UTF-7 encodes UTF-16BE code points, and the high byte for virtually
176 # every script (Latin Extended, Cyrillic, Arabic, CJK, …) produces
177 # uppercase base64 characters. Sequences without any uppercase like
178 # "row", "foo", "pos" are almost always variable names or English
179 # words that accidentally follow a '+'. (bytes.islower() returns
180 # True when there are no uppercase letters, even if digits or '/'
181 # are present, which is the desired behavior here.) Out of 71,510
182 # real UTF-7 base64 blocks in the test corpus, only 4 lack uppercase
183 # letters (0.006%).
184 if b64_len >= 3 and b64_data.islower():
185 start = i
186 continue
187 # Accept if base64 content is valid UTF-16BE (padding bits check
188 # prevents false positives). Terminator can be '-', any non-Base64
189 # byte, or end of data — all per RFC 2152.
190 if b64_len >= 3 and _is_valid_utf7_b64(b64_data):
191 return True
192 start = max(pos, i)
193
194
195def detect_escape_encoding(data: bytes) -> DetectionResult | None:
196 """Detect ISO-2022, HZ-GB-2312, and UTF-7 from escape/tilde/plus sequences.
197
198 :param data: The raw byte data to examine.
199 :returns: A :class:`DetectionResult` if an escape encoding is found, or ``None``.
200 """
201 has_esc = b"\x1b" in data
202 has_tilde = b"~" in data
203 has_plus = b"+" in data
204
205 if not has_esc and not has_tilde and not has_plus:
206 return None
207
208 if has_esc:
209 # ISO-2022-JP-2004: JIS X 0213 designations are unique to this variant.
210 if b"\x1b$(O" in data or b"\x1b$(P" in data or b"\x1b$(Q" in data:
211 return DetectionResult(
212 encoding="iso2022_jp_2004",
213 confidence=DETERMINISTIC_CONFIDENCE,
214 language="ja",
215 )
216
217 # ISO-2022-JP-EXT: JIS X 0201 Kana designation is unique to this variant.
218 if b"\x1b(I" in data:
219 return DetectionResult(
220 encoding="iso2022_jp_ext",
221 confidence=DETERMINISTIC_CONFIDENCE,
222 language="ja",
223 )
224
225 # ISO-2022-JP base: JIS X 0208/0201/0212 designations.
226 if (
227 b"\x1b$B" in data
228 or b"\x1b$@" in data
229 or b"\x1b(J" in data
230 or b"\x1b$(D" in data # JIS X 0212-1990 (JP-1/JP-2/JP-EXT)
231 ):
232 # SI/SO (0x0E / 0x0F) shift controls -> JP-EXT
233 if b"\x0e" in data and b"\x0f" in data:
234 return DetectionResult(
235 encoding="iso2022_jp_ext",
236 confidence=DETERMINISTIC_CONFIDENCE,
237 language="ja",
238 )
239 # Default to JP-2: a strict superset of JP and JP-1 that
240 # decodes all base sequences correctly.
241 return DetectionResult(
242 encoding="iso2022_jp_2",
243 confidence=DETERMINISTIC_CONFIDENCE,
244 language="ja",
245 )
246
247 # ISO-2022-KR: ESC sequence for KS C 5601
248 if b"\x1b$)C" in data:
249 return DetectionResult(
250 encoding="iso2022_kr",
251 confidence=DETERMINISTIC_CONFIDENCE,
252 language="ko",
253 )
254
255 # HZ-GB-2312: tilde escapes for GB2312
256 # Require valid GB2312 byte pairs (0x21-0x7E range) between ~{ and ~} markers.
257 if has_tilde and b"~{" in data and b"~}" in data and _has_valid_hz_regions(data):
258 return DetectionResult(
259 encoding="hz",
260 confidence=DETERMINISTIC_CONFIDENCE,
261 language="zh",
262 )
263
264 # UTF-7: plus-sign shifts into Base64-encoded Unicode.
265 # UTF-7 is a 7-bit encoding (RFC 2152): every byte must be in 0x00-0x7F.
266 # Data with any byte > 0x7F cannot be UTF-7.
267 if has_plus and max(data) < 0x80 and _has_valid_utf7_sequences(data):
268 return DetectionResult(
269 encoding="utf-7",
270 confidence=DETERMINISTIC_CONFIDENCE,
271 language=None,
272 )
273
274 return None