Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/escape.py: 8%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

119 statements  

1"""Early detection of escape-sequence-based encodings (ISO-2022, HZ-GB-2312, UTF-7). 

2 

3These encodings use ESC (0x1B), tilde (~), or plus (+) sequences to switch 

4character sets. They must be detected before binary detection (ESC is a control 

5byte) and before ASCII detection (HZ-GB-2312 and UTF-7 use only printable ASCII 

6bytes plus their respective shift markers). 

7 

8Note: ``from __future__ import annotations`` is intentionally omitted because 

9this module is compiled with mypyc, which does not support PEP 563 string 

10annotations. 

11""" 

12 

13from chardet.pipeline import DETERMINISTIC_CONFIDENCE, DetectionResult 

14 

15 

16def _has_valid_hz_regions(data: bytes) -> bool: 

17 """Check that at least one ~{...~} region contains valid GB2312 byte pairs. 

18 

19 In HZ-GB-2312 GB mode, characters are encoded as pairs of bytes in the 

20 0x21-0x7E range. We require at least one region with a non-empty, even- 

21 length run of such bytes. 

22 """ 

23 start = 0 

24 while True: 

25 begin = data.find(b"~{", start) 

26 if begin == -1: 

27 return False 

28 end = data.find(b"~}", begin + 2) 

29 if end == -1: 

30 return False 

31 region = data[begin + 2 : end] 

32 # Must be non-empty, even length, and all bytes in GB2312 range 

33 if ( 

34 len(region) >= 2 

35 and len(region) % 2 == 0 

36 and all(0x21 <= b <= 0x7E for b in region) 

37 ): 

38 return True 

39 start = end + 2 

40 

41 

42# Base64 alphabet used inside UTF-7 shifted sequences (+<Base64>-) 

43_B64_CHARS: bytes = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" 

44_UTF7_BASE64: frozenset[int] = frozenset(_B64_CHARS) 

45 

46# Lookup table mapping each Base64 byte to its 6-bit value (0-63). 

47_B64_DECODE: dict[int, int] = {c: i for i, c in enumerate(_B64_CHARS)} 

48 

49 

50def _is_valid_utf7_b64(b64_bytes: bytes) -> bool: 

51 """Check if base64 bytes decode to valid UTF-16BE with correct padding. 

52 

53 A valid UTF-7 shifted sequence must: 

54 1. Contain at least 3 Base64 characters (18 bits, enough for one 16-bit 

55 UTF-16 code unit). 

56 2. Have zero-valued trailing padding bits (the unused low bits of the last 

57 Base64 sextet after the last complete 16-bit code unit). 

58 3. Decode to valid UTF-16BE — no lone surrogates. 

59 

60 This rejects accidental ``+<alphanum>-`` patterns found in URLs, MIME 

61 boundaries, hex-encoded hashes (e.g. SHA-1 git refs), and other ASCII data. 

62 

63 The caller (``_has_valid_utf7_sequences``) already checks ``b64_len >= 3`` 

64 before calling this function, so *b64_bytes* is always at least 3 bytes. 

65 """ 

66 n = len(b64_bytes) 

67 total_bits = n * 6 

68 # Check that padding bits (trailing bits after last complete code unit) 

69 # are zero. 

70 padding_bits = total_bits % 16 

71 if padding_bits > 0: 

72 last_val = _B64_DECODE[b64_bytes[-1]] 

73 # The low `padding_bits` of the last sextet must be zero 

74 mask = (1 << padding_bits) - 1 

75 if last_val & mask: 

76 return False 

77 # Decode the base64 to raw bytes and validate as UTF-16BE. 

78 # Lone surrogates (unpaired 0xD800-0xDFFF code units) are illegal in 

79 # well-formed UTF-16 and cannot appear in real UTF-7 text. This catches 

80 # hex-encoded hashes and other accidental base64-like sequences. 

81 num_bytes = total_bits // 8 

82 raw = bytearray(num_bytes) 

83 bit_buf = 0 

84 bit_count = 0 

85 out_idx = 0 

86 for c in b64_bytes: 

87 bit_buf = (bit_buf << 6) | _B64_DECODE[c] 

88 bit_count += 6 

89 if bit_count >= 8: 

90 bit_count -= 8 

91 raw[out_idx] = (bit_buf >> bit_count) & 0xFF 

92 out_idx += 1 

93 prev_high = False 

94 for i in range(0, num_bytes - 1, 2): 

95 code_unit = (raw[i] << 8) | raw[i + 1] 

96 if 0xD800 <= code_unit <= 0xDBFF: # high surrogate 

97 if prev_high: 

98 return False # consecutive high surrogates 

99 prev_high = True 

100 elif 0xDC00 <= code_unit <= 0xDFFF: # low surrogate 

101 if not prev_high: 

102 return False # lone low surrogate 

103 prev_high = False 

104 else: 

105 if prev_high: 

106 return False # high surrogate not followed by low surrogate 

107 prev_high = False 

108 return not prev_high 

109 

110 

111def _is_embedded_in_base64(data: bytes, pos: int) -> bool: 

112 """Return True if the ``+`` at *pos* is embedded in a base64 stream. 

113 

114 Walks backward from *pos*, skipping CR/LF, and counts consecutive base64 

115 characters (including ``=`` for padding). If 4 or more are found, the 

116 ``+`` is likely part of a PEM certificate, email attachment, or similar 

117 base64 blob rather than a real UTF-7 shift character. 

118 """ 

119 b64_with_pad: frozenset[int] = _UTF7_BASE64 | frozenset(b"=") 

120 count = 0 

121 i = pos - 1 

122 while i >= 0: 

123 b = data[i] 

124 if b in {0x0A, 0x0D}: # skip newlines 

125 i -= 1 

126 continue 

127 if b in b64_with_pad: 

128 count += 1 

129 i -= 1 

130 else: 

131 break 

132 return count >= 4 

133 

134 

135def _has_valid_utf7_sequences(data: bytes) -> bool: 

136 """Check that *data* contains at least one valid UTF-7 shifted sequence. 

137 

138 A valid shifted sequence is ``+<base64 chars>`` terminated by either an 

139 explicit ``-`` or any non-Base64 character (per RFC 2152). The base64 

140 portion must decode to valid UTF-16BE with correct zero-padding bits. 

141 The sequence ``+-`` is a literal plus sign and is **not** counted. 

142 """ 

143 start = 0 

144 while True: 

145 shift_pos = data.find(ord("+"), start) 

146 if shift_pos == -1: 

147 return False 

148 pos = shift_pos + 1 # skip the '+' 

149 # +- is a literal plus, not a shifted sequence 

150 if pos < len(data) and data[pos] == ord("-"): 

151 start = pos + 1 

152 continue 

153 # Guard A: '+' as the first base64 character encodes PUA code points 

154 # (U+F800-U+FBFC) which never appear in real text. This catches 

155 # patterns like "C++20" and "++row". Skip past ALL consecutive '+' 

156 # characters so the next '+' in a run like ``++`` or ``+++`` is not 

157 # re-examined as a new shift character. 

158 if pos < len(data) and data[pos] == ord("+"): 

159 while pos < len(data) and data[pos] == ord("+"): 

160 pos += 1 

161 start = pos 

162 continue 

163 # Guard B: if the '+' is embedded in a base64 stream (PEM, email 

164 # attachment, etc.), it's not a real UTF-7 shift character. 

165 if _is_embedded_in_base64(data, shift_pos): 

166 start = pos 

167 continue 

168 # Collect consecutive Base64 characters 

169 i = pos 

170 while i < len(data) and data[i] in _UTF7_BASE64: 

171 i += 1 

172 b64_len = i - pos 

173 b64_data = data[pos:i] 

174 # Guard C: reject base64 blocks with no uppercase letters. 

175 # UTF-7 encodes UTF-16BE code points, and the high byte for virtually 

176 # every script (Latin Extended, Cyrillic, Arabic, CJK, …) produces 

177 # uppercase base64 characters. Sequences without any uppercase like 

178 # "row", "foo", "pos" are almost always variable names or English 

179 # words that accidentally follow a '+'. (bytes.islower() returns 

180 # True when there are no uppercase letters, even if digits or '/' 

181 # are present, which is the desired behavior here.) Out of 71,510 

182 # real UTF-7 base64 blocks in the test corpus, only 4 lack uppercase 

183 # letters (0.006%). 

184 if b64_len >= 3 and b64_data.islower(): 

185 start = i 

186 continue 

187 # Accept if base64 content is valid UTF-16BE (padding bits check 

188 # prevents false positives). Terminator can be '-', any non-Base64 

189 # byte, or end of data — all per RFC 2152. 

190 if b64_len >= 3 and _is_valid_utf7_b64(b64_data): 

191 return True 

192 start = max(pos, i) 

193 

194 

195def detect_escape_encoding(data: bytes) -> DetectionResult | None: 

196 """Detect ISO-2022, HZ-GB-2312, and UTF-7 from escape/tilde/plus sequences. 

197 

198 :param data: The raw byte data to examine. 

199 :returns: A :class:`DetectionResult` if an escape encoding is found, or ``None``. 

200 """ 

201 has_esc = b"\x1b" in data 

202 has_tilde = b"~" in data 

203 has_plus = b"+" in data 

204 

205 if not has_esc and not has_tilde and not has_plus: 

206 return None 

207 

208 if has_esc: 

209 # ISO-2022-JP-2004: JIS X 0213 designations are unique to this variant. 

210 if b"\x1b$(O" in data or b"\x1b$(P" in data or b"\x1b$(Q" in data: 

211 return DetectionResult( 

212 encoding="iso2022_jp_2004", 

213 confidence=DETERMINISTIC_CONFIDENCE, 

214 language="ja", 

215 ) 

216 

217 # ISO-2022-JP-EXT: JIS X 0201 Kana designation is unique to this variant. 

218 if b"\x1b(I" in data: 

219 return DetectionResult( 

220 encoding="iso2022_jp_ext", 

221 confidence=DETERMINISTIC_CONFIDENCE, 

222 language="ja", 

223 ) 

224 

225 # ISO-2022-JP base: JIS X 0208/0201/0212 designations. 

226 if ( 

227 b"\x1b$B" in data 

228 or b"\x1b$@" in data 

229 or b"\x1b(J" in data 

230 or b"\x1b$(D" in data # JIS X 0212-1990 (JP-1/JP-2/JP-EXT) 

231 ): 

232 # SI/SO (0x0E / 0x0F) shift controls -> JP-EXT 

233 if b"\x0e" in data and b"\x0f" in data: 

234 return DetectionResult( 

235 encoding="iso2022_jp_ext", 

236 confidence=DETERMINISTIC_CONFIDENCE, 

237 language="ja", 

238 ) 

239 # Default to JP-2: a strict superset of JP and JP-1 that 

240 # decodes all base sequences correctly. 

241 return DetectionResult( 

242 encoding="iso2022_jp_2", 

243 confidence=DETERMINISTIC_CONFIDENCE, 

244 language="ja", 

245 ) 

246 

247 # ISO-2022-KR: ESC sequence for KS C 5601 

248 if b"\x1b$)C" in data: 

249 return DetectionResult( 

250 encoding="iso2022_kr", 

251 confidence=DETERMINISTIC_CONFIDENCE, 

252 language="ko", 

253 ) 

254 

255 # HZ-GB-2312: tilde escapes for GB2312 

256 # Require valid GB2312 byte pairs (0x21-0x7E range) between ~{ and ~} markers. 

257 if has_tilde and b"~{" in data and b"~}" in data and _has_valid_hz_regions(data): 

258 return DetectionResult( 

259 encoding="hz", 

260 confidence=DETERMINISTIC_CONFIDENCE, 

261 language="zh", 

262 ) 

263 

264 # UTF-7: plus-sign shifts into Base64-encoded Unicode. 

265 # UTF-7 is a 7-bit encoding (RFC 2152): every byte must be in 0x00-0x7F. 

266 # Data with any byte > 0x7F cannot be UTF-7. 

267 if has_plus and max(data) < 0x80 and _has_valid_utf7_sequences(data): 

268 return DetectionResult( 

269 encoding="utf-7", 

270 confidence=DETERMINISTIC_CONFIDENCE, 

271 language=None, 

272 ) 

273 

274 return None