Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/utf1632.py: 14%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

121 statements  

1"""Stage 1a+: UTF-16/UTF-32 detection for data without BOM. 

2 

3This stage runs after BOM detection but before binary detection. 

4UTF-16 and UTF-32 encoded text contains characteristic null-byte patterns 

5that would otherwise cause binary detection to reject the data. 

6 

7Note: ``from __future__ import annotations`` is intentionally omitted because 

8this module is compiled with mypyc, which does not support PEP 563 string 

9annotations. 

10""" 

11 

12import unicodedata 

13 

14from chardet.pipeline import ASCII_TEXT_BYTES, DETERMINISTIC_CONFIDENCE, DetectionResult 

15 

16# How many bytes to sample for pattern analysis 

17_SAMPLE_SIZE = 4096 

18 

19# Minimum bytes needed for reliable pattern detection 

20_MIN_BYTES_UTF32 = 16 # 4 full code units 

21_MIN_BYTES_UTF16 = 10 # 5 full code units 

22 

23# Minimum fraction of null bytes in the expected position for UTF-16. 

24# CJK-heavy UTF-16 text (Chinese, Japanese, Korean) can have as few as 

25# ~4.5% null bytes in the expected position, since CJK codepoints have 

26# non-zero high bytes. The validation step (decode + text quality check) 

27# prevents false positives from binary files at this lower threshold. 

28_UTF16_MIN_NULL_FRACTION = 0.03 

29 

30# Minimum text-quality score to accept a UTF-16 candidate when both 

31# endiannesses show null-byte patterns. A score of 0.5 corresponds to 

32# roughly 50% letters with no ASCII bonus (or ~40% with whitespace 

33# present) — sufficient to distinguish real text from coincidental byte 

34# patterns. 

35_MIN_TEXT_QUALITY = 0.5 

36 

37# Minimum fraction of printable characters for a decoded sample to be 

38# considered text rather than binary data. 

39_MIN_PRINTABLE_FRACTION = 0.7 

40 

41# Maximum null fraction (in the candidate null-byte position) below which 

42# the data is checked for a null-separator pattern. If the null fraction 

43# is below this AND all non-null bytes are printable ASCII, the candidate 

44# is rejected as a null-separator false positive rather than real UTF-16. 

45# Real Latin UTF-16 has ~50% nulls; CJK UTF-16 has fewer but non-ASCII 

46# non-null bytes. 15% is generous — separator data is typically 1-5%. 

47_NULL_SEPARATOR_MAX_FRACTION = 0.15 

48 

49# ASCII_TEXT_BYTES plus the null byte — used by the null-separator guard 

50# to check whether non-null bytes are all printable ASCII. 

51_NULL_SEPARATOR_ALLOWED: bytes = b"\x00" + ASCII_TEXT_BYTES 

52 

53 

54def _is_null_separator_pattern(data: bytes, null_frac: float) -> bool: 

55 """Return True if the data looks like ASCII with null byte separators. 

56 

57 :param data: The raw byte sample to examine. 

58 :param null_frac: The positional null fraction for this UTF-16 candidate 

59 (i.e. fraction of null bytes in even positions for BE, or odd positions 

60 for LE) — not the total null fraction across all bytes. 

61 

62 Checks two conditions: 

63 1. The positional null fraction is below ``_NULL_SEPARATOR_MAX_FRACTION`` 

64 2. Every non-null byte is printable ASCII or common whitespace 

65 

66 When both conditions are met, the nulls are likely field separators 

67 (e.g. ``find -print0``), not UTF-16 encoding artifacts. 

68 """ 

69 if null_frac >= _NULL_SEPARATOR_MAX_FRACTION: 

70 return False 

71 return not data.translate(None, _NULL_SEPARATOR_ALLOWED) 

72 

73 

74def detect_utf1632_patterns(data: bytes) -> DetectionResult | None: 

75 """Detect UTF-32 or UTF-16 encoding from null-byte patterns. 

76 

77 UTF-32 is checked before UTF-16 since UTF-32 patterns are more specific. 

78 

79 :param data: The raw byte data to examine. 

80 :returns: A :class:`DetectionResult` if a strong pattern is found, or ``None``. 

81 """ 

82 sample = data[:_SAMPLE_SIZE] 

83 

84 if len(sample) < _MIN_BYTES_UTF16: 

85 return None 

86 

87 # Check UTF-32 first (more specific pattern) 

88 result = _check_utf32(sample) 

89 if result is not None: 

90 return result 

91 

92 # Then check UTF-16 

93 return _check_utf16(sample) 

94 

95 

96def _check_utf32(data: bytes) -> DetectionResult | None: 

97 """Check for UTF-32 encoding based on 4-byte unit structure. 

98 

99 For valid Unicode (U+0000 to U+10FFFF = 0x0010FFFF): 

100 - UTF-32-BE: the first byte of each 4-byte unit is always 0x00 

101 - UTF-32-LE: the last byte of each 4-byte unit is always 0x00 

102 

103 For BMP characters (U+0000 to U+FFFF), additionally: 

104 - UTF-32-BE: the second byte is also 0x00 

105 - UTF-32-LE: the third byte is also 0x00 

106 """ 

107 # Trim to a multiple of 4 bytes (like _check_utf16 trims to even length) 

108 trimmed_len = len(data) - (len(data) % 4) 

109 if trimmed_len < _MIN_BYTES_UTF32: 

110 return None 

111 data = data[:trimmed_len] 

112 

113 num_units = trimmed_len // 4 

114 

115 # UTF-32-BE: first byte of each 4-byte unit must be 0x00 

116 be_first_null = sum(1 for i in range(0, len(data), 4) if data[i] == 0) 

117 # Second byte is 0x00 for BMP characters (the vast majority of text) 

118 be_second_null = sum(1 for i in range(0, len(data), 4) if data[i + 1] == 0) 

119 

120 if be_first_null == num_units and be_second_null / num_units > 0.5: 

121 try: 

122 text = data.decode("utf-32-be") 

123 if _looks_like_text(text): 

124 return DetectionResult( 

125 encoding="utf-32-be", 

126 confidence=DETERMINISTIC_CONFIDENCE, 

127 language=None, 

128 ) 

129 except UnicodeDecodeError: 

130 pass 

131 

132 # UTF-32-LE: last byte of each 4-byte unit must be 0x00 

133 le_last_null = sum(1 for i in range(3, len(data), 4) if data[i] == 0) 

134 # Third byte is 0x00 for BMP characters 

135 le_third_null = sum(1 for i in range(2, len(data), 4) if data[i] == 0) 

136 

137 if le_last_null == num_units and le_third_null / num_units > 0.5: 

138 try: 

139 text = data.decode("utf-32-le") 

140 if _looks_like_text(text): 

141 return DetectionResult( 

142 encoding="utf-32-le", 

143 confidence=DETERMINISTIC_CONFIDENCE, 

144 language=None, 

145 ) 

146 except UnicodeDecodeError: 

147 pass 

148 

149 return None 

150 

151 

152def _check_utf16(data: bytes) -> DetectionResult | None: 

153 """Check for UTF-16 via null-byte patterns in alternating positions. 

154 

155 UTF-16 encodes each BMP character as two bytes. For characters whose 

156 code-point high byte is 0x00 (Latin, digits, basic punctuation, many 

157 control structures), one of the two bytes in each unit will be a null. 

158 Even for non-Latin scripts (Arabic, CJK, Cyrillic, etc.) a significant 

159 fraction of code units still contain at least one null byte. 

160 

161 Non-UTF-16 single-byte encodings never contain null bytes, so even a 

162 small null-byte fraction in alternating positions is a strong signal. 

163 

164 When both endiannesses show null-byte patterns (e.g., Latin text where 

165 every other byte is null), we disambiguate by decoding both ways and 

166 comparing text-quality scores. 

167 """ 

168 sample_len = min(len(data), _SAMPLE_SIZE) 

169 sample_len -= sample_len % 2 

170 if sample_len < _MIN_BYTES_UTF16: # pragma: no cover - caller checks length 

171 return None 

172 

173 num_units = sample_len // 2 

174 

175 # Count null bytes in even positions (UTF-16-BE high byte for ASCII) 

176 be_null_count = sum(1 for i in range(0, sample_len, 2) if data[i] == 0) 

177 # Count null bytes in odd positions (UTF-16-LE high byte for ASCII) 

178 le_null_count = sum(1 for i in range(1, sample_len, 2) if data[i] == 0) 

179 

180 be_frac = be_null_count / num_units 

181 le_frac = le_null_count / num_units 

182 

183 candidates: list[tuple[str, float]] = [] 

184 if le_frac >= _UTF16_MIN_NULL_FRACTION and not _is_null_separator_pattern( 

185 data[:sample_len], le_frac 

186 ): 

187 candidates.append(("utf-16-le", le_frac)) 

188 if be_frac >= _UTF16_MIN_NULL_FRACTION and not _is_null_separator_pattern( 

189 data[:sample_len], be_frac 

190 ): 

191 candidates.append(("utf-16-be", be_frac)) 

192 

193 if not candidates: 

194 return None 

195 

196 # If only one candidate, validate and return 

197 if len(candidates) == 1: 

198 encoding = candidates[0][0] 

199 try: 

200 text = data[:sample_len].decode(encoding) 

201 if _looks_like_text(text): 

202 return DetectionResult( 

203 encoding=encoding, 

204 confidence=DETERMINISTIC_CONFIDENCE, 

205 language=None, 

206 ) 

207 except UnicodeDecodeError: 

208 pass 

209 return None 

210 

211 # Both candidates matched (common for Latin-heavy text where every other 

212 # byte is null). Decode both and pick the one with higher text quality. 

213 best_encoding: str | None = None 

214 best_quality = -1.0 

215 

216 for encoding, _ in candidates: 

217 try: 

218 text = data[:sample_len].decode(encoding) 

219 except UnicodeDecodeError: 

220 continue 

221 quality = _text_quality(text) 

222 if quality > best_quality: 

223 best_quality = quality 

224 best_encoding = encoding 

225 

226 if best_encoding is not None and best_quality >= _MIN_TEXT_QUALITY: 

227 return DetectionResult( 

228 encoding=best_encoding, 

229 confidence=DETERMINISTIC_CONFIDENCE, 

230 language=None, 

231 ) 

232 

233 return None 

234 

235 

236def _looks_like_text(text: str) -> bool: 

237 """Quick check: is decoded text mostly printable characters.""" 

238 if not text: 

239 return False 

240 sample = text[:500] 

241 printable = sum(1 for c in sample if c.isprintable() or c in "\n\r\t") 

242 return printable / len(sample) > _MIN_PRINTABLE_FRACTION 

243 

244 

245def _text_quality(text: str, limit: int = 500) -> float: 

246 """Score how much *text* looks like real human-readable content. 

247 

248 Returns a score in the range [-1.0, ~1.6). Higher values indicate 

249 more natural text. The practical maximum is 1.5 for all-ASCII-letter 

250 input (1.6 approaches as sample size grows with all ASCII letters plus 

251 whitespace). A score of -1.0 means the content is almost certainly not 

252 valid text (too many control characters or combining marks). 

253 

254 Scoring factors: 

255 

256 * Base score: ratio of Unicode letters (category ``L*``) to sample length. 

257 * ASCII bonus: additional 0.5x weight for ASCII letters. This is the 

258 primary signal for disambiguating endianness — correct decoding of 

259 Latin-heavy text produces ASCII letters, wrong decoding produces CJK. 

260 * Space bonus: +0.1 when the sample contains at least one whitespace 

261 character and is longer than 20 characters. 

262 * Rejection: returns -1.0 if >10% control characters or >20% combining 

263 marks (category ``M*``). 

264 """ 

265 sample = text[:limit] 

266 n = len(sample) 

267 if n == 0: # pragma: no cover - callers always pass non-empty text 

268 return -1.0 

269 

270 letters = 0 

271 marks = 0 

272 spaces = 0 

273 controls = 0 

274 ascii_letters = 0 

275 

276 for c in sample: 

277 cat = unicodedata.category(c) 

278 if cat[0] == "L": 

279 letters += 1 

280 if ord(c) < 128: 

281 ascii_letters += 1 

282 elif cat[0] == "M": 

283 marks += 1 

284 elif cat == "Zs" or c in "\n\r\t": 

285 spaces += 1 

286 elif cat[0] == "C": 

287 controls += 1 

288 

289 # Reject data with many control characters or combining marks 

290 if controls / n > 0.1: 

291 return -1.0 

292 if marks / n > 0.2: 

293 return -1.0 

294 

295 score = letters / n 

296 # ASCII letters strongly indicate correct endianness 

297 score += (ascii_letters / n) * 0.5 

298 # Real text usually contains some whitespace 

299 if n > 20 and spaces > 0: 

300 score += 0.1 

301 

302 return score