Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/utf1632.py: 12%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

115 statements  

1"""Stage 1a+: UTF-16/UTF-32 detection for data without BOM. 

2 

3This stage runs after BOM detection but before binary detection. 

4UTF-16 and UTF-32 encoded text contains characteristic null-byte patterns 

5that would otherwise cause binary detection to reject the data. 

6 

7Note: ``from __future__ import annotations`` is intentionally omitted because 

8this module is compiled with mypyc, which does not support PEP 563 string 

9annotations. 

10""" 

11 

12import unicodedata 

13 

14from chardet.pipeline import DETERMINISTIC_CONFIDENCE, DetectionResult 

15 

16# How many bytes to sample for pattern analysis 

17_SAMPLE_SIZE = 4096 

18 

19# Minimum bytes needed for reliable pattern detection 

20_MIN_BYTES_UTF32 = 16 # 4 full code units 

21_MIN_BYTES_UTF16 = 10 # 5 full code units 

22 

23# Minimum fraction of null bytes in the expected position for UTF-16. 

24# CJK-heavy UTF-16 text (Chinese, Japanese, Korean) can have as few as 

25# ~4.5% null bytes in the expected position, since CJK codepoints have 

26# non-zero high bytes. The validation step (decode + text quality check) 

27# prevents false positives from binary files at this lower threshold. 

28_UTF16_MIN_NULL_FRACTION = 0.03 

29 

30# Minimum text-quality score to accept a UTF-16 candidate when both 

31# endiannesses show null-byte patterns. A score of 0.5 corresponds to 

32# roughly 50% letters with no ASCII bonus (or ~40% with whitespace 

33# present) — sufficient to distinguish real text from coincidental byte 

34# patterns. 

35_MIN_TEXT_QUALITY = 0.5 

36 

37# Minimum fraction of printable characters for a decoded sample to be 

38# considered text rather than binary data. 

39_MIN_PRINTABLE_FRACTION = 0.7 

40 

41 

42def detect_utf1632_patterns(data: bytes) -> DetectionResult | None: 

43 """Detect UTF-32 or UTF-16 encoding from null-byte patterns. 

44 

45 UTF-32 is checked before UTF-16 since UTF-32 patterns are more specific. 

46 

47 :param data: The raw byte data to examine. 

48 :returns: A :class:`DetectionResult` if a strong pattern is found, or ``None``. 

49 """ 

50 sample = data[:_SAMPLE_SIZE] 

51 

52 if len(sample) < _MIN_BYTES_UTF16: 

53 return None 

54 

55 # Check UTF-32 first (more specific pattern) 

56 result = _check_utf32(sample) 

57 if result is not None: 

58 return result 

59 

60 # Then check UTF-16 

61 return _check_utf16(sample) 

62 

63 

64def _check_utf32(data: bytes) -> DetectionResult | None: 

65 """Check for UTF-32 encoding based on 4-byte unit structure. 

66 

67 For valid Unicode (U+0000 to U+10FFFF = 0x0010FFFF): 

68 - UTF-32-BE: the first byte of each 4-byte unit is always 0x00 

69 - UTF-32-LE: the last byte of each 4-byte unit is always 0x00 

70 

71 For BMP characters (U+0000 to U+FFFF), additionally: 

72 - UTF-32-BE: the second byte is also 0x00 

73 - UTF-32-LE: the third byte is also 0x00 

74 """ 

75 # Trim to a multiple of 4 bytes (like _check_utf16 trims to even length) 

76 trimmed_len = len(data) - (len(data) % 4) 

77 if trimmed_len < _MIN_BYTES_UTF32: 

78 return None 

79 data = data[:trimmed_len] 

80 

81 num_units = trimmed_len // 4 

82 

83 # UTF-32-BE: first byte of each 4-byte unit must be 0x00 

84 be_first_null = sum(1 for i in range(0, len(data), 4) if data[i] == 0) 

85 # Second byte is 0x00 for BMP characters (the vast majority of text) 

86 be_second_null = sum(1 for i in range(0, len(data), 4) if data[i + 1] == 0) 

87 

88 if be_first_null == num_units and be_second_null / num_units > 0.5: 

89 try: 

90 text = data.decode("utf-32-be") 

91 if _looks_like_text(text): 

92 return DetectionResult( 

93 encoding="utf-32-be", 

94 confidence=DETERMINISTIC_CONFIDENCE, 

95 language=None, 

96 ) 

97 except UnicodeDecodeError: 

98 pass 

99 

100 # UTF-32-LE: last byte of each 4-byte unit must be 0x00 

101 le_last_null = sum(1 for i in range(3, len(data), 4) if data[i] == 0) 

102 # Third byte is 0x00 for BMP characters 

103 le_third_null = sum(1 for i in range(2, len(data), 4) if data[i] == 0) 

104 

105 if le_last_null == num_units and le_third_null / num_units > 0.5: 

106 try: 

107 text = data.decode("utf-32-le") 

108 if _looks_like_text(text): 

109 return DetectionResult( 

110 encoding="utf-32-le", 

111 confidence=DETERMINISTIC_CONFIDENCE, 

112 language=None, 

113 ) 

114 except UnicodeDecodeError: 

115 pass 

116 

117 return None 

118 

119 

120def _check_utf16(data: bytes) -> DetectionResult | None: 

121 """Check for UTF-16 via null-byte patterns in alternating positions. 

122 

123 UTF-16 encodes each BMP character as two bytes. For characters whose 

124 code-point high byte is 0x00 (Latin, digits, basic punctuation, many 

125 control structures), one of the two bytes in each unit will be a null. 

126 Even for non-Latin scripts (Arabic, CJK, Cyrillic, etc.) a significant 

127 fraction of code units still contain at least one null byte. 

128 

129 Non-UTF-16 single-byte encodings never contain null bytes, so even a 

130 small null-byte fraction in alternating positions is a strong signal. 

131 

132 When both endiannesses show null-byte patterns (e.g., Latin text where 

133 every other byte is null), we disambiguate by decoding both ways and 

134 comparing text-quality scores. 

135 """ 

136 sample_len = min(len(data), _SAMPLE_SIZE) 

137 sample_len -= sample_len % 2 

138 if sample_len < _MIN_BYTES_UTF16: # pragma: no cover - caller checks length 

139 return None 

140 

141 num_units = sample_len // 2 

142 

143 # Count null bytes in even positions (UTF-16-BE high byte for ASCII) 

144 be_null_count = sum(1 for i in range(0, sample_len, 2) if data[i] == 0) 

145 # Count null bytes in odd positions (UTF-16-LE high byte for ASCII) 

146 le_null_count = sum(1 for i in range(1, sample_len, 2) if data[i] == 0) 

147 

148 be_frac = be_null_count / num_units 

149 le_frac = le_null_count / num_units 

150 

151 candidates: list[tuple[str, float]] = [] 

152 if le_frac >= _UTF16_MIN_NULL_FRACTION: 

153 candidates.append(("utf-16-le", le_frac)) 

154 if be_frac >= _UTF16_MIN_NULL_FRACTION: 

155 candidates.append(("utf-16-be", be_frac)) 

156 

157 if not candidates: 

158 return None 

159 

160 # If only one candidate, validate and return 

161 if len(candidates) == 1: 

162 encoding = candidates[0][0] 

163 try: 

164 text = data[:sample_len].decode(encoding) 

165 if _looks_like_text(text): 

166 return DetectionResult( 

167 encoding=encoding, 

168 confidence=DETERMINISTIC_CONFIDENCE, 

169 language=None, 

170 ) 

171 except UnicodeDecodeError: 

172 pass 

173 return None 

174 

175 # Both candidates matched (common for Latin-heavy text where every other 

176 # byte is null). Decode both and pick the one with higher text quality. 

177 best_encoding: str | None = None 

178 best_quality = -1.0 

179 

180 for encoding, _ in candidates: 

181 try: 

182 text = data[:sample_len].decode(encoding) 

183 except UnicodeDecodeError: 

184 continue 

185 quality = _text_quality(text) 

186 if quality > best_quality: 

187 best_quality = quality 

188 best_encoding = encoding 

189 

190 if best_encoding is not None and best_quality >= _MIN_TEXT_QUALITY: 

191 return DetectionResult( 

192 encoding=best_encoding, 

193 confidence=DETERMINISTIC_CONFIDENCE, 

194 language=None, 

195 ) 

196 

197 return None 

198 

199 

200def _looks_like_text(text: str) -> bool: 

201 """Quick check: is decoded text mostly printable characters.""" 

202 if not text: 

203 return False 

204 sample = text[:500] 

205 printable = sum(1 for c in sample if c.isprintable() or c in "\n\r\t") 

206 return printable / len(sample) > _MIN_PRINTABLE_FRACTION 

207 

208 

209def _text_quality(text: str, limit: int = 500) -> float: 

210 """Score how much *text* looks like real human-readable content. 

211 

212 Returns a score in the range [-1.0, ~1.6). Higher values indicate 

213 more natural text. The practical maximum is 1.5 for all-ASCII-letter 

214 input (1.6 approaches as sample size grows with all ASCII letters plus 

215 whitespace). A score of -1.0 means the content is almost certainly not 

216 valid text (too many control characters or combining marks). 

217 

218 Scoring factors: 

219 

220 * Base score: ratio of Unicode letters (category ``L*``) to sample length. 

221 * ASCII bonus: additional 0.5x weight for ASCII letters. This is the 

222 primary signal for disambiguating endianness — correct decoding of 

223 Latin-heavy text produces ASCII letters, wrong decoding produces CJK. 

224 * Space bonus: +0.1 when the sample contains at least one whitespace 

225 character and is longer than 20 characters. 

226 * Rejection: returns -1.0 if >10% control characters or >20% combining 

227 marks (category ``M*``). 

228 """ 

229 sample = text[:limit] 

230 n = len(sample) 

231 if n == 0: # pragma: no cover - callers always pass non-empty text 

232 return -1.0 

233 

234 letters = 0 

235 marks = 0 

236 spaces = 0 

237 controls = 0 

238 ascii_letters = 0 

239 

240 for c in sample: 

241 cat = unicodedata.category(c) 

242 if cat[0] == "L": 

243 letters += 1 

244 if ord(c) < 128: 

245 ascii_letters += 1 

246 elif cat[0] == "M": 

247 marks += 1 

248 elif cat == "Zs" or c in "\n\r\t": 

249 spaces += 1 

250 elif cat[0] == "C": 

251 controls += 1 

252 

253 # Reject data with many control characters or combining marks 

254 if controls / n > 0.1: 

255 return -1.0 

256 if marks / n > 0.2: 

257 return -1.0 

258 

259 score = letters / n 

260 # ASCII letters strongly indicate correct endianness 

261 score += (ascii_letters / n) * 0.5 

262 # Real text usually contains some whitespace 

263 if n > 20 and spaces > 0: 

264 score += 0.1 

265 

266 return score