Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/utf1632.py: 12%

1"""Stage 1a+: UTF-16/UTF-32 detection for data without BOM.

3This stage runs after BOM detection but before binary detection.

4UTF-16 and UTF-32 encoded text contains characteristic null-byte patterns

5that would otherwise cause binary detection to reject the data.

7Note: ``from __future__ import annotations`` is intentionally omitted because

8this module is compiled with mypyc, which does not support PEP 563 string

9annotations.

10"""

12import unicodedata

14from chardet.pipeline import DETERMINISTIC_CONFIDENCE, DetectionResult

16# How many bytes to sample for pattern analysis

17_SAMPLE_SIZE = 4096

19# Minimum bytes needed for reliable pattern detection

20_MIN_BYTES_UTF32 = 16 # 4 full code units

21_MIN_BYTES_UTF16 = 10 # 5 full code units

23# Minimum fraction of null bytes in the expected position for UTF-16.

24# CJK-heavy UTF-16 text (Chinese, Japanese, Korean) can have as few as

25# ~4.5% null bytes in the expected position, since CJK codepoints have

26# non-zero high bytes. The validation step (decode + text quality check)

27# prevents false positives from binary files at this lower threshold.

28_UTF16_MIN_NULL_FRACTION = 0.03

30# Minimum text-quality score to accept a UTF-16 candidate when both

31# endiannesses show null-byte patterns. A score of 0.5 corresponds to

32# roughly 50% letters with no ASCII bonus (or ~40% with whitespace

33# present) — sufficient to distinguish real text from coincidental byte

34# patterns.

35_MIN_TEXT_QUALITY = 0.5

37# Minimum fraction of printable characters for a decoded sample to be

38# considered text rather than binary data.

39_MIN_PRINTABLE_FRACTION = 0.7

42def detect_utf1632_patterns(data: bytes) -> DetectionResult | None:

43 """Detect UTF-32 or UTF-16 encoding from null-byte patterns.

45 UTF-32 is checked before UTF-16 since UTF-32 patterns are more specific.

47 :param data: The raw byte data to examine.

48 :returns: A :class:`DetectionResult` if a strong pattern is found, or ``None``.

49 """

50 sample = data[:_SAMPLE_SIZE]

52 if len(sample) < _MIN_BYTES_UTF16:

53 return None

55 # Check UTF-32 first (more specific pattern)

56 result = _check_utf32(sample)

57 if result is not None:

58 return result

60 # Then check UTF-16

61 return _check_utf16(sample)

64def _check_utf32(data: bytes) -> DetectionResult | None:

65 """Check for UTF-32 encoding based on 4-byte unit structure.

67 For valid Unicode (U+0000 to U+10FFFF = 0x0010FFFF):

68 - UTF-32-BE: the first byte of each 4-byte unit is always 0x00

69 - UTF-32-LE: the last byte of each 4-byte unit is always 0x00

71 For BMP characters (U+0000 to U+FFFF), additionally:

72 - UTF-32-BE: the second byte is also 0x00

73 - UTF-32-LE: the third byte is also 0x00

74 """

75 # Trim to a multiple of 4 bytes (like _check_utf16 trims to even length)

76 trimmed_len = len(data) - (len(data) % 4)

77 if trimmed_len < _MIN_BYTES_UTF32:

78 return None

79 data = data[:trimmed_len]

81 num_units = trimmed_len // 4

83 # UTF-32-BE: first byte of each 4-byte unit must be 0x00

84 be_first_null = sum(1 for i in range(0, len(data), 4) if data[i] == 0)

85 # Second byte is 0x00 for BMP characters (the vast majority of text)

86 be_second_null = sum(1 for i in range(0, len(data), 4) if data[i + 1] == 0)

88 if be_first_null == num_units and be_second_null / num_units > 0.5:

89 try:

90 text = data.decode("utf-32-be")

91 if _looks_like_text(text):

92 return DetectionResult(

93 encoding="utf-32-be",

94 confidence=DETERMINISTIC_CONFIDENCE,

95 language=None,

96 )

97 except UnicodeDecodeError:

98 pass

100 # UTF-32-LE: last byte of each 4-byte unit must be 0x00

101 le_last_null = sum(1 for i in range(3, len(data), 4) if data[i] == 0)

102 # Third byte is 0x00 for BMP characters

103 le_third_null = sum(1 for i in range(2, len(data), 4) if data[i] == 0)

104

105 if le_last_null == num_units and le_third_null / num_units > 0.5:

106 try:

107 text = data.decode("utf-32-le")

108 if _looks_like_text(text):

109 return DetectionResult(

110 encoding="utf-32-le",

111 confidence=DETERMINISTIC_CONFIDENCE,

112 language=None,

113 )

114 except UnicodeDecodeError:

115 pass

116

117 return None

118

119

120def _check_utf16(data: bytes) -> DetectionResult | None:

121 """Check for UTF-16 via null-byte patterns in alternating positions.

122

123 UTF-16 encodes each BMP character as two bytes. For characters whose

124 code-point high byte is 0x00 (Latin, digits, basic punctuation, many

125 control structures), one of the two bytes in each unit will be a null.

126 Even for non-Latin scripts (Arabic, CJK, Cyrillic, etc.) a significant

127 fraction of code units still contain at least one null byte.

128

129 Non-UTF-16 single-byte encodings never contain null bytes, so even a

130 small null-byte fraction in alternating positions is a strong signal.

131

132 When both endiannesses show null-byte patterns (e.g., Latin text where

133 every other byte is null), we disambiguate by decoding both ways and

134 comparing text-quality scores.

135 """

136 sample_len = min(len(data), _SAMPLE_SIZE)

137 sample_len -= sample_len % 2

138 if sample_len < _MIN_BYTES_UTF16: # pragma: no cover - caller checks length

139 return None

140

141 num_units = sample_len // 2

142

143 # Count null bytes in even positions (UTF-16-BE high byte for ASCII)

144 be_null_count = sum(1 for i in range(0, sample_len, 2) if data[i] == 0)

145 # Count null bytes in odd positions (UTF-16-LE high byte for ASCII)

146 le_null_count = sum(1 for i in range(1, sample_len, 2) if data[i] == 0)

147

148 be_frac = be_null_count / num_units

149 le_frac = le_null_count / num_units

150

151 candidates: list[tuple[str, float]] = []

152 if le_frac >= _UTF16_MIN_NULL_FRACTION:

153 candidates.append(("utf-16-le", le_frac))

154 if be_frac >= _UTF16_MIN_NULL_FRACTION:

155 candidates.append(("utf-16-be", be_frac))

156

157 if not candidates:

158 return None

159

160 # If only one candidate, validate and return

161 if len(candidates) == 1:

162 encoding = candidates[0][0]

163 try:

164 text = data[:sample_len].decode(encoding)

165 if _looks_like_text(text):

166 return DetectionResult(

167 encoding=encoding,

168 confidence=DETERMINISTIC_CONFIDENCE,

169 language=None,

170 )

171 except UnicodeDecodeError:

172 pass

173 return None

174

175 # Both candidates matched (common for Latin-heavy text where every other

176 # byte is null). Decode both and pick the one with higher text quality.

177 best_encoding: str | None = None

178 best_quality = -1.0

179

180 for encoding, _ in candidates:

181 try:

182 text = data[:sample_len].decode(encoding)

183 except UnicodeDecodeError:

184 continue

185 quality = _text_quality(text)

186 if quality > best_quality:

187 best_quality = quality

188 best_encoding = encoding

189

190 if best_encoding is not None and best_quality >= _MIN_TEXT_QUALITY:

191 return DetectionResult(

192 encoding=best_encoding,

193 confidence=DETERMINISTIC_CONFIDENCE,

194 language=None,

195 )

196

197 return None

198

199

200def _looks_like_text(text: str) -> bool:

201 """Quick check: is decoded text mostly printable characters."""

202 if not text:

203 return False

204 sample = text[:500]

205 printable = sum(1 for c in sample if c.isprintable() or c in "\n\r\t")

206 return printable / len(sample) > _MIN_PRINTABLE_FRACTION

207

208

209def _text_quality(text: str, limit: int = 500) -> float:

210 """Score how much *text* looks like real human-readable content.

211

212 Returns a score in the range [-1.0, ~1.6). Higher values indicate

213 more natural text. The practical maximum is 1.5 for all-ASCII-letter

214 input (1.6 approaches as sample size grows with all ASCII letters plus

215 whitespace). A score of -1.0 means the content is almost certainly not

216 valid text (too many control characters or combining marks).

217

218 Scoring factors:

219

220 * Base score: ratio of Unicode letters (category ``L*``) to sample length.

221 * ASCII bonus: additional 0.5x weight for ASCII letters. This is the

222 primary signal for disambiguating endianness — correct decoding of

223 Latin-heavy text produces ASCII letters, wrong decoding produces CJK.

224 * Space bonus: +0.1 when the sample contains at least one whitespace

225 character and is longer than 20 characters.

226 * Rejection: returns -1.0 if >10% control characters or >20% combining

227 marks (category ``M*``).

228 """

229 sample = text[:limit]

230 n = len(sample)

231 if n == 0: # pragma: no cover - callers always pass non-empty text

232 return -1.0

233

234 letters = 0

235 marks = 0

236 spaces = 0

237 controls = 0

238 ascii_letters = 0

239

240 for c in sample:

241 cat = unicodedata.category(c)

242 if cat[0] == "L":

243 letters += 1

244 if ord(c) < 128:

245 ascii_letters += 1

246 elif cat[0] == "M":

247 marks += 1

248 elif cat == "Zs" or c in "\n\r\t":

249 spaces += 1

250 elif cat[0] == "C":

251 controls += 1

252

253 # Reject data with many control characters or combining marks

254 if controls / n > 0.1:

255 return -1.0

256 if marks / n > 0.2:

257 return -1.0

258

259 score = letters / n

260 # ASCII letters strongly indicate correct endianness

261 score += (ascii_letters / n) * 0.5

262 # Real text usually contains some whitespace

263 if n > 20 and spaces > 0:

264 score += 0.1

265

266 return score