1"""Stage 1a+: UTF-16/UTF-32 detection for data without BOM.
2
3This stage runs after BOM detection but before binary detection.
4UTF-16 and UTF-32 encoded text contains characteristic null-byte patterns
5that would otherwise cause binary detection to reject the data.
6
7Note: ``from __future__ import annotations`` is intentionally omitted because
8this module is compiled with mypyc, which does not support PEP 563 string
9annotations.
10"""
11
12import unicodedata
13
14from chardet.pipeline import DETERMINISTIC_CONFIDENCE, DetectionResult
15
16# How many bytes to sample for pattern analysis
17_SAMPLE_SIZE = 4096
18
19# Minimum bytes needed for reliable pattern detection
20_MIN_BYTES_UTF32 = 16 # 4 full code units
21_MIN_BYTES_UTF16 = 10 # 5 full code units
22
23# Minimum fraction of null bytes in the expected position for UTF-16.
24# CJK-heavy UTF-16 text (Chinese, Japanese, Korean) can have as few as
25# ~4.5% null bytes in the expected position, since CJK codepoints have
26# non-zero high bytes. The validation step (decode + text quality check)
27# prevents false positives from binary files at this lower threshold.
28_UTF16_MIN_NULL_FRACTION = 0.03
29
30# Minimum text-quality score to accept a UTF-16 candidate when both
31# endiannesses show null-byte patterns. A score of 0.5 corresponds to
32# roughly 50% letters with no ASCII bonus (or ~40% with whitespace
33# present) — sufficient to distinguish real text from coincidental byte
34# patterns.
35_MIN_TEXT_QUALITY = 0.5
36
37# Minimum fraction of printable characters for a decoded sample to be
38# considered text rather than binary data.
39_MIN_PRINTABLE_FRACTION = 0.7
40
41
42def detect_utf1632_patterns(data: bytes) -> DetectionResult | None:
43 """Detect UTF-32 or UTF-16 encoding from null-byte patterns.
44
45 UTF-32 is checked before UTF-16 since UTF-32 patterns are more specific.
46
47 :param data: The raw byte data to examine.
48 :returns: A :class:`DetectionResult` if a strong pattern is found, or ``None``.
49 """
50 sample = data[:_SAMPLE_SIZE]
51
52 if len(sample) < _MIN_BYTES_UTF16:
53 return None
54
55 # Check UTF-32 first (more specific pattern)
56 result = _check_utf32(sample)
57 if result is not None:
58 return result
59
60 # Then check UTF-16
61 return _check_utf16(sample)
62
63
64def _check_utf32(data: bytes) -> DetectionResult | None:
65 """Check for UTF-32 encoding based on 4-byte unit structure.
66
67 For valid Unicode (U+0000 to U+10FFFF = 0x0010FFFF):
68 - UTF-32-BE: the first byte of each 4-byte unit is always 0x00
69 - UTF-32-LE: the last byte of each 4-byte unit is always 0x00
70
71 For BMP characters (U+0000 to U+FFFF), additionally:
72 - UTF-32-BE: the second byte is also 0x00
73 - UTF-32-LE: the third byte is also 0x00
74 """
75 # Trim to a multiple of 4 bytes (like _check_utf16 trims to even length)
76 trimmed_len = len(data) - (len(data) % 4)
77 if trimmed_len < _MIN_BYTES_UTF32:
78 return None
79 data = data[:trimmed_len]
80
81 num_units = trimmed_len // 4
82
83 # UTF-32-BE: first byte of each 4-byte unit must be 0x00
84 be_first_null = sum(1 for i in range(0, len(data), 4) if data[i] == 0)
85 # Second byte is 0x00 for BMP characters (the vast majority of text)
86 be_second_null = sum(1 for i in range(0, len(data), 4) if data[i + 1] == 0)
87
88 if be_first_null == num_units and be_second_null / num_units > 0.5:
89 try:
90 text = data.decode("utf-32-be")
91 if _looks_like_text(text):
92 return DetectionResult(
93 encoding="utf-32-be",
94 confidence=DETERMINISTIC_CONFIDENCE,
95 language=None,
96 )
97 except UnicodeDecodeError:
98 pass
99
100 # UTF-32-LE: last byte of each 4-byte unit must be 0x00
101 le_last_null = sum(1 for i in range(3, len(data), 4) if data[i] == 0)
102 # Third byte is 0x00 for BMP characters
103 le_third_null = sum(1 for i in range(2, len(data), 4) if data[i] == 0)
104
105 if le_last_null == num_units and le_third_null / num_units > 0.5:
106 try:
107 text = data.decode("utf-32-le")
108 if _looks_like_text(text):
109 return DetectionResult(
110 encoding="utf-32-le",
111 confidence=DETERMINISTIC_CONFIDENCE,
112 language=None,
113 )
114 except UnicodeDecodeError:
115 pass
116
117 return None
118
119
120def _check_utf16(data: bytes) -> DetectionResult | None:
121 """Check for UTF-16 via null-byte patterns in alternating positions.
122
123 UTF-16 encodes each BMP character as two bytes. For characters whose
124 code-point high byte is 0x00 (Latin, digits, basic punctuation, many
125 control structures), one of the two bytes in each unit will be a null.
126 Even for non-Latin scripts (Arabic, CJK, Cyrillic, etc.) a significant
127 fraction of code units still contain at least one null byte.
128
129 Non-UTF-16 single-byte encodings never contain null bytes, so even a
130 small null-byte fraction in alternating positions is a strong signal.
131
132 When both endiannesses show null-byte patterns (e.g., Latin text where
133 every other byte is null), we disambiguate by decoding both ways and
134 comparing text-quality scores.
135 """
136 sample_len = min(len(data), _SAMPLE_SIZE)
137 sample_len -= sample_len % 2
138 if sample_len < _MIN_BYTES_UTF16: # pragma: no cover - caller checks length
139 return None
140
141 num_units = sample_len // 2
142
143 # Count null bytes in even positions (UTF-16-BE high byte for ASCII)
144 be_null_count = sum(1 for i in range(0, sample_len, 2) if data[i] == 0)
145 # Count null bytes in odd positions (UTF-16-LE high byte for ASCII)
146 le_null_count = sum(1 for i in range(1, sample_len, 2) if data[i] == 0)
147
148 be_frac = be_null_count / num_units
149 le_frac = le_null_count / num_units
150
151 candidates: list[tuple[str, float]] = []
152 if le_frac >= _UTF16_MIN_NULL_FRACTION:
153 candidates.append(("utf-16-le", le_frac))
154 if be_frac >= _UTF16_MIN_NULL_FRACTION:
155 candidates.append(("utf-16-be", be_frac))
156
157 if not candidates:
158 return None
159
160 # If only one candidate, validate and return
161 if len(candidates) == 1:
162 encoding = candidates[0][0]
163 try:
164 text = data[:sample_len].decode(encoding)
165 if _looks_like_text(text):
166 return DetectionResult(
167 encoding=encoding,
168 confidence=DETERMINISTIC_CONFIDENCE,
169 language=None,
170 )
171 except UnicodeDecodeError:
172 pass
173 return None
174
175 # Both candidates matched (common for Latin-heavy text where every other
176 # byte is null). Decode both and pick the one with higher text quality.
177 best_encoding: str | None = None
178 best_quality = -1.0
179
180 for encoding, _ in candidates:
181 try:
182 text = data[:sample_len].decode(encoding)
183 except UnicodeDecodeError:
184 continue
185 quality = _text_quality(text)
186 if quality > best_quality:
187 best_quality = quality
188 best_encoding = encoding
189
190 if best_encoding is not None and best_quality >= _MIN_TEXT_QUALITY:
191 return DetectionResult(
192 encoding=best_encoding,
193 confidence=DETERMINISTIC_CONFIDENCE,
194 language=None,
195 )
196
197 return None
198
199
200def _looks_like_text(text: str) -> bool:
201 """Quick check: is decoded text mostly printable characters."""
202 if not text:
203 return False
204 sample = text[:500]
205 printable = sum(1 for c in sample if c.isprintable() or c in "\n\r\t")
206 return printable / len(sample) > _MIN_PRINTABLE_FRACTION
207
208
209def _text_quality(text: str, limit: int = 500) -> float:
210 """Score how much *text* looks like real human-readable content.
211
212 Returns a score in the range [-1.0, ~1.6). Higher values indicate
213 more natural text. The practical maximum is 1.5 for all-ASCII-letter
214 input (1.6 approaches as sample size grows with all ASCII letters plus
215 whitespace). A score of -1.0 means the content is almost certainly not
216 valid text (too many control characters or combining marks).
217
218 Scoring factors:
219
220 * Base score: ratio of Unicode letters (category ``L*``) to sample length.
221 * ASCII bonus: additional 0.5x weight for ASCII letters. This is the
222 primary signal for disambiguating endianness — correct decoding of
223 Latin-heavy text produces ASCII letters, wrong decoding produces CJK.
224 * Space bonus: +0.1 when the sample contains at least one whitespace
225 character and is longer than 20 characters.
226 * Rejection: returns -1.0 if >10% control characters or >20% combining
227 marks (category ``M*``).
228 """
229 sample = text[:limit]
230 n = len(sample)
231 if n == 0: # pragma: no cover - callers always pass non-empty text
232 return -1.0
233
234 letters = 0
235 marks = 0
236 spaces = 0
237 controls = 0
238 ascii_letters = 0
239
240 for c in sample:
241 cat = unicodedata.category(c)
242 if cat[0] == "L":
243 letters += 1
244 if ord(c) < 128:
245 ascii_letters += 1
246 elif cat[0] == "M":
247 marks += 1
248 elif cat == "Zs" or c in "\n\r\t":
249 spaces += 1
250 elif cat[0] == "C":
251 controls += 1
252
253 # Reject data with many control characters or combining marks
254 if controls / n > 0.1:
255 return -1.0
256 if marks / n > 0.2:
257 return -1.0
258
259 score = letters / n
260 # ASCII letters strongly indicate correct endianness
261 score += (ascii_letters / n) * 0.5
262 # Real text usually contains some whitespace
263 if n > 20 and spaces > 0:
264 score += 0.1
265
266 return score