1"""Stage 1a+: UTF-16/UTF-32 detection for data without BOM.
2
3This stage runs after BOM detection but before binary detection.
4UTF-16 and UTF-32 encoded text contains characteristic null-byte patterns
5that would otherwise cause binary detection to reject the data.
6
7Note: ``from __future__ import annotations`` is intentionally omitted because
8this module is compiled with mypyc, which does not support PEP 563 string
9annotations.
10"""
11
12import unicodedata
13
14from chardet.pipeline import ASCII_TEXT_BYTES, DETERMINISTIC_CONFIDENCE, DetectionResult
15
16# How many bytes to sample for pattern analysis
17_SAMPLE_SIZE = 4096
18
19# Minimum bytes needed for reliable pattern detection
20_MIN_BYTES_UTF32 = 16 # 4 full code units
21_MIN_BYTES_UTF16 = 10 # 5 full code units
22
23# Minimum fraction of null bytes in the expected position for UTF-16.
24# CJK-heavy UTF-16 text (Chinese, Japanese, Korean) can have as few as
25# ~4.5% null bytes in the expected position, since CJK codepoints have
26# non-zero high bytes. The validation step (decode + text quality check)
27# prevents false positives from binary files at this lower threshold.
28_UTF16_MIN_NULL_FRACTION = 0.03
29
30# Minimum text-quality score to accept a UTF-16 candidate when both
31# endiannesses show null-byte patterns. A score of 0.5 corresponds to
32# roughly 50% letters with no ASCII bonus (or ~40% with whitespace
33# present) — sufficient to distinguish real text from coincidental byte
34# patterns.
35_MIN_TEXT_QUALITY = 0.5
36
37# Minimum fraction of printable characters for a decoded sample to be
38# considered text rather than binary data.
39_MIN_PRINTABLE_FRACTION = 0.7
40
41# Maximum null fraction (in the candidate null-byte position) below which
42# the data is checked for a null-separator pattern. If the null fraction
43# is below this AND all non-null bytes are printable ASCII, the candidate
44# is rejected as a null-separator false positive rather than real UTF-16.
45# Real Latin UTF-16 has ~50% nulls; CJK UTF-16 has fewer but non-ASCII
46# non-null bytes. 15% is generous — separator data is typically 1-5%.
47_NULL_SEPARATOR_MAX_FRACTION = 0.15
48
49# ASCII_TEXT_BYTES plus the null byte — used by the null-separator guard
50# to check whether non-null bytes are all printable ASCII.
51_NULL_SEPARATOR_ALLOWED: bytes = b"\x00" + ASCII_TEXT_BYTES
52
53
54def _is_null_separator_pattern(data: bytes, null_frac: float) -> bool:
55 """Return True if the data looks like ASCII with null byte separators.
56
57 :param data: The raw byte sample to examine.
58 :param null_frac: The positional null fraction for this UTF-16 candidate
59 (i.e. fraction of null bytes in even positions for BE, or odd positions
60 for LE) — not the total null fraction across all bytes.
61
62 Checks two conditions:
63 1. The positional null fraction is below ``_NULL_SEPARATOR_MAX_FRACTION``
64 2. Every non-null byte is printable ASCII or common whitespace
65
66 When both conditions are met, the nulls are likely field separators
67 (e.g. ``find -print0``), not UTF-16 encoding artifacts.
68 """
69 if null_frac >= _NULL_SEPARATOR_MAX_FRACTION:
70 return False
71 return not data.translate(None, _NULL_SEPARATOR_ALLOWED)
72
73
74def detect_utf1632_patterns(data: bytes) -> DetectionResult | None:
75 """Detect UTF-32 or UTF-16 encoding from null-byte patterns.
76
77 UTF-32 is checked before UTF-16 since UTF-32 patterns are more specific.
78
79 :param data: The raw byte data to examine.
80 :returns: A :class:`DetectionResult` if a strong pattern is found, or ``None``.
81 """
82 sample = data[:_SAMPLE_SIZE]
83
84 if len(sample) < _MIN_BYTES_UTF16:
85 return None
86
87 # Check UTF-32 first (more specific pattern)
88 result = _check_utf32(sample)
89 if result is not None:
90 return result
91
92 # Then check UTF-16
93 return _check_utf16(sample)
94
95
96def _check_utf32(data: bytes) -> DetectionResult | None:
97 """Check for UTF-32 encoding based on 4-byte unit structure.
98
99 For valid Unicode (U+0000 to U+10FFFF = 0x0010FFFF):
100 - UTF-32-BE: the first byte of each 4-byte unit is always 0x00
101 - UTF-32-LE: the last byte of each 4-byte unit is always 0x00
102
103 For BMP characters (U+0000 to U+FFFF), additionally:
104 - UTF-32-BE: the second byte is also 0x00
105 - UTF-32-LE: the third byte is also 0x00
106 """
107 # Trim to a multiple of 4 bytes (like _check_utf16 trims to even length)
108 trimmed_len = len(data) - (len(data) % 4)
109 if trimmed_len < _MIN_BYTES_UTF32:
110 return None
111 data = data[:trimmed_len]
112
113 num_units = trimmed_len // 4
114
115 # UTF-32-BE: first byte of each 4-byte unit must be 0x00
116 be_first_null = sum(1 for i in range(0, len(data), 4) if data[i] == 0)
117 # Second byte is 0x00 for BMP characters (the vast majority of text)
118 be_second_null = sum(1 for i in range(0, len(data), 4) if data[i + 1] == 0)
119
120 if be_first_null == num_units and be_second_null / num_units > 0.5:
121 try:
122 text = data.decode("utf-32-be")
123 if _looks_like_text(text):
124 return DetectionResult(
125 encoding="utf-32-be",
126 confidence=DETERMINISTIC_CONFIDENCE,
127 language=None,
128 )
129 except UnicodeDecodeError:
130 pass
131
132 # UTF-32-LE: last byte of each 4-byte unit must be 0x00
133 le_last_null = sum(1 for i in range(3, len(data), 4) if data[i] == 0)
134 # Third byte is 0x00 for BMP characters
135 le_third_null = sum(1 for i in range(2, len(data), 4) if data[i] == 0)
136
137 if le_last_null == num_units and le_third_null / num_units > 0.5:
138 try:
139 text = data.decode("utf-32-le")
140 if _looks_like_text(text):
141 return DetectionResult(
142 encoding="utf-32-le",
143 confidence=DETERMINISTIC_CONFIDENCE,
144 language=None,
145 )
146 except UnicodeDecodeError:
147 pass
148
149 return None
150
151
152def _check_utf16(data: bytes) -> DetectionResult | None:
153 """Check for UTF-16 via null-byte patterns in alternating positions.
154
155 UTF-16 encodes each BMP character as two bytes. For characters whose
156 code-point high byte is 0x00 (Latin, digits, basic punctuation, many
157 control structures), one of the two bytes in each unit will be a null.
158 Even for non-Latin scripts (Arabic, CJK, Cyrillic, etc.) a significant
159 fraction of code units still contain at least one null byte.
160
161 Non-UTF-16 single-byte encodings never contain null bytes, so even a
162 small null-byte fraction in alternating positions is a strong signal.
163
164 When both endiannesses show null-byte patterns (e.g., Latin text where
165 every other byte is null), we disambiguate by decoding both ways and
166 comparing text-quality scores.
167 """
168 sample_len = min(len(data), _SAMPLE_SIZE)
169 sample_len -= sample_len % 2
170 if sample_len < _MIN_BYTES_UTF16: # pragma: no cover - caller checks length
171 return None
172
173 num_units = sample_len // 2
174
175 # Count null bytes in even positions (UTF-16-BE high byte for ASCII)
176 be_null_count = sum(1 for i in range(0, sample_len, 2) if data[i] == 0)
177 # Count null bytes in odd positions (UTF-16-LE high byte for ASCII)
178 le_null_count = sum(1 for i in range(1, sample_len, 2) if data[i] == 0)
179
180 be_frac = be_null_count / num_units
181 le_frac = le_null_count / num_units
182
183 candidates: list[tuple[str, float]] = []
184 if le_frac >= _UTF16_MIN_NULL_FRACTION and not _is_null_separator_pattern(
185 data[:sample_len], le_frac
186 ):
187 candidates.append(("utf-16-le", le_frac))
188 if be_frac >= _UTF16_MIN_NULL_FRACTION and not _is_null_separator_pattern(
189 data[:sample_len], be_frac
190 ):
191 candidates.append(("utf-16-be", be_frac))
192
193 if not candidates:
194 return None
195
196 # If only one candidate, validate and return
197 if len(candidates) == 1:
198 encoding = candidates[0][0]
199 try:
200 text = data[:sample_len].decode(encoding)
201 if _looks_like_text(text):
202 return DetectionResult(
203 encoding=encoding,
204 confidence=DETERMINISTIC_CONFIDENCE,
205 language=None,
206 )
207 except UnicodeDecodeError:
208 pass
209 return None
210
211 # Both candidates matched (common for Latin-heavy text where every other
212 # byte is null). Decode both and pick the one with higher text quality.
213 best_encoding: str | None = None
214 best_quality = -1.0
215
216 for encoding, _ in candidates:
217 try:
218 text = data[:sample_len].decode(encoding)
219 except UnicodeDecodeError:
220 continue
221 quality = _text_quality(text)
222 if quality > best_quality:
223 best_quality = quality
224 best_encoding = encoding
225
226 if best_encoding is not None and best_quality >= _MIN_TEXT_QUALITY:
227 return DetectionResult(
228 encoding=best_encoding,
229 confidence=DETERMINISTIC_CONFIDENCE,
230 language=None,
231 )
232
233 return None
234
235
236def _looks_like_text(text: str) -> bool:
237 """Quick check: is decoded text mostly printable characters."""
238 if not text:
239 return False
240 sample = text[:500]
241 printable = sum(1 for c in sample if c.isprintable() or c in "\n\r\t")
242 return printable / len(sample) > _MIN_PRINTABLE_FRACTION
243
244
245def _text_quality(text: str, limit: int = 500) -> float:
246 """Score how much *text* looks like real human-readable content.
247
248 Returns a score in the range [-1.0, ~1.6). Higher values indicate
249 more natural text. The practical maximum is 1.5 for all-ASCII-letter
250 input (1.6 approaches as sample size grows with all ASCII letters plus
251 whitespace). A score of -1.0 means the content is almost certainly not
252 valid text (too many control characters or combining marks).
253
254 Scoring factors:
255
256 * Base score: ratio of Unicode letters (category ``L*``) to sample length.
257 * ASCII bonus: additional 0.5x weight for ASCII letters. This is the
258 primary signal for disambiguating endianness — correct decoding of
259 Latin-heavy text produces ASCII letters, wrong decoding produces CJK.
260 * Space bonus: +0.1 when the sample contains at least one whitespace
261 character and is longer than 20 characters.
262 * Rejection: returns -1.0 if >10% control characters or >20% combining
263 marks (category ``M*``).
264 """
265 sample = text[:limit]
266 n = len(sample)
267 if n == 0: # pragma: no cover - callers always pass non-empty text
268 return -1.0
269
270 letters = 0
271 marks = 0
272 spaces = 0
273 controls = 0
274 ascii_letters = 0
275
276 for c in sample:
277 cat = unicodedata.category(c)
278 if cat[0] == "L":
279 letters += 1
280 if ord(c) < 128:
281 ascii_letters += 1
282 elif cat[0] == "M":
283 marks += 1
284 elif cat == "Zs" or c in "\n\r\t":
285 spaces += 1
286 elif cat[0] == "C":
287 controls += 1
288
289 # Reject data with many control characters or combining marks
290 if controls / n > 0.1:
291 return -1.0
292 if marks / n > 0.2:
293 return -1.0
294
295 score = letters / n
296 # ASCII letters strongly indicate correct endianness
297 score += (ascii_letters / n) * 0.5
298 # Real text usually contains some whitespace
299 if n > 20 and spaces > 0:
300 score += 0.1
301
302 return score