1"""Stage 1d: UTF-8 structural validation.
2
3Note: ``from __future__ import annotations`` is intentionally omitted because
4this module is compiled with mypyc, which does not support PEP 563 string
5annotations.
6"""
7
8from chardet.pipeline import DetectionResult
9
10# Confidence curve parameters for UTF-8 detection.
11# Even a small fraction of valid multi-byte sequences is strong evidence.
12_BASE_CONFIDENCE = 0.80
13_MAX_CONFIDENCE = 0.99
14# Scale factor for the multi-byte byte ratio: mb_ratio * 6 saturates the
15# confidence ramp at ~17% multi-byte content.
16_MB_RATIO_SCALE = 6
17
18
19def detect_utf8(data: bytes) -> DetectionResult | None:
20 """Validate UTF-8 byte structure.
21
22 Returns a result only if multi-byte sequences are found (pure ASCII
23 is handled by the ASCII stage).
24
25 :param data: The raw byte data to examine.
26 :returns: A :class:`DetectionResult` for UTF-8, or ``None``.
27 """
28 if not data:
29 return None
30
31 i = 0
32 length = len(data)
33 multibyte_sequences = 0
34 multibyte_bytes = 0
35
36 while i < length:
37 byte = data[i]
38
39 if byte < 0x80:
40 i += 1
41 continue
42
43 # Determine expected sequence length from leading byte.
44 # 0xC0-0xC1 are overlong 2-byte encodings of ASCII, so we start at 0xC2.
45 if 0xC2 <= byte <= 0xDF:
46 seq_len = 2
47 elif 0xE0 <= byte <= 0xEF:
48 seq_len = 3
49 elif 0xF0 <= byte <= 0xF4:
50 seq_len = 4
51 else:
52 # Invalid start byte (0x80-0xC1, 0xF5-0xFF)
53 return None
54
55 # Truncated final sequence (e.g. from max_bytes slicing) — treat as
56 # valid since the bytes seen so far are structurally correct.
57 if i + seq_len > length:
58 break
59
60 # Validate continuation bytes (must be 0x80-0xBF)
61 for j in range(1, seq_len):
62 if not (0x80 <= data[i + j] <= 0xBF):
63 return None
64
65 # Reject overlong encodings and surrogates
66 if seq_len == 3:
67 # 0xE0: second byte must be >= 0xA0 (prevents overlong 3-byte)
68 if byte == 0xE0 and data[i + 1] < 0xA0:
69 return None
70 # 0xED: second byte must be <= 0x9F (prevents UTF-16 surrogates U+D800-U+DFFF)
71 if byte == 0xED and data[i + 1] > 0x9F:
72 return None
73 elif seq_len == 4:
74 # 0xF0: second byte must be >= 0x90 (prevents overlong 4-byte)
75 if byte == 0xF0 and data[i + 1] < 0x90:
76 return None
77 # 0xF4: second byte must be <= 0x8F (prevents codepoints above U+10FFFF)
78 if byte == 0xF4 and data[i + 1] > 0x8F:
79 return None
80
81 multibyte_sequences += 1
82 multibyte_bytes += seq_len
83 i += seq_len
84
85 # Pure ASCII — let the ASCII detector handle it
86 if multibyte_sequences == 0:
87 return None
88
89 # Confidence scales with the proportion of multi-byte bytes in the data.
90 # Even a small amount of valid multi-byte UTF-8 is strong evidence.
91 mb_ratio = multibyte_bytes / length
92 confidence_range = _MAX_CONFIDENCE - _BASE_CONFIDENCE
93 confidence = min(
94 _MAX_CONFIDENCE,
95 _BASE_CONFIDENCE + confidence_range * min(mb_ratio * _MB_RATIO_SCALE, 1.0),
96 )
97 return DetectionResult(encoding="utf-8", confidence=confidence, language=None)