Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/utf8.py: 12%

1"""Stage 1d: UTF-8 structural validation.

3Note: ``from __future__ import annotations`` is intentionally omitted because

4this module is compiled with mypyc, which does not support PEP 563 string

5annotations.

6"""

8from chardet.pipeline import DetectionResult

10# Confidence curve parameters for UTF-8 detection.

11# Even a small fraction of valid multi-byte sequences is strong evidence.

12_BASE_CONFIDENCE = 0.80

13_MAX_CONFIDENCE = 0.99

14# Scale factor for the multi-byte byte ratio: mb_ratio * 6 saturates the

15# confidence ramp at ~17% multi-byte content.

16_MB_RATIO_SCALE = 6

19def detect_utf8(data: bytes) -> DetectionResult | None:

20 """Validate UTF-8 byte structure.

22 Returns a result only if multi-byte sequences are found (pure ASCII

23 is handled by the ASCII stage).

25 :param data: The raw byte data to examine.

26 :returns: A :class:`DetectionResult` for UTF-8, or ``None``.

27 """

28 if not data:

29 return None

31 i = 0

32 length = len(data)

33 multibyte_sequences = 0

34 multibyte_bytes = 0

36 while i < length:

37 byte = data[i]

39 if byte < 0x80:

40 i += 1

41 continue

43 # Determine expected sequence length from leading byte.

44 # 0xC0-0xC1 are overlong 2-byte encodings of ASCII, so we start at 0xC2.

45 if 0xC2 <= byte <= 0xDF:

46 seq_len = 2

47 elif 0xE0 <= byte <= 0xEF:

48 seq_len = 3

49 elif 0xF0 <= byte <= 0xF4:

50 seq_len = 4

51 else:

52 # Invalid start byte (0x80-0xC1, 0xF5-0xFF)

53 return None

55 # Truncated final sequence (e.g. from max_bytes slicing) — treat as

56 # valid since the bytes seen so far are structurally correct.

57 if i + seq_len > length:

58 break

60 # Validate continuation bytes (must be 0x80-0xBF)

61 for j in range(1, seq_len):

62 if not (0x80 <= data[i + j] <= 0xBF):

63 return None

65 # Reject overlong encodings and surrogates

66 if seq_len == 3:

67 # 0xE0: second byte must be >= 0xA0 (prevents overlong 3-byte)

68 if byte == 0xE0 and data[i + 1] < 0xA0:

69 return None

70 # 0xED: second byte must be <= 0x9F (prevents UTF-16 surrogates U+D800-U+DFFF)

71 if byte == 0xED and data[i + 1] > 0x9F:

72 return None

73 elif seq_len == 4:

74 # 0xF0: second byte must be >= 0x90 (prevents overlong 4-byte)

75 if byte == 0xF0 and data[i + 1] < 0x90:

76 return None

77 # 0xF4: second byte must be <= 0x8F (prevents codepoints above U+10FFFF)

78 if byte == 0xF4 and data[i + 1] > 0x8F:

79 return None

81 multibyte_sequences += 1

82 multibyte_bytes += seq_len

83 i += seq_len

85 # Pure ASCII — let the ASCII detector handle it

86 if multibyte_sequences == 0:

87 return None

89 # Confidence scales with the proportion of multi-byte bytes in the data.

90 # Even a small amount of valid multi-byte UTF-8 is strong evidence.

91 mb_ratio = multibyte_bytes / length

92 confidence_range = _MAX_CONFIDENCE - _BASE_CONFIDENCE

93 confidence = min(

94 _MAX_CONFIDENCE,

95 _BASE_CONFIDENCE + confidence_range * min(mb_ratio * _MB_RATIO_SCALE, 1.0),

96 )

97 return DetectionResult(encoding="utf-8", confidence=confidence, language=None)