Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/utf8.py: 12%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

48 statements  

1"""Stage 1d: UTF-8 structural validation. 

2 

3Note: ``from __future__ import annotations`` is intentionally omitted because 

4this module is compiled with mypyc, which does not support PEP 563 string 

5annotations. 

6""" 

7 

8from chardet.pipeline import DetectionResult 

9 

10# Confidence curve parameters for UTF-8 detection. 

11# Even a small fraction of valid multi-byte sequences is strong evidence. 

12_BASE_CONFIDENCE = 0.80 

13_MAX_CONFIDENCE = 0.99 

14# Scale factor for the multi-byte byte ratio: mb_ratio * 6 saturates the 

15# confidence ramp at ~17% multi-byte content. 

16_MB_RATIO_SCALE = 6 

17 

18 

19def detect_utf8(data: bytes) -> DetectionResult | None: 

20 """Validate UTF-8 byte structure. 

21 

22 Returns a result only if multi-byte sequences are found (pure ASCII 

23 is handled by the ASCII stage). 

24 

25 :param data: The raw byte data to examine. 

26 :returns: A :class:`DetectionResult` for UTF-8, or ``None``. 

27 """ 

28 if not data: 

29 return None 

30 

31 i = 0 

32 length = len(data) 

33 multibyte_sequences = 0 

34 multibyte_bytes = 0 

35 

36 while i < length: 

37 byte = data[i] 

38 

39 if byte < 0x80: 

40 i += 1 

41 continue 

42 

43 # Determine expected sequence length from leading byte. 

44 # 0xC0-0xC1 are overlong 2-byte encodings of ASCII, so we start at 0xC2. 

45 if 0xC2 <= byte <= 0xDF: 

46 seq_len = 2 

47 elif 0xE0 <= byte <= 0xEF: 

48 seq_len = 3 

49 elif 0xF0 <= byte <= 0xF4: 

50 seq_len = 4 

51 else: 

52 # Invalid start byte (0x80-0xC1, 0xF5-0xFF) 

53 return None 

54 

55 # Truncated final sequence (e.g. from max_bytes slicing) — treat as 

56 # valid since the bytes seen so far are structurally correct. 

57 if i + seq_len > length: 

58 break 

59 

60 # Validate continuation bytes (must be 0x80-0xBF) 

61 for j in range(1, seq_len): 

62 if not (0x80 <= data[i + j] <= 0xBF): 

63 return None 

64 

65 # Reject overlong encodings and surrogates 

66 if seq_len == 3: 

67 # 0xE0: second byte must be >= 0xA0 (prevents overlong 3-byte) 

68 if byte == 0xE0 and data[i + 1] < 0xA0: 

69 return None 

70 # 0xED: second byte must be <= 0x9F (prevents UTF-16 surrogates U+D800-U+DFFF) 

71 if byte == 0xED and data[i + 1] > 0x9F: 

72 return None 

73 elif seq_len == 4: 

74 # 0xF0: second byte must be >= 0x90 (prevents overlong 4-byte) 

75 if byte == 0xF0 and data[i + 1] < 0x90: 

76 return None 

77 # 0xF4: second byte must be <= 0x8F (prevents codepoints above U+10FFFF) 

78 if byte == 0xF4 and data[i + 1] > 0x8F: 

79 return None 

80 

81 multibyte_sequences += 1 

82 multibyte_bytes += seq_len 

83 i += seq_len 

84 

85 # Pure ASCII — let the ASCII detector handle it 

86 if multibyte_sequences == 0: 

87 return None 

88 

89 # Confidence scales with the proportion of multi-byte bytes in the data. 

90 # Even a small amount of valid multi-byte UTF-8 is strong evidence. 

91 mb_ratio = multibyte_bytes / length 

92 confidence_range = _MAX_CONFIDENCE - _BASE_CONFIDENCE 

93 confidence = min( 

94 _MAX_CONFIDENCE, 

95 _BASE_CONFIDENCE + confidence_range * min(mb_ratio * _MB_RATIO_SCALE, 1.0), 

96 ) 

97 return DetectionResult(encoding="utf-8", confidence=confidence, language=None)