Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/markup.py: 25%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

65 statements  

1"""Stage 1b: charset declaration extraction (HTML/XML/PEP 263).""" 

2 

3from __future__ import annotations 

4 

5import re 

6 

7from chardet.pipeline import DETERMINISTIC_CONFIDENCE, DetectionResult, PipelineContext 

8from chardet.pipeline.structural import compute_structural_score 

9from chardet.registry import REGISTRY, lookup_encoding 

10 

11# Markup charset declarations that commonly refer to a Windows superset 

12# encoding rather than the strict standard encoding. Japanese web content 

13# almost universally declares "Shift_JIS" but actually uses CP932 extensions; 

14# similarly, Korean web content declares "EUC-KR" but uses CP949/UHC. 

15# When the declared encoding resolves to the base (left), we check whether 

16# the superset (right) is a better structural match. 

17_MARKUP_SUPERSET_PROMOTIONS: dict[str, str] = { 

18 "shift_jis_2004": "cp932", 

19 "euc_kr": "cp949", 

20} 

21 

22_SCAN_LIMIT = 4096 

23 

24_XML_ENCODING_RE = re.compile( 

25 rb"""<\?xml[^>]+encoding\s*=\s*['"]([^'"]+)['"]""", re.IGNORECASE 

26) 

27_HTML5_CHARSET_RE = re.compile( 

28 rb"""<meta[^>]+charset\s*=\s*['"]?\s*([^\s'">;]+)""", re.IGNORECASE 

29) 

30_HTML4_CONTENT_TYPE_RE = re.compile( 

31 rb"""<meta[^>]+content\s*=\s*['"][^'"]*charset=([^\s'">;]+)""", re.IGNORECASE 

32) 

33 

34# PEP 263: encoding declaration in the first two lines of a Python file. 

35# https://peps.python.org/pep-0263/ 

36_PEP263_RE = re.compile(rb"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.MULTILINE) 

37 

38 

39def _detect_pep263(data: bytes) -> DetectionResult | None: 

40 """Check the first two lines of *data* for a PEP 263 encoding declaration. 

41 

42 PEP 263 declarations (e.g. ``# -*- coding: utf-8 -*-``) are only valid 

43 on line 1 or line 2 of a Python source file. 

44 

45 :param data: The raw byte data to scan. 

46 :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``. 

47 """ 

48 # PEP 263 requires a '#' comment marker on line 1 or 2. 

49 if b"#" not in data[:200]: 

50 return None 

51 # Extract first two lines only. 

52 first_two_lines = b"\n".join(data.split(b"\n", 2)[:2]) 

53 match = _PEP263_RE.search(first_two_lines) 

54 if match: 

55 try: 

56 raw_name = match.group(1).decode("ascii").strip() 

57 except (UnicodeDecodeError, ValueError): 

58 return None 

59 encoding = lookup_encoding(raw_name) 

60 if encoding is not None and _validate_bytes(data, encoding): 

61 return DetectionResult( 

62 encoding=encoding, 

63 confidence=DETERMINISTIC_CONFIDENCE, 

64 language=None, 

65 mime_type="text/x-python", 

66 ) 

67 return None 

68 

69 

70def detect_markup_charset(data: bytes) -> DetectionResult | None: 

71 """Scan the first bytes of *data* for a charset declaration. 

72 

73 Checks for: 

74 

75 1. ``<?xml ... encoding="..."?>`` 

76 2. ``<meta charset="...">`` 

77 3. ``<meta http-equiv="Content-Type" content="...; charset=...">`` 

78 4. PEP 263 ``# -*- coding: ... -*-`` (first two lines only) 

79 

80 :param data: The raw byte data to scan. 

81 :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``. 

82 """ 

83 if not data: 

84 return None 

85 

86 head = data[:_SCAN_LIMIT] 

87 

88 for pattern in (_XML_ENCODING_RE, _HTML5_CHARSET_RE, _HTML4_CONTENT_TYPE_RE): 

89 match = pattern.search(head) 

90 if match: 

91 try: 

92 raw_name = match.group(1).decode("ascii").strip() 

93 except (UnicodeDecodeError, ValueError): 

94 continue 

95 encoding = lookup_encoding(raw_name) 

96 if encoding is not None and _validate_bytes(data, encoding): 

97 mime_type = "text/xml" if pattern is _XML_ENCODING_RE else "text/html" 

98 return DetectionResult( 

99 encoding=encoding, 

100 confidence=DETERMINISTIC_CONFIDENCE, 

101 language=None, 

102 mime_type=mime_type, 

103 ) 

104 

105 return _detect_pep263(data) 

106 

107 

108def promote_markup_superset( 

109 data: bytes, 

110 markup_result: DetectionResult, 

111 allowed: frozenset[str], 

112) -> DetectionResult: 

113 """Promote a markup-declared encoding to its superset when structural evidence supports it. 

114 

115 If the declared encoding has a known superset (per 

116 :data:`_MARKUP_SUPERSET_PROMOTIONS`), the superset validates the data, 

117 and the superset's structural score is materially better, return a new 

118 result using the superset encoding. Otherwise return *markup_result* 

119 unchanged. 

120 """ 

121 if markup_result.encoding is None: 

122 return markup_result 

123 superset_name = _MARKUP_SUPERSET_PROMOTIONS.get(markup_result.encoding) 

124 if superset_name is None or superset_name not in allowed: 

125 return markup_result 

126 superset_info = REGISTRY[superset_name] 

127 # Validate: superset must be able to decode the data 

128 try: 

129 data.decode(superset_name, errors="strict") 

130 except (UnicodeDecodeError, LookupError): 

131 return markup_result 

132 # Compare structural scores 

133 ctx = PipelineContext() 

134 base_score = compute_structural_score(data, REGISTRY[markup_result.encoding], ctx) 

135 superset_score = compute_structural_score(data, superset_info, ctx) 

136 if superset_score > base_score: 

137 return DetectionResult( 

138 superset_name, 

139 markup_result.confidence, 

140 markup_result.language, 

141 markup_result.mime_type, 

142 ) 

143 return markup_result 

144 

145 

146def _validate_bytes(data: bytes, encoding: str) -> bool: 

147 """Check that *data* can be decoded under *encoding* without errors. 

148 

149 Only validates the first ``_SCAN_LIMIT`` bytes to avoid decoding a 

150 full 200 kB input just to verify a charset declaration found in the 

151 header. 

152 """ 

153 try: 

154 data[:_SCAN_LIMIT].decode(encoding) 

155 except (UnicodeDecodeError, LookupError, ValueError): 

156 return False 

157 return True