Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/markup.py: 25%

1"""Stage 1b: charset declaration extraction (HTML/XML/PEP 263)."""

3from __future__ import annotations

5import re

7from chardet.pipeline import DETERMINISTIC_CONFIDENCE, DetectionResult, PipelineContext

8from chardet.pipeline.structural import compute_structural_score

9from chardet.registry import REGISTRY, lookup_encoding

11# Markup charset declarations that commonly refer to a Windows superset

12# encoding rather than the strict standard encoding. Japanese web content

13# almost universally declares "Shift_JIS" but actually uses CP932 extensions;

14# similarly, Korean web content declares "EUC-KR" but uses CP949/UHC.

15# When the declared encoding resolves to the base (left), we check whether

16# the superset (right) is a better structural match.

17_MARKUP_SUPERSET_PROMOTIONS: dict[str, str] = {

18 "shift_jis_2004": "cp932",

19 "euc_kr": "cp949",

20}

22_SCAN_LIMIT = 4096

24_XML_ENCODING_RE = re.compile(

25 rb"""<\?xml[^>]+encoding\s*=\s*['"]([^'"]+)['"]""", re.IGNORECASE

26)

27_HTML5_CHARSET_RE = re.compile(

28 rb"""<meta[^>]+charset\s*=\s*['"]?\s*([^\s'">;]+)""", re.IGNORECASE

29)

30_HTML4_CONTENT_TYPE_RE = re.compile(

31 rb"""<meta[^>]+content\s*=\s*['"][^'"]*charset=([^\s'">;]+)""", re.IGNORECASE

32)

34# PEP 263: encoding declaration in the first two lines of a Python file.

35# https://peps.python.org/pep-0263/

36_PEP263_RE = re.compile(rb"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.MULTILINE)

39def _detect_pep263(data: bytes) -> DetectionResult | None:

40 """Check the first two lines of *data* for a PEP 263 encoding declaration.

42 PEP 263 declarations (e.g. ``# -*- coding: utf-8 -*-``) are only valid

43 on line 1 or line 2 of a Python source file.

45 :param data: The raw byte data to scan.

46 :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``.

47 """

48 # PEP 263 requires a '#' comment marker on line 1 or 2.

49 if b"#" not in data[:200]:

50 return None

51 # Extract first two lines only.

52 first_two_lines = b"\n".join(data.split(b"\n", 2)[:2])

53 match = _PEP263_RE.search(first_two_lines)

54 if match:

55 try:

56 raw_name = match.group(1).decode("ascii").strip()

57 except (UnicodeDecodeError, ValueError):

58 return None

59 encoding = lookup_encoding(raw_name)

60 if encoding is not None and _validate_bytes(data, encoding):

61 return DetectionResult(

62 encoding=encoding,

63 confidence=DETERMINISTIC_CONFIDENCE,

64 language=None,

65 mime_type="text/x-python",

66 )

67 return None

70def detect_markup_charset(data: bytes) -> DetectionResult | None:

71 """Scan the first bytes of *data* for a charset declaration.

73 Checks for:

75 1. ``<?xml ... encoding="..."?>``

76 2. ``<meta charset="...">``

77 3. ``<meta http-equiv="Content-Type" content="...; charset=...">``

78 4. PEP 263 ``# -*- coding: ... -*-`` (first two lines only)

80 :param data: The raw byte data to scan.

81 :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``.

82 """

83 if not data:

84 return None

86 head = data[:_SCAN_LIMIT]

88 for pattern in (_XML_ENCODING_RE, _HTML5_CHARSET_RE, _HTML4_CONTENT_TYPE_RE):

89 match = pattern.search(head)

90 if match:

91 try:

92 raw_name = match.group(1).decode("ascii").strip()

93 except (UnicodeDecodeError, ValueError):

94 continue

95 encoding = lookup_encoding(raw_name)

96 if encoding is not None and _validate_bytes(data, encoding):

97 mime_type = "text/xml" if pattern is _XML_ENCODING_RE else "text/html"

98 return DetectionResult(

99 encoding=encoding,

100 confidence=DETERMINISTIC_CONFIDENCE,

101 language=None,

102 mime_type=mime_type,

103 )

104

105 return _detect_pep263(data)

106

107

108def promote_markup_superset(

109 data: bytes,

110 markup_result: DetectionResult,

111 allowed: frozenset[str],

112) -> DetectionResult:

113 """Promote a markup-declared encoding to its superset when structural evidence supports it.

114

115 If the declared encoding has a known superset (per

116 :data:`_MARKUP_SUPERSET_PROMOTIONS`), the superset validates the data,

117 and the superset's structural score is materially better, return a new

118 result using the superset encoding. Otherwise return *markup_result*

119 unchanged.

120 """

121 if markup_result.encoding is None:

122 return markup_result

123 superset_name = _MARKUP_SUPERSET_PROMOTIONS.get(markup_result.encoding)

124 if superset_name is None or superset_name not in allowed:

125 return markup_result

126 superset_info = REGISTRY[superset_name]

127 # Validate: superset must be able to decode the data

128 try:

129 data.decode(superset_name, errors="strict")

130 except (UnicodeDecodeError, LookupError):

131 return markup_result

132 # Compare structural scores

133 ctx = PipelineContext()

134 base_score = compute_structural_score(data, REGISTRY[markup_result.encoding], ctx)

135 superset_score = compute_structural_score(data, superset_info, ctx)

136 if superset_score > base_score:

137 return DetectionResult(

138 superset_name,

139 markup_result.confidence,

140 markup_result.language,

141 markup_result.mime_type,

142 )

143 return markup_result

144

145

146def _validate_bytes(data: bytes, encoding: str) -> bool:

147 """Check that *data* can be decoded under *encoding* without errors.

148

149 Only validates the first ``_SCAN_LIMIT`` bytes to avoid decoding a

150 full 200 kB input just to verify a charset declaration found in the

151 header.

152 """

153 try:

154 data[:_SCAN_LIMIT].decode(encoding)

155 except (UnicodeDecodeError, LookupError, ValueError):

156 return False

157 return True