1"""Stage 1b: charset declaration extraction (HTML/XML/PEP 263)."""
2
3from __future__ import annotations
4
5import re
6
7from chardet.pipeline import DETERMINISTIC_CONFIDENCE, DetectionResult, PipelineContext
8from chardet.pipeline.structural import compute_structural_score
9from chardet.registry import REGISTRY, lookup_encoding
10
11# Markup charset declarations that commonly refer to a Windows superset
12# encoding rather than the strict standard encoding. Japanese web content
13# almost universally declares "Shift_JIS" but actually uses CP932 extensions;
14# similarly, Korean web content declares "EUC-KR" but uses CP949/UHC.
15# When the declared encoding resolves to the base (left), we check whether
16# the superset (right) is a better structural match.
17_MARKUP_SUPERSET_PROMOTIONS: dict[str, str] = {
18 "shift_jis_2004": "cp932",
19 "euc_kr": "cp949",
20}
21
22_SCAN_LIMIT = 4096
23
24_XML_ENCODING_RE = re.compile(
25 rb"""<\?xml[^>]+encoding\s*=\s*['"]([^'"]+)['"]""", re.IGNORECASE
26)
27_HTML5_CHARSET_RE = re.compile(
28 rb"""<meta[^>]+charset\s*=\s*['"]?\s*([^\s'">;]+)""", re.IGNORECASE
29)
30_HTML4_CONTENT_TYPE_RE = re.compile(
31 rb"""<meta[^>]+content\s*=\s*['"][^'"]*charset=([^\s'">;]+)""", re.IGNORECASE
32)
33
34# PEP 263: encoding declaration in the first two lines of a Python file.
35# https://peps.python.org/pep-0263/
36_PEP263_RE = re.compile(rb"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.MULTILINE)
37
38
39def _detect_pep263(data: bytes) -> DetectionResult | None:
40 """Check the first two lines of *data* for a PEP 263 encoding declaration.
41
42 PEP 263 declarations (e.g. ``# -*- coding: utf-8 -*-``) are only valid
43 on line 1 or line 2 of a Python source file.
44
45 :param data: The raw byte data to scan.
46 :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``.
47 """
48 # PEP 263 requires a '#' comment marker on line 1 or 2.
49 if b"#" not in data[:200]:
50 return None
51 # Extract first two lines only.
52 first_two_lines = b"\n".join(data.split(b"\n", 2)[:2])
53 match = _PEP263_RE.search(first_two_lines)
54 if match:
55 try:
56 raw_name = match.group(1).decode("ascii").strip()
57 except (UnicodeDecodeError, ValueError):
58 return None
59 encoding = lookup_encoding(raw_name)
60 if encoding is not None and _validate_bytes(data, encoding):
61 return DetectionResult(
62 encoding=encoding,
63 confidence=DETERMINISTIC_CONFIDENCE,
64 language=None,
65 mime_type="text/x-python",
66 )
67 return None
68
69
70def detect_markup_charset(data: bytes) -> DetectionResult | None:
71 """Scan the first bytes of *data* for a charset declaration.
72
73 Checks for:
74
75 1. ``<?xml ... encoding="..."?>``
76 2. ``<meta charset="...">``
77 3. ``<meta http-equiv="Content-Type" content="...; charset=...">``
78 4. PEP 263 ``# -*- coding: ... -*-`` (first two lines only)
79
80 :param data: The raw byte data to scan.
81 :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``.
82 """
83 if not data:
84 return None
85
86 head = data[:_SCAN_LIMIT]
87
88 for pattern in (_XML_ENCODING_RE, _HTML5_CHARSET_RE, _HTML4_CONTENT_TYPE_RE):
89 match = pattern.search(head)
90 if match:
91 try:
92 raw_name = match.group(1).decode("ascii").strip()
93 except (UnicodeDecodeError, ValueError):
94 continue
95 encoding = lookup_encoding(raw_name)
96 if encoding is not None and _validate_bytes(data, encoding):
97 mime_type = "text/xml" if pattern is _XML_ENCODING_RE else "text/html"
98 return DetectionResult(
99 encoding=encoding,
100 confidence=DETERMINISTIC_CONFIDENCE,
101 language=None,
102 mime_type=mime_type,
103 )
104
105 return _detect_pep263(data)
106
107
108def promote_markup_superset(
109 data: bytes,
110 markup_result: DetectionResult,
111 allowed: frozenset[str],
112) -> DetectionResult:
113 """Promote a markup-declared encoding to its superset when structural evidence supports it.
114
115 If the declared encoding has a known superset (per
116 :data:`_MARKUP_SUPERSET_PROMOTIONS`), the superset validates the data,
117 and the superset's structural score is materially better, return a new
118 result using the superset encoding. Otherwise return *markup_result*
119 unchanged.
120 """
121 if markup_result.encoding is None:
122 return markup_result
123 superset_name = _MARKUP_SUPERSET_PROMOTIONS.get(markup_result.encoding)
124 if superset_name is None or superset_name not in allowed:
125 return markup_result
126 superset_info = REGISTRY[superset_name]
127 # Validate: superset must be able to decode the data
128 try:
129 data.decode(superset_name, errors="strict")
130 except (UnicodeDecodeError, LookupError):
131 return markup_result
132 # Compare structural scores
133 ctx = PipelineContext()
134 base_score = compute_structural_score(data, REGISTRY[markup_result.encoding], ctx)
135 superset_score = compute_structural_score(data, superset_info, ctx)
136 if superset_score > base_score:
137 return DetectionResult(
138 superset_name,
139 markup_result.confidence,
140 markup_result.language,
141 markup_result.mime_type,
142 )
143 return markup_result
144
145
146def _validate_bytes(data: bytes, encoding: str) -> bool:
147 """Check that *data* can be decoded under *encoding* without errors.
148
149 Only validates the first ``_SCAN_LIMIT`` bytes to avoid decoding a
150 full 200 kB input just to verify a charset declaration found in the
151 header.
152 """
153 try:
154 data[:_SCAN_LIMIT].decode(encoding)
155 except (UnicodeDecodeError, LookupError, ValueError):
156 return False
157 return True