1"""Stage 1b: charset declaration extraction (HTML/XML/PEP 263)."""
2
3from __future__ import annotations
4
5import re
6
7from chardet.pipeline import DETERMINISTIC_CONFIDENCE, DetectionResult
8from chardet.registry import lookup_encoding
9
10_SCAN_LIMIT = 4096
11
12_XML_ENCODING_RE = re.compile(
13 rb"""<\?xml[^>]+encoding\s*=\s*['"]([^'"]+)['"]""", re.IGNORECASE
14)
15_HTML5_CHARSET_RE = re.compile(
16 rb"""<meta[^>]+charset\s*=\s*['"]?\s*([^\s'">;]+)""", re.IGNORECASE
17)
18_HTML4_CONTENT_TYPE_RE = re.compile(
19 rb"""<meta[^>]+content\s*=\s*['"][^'"]*charset=([^\s'">;]+)""", re.IGNORECASE
20)
21
22# PEP 263: encoding declaration in the first two lines of a Python file.
23# https://peps.python.org/pep-0263/
24_PEP263_RE = re.compile(rb"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.MULTILINE)
25
26
27def _detect_pep263(data: bytes) -> DetectionResult | None:
28 """Check the first two lines of *data* for a PEP 263 encoding declaration.
29
30 PEP 263 declarations (e.g. ``# -*- coding: utf-8 -*-``) are only valid
31 on line 1 or line 2 of a Python source file.
32
33 :param data: The raw byte data to scan.
34 :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``.
35 """
36 # PEP 263 requires a '#' comment marker on line 1 or 2.
37 if b"#" not in data[:200]:
38 return None
39 # Extract first two lines only.
40 first_two_lines = b"\n".join(data.split(b"\n", 2)[:2])
41 match = _PEP263_RE.search(first_two_lines)
42 if match:
43 try:
44 raw_name = match.group(1).decode("ascii").strip()
45 except (UnicodeDecodeError, ValueError):
46 return None
47 encoding = lookup_encoding(raw_name)
48 if encoding is not None and _validate_bytes(data, encoding):
49 return DetectionResult(
50 encoding=encoding,
51 confidence=DETERMINISTIC_CONFIDENCE,
52 language=None,
53 mime_type="text/x-python",
54 )
55 return None
56
57
58def detect_markup_charset(data: bytes) -> DetectionResult | None:
59 """Scan the first bytes of *data* for a charset declaration.
60
61 Checks for:
62
63 1. ``<?xml ... encoding="..."?>``
64 2. ``<meta charset="...">``
65 3. ``<meta http-equiv="Content-Type" content="...; charset=...">``
66 4. PEP 263 ``# -*- coding: ... -*-`` (first two lines only)
67
68 :param data: The raw byte data to scan.
69 :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``.
70 """
71 if not data:
72 return None
73
74 head = data[:_SCAN_LIMIT]
75
76 for pattern in (_XML_ENCODING_RE, _HTML5_CHARSET_RE, _HTML4_CONTENT_TYPE_RE):
77 match = pattern.search(head)
78 if match:
79 try:
80 raw_name = match.group(1).decode("ascii").strip()
81 except (UnicodeDecodeError, ValueError):
82 continue
83 encoding = lookup_encoding(raw_name)
84 if encoding is not None and _validate_bytes(data, encoding):
85 mime_type = "text/xml" if pattern is _XML_ENCODING_RE else "text/html"
86 return DetectionResult(
87 encoding=encoding,
88 confidence=DETERMINISTIC_CONFIDENCE,
89 language=None,
90 mime_type=mime_type,
91 )
92
93 return _detect_pep263(data)
94
95
96def _validate_bytes(data: bytes, encoding: str) -> bool:
97 """Check that *data* can be decoded under *encoding* without errors.
98
99 Only validates the first ``_SCAN_LIMIT`` bytes to avoid decoding a
100 full 200 kB input just to verify a charset declaration found in the
101 header.
102 """
103 try:
104 data[:_SCAN_LIMIT].decode(encoding)
105 except (UnicodeDecodeError, LookupError):
106 return False
107 return True