Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/markup.py: 28%

1"""Stage 1b: charset declaration extraction (HTML/XML/PEP 263)."""

3from __future__ import annotations

5import re

7from chardet.pipeline import DETERMINISTIC_CONFIDENCE, DetectionResult

8from chardet.registry import lookup_encoding

10_SCAN_LIMIT = 4096

12_XML_ENCODING_RE = re.compile(

13 rb"""<\?xml[^>]+encoding\s*=\s*['"]([^'"]+)['"]""", re.IGNORECASE

14)

15_HTML5_CHARSET_RE = re.compile(

16 rb"""<meta[^>]+charset\s*=\s*['"]?\s*([^\s'">;]+)""", re.IGNORECASE

17)

18_HTML4_CONTENT_TYPE_RE = re.compile(

19 rb"""<meta[^>]+content\s*=\s*['"][^'"]*charset=([^\s'">;]+)""", re.IGNORECASE

20)

22# PEP 263: encoding declaration in the first two lines of a Python file.

23# https://peps.python.org/pep-0263/

24_PEP263_RE = re.compile(rb"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.MULTILINE)

27def _detect_pep263(data: bytes) -> DetectionResult | None:

28 """Check the first two lines of *data* for a PEP 263 encoding declaration.

30 PEP 263 declarations (e.g. ``# -*- coding: utf-8 -*-``) are only valid

31 on line 1 or line 2 of a Python source file.

33 :param data: The raw byte data to scan.

34 :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``.

35 """

36 # PEP 263 requires a '#' comment marker on line 1 or 2.

37 if b"#" not in data[:200]:

38 return None

39 # Extract first two lines only.

40 first_two_lines = b"\n".join(data.split(b"\n", 2)[:2])

41 match = _PEP263_RE.search(first_two_lines)

42 if match:

43 try:

44 raw_name = match.group(1).decode("ascii").strip()

45 except (UnicodeDecodeError, ValueError):

46 return None

47 encoding = lookup_encoding(raw_name)

48 if encoding is not None and _validate_bytes(data, encoding):

49 return DetectionResult(

50 encoding=encoding,

51 confidence=DETERMINISTIC_CONFIDENCE,

52 language=None,

53 mime_type="text/x-python",

54 )

55 return None

58def detect_markup_charset(data: bytes) -> DetectionResult | None:

59 """Scan the first bytes of *data* for a charset declaration.

61 Checks for:

63 1. ``<?xml ... encoding="..."?>``

64 2. ``<meta charset="...">``

65 3. ``<meta http-equiv="Content-Type" content="...; charset=...">``

66 4. PEP 263 ``# -*- coding: ... -*-`` (first two lines only)

68 :param data: The raw byte data to scan.

69 :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``.

70 """

71 if not data:

72 return None

74 head = data[:_SCAN_LIMIT]

76 for pattern in (_XML_ENCODING_RE, _HTML5_CHARSET_RE, _HTML4_CONTENT_TYPE_RE):

77 match = pattern.search(head)

78 if match:

79 try:

80 raw_name = match.group(1).decode("ascii").strip()

81 except (UnicodeDecodeError, ValueError):

82 continue

83 encoding = lookup_encoding(raw_name)

84 if encoding is not None and _validate_bytes(data, encoding):

85 mime_type = "text/xml" if pattern is _XML_ENCODING_RE else "text/html"

86 return DetectionResult(

87 encoding=encoding,

88 confidence=DETERMINISTIC_CONFIDENCE,

89 language=None,

90 mime_type=mime_type,

91 )

93 return _detect_pep263(data)

96def _validate_bytes(data: bytes, encoding: str) -> bool:

97 """Check that *data* can be decoded under *encoding* without errors.

99 Only validates the first ``_SCAN_LIMIT`` bytes to avoid decoding a

100 full 200 kB input just to verify a charset declaration found in the

101 header.

102 """

103 try:

104 data[:_SCAN_LIMIT].decode(encoding)

105 except (UnicodeDecodeError, LookupError):

106 return False

107 return True