Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/markup.py: 29%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

45 statements  

1"""Stage 1b: charset declaration extraction (HTML/XML/PEP 263).""" 

2 

3from __future__ import annotations 

4 

5import re 

6 

7from chardet.pipeline import DETERMINISTIC_CONFIDENCE, DetectionResult 

8from chardet.registry import lookup_encoding 

9 

10_SCAN_LIMIT = 4096 

11 

12_XML_ENCODING_RE = re.compile( 

13 rb"""<\?xml[^>]+encoding\s*=\s*['"]([^'"]+)['"]""", re.IGNORECASE 

14) 

15_HTML5_CHARSET_RE = re.compile( 

16 rb"""<meta[^>]+charset\s*=\s*['"]?\s*([^\s'">;]+)""", re.IGNORECASE 

17) 

18_HTML4_CONTENT_TYPE_RE = re.compile( 

19 rb"""<meta[^>]+content\s*=\s*['"][^'"]*charset=([^\s'">;]+)""", re.IGNORECASE 

20) 

21 

22# PEP 263: encoding declaration in the first two lines of a Python file. 

23# https://peps.python.org/pep-0263/ 

24_PEP263_RE = re.compile(rb"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.MULTILINE) 

25 

26 

27def _detect_pep263(data: bytes) -> DetectionResult | None: 

28 """Check the first two lines of *data* for a PEP 263 encoding declaration. 

29 

30 PEP 263 declarations (e.g. ``# -*- coding: utf-8 -*-``) are only valid 

31 on line 1 or line 2 of a Python source file. 

32 

33 :param data: The raw byte data to scan. 

34 :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``. 

35 """ 

36 # PEP 263 requires a '#' comment marker on line 1 or 2. 

37 if b"#" not in data[:200]: 

38 return None 

39 # Extract first two lines only. 

40 first_two_lines = b"\n".join(data.split(b"\n", 2)[:2]) 

41 match = _PEP263_RE.search(first_two_lines) 

42 if match: 

43 try: 

44 raw_name = match.group(1).decode("ascii").strip() 

45 except (UnicodeDecodeError, ValueError): 

46 return None 

47 encoding = lookup_encoding(raw_name) 

48 if encoding is not None and _validate_bytes(data, encoding): 

49 return DetectionResult( 

50 encoding=encoding, 

51 confidence=DETERMINISTIC_CONFIDENCE, 

52 language=None, 

53 ) 

54 return None 

55 

56 

57def detect_markup_charset(data: bytes) -> DetectionResult | None: 

58 """Scan the first bytes of *data* for a charset declaration. 

59 

60 Checks for: 

61 

62 1. ``<?xml ... encoding="..."?>`` 

63 2. ``<meta charset="...">`` 

64 3. ``<meta http-equiv="Content-Type" content="...; charset=...">`` 

65 4. PEP 263 ``# -*- coding: ... -*-`` (first two lines only) 

66 

67 :param data: The raw byte data to scan. 

68 :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``. 

69 """ 

70 if not data: 

71 return None 

72 

73 head = data[:_SCAN_LIMIT] 

74 

75 for pattern in (_XML_ENCODING_RE, _HTML5_CHARSET_RE, _HTML4_CONTENT_TYPE_RE): 

76 match = pattern.search(head) 

77 if match: 

78 try: 

79 raw_name = match.group(1).decode("ascii").strip() 

80 except (UnicodeDecodeError, ValueError): 

81 continue 

82 encoding = lookup_encoding(raw_name) 

83 if encoding is not None and _validate_bytes(data, encoding): 

84 return DetectionResult( 

85 encoding=encoding, 

86 confidence=DETERMINISTIC_CONFIDENCE, 

87 language=None, 

88 ) 

89 

90 return _detect_pep263(data) 

91 

92 

93def _validate_bytes(data: bytes, encoding: str) -> bool: 

94 """Check that *data* can be decoded under *encoding* without errors. 

95 

96 Only validates the first ``_SCAN_LIMIT`` bytes to avoid decoding a 

97 full 200 kB input just to verify a charset declaration found in the 

98 header. 

99 """ 

100 try: 

101 data[:_SCAN_LIMIT].decode(encoding) 

102 except (UnicodeDecodeError, LookupError): 

103 return False 

104 return True