Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/markup.py: 28%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

46 statements  

1"""Stage 1b: charset declaration extraction (HTML/XML/PEP 263).""" 

2 

3from __future__ import annotations 

4 

5import re 

6 

7from chardet.pipeline import DETERMINISTIC_CONFIDENCE, DetectionResult 

8from chardet.registry import lookup_encoding 

9 

10_SCAN_LIMIT = 4096 

11 

12_XML_ENCODING_RE = re.compile( 

13 rb"""<\?xml[^>]+encoding\s*=\s*['"]([^'"]+)['"]""", re.IGNORECASE 

14) 

15_HTML5_CHARSET_RE = re.compile( 

16 rb"""<meta[^>]+charset\s*=\s*['"]?\s*([^\s'">;]+)""", re.IGNORECASE 

17) 

18_HTML4_CONTENT_TYPE_RE = re.compile( 

19 rb"""<meta[^>]+content\s*=\s*['"][^'"]*charset=([^\s'">;]+)""", re.IGNORECASE 

20) 

21 

22# PEP 263: encoding declaration in the first two lines of a Python file. 

23# https://peps.python.org/pep-0263/ 

24_PEP263_RE = re.compile(rb"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.MULTILINE) 

25 

26 

27def _detect_pep263(data: bytes) -> DetectionResult | None: 

28 """Check the first two lines of *data* for a PEP 263 encoding declaration. 

29 

30 PEP 263 declarations (e.g. ``# -*- coding: utf-8 -*-``) are only valid 

31 on line 1 or line 2 of a Python source file. 

32 

33 :param data: The raw byte data to scan. 

34 :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``. 

35 """ 

36 # PEP 263 requires a '#' comment marker on line 1 or 2. 

37 if b"#" not in data[:200]: 

38 return None 

39 # Extract first two lines only. 

40 first_two_lines = b"\n".join(data.split(b"\n", 2)[:2]) 

41 match = _PEP263_RE.search(first_two_lines) 

42 if match: 

43 try: 

44 raw_name = match.group(1).decode("ascii").strip() 

45 except (UnicodeDecodeError, ValueError): 

46 return None 

47 encoding = lookup_encoding(raw_name) 

48 if encoding is not None and _validate_bytes(data, encoding): 

49 return DetectionResult( 

50 encoding=encoding, 

51 confidence=DETERMINISTIC_CONFIDENCE, 

52 language=None, 

53 mime_type="text/x-python", 

54 ) 

55 return None 

56 

57 

58def detect_markup_charset(data: bytes) -> DetectionResult | None: 

59 """Scan the first bytes of *data* for a charset declaration. 

60 

61 Checks for: 

62 

63 1. ``<?xml ... encoding="..."?>`` 

64 2. ``<meta charset="...">`` 

65 3. ``<meta http-equiv="Content-Type" content="...; charset=...">`` 

66 4. PEP 263 ``# -*- coding: ... -*-`` (first two lines only) 

67 

68 :param data: The raw byte data to scan. 

69 :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``. 

70 """ 

71 if not data: 

72 return None 

73 

74 head = data[:_SCAN_LIMIT] 

75 

76 for pattern in (_XML_ENCODING_RE, _HTML5_CHARSET_RE, _HTML4_CONTENT_TYPE_RE): 

77 match = pattern.search(head) 

78 if match: 

79 try: 

80 raw_name = match.group(1).decode("ascii").strip() 

81 except (UnicodeDecodeError, ValueError): 

82 continue 

83 encoding = lookup_encoding(raw_name) 

84 if encoding is not None and _validate_bytes(data, encoding): 

85 mime_type = "text/xml" if pattern is _XML_ENCODING_RE else "text/html" 

86 return DetectionResult( 

87 encoding=encoding, 

88 confidence=DETERMINISTIC_CONFIDENCE, 

89 language=None, 

90 mime_type=mime_type, 

91 ) 

92 

93 return _detect_pep263(data) 

94 

95 

96def _validate_bytes(data: bytes, encoding: str) -> bool: 

97 """Check that *data* can be decoded under *encoding* without errors. 

98 

99 Only validates the first ``_SCAN_LIMIT`` bytes to avoid decoding a 

100 full 200 kB input just to verify a charset declaration found in the 

101 header. 

102 """ 

103 try: 

104 data[:_SCAN_LIMIT].decode(encoding) 

105 except (UnicodeDecodeError, LookupError): 

106 return False 

107 return True