Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/bom.py: 43%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

14 statements  

1"""Stage 1a: BOM (Byte Order Mark) detection.""" 

2 

3from __future__ import annotations 

4 

5from chardet.pipeline import DetectionResult 

6 

7# Ordered longest-first so UTF-32 is checked before UTF-16 

8# (UTF-32-LE BOM starts with the same bytes as UTF-16-LE BOM) 

9_BOMS: tuple[tuple[bytes, str], ...] = ( 

10 (b"\x00\x00\xfe\xff", "utf-32-be"), 

11 (b"\xff\xfe\x00\x00", "utf-32-le"), 

12 (b"\xef\xbb\xbf", "utf-8-sig"), 

13 (b"\xfe\xff", "utf-16-be"), 

14 (b"\xff\xfe", "utf-16-le"), 

15) 

16 

17_UTF32_BOMS: frozenset[bytes] = frozenset({b"\x00\x00\xfe\xff", b"\xff\xfe\x00\x00"}) 

18 

19 

20def detect_bom(data: bytes) -> DetectionResult | None: 

21 """Check for a byte order mark at the start of *data*. 

22 

23 :param data: The raw byte data to examine. 

24 :returns: A :class:`DetectionResult` with confidence 1.0, or ``None``. 

25 """ 

26 for bom_bytes, encoding in _BOMS: 

27 if data.startswith(bom_bytes): 

28 # UTF-32 BOMs overlap with UTF-16 BOMs (e.g. FF FE 00 00 starts 

29 # with the UTF-16-LE BOM FF FE). Validate that the payload after 

30 # a UTF-32 BOM is a valid number of UTF-32 code units (multiple of 

31 # 4 bytes). If not, skip to let the shorter UTF-16 BOM match. 

32 if bom_bytes in _UTF32_BOMS: 

33 payload_len = len(data) - len(bom_bytes) 

34 if payload_len % 4 != 0: 

35 continue 

36 return DetectionResult(encoding=encoding, confidence=1.0, language=None) 

37 return None