Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/bom.py: 43%

1"""Stage 1a: BOM (Byte Order Mark) detection."""

3from __future__ import annotations

5from chardet.pipeline import DetectionResult

7# Ordered longest-first so UTF-32 is checked before UTF-16

8# (UTF-32-LE BOM starts with the same bytes as UTF-16-LE BOM)

9_BOMS: tuple[tuple[bytes, str], ...] = (

10 (b"\x00\x00\xfe\xff", "utf-32-be"),

11 (b"\xff\xfe\x00\x00", "utf-32-le"),

12 (b"\xef\xbb\xbf", "utf-8-sig"),

13 (b"\xfe\xff", "utf-16-be"),

14 (b"\xff\xfe", "utf-16-le"),

17_UTF32_BOMS: frozenset[bytes] = frozenset({b"\x00\x00\xfe\xff", b"\xff\xfe\x00\x00"})

20def detect_bom(data: bytes) -> DetectionResult | None:

21 """Check for a byte order mark at the start of *data*.

23 :param data: The raw byte data to examine.

24 :returns: A :class:`DetectionResult` with confidence 1.0, or ``None``.

25 """

26 for bom_bytes, encoding in _BOMS:

27 if data.startswith(bom_bytes):

28 # UTF-32 BOMs overlap with UTF-16 BOMs (e.g. FF FE 00 00 starts

29 # with the UTF-16-LE BOM FF FE). Validate that the payload after

30 # a UTF-32 BOM is a valid number of UTF-32 code units (multiple of

31 # 4 bytes). If not, skip to let the shorter UTF-16 BOM match.

32 if bom_bytes in _UTF32_BOMS:

33 payload_len = len(data) - len(bom_bytes)

34 if payload_len % 4 != 0:

35 continue

36 return DetectionResult(encoding=encoding, confidence=1.0, language=None)

37 return None