Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/bom.py: 43%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Stage 1a: BOM (Byte Order Mark) detection."""
3from __future__ import annotations
5from chardet.pipeline import DetectionResult
7# Ordered longest-first so UTF-32 is checked before UTF-16
8# (UTF-32-LE BOM starts with the same bytes as UTF-16-LE BOM)
9_BOMS: tuple[tuple[bytes, str], ...] = (
10 (b"\x00\x00\xfe\xff", "utf-32-be"),
11 (b"\xff\xfe\x00\x00", "utf-32-le"),
12 (b"\xef\xbb\xbf", "utf-8-sig"),
13 (b"\xfe\xff", "utf-16-be"),
14 (b"\xff\xfe", "utf-16-le"),
15)
17_UTF32_BOMS: frozenset[bytes] = frozenset({b"\x00\x00\xfe\xff", b"\xff\xfe\x00\x00"})
20def detect_bom(data: bytes) -> DetectionResult | None:
21 """Check for a byte order mark at the start of *data*.
23 :param data: The raw byte data to examine.
24 :returns: A :class:`DetectionResult` with confidence 1.0, or ``None``.
25 """
26 for bom_bytes, encoding in _BOMS:
27 if data.startswith(bom_bytes):
28 # UTF-32 BOMs overlap with UTF-16 BOMs (e.g. FF FE 00 00 starts
29 # with the UTF-16-LE BOM FF FE). Validate that the payload after
30 # a UTF-32 BOM is a valid number of UTF-32 code units (multiple of
31 # 4 bytes). If not, skip to let the shorter UTF-16 BOM match.
32 if bom_bytes in _UTF32_BOMS:
33 payload_len = len(data) - len(bom_bytes)
34 if payload_len % 4 != 0:
35 continue
36 return DetectionResult(encoding=encoding, confidence=1.0, language=None)
37 return None