1"""Stage 0: Binary content detection."""
2
3from __future__ import annotations
4
5from chardet._utils import DEFAULT_MAX_BYTES
6
7# Threshold: if more than this fraction of bytes are binary indicators, it's binary
8_BINARY_THRESHOLD = 0.01
9
10# Translation table that maps binary-indicator control bytes (0x00-0x08,
11# 0x0E-0x1F — excludes \t \n \v \f \r) to None (deleting them) and keeps
12# everything else. len(data) - len(translated) gives the count in one
13# C-level pass.
14_BINARY_DELETE = bytes(range(0x09)) + bytes(range(0x0E, 0x20))
15
16
17def is_binary(data: bytes, max_bytes: int = DEFAULT_MAX_BYTES) -> bool:
18 """Return ``True`` if *data* appears to be binary (not text) content.
19
20 :param data: The raw byte data to examine.
21 :param max_bytes: Maximum number of bytes to scan.
22 :returns: ``True`` if the data is classified as binary.
23 """
24 data = data[:max_bytes]
25 if not data:
26 return False
27
28 clean = data.translate(None, _BINARY_DELETE)
29 binary_count = len(data) - len(clean)
30 return binary_count / len(data) > _BINARY_THRESHOLD