1"""Magic number detection for binary file types.
2
3Note: ``from __future__ import annotations`` is intentionally omitted because
4this module is compiled with mypyc, which does not support PEP 563 string
5annotations.
6"""
7
8from chardet.pipeline import DetectionResult
9
10# (prefix_bytes, mime_type) — longest prefix first to avoid shorter prefixes
11# shadowing longer ones. All entries match at offset 0.
12# Formats with sub-type logic (ftyp, RIFF, FORM, ZIP) are handled separately.
13_MAGIC_NUMBERS: tuple[tuple[bytes, str], ...] = (
14 # Images
15 (b"\x89PNG\r\n\x1a\n", "image/png"),
16 (b"GIF87a", "image/gif"),
17 (b"GIF89a", "image/gif"),
18 (b"MM\x00\x2a", "image/tiff"),
19 (b"II\x2a\x00", "image/tiff"),
20 (b"8BPS", "image/vnd.adobe.photoshop"),
21 (b"qoif", "image/qoi"),
22 (b"BM", "image/bmp"),
23 (b"\xff\xd8\xff", "image/jpeg"),
24 # JPEG XL: 12-byte container signature (must precede the 2-byte codestream)
25 (
26 b"\x00\x00\x00\x0c\x4a\x58\x4c\x20\x0d\x0a\x87\x0a",
27 "image/jxl",
28 ),
29 # JPEG XL: 2-byte codestream signature
30 (b"\xff\x0a", "image/jxl"),
31 (
32 b"\x00\x00\x01\x00",
33 "image/vnd.microsoft.icon",
34 ), # ICO (not TTF — TTF is \x00\x01\x00\x00)
35 # Audio/Video
36 (b"ID3", "audio/mpeg"),
37 (b"MThd", "audio/midi"),
38 (b"OggS", "audio/ogg"),
39 (b"fLaC", "audio/flac"),
40 (b"\x1a\x45\xdf\xa3", "video/webm"),
41 # Archives (ZIP handled separately below for subtype detection)
42 (b"\x1f\x8b", "application/gzip"),
43 (b"BZh", "application/x-bzip2"),
44 (b"\xfd7zXZ\x00", "application/x-xz"),
45 (b"7z\xbc\xaf\x27\x1c", "application/x-7z-compressed"),
46 (b"Rar!\x1a\x07\x01\x00", "application/vnd.rar"),
47 (b"Rar!\x1a\x07\x00", "application/vnd.rar"),
48 (b"\x28\xb5\x2f\xfd", "application/zstd"),
49 # Documents / Data
50 (b"%PDF-", "application/pdf"),
51 (b"SQLite format 3\x00", "application/x-sqlite3"),
52 (b"ARROW1", "application/vnd.apache.arrow.file"),
53 (b"PAR1", "application/vnd.apache.parquet"),
54 (b"\x00asm", "application/wasm"),
55 # Executables / Bytecode (cafebabe handled separately — shared by Java
56 # class files and Mach-O fat binaries, disambiguated by bytes 4-7)
57 (b"dex\n", "application/vnd.android.dex"),
58 (b"\x7fELF", "application/x-elf"),
59 (b"\xfe\xed\xfa\xce", "application/x-mach-binary"),
60 (b"\xfe\xed\xfa\xcf", "application/x-mach-binary"),
61 (b"\xce\xfa\xed\xfe", "application/x-mach-binary"),
62 (b"\xcf\xfa\xed\xfe", "application/x-mach-binary"),
63 (b"MZ", "application/vnd.microsoft.portable-executable"),
64 # Fonts
65 (b"wOFF", "font/woff"),
66 (b"wOF2", "font/woff2"),
67 (b"OTTO", "font/otf"),
68 (b"\x00\x01\x00\x00", "font/ttf"),
69)
70
71# TAR archives have "ustar" at offset 257
72_TAR_OFFSET = 257
73_TAR_SIGNATURES: tuple[bytes, ...] = (b"ustar\x00", b"ustar ")
74
75# RIFF container subtypes — determined by bytes 8-11
76_RIFF_SUBTYPES: dict[bytes, str] = {
77 b"WEBP": "image/webp",
78 b"WAVE": "audio/wav",
79 b"AVI ": "video/x-msvideo",
80}
81
82# FORM container subtypes (same layout as RIFF: 4-byte tag, 4-byte size, 4-byte type)
83_FORM_SUBTYPES: dict[bytes, str] = {
84 b"AIFF": "audio/aiff",
85 b"AIFC": "audio/aiff",
86}
87
88# ZIP-based format detection — scan the first 4 KB for local file headers
89# and classify based on entry filenames or content. Many ZIP generators
90# set the data-descriptor flag on every entry, making sequential header
91# walking impossible without decompression. Instead we search for
92# PK\x03\x04 signatures and inspect the filename/content fields.
93_ZIP_SIGNATURE = b"PK\x03\x04"
94_ZIP_SCAN_LIMIT = 4096
95
96# Filename prefix → MIME type (checked against each entry's filename)
97_ZIP_FILENAME_PREFIXES: tuple[tuple[bytes, str], ...] = (
98 # Office Open XML
99 (b"xl/", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
100 (
101 b"word/",
102 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
103 ),
104 (
105 b"ppt/",
106 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
107 ),
108 # Java
109 (b"META-INF/MANIFEST.MF", "application/java-archive"),
110 # Android
111 (b"AndroidManifest.xml", "application/vnd.android.package-archive"),
112 # EPUB
113 (b"META-INF/container.xml", "application/epub+zip"),
114)
115
116# Filename suffix → MIME type (checked against each entry's filename)
117_ZIP_FILENAME_SUFFIXES: tuple[tuple[bytes, str], ...] = (
118 # Python wheels: entries like "package-1.0.dist-info/WHEEL"
119 (b".dist-info/", "application/x-wheel+zip"),
120)
121
122# OpenDocument MIME types recognized in the "mimetype" entry content.
123_OPENDOCUMENT_MIMES: frozenset[bytes] = frozenset(
124 {
125 b"application/vnd.oasis.opendocument.text",
126 b"application/vnd.oasis.opendocument.spreadsheet",
127 b"application/vnd.oasis.opendocument.presentation",
128 b"application/vnd.oasis.opendocument.graphics",
129 }
130)
131
132# MP4/MOV/HEIC ftyp box — "ftyp" at offset 4
133_FTYP_MARKER = b"ftyp"
134_FTYP_OFFSET = 4
135# Brand → MIME type for image ftyp brands
136_FTYP_AVIF_BRANDS: frozenset[bytes] = frozenset({b"avif", b"avis"})
137_FTYP_HEIC_BRANDS: frozenset[bytes] = frozenset({b"heic", b"heix"})
138_FTYP_HEIF_BRANDS: frozenset[bytes] = frozenset({b"mif1", b"msf1"})
139_FTYP_AUDIO_BRANDS: frozenset[bytes] = frozenset({b"M4A ", b"M4B ", b"F4A "})
140_FTYP_QUICKTIME_BRANDS: frozenset[bytes] = frozenset({b"qt "})
141
142# Java class file vs Mach-O fat binary — both start with \xca\xfe\xba\xbe.
143# Bytes 4-7 disambiguate: Mach-O fat stores nfat_arch (big-endian uint32,
144# typically 2-5), while Java class stores minor_version (uint16) +
145# major_version (uint16, 45+ for Java 1.1 through modern Java).
146_CAFEBABE = b"\xca\xfe\xba\xbe"
147_CAFEBABE_MAX_FAT_ARCHES = 20 # no real fat binary exceeds this
148
149
150def _classify_zip(data: bytes) -> str:
151 """Classify a ZIP file by scanning entry filenames and content.
152
153 Scans for local file header signatures within the first
154 ``_ZIP_SCAN_LIMIT`` bytes. For each entry, checks the filename
155 against known prefixes/suffixes, and for ``mimetype`` entries reads
156 the uncompressed content to detect OpenDocument formats.
157
158 **Limitation:** when an entry has the data-descriptor flag (bit 3)
159 set, the compressed size in the header is 0 and we cannot skip past
160 the entry content. The scan may find spurious PK local file header
161 signatures inside compressed data. In practice deflate output rarely produces
162 valid ZIP local file headers with recognizable filenames, so false
163 positives are unlikely.
164 """
165 scan = data[:_ZIP_SCAN_LIMIT]
166 offset = 0
167 while True:
168 idx = scan.find(_ZIP_SIGNATURE, offset)
169 if idx == -1 or len(scan) < idx + 30:
170 break
171 name_len = int.from_bytes(scan[idx + 26 : idx + 28], "little")
172 extra_len = int.from_bytes(scan[idx + 28 : idx + 30], "little")
173 name_start = idx + 30
174 if len(scan) < name_start + name_len:
175 break
176 name = scan[name_start : name_start + name_len]
177 # Check filename prefixes
178 for prefix, mime in _ZIP_FILENAME_PREFIXES:
179 if name.startswith(prefix):
180 return mime
181 # Check filename suffixes
182 for suffix, mime in _ZIP_FILENAME_SUFFIXES:
183 if suffix in name:
184 return mime
185 # OpenDocument: "mimetype" entry with uncompressed content
186 if name == b"mimetype":
187 compression = int.from_bytes(scan[idx + 8 : idx + 10], "little")
188 if compression == 0: # stored (uncompressed)
189 content_start = name_start + name_len + extra_len
190 content_len = int.from_bytes(scan[idx + 22 : idx + 26], "little")
191 if len(scan) >= content_start + content_len:
192 content = scan[content_start : content_start + content_len]
193 if content in _OPENDOCUMENT_MIMES:
194 return content.decode("ascii")
195 # Advance past this entry's extra field and content to avoid
196 # matching PK\x03\x04 that appears inside file data. When the
197 # data-descriptor flag (bit 3) is set, compressed_size in the
198 # header is 0 — we still skip the extra field and hope the next
199 # real header is found by the scan.
200 flags = int.from_bytes(scan[idx + 6 : idx + 8], "little")
201 content_size = (
202 0 if flags & 0x0008 else int.from_bytes(scan[idx + 18 : idx + 22], "little")
203 )
204 offset = name_start + name_len + extra_len + content_size
205 return "application/zip"
206
207
208def _make_result(mime: str) -> DetectionResult:
209 return DetectionResult(encoding=None, confidence=1.0, language=None, mime_type=mime)
210
211
212def detect_magic(data: bytes) -> DetectionResult | None:
213 """Check *data* for known binary file magic numbers.
214
215 :param data: The raw byte data to examine.
216 :returns: A :class:`DetectionResult` with ``encoding=None`` and the
217 identified MIME type, or ``None`` if no magic number matches.
218 """
219 if not data:
220 return None
221
222 # Check ftyp box (MP4/MOV/HEIC/AVIF) — "ftyp" at offset 4.
223 # Bytes 0-3 are the box size (big-endian uint32). Valid ftyp boxes
224 # have size >= 8 and <= file length. The upper bound check prevents
225 # false positives on text (ASCII bytes 0-3 produce huge box sizes
226 # like 0x54686520 for "The ").
227 if len(data) >= 12 and data[_FTYP_OFFSET : _FTYP_OFFSET + 4] == _FTYP_MARKER:
228 box_size = int.from_bytes(data[:4], "big")
229 if 8 <= box_size <= len(data):
230 brand = data[8:12]
231 if brand in _FTYP_AVIF_BRANDS:
232 return _make_result("image/avif")
233 if brand in _FTYP_HEIC_BRANDS:
234 return _make_result("image/heic")
235 if brand in _FTYP_HEIF_BRANDS:
236 return _make_result("image/heif")
237 if brand in _FTYP_AUDIO_BRANDS:
238 return _make_result("audio/mp4")
239 if brand in _FTYP_QUICKTIME_BRANDS:
240 return _make_result("video/quicktime")
241 return _make_result("video/mp4")
242
243 # RIFF container — check subtype at bytes 8-11
244 if data[:4] == b"RIFF" and len(data) >= 12:
245 subtype = _RIFF_SUBTYPES.get(data[8:12])
246 if subtype is not None:
247 return _make_result(subtype)
248
249 # FORM container (AIFF) — same layout as RIFF
250 if data[:4] == b"FORM" and len(data) >= 12:
251 subtype = _FORM_SUBTYPES.get(data[8:12])
252 if subtype is not None:
253 return _make_result(subtype)
254
255 # ZIP-based format detection
256 if data.startswith(_ZIP_SIGNATURE):
257 return _make_result(_classify_zip(data))
258
259 # Java class file vs Mach-O fat binary (both \xca\xfe\xba\xbe)
260 if data[:4] == _CAFEBABE and len(data) >= 8:
261 nfat_arch = int.from_bytes(data[4:8], "big")
262 if nfat_arch <= _CAFEBABE_MAX_FAT_ARCHES:
263 return _make_result("application/x-mach-binary")
264 return _make_result("application/java-vm")
265
266 # Fixed-offset magic numbers (all at offset 0)
267 for prefix, mime in _MAGIC_NUMBERS:
268 if data.startswith(prefix):
269 return _make_result(mime)
270
271 # TAR archive — "ustar" at offset 257
272 if len(data) >= _TAR_OFFSET + 6:
273 tar_sig = data[_TAR_OFFSET : _TAR_OFFSET + 6]
274 if tar_sig in _TAR_SIGNATURES:
275 return _make_result("application/x-tar")
276
277 return None