Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/magic.py: 25%

1"""Magic number detection for binary file types.

3Note: ``from __future__ import annotations`` is intentionally omitted because

4this module is compiled with mypyc, which does not support PEP 563 string

5annotations.

6"""

8from chardet.pipeline import DetectionResult

10# (prefix_bytes, mime_type) — longest prefix first to avoid shorter prefixes

11# shadowing longer ones. All entries match at offset 0.

12# Formats with sub-type logic (ftyp, RIFF, FORM, ZIP) are handled separately.

13_MAGIC_NUMBERS: tuple[tuple[bytes, str], ...] = (

14 # Images

15 (b"\x89PNG\r\n\x1a\n", "image/png"),

16 (b"GIF87a", "image/gif"),

17 (b"GIF89a", "image/gif"),

18 (b"MM\x00\x2a", "image/tiff"),

19 (b"II\x2a\x00", "image/tiff"),

20 (b"8BPS", "image/vnd.adobe.photoshop"),

21 (b"qoif", "image/qoi"),

22 (b"BM", "image/bmp"),

23 (b"\xff\xd8\xff", "image/jpeg"),

24 # JPEG XL: 12-byte container signature (must precede the 2-byte codestream)

25 (

26 b"\x00\x00\x00\x0c\x4a\x58\x4c\x20\x0d\x0a\x87\x0a",

27 "image/jxl",

28 ),

29 # JPEG XL: 2-byte codestream signature

30 (b"\xff\x0a", "image/jxl"),

31 (

32 b"\x00\x00\x01\x00",

33 "image/vnd.microsoft.icon",

34 ), # ICO (not TTF — TTF is \x00\x01\x00\x00)

35 # Audio/Video

36 (b"ID3", "audio/mpeg"),

37 (b"MThd", "audio/midi"),

38 (b"OggS", "audio/ogg"),

39 (b"fLaC", "audio/flac"),

40 (b"\x1a\x45\xdf\xa3", "video/webm"),

41 # Archives (ZIP handled separately below for subtype detection)

42 (b"\x1f\x8b", "application/gzip"),

43 (b"BZh", "application/x-bzip2"),

44 (b"\xfd7zXZ\x00", "application/x-xz"),

45 (b"7z\xbc\xaf\x27\x1c", "application/x-7z-compressed"),

46 (b"Rar!\x1a\x07\x01\x00", "application/vnd.rar"),

47 (b"Rar!\x1a\x07\x00", "application/vnd.rar"),

48 (b"\x28\xb5\x2f\xfd", "application/zstd"),

49 # Documents / Data

50 (b"%PDF-", "application/pdf"),

51 (b"SQLite format 3\x00", "application/x-sqlite3"),

52 (b"ARROW1", "application/vnd.apache.arrow.file"),

53 (b"PAR1", "application/vnd.apache.parquet"),

54 (b"\x00asm", "application/wasm"),

55 # Executables / Bytecode (cafebabe handled separately — shared by Java

56 # class files and Mach-O fat binaries, disambiguated by bytes 4-7)

57 (b"dex\n", "application/vnd.android.dex"),

58 (b"\x7fELF", "application/x-elf"),

59 (b"\xfe\xed\xfa\xce", "application/x-mach-binary"),

60 (b"\xfe\xed\xfa\xcf", "application/x-mach-binary"),

61 (b"\xce\xfa\xed\xfe", "application/x-mach-binary"),

62 (b"\xcf\xfa\xed\xfe", "application/x-mach-binary"),

63 (b"MZ", "application/vnd.microsoft.portable-executable"),

64 # Fonts

65 (b"wOFF", "font/woff"),

66 (b"wOF2", "font/woff2"),

67 (b"OTTO", "font/otf"),

68 (b"\x00\x01\x00\x00", "font/ttf"),

69)

71# TAR archives have "ustar" at offset 257

72_TAR_OFFSET = 257

73_TAR_SIGNATURES: tuple[bytes, ...] = (b"ustar\x00", b"ustar ")

75# RIFF container subtypes — determined by bytes 8-11

76_RIFF_SUBTYPES: dict[bytes, str] = {

77 b"WEBP": "image/webp",

78 b"WAVE": "audio/wav",

79 b"AVI ": "video/x-msvideo",

80}

82# FORM container subtypes (same layout as RIFF: 4-byte tag, 4-byte size, 4-byte type)

83_FORM_SUBTYPES: dict[bytes, str] = {

84 b"AIFF": "audio/aiff",

85 b"AIFC": "audio/aiff",

86}

88# ZIP-based format detection — scan the first 4 KB for local file headers

89# and classify based on entry filenames or content. Many ZIP generators

90# set the data-descriptor flag on every entry, making sequential header

91# walking impossible without decompression. Instead we search for

92# PK\x03\x04 signatures and inspect the filename/content fields.

93_ZIP_SIGNATURE = b"PK\x03\x04"

94_ZIP_SCAN_LIMIT = 4096

96# Filename prefix → MIME type (checked against each entry's filename)

97_ZIP_FILENAME_PREFIXES: tuple[tuple[bytes, str], ...] = (

98 # Office Open XML

99 (b"xl/", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),

100 (

101 b"word/",

102 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

103 ),

104 (

105 b"ppt/",

106 "application/vnd.openxmlformats-officedocument.presentationml.presentation",

107 ),

108 # Java

109 (b"META-INF/MANIFEST.MF", "application/java-archive"),

110 # Android

111 (b"AndroidManifest.xml", "application/vnd.android.package-archive"),

112 # EPUB

113 (b"META-INF/container.xml", "application/epub+zip"),

114)

115

116# Filename suffix → MIME type (checked against each entry's filename)

117_ZIP_FILENAME_SUFFIXES: tuple[tuple[bytes, str], ...] = (

118 # Python wheels: entries like "package-1.0.dist-info/WHEEL"

119 (b".dist-info/", "application/x-wheel+zip"),

120)

121

122# OpenDocument MIME types recognized in the "mimetype" entry content.

123_OPENDOCUMENT_MIMES: frozenset[bytes] = frozenset(

124 {

125 b"application/vnd.oasis.opendocument.text",

126 b"application/vnd.oasis.opendocument.spreadsheet",

127 b"application/vnd.oasis.opendocument.presentation",

128 b"application/vnd.oasis.opendocument.graphics",

129 }

130)

131

132# MP4/MOV/HEIC ftyp box — "ftyp" at offset 4

133_FTYP_MARKER = b"ftyp"

134_FTYP_OFFSET = 4

135# Brand → MIME type for image ftyp brands

136_FTYP_AVIF_BRANDS: frozenset[bytes] = frozenset({b"avif", b"avis"})

137_FTYP_HEIC_BRANDS: frozenset[bytes] = frozenset({b"heic", b"heix"})

138_FTYP_HEIF_BRANDS: frozenset[bytes] = frozenset({b"mif1", b"msf1"})

139_FTYP_AUDIO_BRANDS: frozenset[bytes] = frozenset({b"M4A ", b"M4B ", b"F4A "})

140_FTYP_QUICKTIME_BRANDS: frozenset[bytes] = frozenset({b"qt "})

141

142# Java class file vs Mach-O fat binary — both start with \xca\xfe\xba\xbe.

143# Bytes 4-7 disambiguate: Mach-O fat stores nfat_arch (big-endian uint32,

144# typically 2-5), while Java class stores minor_version (uint16) +

145# major_version (uint16, 45+ for Java 1.1 through modern Java).

146_CAFEBABE = b"\xca\xfe\xba\xbe"

147_CAFEBABE_MAX_FAT_ARCHES = 20 # no real fat binary exceeds this

148

149

150def _classify_zip(data: bytes) -> str:

151 """Classify a ZIP file by scanning entry filenames and content.

152

153 Scans for local file header signatures within the first

154 ``_ZIP_SCAN_LIMIT`` bytes. For each entry, checks the filename

155 against known prefixes/suffixes, and for ``mimetype`` entries reads

156 the uncompressed content to detect OpenDocument formats.

157

158 **Limitation:** when an entry has the data-descriptor flag (bit 3)

159 set, the compressed size in the header is 0 and we cannot skip past

160 the entry content. The scan may find spurious PK local file header

161 signatures inside compressed data. In practice deflate output rarely produces

162 valid ZIP local file headers with recognizable filenames, so false

163 positives are unlikely.

164 """

165 scan = data[:_ZIP_SCAN_LIMIT]

166 offset = 0

167 while True:

168 idx = scan.find(_ZIP_SIGNATURE, offset)

169 if idx == -1 or len(scan) < idx + 30:

170 break

171 name_len = int.from_bytes(scan[idx + 26 : idx + 28], "little")

172 extra_len = int.from_bytes(scan[idx + 28 : idx + 30], "little")

173 name_start = idx + 30

174 if len(scan) < name_start + name_len:

175 break

176 name = scan[name_start : name_start + name_len]

177 # Check filename prefixes

178 for prefix, mime in _ZIP_FILENAME_PREFIXES:

179 if name.startswith(prefix):

180 return mime

181 # Check filename suffixes

182 for suffix, mime in _ZIP_FILENAME_SUFFIXES:

183 if suffix in name:

184 return mime

185 # OpenDocument: "mimetype" entry with uncompressed content

186 if name == b"mimetype":

187 compression = int.from_bytes(scan[idx + 8 : idx + 10], "little")

188 if compression == 0: # stored (uncompressed)

189 content_start = name_start + name_len + extra_len

190 content_len = int.from_bytes(scan[idx + 22 : idx + 26], "little")

191 if len(scan) >= content_start + content_len:

192 content = scan[content_start : content_start + content_len]

193 if content in _OPENDOCUMENT_MIMES:

194 return content.decode("ascii")

195 # Advance past this entry's extra field and content to avoid

196 # matching PK\x03\x04 that appears inside file data. When the

197 # data-descriptor flag (bit 3) is set, compressed_size in the

198 # header is 0 — we still skip the extra field and hope the next

199 # real header is found by the scan.

200 flags = int.from_bytes(scan[idx + 6 : idx + 8], "little")

201 content_size = (

202 0 if flags & 0x0008 else int.from_bytes(scan[idx + 18 : idx + 22], "little")

203 )

204 offset = name_start + name_len + extra_len + content_size

205 return "application/zip"

206

207

208def _make_result(mime: str) -> DetectionResult:

209 return DetectionResult(encoding=None, confidence=1.0, language=None, mime_type=mime)

210

211

212def detect_magic(data: bytes) -> DetectionResult | None:

213 """Check *data* for known binary file magic numbers.

214

215 :param data: The raw byte data to examine.

216 :returns: A :class:`DetectionResult` with ``encoding=None`` and the

217 identified MIME type, or ``None`` if no magic number matches.

218 """

219 if not data:

220 return None

221

222 # Check ftyp box (MP4/MOV/HEIC/AVIF) — "ftyp" at offset 4.

223 # Bytes 0-3 are the box size (big-endian uint32). Valid ftyp boxes

224 # have size >= 8 and <= file length. The upper bound check prevents

225 # false positives on text (ASCII bytes 0-3 produce huge box sizes

226 # like 0x54686520 for "The ").

227 if len(data) >= 12 and data[_FTYP_OFFSET : _FTYP_OFFSET + 4] == _FTYP_MARKER:

228 box_size = int.from_bytes(data[:4], "big")

229 if 8 <= box_size <= len(data):

230 brand = data[8:12]

231 if brand in _FTYP_AVIF_BRANDS:

232 return _make_result("image/avif")

233 if brand in _FTYP_HEIC_BRANDS:

234 return _make_result("image/heic")

235 if brand in _FTYP_HEIF_BRANDS:

236 return _make_result("image/heif")

237 if brand in _FTYP_AUDIO_BRANDS:

238 return _make_result("audio/mp4")

239 if brand in _FTYP_QUICKTIME_BRANDS:

240 return _make_result("video/quicktime")

241 return _make_result("video/mp4")

242

243 # RIFF container — check subtype at bytes 8-11

244 if data[:4] == b"RIFF" and len(data) >= 12:

245 subtype = _RIFF_SUBTYPES.get(data[8:12])

246 if subtype is not None:

247 return _make_result(subtype)

248

249 # FORM container (AIFF) — same layout as RIFF

250 if data[:4] == b"FORM" and len(data) >= 12:

251 subtype = _FORM_SUBTYPES.get(data[8:12])

252 if subtype is not None:

253 return _make_result(subtype)

254

255 # ZIP-based format detection

256 if data.startswith(_ZIP_SIGNATURE):

257 return _make_result(_classify_zip(data))

258

259 # Java class file vs Mach-O fat binary (both \xca\xfe\xba\xbe)

260 if data[:4] == _CAFEBABE and len(data) >= 8:

261 nfat_arch = int.from_bytes(data[4:8], "big")

262 if nfat_arch <= _CAFEBABE_MAX_FAT_ARCHES:

263 return _make_result("application/x-mach-binary")

264 return _make_result("application/java-vm")

265

266 # Fixed-offset magic numbers (all at offset 0)

267 for prefix, mime in _MAGIC_NUMBERS:

268 if data.startswith(prefix):

269 return _make_result(mime)

270

271 # TAR archive — "ustar" at offset 257

272 if len(data) >= _TAR_OFFSET + 6:

273 tar_sig = data[_TAR_OFFSET : _TAR_OFFSET + 6]

274 if tar_sig in _TAR_SIGNATURES:

275 return _make_result("application/x-tar")

276

277 return None