Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/magic.py: 25%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

96 statements  

1"""Magic number detection for binary file types. 

2 

3Note: ``from __future__ import annotations`` is intentionally omitted because 

4this module is compiled with mypyc, which does not support PEP 563 string 

5annotations. 

6""" 

7 

8from chardet.pipeline import DetectionResult 

9 

10# (prefix_bytes, mime_type) — longest prefix first to avoid shorter prefixes 

11# shadowing longer ones. All entries match at offset 0. 

12# Formats with sub-type logic (ftyp, RIFF, FORM, ZIP) are handled separately. 

13_MAGIC_NUMBERS: tuple[tuple[bytes, str], ...] = ( 

14 # Images 

15 (b"\x89PNG\r\n\x1a\n", "image/png"), 

16 (b"GIF87a", "image/gif"), 

17 (b"GIF89a", "image/gif"), 

18 (b"MM\x00\x2a", "image/tiff"), 

19 (b"II\x2a\x00", "image/tiff"), 

20 (b"8BPS", "image/vnd.adobe.photoshop"), 

21 (b"qoif", "image/qoi"), 

22 (b"BM", "image/bmp"), 

23 (b"\xff\xd8\xff", "image/jpeg"), 

24 # JPEG XL: 12-byte container signature (must precede the 2-byte codestream) 

25 ( 

26 b"\x00\x00\x00\x0c\x4a\x58\x4c\x20\x0d\x0a\x87\x0a", 

27 "image/jxl", 

28 ), 

29 # JPEG XL: 2-byte codestream signature 

30 (b"\xff\x0a", "image/jxl"), 

31 ( 

32 b"\x00\x00\x01\x00", 

33 "image/vnd.microsoft.icon", 

34 ), # ICO (not TTF — TTF is \x00\x01\x00\x00) 

35 # Audio/Video 

36 (b"ID3", "audio/mpeg"), 

37 (b"MThd", "audio/midi"), 

38 (b"OggS", "audio/ogg"), 

39 (b"fLaC", "audio/flac"), 

40 (b"\x1a\x45\xdf\xa3", "video/webm"), 

41 # Archives (ZIP handled separately below for subtype detection) 

42 (b"\x1f\x8b", "application/gzip"), 

43 (b"BZh", "application/x-bzip2"), 

44 (b"\xfd7zXZ\x00", "application/x-xz"), 

45 (b"7z\xbc\xaf\x27\x1c", "application/x-7z-compressed"), 

46 (b"Rar!\x1a\x07\x01\x00", "application/vnd.rar"), 

47 (b"Rar!\x1a\x07\x00", "application/vnd.rar"), 

48 (b"\x28\xb5\x2f\xfd", "application/zstd"), 

49 # Documents / Data 

50 (b"%PDF-", "application/pdf"), 

51 (b"SQLite format 3\x00", "application/x-sqlite3"), 

52 (b"ARROW1", "application/vnd.apache.arrow.file"), 

53 (b"PAR1", "application/vnd.apache.parquet"), 

54 (b"\x00asm", "application/wasm"), 

55 # Executables / Bytecode (cafebabe handled separately — shared by Java 

56 # class files and Mach-O fat binaries, disambiguated by bytes 4-7) 

57 (b"dex\n", "application/vnd.android.dex"), 

58 (b"\x7fELF", "application/x-elf"), 

59 (b"\xfe\xed\xfa\xce", "application/x-mach-binary"), 

60 (b"\xfe\xed\xfa\xcf", "application/x-mach-binary"), 

61 (b"\xce\xfa\xed\xfe", "application/x-mach-binary"), 

62 (b"\xcf\xfa\xed\xfe", "application/x-mach-binary"), 

63 (b"MZ", "application/vnd.microsoft.portable-executable"), 

64 # Fonts 

65 (b"wOFF", "font/woff"), 

66 (b"wOF2", "font/woff2"), 

67 (b"OTTO", "font/otf"), 

68 (b"\x00\x01\x00\x00", "font/ttf"), 

69) 

70 

71# TAR archives have "ustar" at offset 257 

72_TAR_OFFSET = 257 

73_TAR_SIGNATURES: tuple[bytes, ...] = (b"ustar\x00", b"ustar ") 

74 

75# RIFF container subtypes — determined by bytes 8-11 

76_RIFF_SUBTYPES: dict[bytes, str] = { 

77 b"WEBP": "image/webp", 

78 b"WAVE": "audio/wav", 

79 b"AVI ": "video/x-msvideo", 

80} 

81 

82# FORM container subtypes (same layout as RIFF: 4-byte tag, 4-byte size, 4-byte type) 

83_FORM_SUBTYPES: dict[bytes, str] = { 

84 b"AIFF": "audio/aiff", 

85 b"AIFC": "audio/aiff", 

86} 

87 

88# ZIP-based format detection — scan the first 4 KB for local file headers 

89# and classify based on entry filenames or content. Many ZIP generators 

90# set the data-descriptor flag on every entry, making sequential header 

91# walking impossible without decompression. Instead we search for 

92# PK\x03\x04 signatures and inspect the filename/content fields. 

93_ZIP_SIGNATURE = b"PK\x03\x04" 

94_ZIP_SCAN_LIMIT = 4096 

95 

96# Filename prefix → MIME type (checked against each entry's filename) 

97_ZIP_FILENAME_PREFIXES: tuple[tuple[bytes, str], ...] = ( 

98 # Office Open XML 

99 (b"xl/", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), 

100 ( 

101 b"word/", 

102 "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 

103 ), 

104 ( 

105 b"ppt/", 

106 "application/vnd.openxmlformats-officedocument.presentationml.presentation", 

107 ), 

108 # Java 

109 (b"META-INF/MANIFEST.MF", "application/java-archive"), 

110 # Android 

111 (b"AndroidManifest.xml", "application/vnd.android.package-archive"), 

112 # EPUB 

113 (b"META-INF/container.xml", "application/epub+zip"), 

114) 

115 

116# Filename suffix → MIME type (checked against each entry's filename) 

117_ZIP_FILENAME_SUFFIXES: tuple[tuple[bytes, str], ...] = ( 

118 # Python wheels: entries like "package-1.0.dist-info/WHEEL" 

119 (b".dist-info/", "application/x-wheel+zip"), 

120) 

121 

122# OpenDocument MIME types recognized in the "mimetype" entry content. 

123_OPENDOCUMENT_MIMES: frozenset[bytes] = frozenset( 

124 { 

125 b"application/vnd.oasis.opendocument.text", 

126 b"application/vnd.oasis.opendocument.spreadsheet", 

127 b"application/vnd.oasis.opendocument.presentation", 

128 b"application/vnd.oasis.opendocument.graphics", 

129 } 

130) 

131 

132# MP4/MOV/HEIC ftyp box — "ftyp" at offset 4 

133_FTYP_MARKER = b"ftyp" 

134_FTYP_OFFSET = 4 

135# Brand → MIME type for image ftyp brands 

136_FTYP_AVIF_BRANDS: frozenset[bytes] = frozenset({b"avif", b"avis"}) 

137_FTYP_HEIC_BRANDS: frozenset[bytes] = frozenset({b"heic", b"heix"}) 

138_FTYP_HEIF_BRANDS: frozenset[bytes] = frozenset({b"mif1", b"msf1"}) 

139_FTYP_AUDIO_BRANDS: frozenset[bytes] = frozenset({b"M4A ", b"M4B ", b"F4A "}) 

140_FTYP_QUICKTIME_BRANDS: frozenset[bytes] = frozenset({b"qt "}) 

141 

142# Java class file vs Mach-O fat binary — both start with \xca\xfe\xba\xbe. 

143# Bytes 4-7 disambiguate: Mach-O fat stores nfat_arch (big-endian uint32, 

144# typically 2-5), while Java class stores minor_version (uint16) + 

145# major_version (uint16, 45+ for Java 1.1 through modern Java). 

146_CAFEBABE = b"\xca\xfe\xba\xbe" 

147_CAFEBABE_MAX_FAT_ARCHES = 20 # no real fat binary exceeds this 

148 

149 

150def _classify_zip(data: bytes) -> str: 

151 """Classify a ZIP file by scanning entry filenames and content. 

152 

153 Scans for local file header signatures within the first 

154 ``_ZIP_SCAN_LIMIT`` bytes. For each entry, checks the filename 

155 against known prefixes/suffixes, and for ``mimetype`` entries reads 

156 the uncompressed content to detect OpenDocument formats. 

157 

158 **Limitation:** when an entry has the data-descriptor flag (bit 3) 

159 set, the compressed size in the header is 0 and we cannot skip past 

160 the entry content. The scan may find spurious PK local file header 

161 signatures inside compressed data. In practice deflate output rarely produces 

162 valid ZIP local file headers with recognizable filenames, so false 

163 positives are unlikely. 

164 """ 

165 scan = data[:_ZIP_SCAN_LIMIT] 

166 offset = 0 

167 while True: 

168 idx = scan.find(_ZIP_SIGNATURE, offset) 

169 if idx == -1 or len(scan) < idx + 30: 

170 break 

171 name_len = int.from_bytes(scan[idx + 26 : idx + 28], "little") 

172 extra_len = int.from_bytes(scan[idx + 28 : idx + 30], "little") 

173 name_start = idx + 30 

174 if len(scan) < name_start + name_len: 

175 break 

176 name = scan[name_start : name_start + name_len] 

177 # Check filename prefixes 

178 for prefix, mime in _ZIP_FILENAME_PREFIXES: 

179 if name.startswith(prefix): 

180 return mime 

181 # Check filename suffixes 

182 for suffix, mime in _ZIP_FILENAME_SUFFIXES: 

183 if suffix in name: 

184 return mime 

185 # OpenDocument: "mimetype" entry with uncompressed content 

186 if name == b"mimetype": 

187 compression = int.from_bytes(scan[idx + 8 : idx + 10], "little") 

188 if compression == 0: # stored (uncompressed) 

189 content_start = name_start + name_len + extra_len 

190 content_len = int.from_bytes(scan[idx + 22 : idx + 26], "little") 

191 if len(scan) >= content_start + content_len: 

192 content = scan[content_start : content_start + content_len] 

193 if content in _OPENDOCUMENT_MIMES: 

194 return content.decode("ascii") 

195 # Advance past this entry's extra field and content to avoid 

196 # matching PK\x03\x04 that appears inside file data. When the 

197 # data-descriptor flag (bit 3) is set, compressed_size in the 

198 # header is 0 — we still skip the extra field and hope the next 

199 # real header is found by the scan. 

200 flags = int.from_bytes(scan[idx + 6 : idx + 8], "little") 

201 content_size = ( 

202 0 if flags & 0x0008 else int.from_bytes(scan[idx + 18 : idx + 22], "little") 

203 ) 

204 offset = name_start + name_len + extra_len + content_size 

205 return "application/zip" 

206 

207 

208def _make_result(mime: str) -> DetectionResult: 

209 return DetectionResult(encoding=None, confidence=1.0, language=None, mime_type=mime) 

210 

211 

212def detect_magic(data: bytes) -> DetectionResult | None: 

213 """Check *data* for known binary file magic numbers. 

214 

215 :param data: The raw byte data to examine. 

216 :returns: A :class:`DetectionResult` with ``encoding=None`` and the 

217 identified MIME type, or ``None`` if no magic number matches. 

218 """ 

219 if not data: 

220 return None 

221 

222 # Check ftyp box (MP4/MOV/HEIC/AVIF) — "ftyp" at offset 4. 

223 # Bytes 0-3 are the box size (big-endian uint32). Valid ftyp boxes 

224 # have size >= 8 and <= file length. The upper bound check prevents 

225 # false positives on text (ASCII bytes 0-3 produce huge box sizes 

226 # like 0x54686520 for "The "). 

227 if len(data) >= 12 and data[_FTYP_OFFSET : _FTYP_OFFSET + 4] == _FTYP_MARKER: 

228 box_size = int.from_bytes(data[:4], "big") 

229 if 8 <= box_size <= len(data): 

230 brand = data[8:12] 

231 if brand in _FTYP_AVIF_BRANDS: 

232 return _make_result("image/avif") 

233 if brand in _FTYP_HEIC_BRANDS: 

234 return _make_result("image/heic") 

235 if brand in _FTYP_HEIF_BRANDS: 

236 return _make_result("image/heif") 

237 if brand in _FTYP_AUDIO_BRANDS: 

238 return _make_result("audio/mp4") 

239 if brand in _FTYP_QUICKTIME_BRANDS: 

240 return _make_result("video/quicktime") 

241 return _make_result("video/mp4") 

242 

243 # RIFF container — check subtype at bytes 8-11 

244 if data[:4] == b"RIFF" and len(data) >= 12: 

245 subtype = _RIFF_SUBTYPES.get(data[8:12]) 

246 if subtype is not None: 

247 return _make_result(subtype) 

248 

249 # FORM container (AIFF) — same layout as RIFF 

250 if data[:4] == b"FORM" and len(data) >= 12: 

251 subtype = _FORM_SUBTYPES.get(data[8:12]) 

252 if subtype is not None: 

253 return _make_result(subtype) 

254 

255 # ZIP-based format detection 

256 if data.startswith(_ZIP_SIGNATURE): 

257 return _make_result(_classify_zip(data)) 

258 

259 # Java class file vs Mach-O fat binary (both \xca\xfe\xba\xbe) 

260 if data[:4] == _CAFEBABE and len(data) >= 8: 

261 nfat_arch = int.from_bytes(data[4:8], "big") 

262 if nfat_arch <= _CAFEBABE_MAX_FAT_ARCHES: 

263 return _make_result("application/x-mach-binary") 

264 return _make_result("application/java-vm") 

265 

266 # Fixed-offset magic numbers (all at offset 0) 

267 for prefix, mime in _MAGIC_NUMBERS: 

268 if data.startswith(prefix): 

269 return _make_result(mime) 

270 

271 # TAR archive — "ustar" at offset 257 

272 if len(data) >= _TAR_OFFSET + 6: 

273 tar_sig = data[_TAR_OFFSET : _TAR_OFFSET + 6] 

274 if tar_sig in _TAR_SIGNATURES: 

275 return _make_result("application/x-tar") 

276 

277 return None