Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/unblob/handlers/filesystem/btrfs_stream.py: 32%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

238 statements  

1import enum 

2import io 

3import zlib 

4from pathlib import Path 

5 

6import pyzstd 

7from lzallright import LZOCompressor 

8 

9from unblob.file_utils import ( 

10 Endian, 

11 File, 

12 FileSystem, 

13 InvalidInputFormat, 

14 StructParser, 

15 iterate_file, 

16) 

17from unblob.models import ( 

18 Extractor, 

19 ExtractResult, 

20 HandlerDoc, 

21 HandlerType, 

22 HexString, 

23 Reference, 

24 StructHandler, 

25 ValidChunk, 

26) 

27from unblob.report import ExtractionProblem 

28 

29C_DEFINITIONS = """ 

30typedef struct stream_header { 

31 char magic[13]; // "btrfs-stream\0" 

32 uint32 version; 

33} stream_header_t; 

34 

35typedef struct cmd_header { 

36 uint32 data_len; 

37 uint16 cmd_type; 

38 uint32 crc32; // use Castagnoli polynomial and the seed 0x0 

39} cmd_header_t; 

40 

41typedef struct tlv_header_no_value { 

42 uint16 tlv_type; 

43 uint16 tlv_len; 

44} tlv_header_no_value_t; 

45 

46typedef struct tlv_header_type_only { 

47 uint16 tlv_type; 

48} tlv_header_type_only_t; 

49 

50typedef struct tlv_header { 

51 uint16 tlv_type; 

52 uint16 tlv_len; 

53 uint64 value; 

54} tlv_header_t; 

55 

56typedef struct tlv_header_u32 { 

57 uint16 tlv_type; 

58 uint16 tlv_len; 

59 uint32 value; 

60} tlv_header_u32_t; 

61 

62typedef struct tlv_header_path { 

63 uint16 tlv_type; 

64 uint16 tlv_len; 

65 char value[tlv_len]; 

66} tlv_header_path_t; 

67 

68typedef struct timespec { 

69 uint64 sec; 

70 uint32 nsec; 

71} timespec_t; 

72 

73typedef struct tlv_header_timespec { 

74 uint16 tlv_type; 

75 uint16 tlv_len; 

76 timespec_t value; 

77} tlv_header_timespec_t; 

78 

79typedef struct tlv_header_uuid { 

80 uint16 tlv_type; 

81 uint16 tlv_len; 

82 char value[16]; 

83} tlv_header_uuid_t; 

84 

85typedef struct mk_cmd { 

86 tlv_header_path_t path; 

87 tlv_header_t ino; 

88} mk_cmd_t; 

89 

90typedef struct symlink_cmd { 

91 tlv_header_path_t path; 

92 tlv_header_t ino; 

93 tlv_header_path_t path_link; 

94} symlink_cmd_t; 

95 

96typedef struct rename_cmd { 

97 tlv_header_path_t path; 

98 tlv_header_path_t path_to; 

99} rename_cmd_t; 

100 

101typedef struct link_cmd { 

102 tlv_header_path_t path; 

103 tlv_header_path_t link_to; 

104} link_cmd_t; 

105 

106typedef struct truncate_cmd { 

107 tlv_header_path_t path; 

108 tlv_header_t size; 

109} truncate_cmd_t; 

110 

111typedef struct mk_special_cmd { // MKNOD, MKSOCK, MKFIFO, in the documentation they don't have this structure 

112 tlv_header_path_t path; 

113 tlv_header_t ino; 

114 tlv_header_t rdev; 

115 tlv_header_t mode; 

116} mk_special_cmd_t; 

117 

118typedef struct rmdir_cmd { 

119 tlv_header_path_t path; 

120} rmdir_cmd_t; 

121 

122typedef struct unlink_cmd { 

123 tlv_header_path_t path; 

124} unlink_cmd_t; 

125 

126typedef struct write_cmd { 

127 tlv_header_path_t path; 

128 tlv_header_t offset; 

129} write_cmd_t; 

130 

131typedef struct clone_cmd { 

132 tlv_header_path_t path; 

133 tlv_header_t offset; 

134 tlv_header_t clone_len; 

135 tlv_header_t clone_uuid; 

136 tlv_header_t clone_ctransid; 

137 tlv_header_path_t clone_path; 

138 tlv_header_t clone_offset; 

139} clone_cmd_t; 

140 

141typedef struct subvol_cmd { 

142 tlv_header_path_t path; 

143 tlv_header_uuid_t uuid; 

144 tlv_header_t ctransid; 

145} subvol_cmd_t; 

146 

147typedef struct snapshot_cmd { 

148 tlv_header_path_t path; 

149 tlv_header_uuid_t uuid; 

150 tlv_header_t ctransid; 

151 tlv_header_uuid_t clone_uuid; 

152 tlv_header_t clone_ctransid; 

153} snapshot_cmd_t; 

154 

155typedef struct chmod_cmd { 

156 tlv_header_path_t path; 

157 tlv_header_t mode; 

158} chmod_cmd_t; 

159 

160typedef struct chown_cmd { 

161 tlv_header_path_t path; 

162 tlv_header_t uid; 

163 tlv_header_t gid; 

164} chown_cmd_t; 

165 

166typedef struct utimes_cmd { 

167 tlv_header_path_t path; 

168 tlv_header_timespec_t atime; 

169 tlv_header_timespec_t mtime; 

170 tlv_header_timespec_t ctime; 

171} utimes_cmd_t; 

172 

173typedef struct utimesV2_cmd { 

174 tlv_header_path_t path; 

175 tlv_header_timespec_t atime; 

176 tlv_header_timespec_t mtime; 

177 tlv_header_timespec_t ctime; 

178 tlv_header_timespec_t otime; 

179} utimesV2_cmd_t; 

180 

181typedef struct set_xattr_cmd { 

182 tlv_header_path_t path; 

183 tlv_header_path_t xattr_name; 

184 tlv_header_path_t xattr_data; 

185} set_xattr_cmd_t; 

186 

187typedef struct remove_xattr_cmd { 

188 tlv_header_path_t path; 

189 tlv_header_path_t xattr_name; 

190} remove_xattr_cmd_t; 

191 

192typedef struct update_extent_cmd { 

193 tlv_header_path_t path; 

194 tlv_header_t file_offset; 

195 tlv_header_t size; 

196} update_extent_cmd_t; 

197 

198 

199typedef struct fallocate_cmd { 

200 tlv_header_path_t path; 

201 tlv_header_t fallocate_mode; 

202 tlv_header_t file_offset; 

203 tlv_header_t size; 

204} fallocate_cmd_t; 

205 

206typedef struct fileattr_cmd { 

207 tlv_header_path_t path; 

208 tlv_header_t fileattr; 

209} fileattr_cmd_t; 

210 

211typedef struct btrfs_lzo_header { 

212 uint32 total_len; // total_len (includes itself) 

213} btrfs_lzo_header_t; 

214 

215typedef struct btrfs_lzo_segment { 

216 uint32 seg_len; 

217} btrfs_lzo_segment_t; 

218 

219typedef struct encoded_write_cmd { 

220 tlv_header_path_t path; 

221 tlv_header_t file_offset; 

222 tlv_header_t unencoded_file_len; 

223 tlv_header_t unencoded_len; 

224 tlv_header_t unencoded_offset; 

225 tlv_header_u32_t compression; 

226 tlv_header_u32_t encryption; // not implemented yet by btrfs send stream 

227 // DATA has no tlv_len in v2 — length is implicit (cmd.data_len - bytes consumed so far) 

228} encoded_write_cmd_t; 

229 

230""" 

231 

232STREAM_HEADER_LEN = 17 

233BTRFS_LZO_PAGE_SIZE = 4096 

234 

235 

236def makecrc32ctable(): 

237 table = [] 

238 for i in range(256): 

239 crc = i 

240 for _ in range(8): 

241 if crc & 1: 

242 crc = ( 

243 crc >> 1 

244 ) ^ 0x82F63B78 # Castagnoli reversed polynomial (little-endian) 

245 else: 

246 crc >>= 1 

247 table.append(crc) 

248 return table 

249 

250 

251CRC32CTABLE = makecrc32ctable() 

252 

253 

254def calculate_crc32(data: bytes, crc: int = 0) -> int: 

255 for byte in data: 

256 crc = (crc >> 8) ^ CRC32CTABLE[(crc ^ byte) & 0xFF] 

257 return crc 

258 

259 

260def valid_crc32(file: File, cmd_header) -> bool: 

261 cmd_header_crc32 = cmd_header.crc32 

262 cmd_header.crc32 = 0x0 

263 

264 data_start = file.tell() 

265 tmp_crc32 = calculate_crc32(cmd_header.dumps()) 

266 for chunk in iterate_file(file, data_start, cmd_header.data_len): 

267 tmp_crc32 = calculate_crc32(chunk, tmp_crc32) 

268 if cmd_header_crc32 == tmp_crc32: 

269 file.seek(data_start, io.SEEK_SET) 

270 return True 

271 return False 

272 

273 

274class CmdType(enum.IntEnum): 

275 UNSPEC = 0 

276 SUBVOL = 1 

277 SNAPSHOT = 2 

278 MKFILE = 3 

279 MKDIR = 4 

280 MKNOD = 5 

281 MKFIFO = 6 

282 MKSOCK = 7 

283 SYMLINK = 8 

284 RENAME = 9 

285 LINK = 10 

286 UNLINK = 11 

287 RMDIR = 12 

288 SET_XATTR = 13 

289 REMOVE_XATTR = 14 

290 WRITE = 15 

291 CLONE = 16 

292 TRUNCATE = 17 

293 CHMOD = 18 

294 CHOWN = 19 

295 UTIMES = 20 

296 END = 21 

297 UPDATE_EXTENT = 22 

298 # V2 commands 

299 FALLOCATE = 23 

300 FILEATTR = 24 

301 ENCODED_WRITE = 25 

302 

303 

304class CompressionType(enum.IntEnum): 

305 NONE = 0 

306 ZLIB = 1 

307 ZSTD = 2 

308 LZO = 3 

309 

310 

311class BTRFSParser: 

312 def __init__(self, file: File, start_offset: int): 

313 self._struct_parser = StructParser(C_DEFINITIONS) 

314 self.file = file 

315 self.start_offset = start_offset 

316 self.file.seek(self.start_offset, io.SEEK_SET) 

317 self.stream_header = self._struct_parser.parse( 

318 "stream_header_t", self.file, Endian.LITTLE 

319 ) 

320 

321 def decompress_btrfs_lzo(self, data: bytes) -> bytes: 

322 # btrfs splits each lzo compressed extent into independent 4 KiB pages so the 

323 # kernel can read any page without decompressing the whole extent. 

324 stream = io.BytesIO(data) 

325 total_len = self._struct_parser.parse( 

326 "btrfs_lzo_header_t", 

327 stream, # pyright: ignore[reportArgumentType] 

328 Endian.LITTLE, 

329 ).total_len 

330 decompressed = [] 

331 while stream.tell() < total_len: 

332 page_remaining = BTRFS_LZO_PAGE_SIZE - (stream.tell() % BTRFS_LZO_PAGE_SIZE) 

333 # if <4 bytes remain in the page, the rest is padding. 

334 if page_remaining < 4: 

335 stream.seek(page_remaining, io.SEEK_CUR) 

336 continue 

337 seg_len = self._struct_parser.parse( 

338 "btrfs_lzo_segment_t", 

339 stream, # pyright: ignore[reportArgumentType] 

340 Endian.LITTLE, 

341 ).seg_len 

342 decompressed.append( 

343 LZOCompressor.decompress( 

344 stream.read(seg_len), output_size_hint=BTRFS_LZO_PAGE_SIZE 

345 ) 

346 ) 

347 return b"".join(decompressed) 

348 

349 def replay(self, fs: FileSystem) -> None: 

350 self.file.seek(self.start_offset + len(self.stream_header), io.SEEK_SET) 

351 cmd_header = self._struct_parser.parse("cmd_header_t", self.file, Endian.LITTLE) 

352 

353 if CmdType(cmd_header.cmd_type) == CmdType.SNAPSHOT: 

354 raise InvalidInputFormat("Incremental BTRFS streams are not supported") 

355 

356 while cmd_header.cmd_type != CmdType.END: 

357 if not valid_crc32(self.file, cmd_header): 

358 fs.record_problem( 

359 ExtractionProblem( 

360 problem=f"Command type : {cmd_header.cmd_type} has an invalid checksum", 

361 resolution="Skipped", 

362 ) 

363 ) 

364 else: 

365 self.replay_command(cmd_header, fs) 

366 cmd_header = self._struct_parser.parse( 

367 "cmd_header_t", self.file, Endian.LITTLE 

368 ) 

369 

370 def replay_command(self, cmd_header, fs: FileSystem) -> None: # noqa: C901 

371 match CmdType(cmd_header.cmd_type): 

372 case CmdType.MKFILE: 

373 command = self._struct_parser.parse( 

374 "mk_cmd_t", self.file, Endian.LITTLE 

375 ) 

376 path = Path(command.path.value.decode()) 

377 fs.write_bytes(path, b"") 

378 case CmdType.MKDIR: 

379 command = self._struct_parser.parse( 

380 "mk_cmd_t", self.file, Endian.LITTLE 

381 ) 

382 path = Path(command.path.value.decode()) 

383 fs.mkdir(path, parents=True, exist_ok=True) 

384 case CmdType.MKNOD | CmdType.MKSOCK: 

385 command = self._struct_parser.parse( 

386 "mk_special_cmd_t", self.file, Endian.LITTLE 

387 ) 

388 path = Path(command.path.value.decode()) 

389 fs.mknod(path, mode=command.mode.value, device=command.rdev.value) 

390 case CmdType.MKFIFO: 

391 command = self._struct_parser.parse( 

392 "mk_special_cmd_t", self.file, Endian.LITTLE 

393 ) 

394 fs.mkfifo(Path(command.path.value.decode())) 

395 case CmdType.SYMLINK: 

396 command = self._struct_parser.parse( 

397 "symlink_cmd_t", self.file, Endian.LITTLE 

398 ) 

399 fs.create_symlink( 

400 src=Path(command.path_link.value.decode()), 

401 dst=Path(command.path.value.decode()), 

402 ) 

403 case CmdType.RENAME: 

404 command = self._struct_parser.parse( 

405 "rename_cmd_t", self.file, Endian.LITTLE 

406 ) 

407 fs.rename( 

408 src=Path(command.path.value.decode()), 

409 dst=Path(command.path_to.value.decode()), 

410 ) 

411 case CmdType.LINK: 

412 command = self._struct_parser.parse( 

413 "link_cmd_t", self.file, Endian.LITTLE 

414 ) 

415 fs.create_hardlink( 

416 src=Path(command.link_to.value.decode()), 

417 dst=Path(command.path.value.decode()), 

418 ) 

419 case CmdType.RMDIR: 

420 command = self._struct_parser.parse( 

421 "rmdir_cmd_t", self.file, Endian.LITTLE 

422 ) 

423 fs.rmdir(Path(command.path.value.decode())) 

424 case CmdType.UNLINK: 

425 command = self._struct_parser.parse( 

426 "unlink_cmd_t", self.file, Endian.LITTLE 

427 ) 

428 fs.unlink(Path(command.path.value.decode())) 

429 case CmdType.SET_XATTR: 

430 command = self._struct_parser.parse( 

431 "set_xattr_cmd_t", self.file, Endian.LITTLE 

432 ) 

433 path = Path(command.path.value.decode()) 

434 name = command.xattr_name.value.decode() 

435 data = command.xattr_data.value 

436 fs.set_xattr(path, name, data) 

437 case CmdType.REMOVE_XATTR: 

438 command = self._struct_parser.parse( 

439 "remove_xattr_cmd_t", self.file, Endian.LITTLE 

440 ) 

441 path = Path(command.path.value.decode()) 

442 name = command.xattr_name.value.decode() 

443 fs.remove_xattr(path, name) 

444 case CmdType.WRITE: 

445 self.replay_write(fs, cmd_header) 

446 case CmdType.CLONE: 

447 self.replay_clone(fs) 

448 case CmdType.TRUNCATE: 

449 command = self._struct_parser.parse( 

450 "truncate_cmd_t", self.file, Endian.LITTLE 

451 ) 

452 path = Path(command.path.value.decode()) 

453 size = command.size.value 

454 fs.truncate(path, size) 

455 case CmdType.ENCODED_WRITE: 

456 self.replay_encoded_write(fs, cmd_header) 

457 case CmdType.UTIMES: 

458 if self.stream_header.version == 2: 

459 command = self._struct_parser.parse( 

460 "utimesV2_cmd_t", self.file, Endian.LITTLE 

461 ) 

462 else: 

463 command = self._struct_parser.parse( 

464 "utimes_cmd_t", self.file, Endian.LITTLE 

465 ) 

466 path = Path(command.path.value.decode()) 

467 times = (command.atime.value.sec, command.mtime.value.sec) 

468 fs.utime(path, times) 

469 case CmdType.CHMOD: 

470 command = self._struct_parser.parse( 

471 "chmod_cmd_t", self.file, Endian.LITTLE 

472 ) 

473 path = Path(command.path.value.decode()) 

474 mode = command.mode.value 

475 fs.chmod(path, mode) 

476 case ( 

477 CmdType.CHOWN 

478 | CmdType.UPDATE_EXTENT 

479 | CmdType.FALLOCATE 

480 | CmdType.FILEATTR 

481 ): 

482 fs.record_problem( 

483 ExtractionProblem( 

484 problem=f"Unsupported BTRFS stream command: {CmdType(cmd_header.cmd_type).name}", 

485 resolution="Skipped", 

486 ) 

487 ) 

488 self.file.seek(cmd_header.data_len, io.SEEK_CUR) 

489 case CmdType.UNSPEC | CmdType.END | CmdType.SUBVOL | CmdType.SNAPSHOT: 

490 self.file.seek(cmd_header.data_len, io.SEEK_CUR) 

491 

492 def replay_write(self, fs: FileSystem, cmd_header): 

493 command = self._struct_parser.parse("write_cmd_t", self.file, Endian.LITTLE) 

494 path = Path(command.path.value.decode()) 

495 if self.stream_header.version == 2: 

496 # v2: DATA TLV has no length field, skip the 2-byte type field 

497 tlv_type = self._struct_parser.parse( 

498 "tlv_header_type_only_t", self.file, Endian.LITTLE 

499 ) 

500 data_len = cmd_header.data_len - len(command) - len(tlv_type) 

501 else: 

502 # ok to handle this TLV this way, otherwise cstruct reads the whole data in ram 

503 tlv = self._struct_parser.parse( 

504 "tlv_header_no_value_t", self.file, Endian.LITTLE 

505 ) 

506 data_len = tlv.tlv_len 

507 

508 with fs.open(path, "rb+") as f: 

509 f.seek(command.offset.value, io.SEEK_SET) 

510 for chunk in iterate_file(self.file, self.file.tell(), data_len): 

511 f.write(chunk) 

512 

513 def replay_encoded_write(self, fs: FileSystem, cmd_header) -> None: 

514 command = self._struct_parser.parse( 

515 "encoded_write_cmd_t", self.file, Endian.LITTLE 

516 ) 

517 path = Path(command.path.value.decode()) 

518 # V2 only: DATA TLV has no tlv_len, skip the 2-byte type field 

519 tlv_type = self._struct_parser.parse( 

520 "tlv_header_type_only_t", self.file, Endian.LITTLE 

521 ) 

522 data_len = cmd_header.data_len - len(command) - len(tlv_type) 

523 

524 # data_len is bounded by BTRFS_MAX_COMPRESSED (128 KiB); large files are 

525 # split into multiple encoded_write commands, so reading at once is safe 

526 data = self.file.read(data_len) 

527 match CompressionType(command.compression.value): 

528 case CompressionType.NONE: 

529 decoded = data 

530 case CompressionType.LZO: 

531 decoded = self.decompress_btrfs_lzo(data) 

532 case CompressionType.ZLIB: 

533 decoded = zlib.decompressobj().decompress(data) 

534 case CompressionType.ZSTD: 

535 decoded = pyzstd.ZstdDecompressor().decompress(data) 

536 case _: 

537 raise InvalidInputFormat 

538 with fs.open(path, "rb+") as f: 

539 f.seek(command.file_offset.value, io.SEEK_SET) 

540 f.write(decoded) 

541 

542 def replay_clone(self, fs: FileSystem): 

543 command = self._struct_parser.parse("clone_cmd_t", self.file, Endian.LITTLE) 

544 dst_path = Path(command.path.value.decode()) 

545 dst_offset = command.offset.value 

546 src_len = command.clone_len.value 

547 src_path = Path(command.clone_path.value.decode()) 

548 src_offset = command.clone_offset.value 

549 with fs.open(src_path, "rb+") as src, fs.open(dst_path, "rb+") as dst: 

550 dst.seek(dst_offset, io.SEEK_SET) 

551 for chunk in iterate_file(src, src_offset, src_len): # pyright: ignore[reportArgumentType] 

552 dst.write(chunk) 

553 

554 

555class BTRFSStreamExtractor(Extractor): 

556 def extract(self, inpath: Path, outdir: Path) -> ExtractResult: 

557 fs = FileSystem(outdir) 

558 with File.from_path(inpath) as file: 

559 btrfs_parser = BTRFSParser(file, 0) 

560 btrfs_parser.replay(fs) 

561 return ExtractResult(reports=fs.problems) 

562 

563 

564class BTRFSStreamHandler(StructHandler): 

565 NAME = "btrfs_stream" 

566 PATTERNS = [HexString("62 74 72 66 73 2d 73 74 72 65 61 6d 00 (01 | 02) 00 00 00")] 

567 C_DEFINITIONS = C_DEFINITIONS 

568 EXTRACTOR = BTRFSStreamExtractor() 

569 HEADER_STRUCT = "stream_header_t" 

570 DOC = HandlerDoc( 

571 name="BTRFS Stream", 

572 description="A BTRFS send stream is a binary format used to transfer btrfs subvolume snapshots between filesystems. It encodes filesystem operations (file creation, directory structure, data writes, metadata) as a sequence of commands that can be replayed by btrfs receive to reconstruct the original snapshot. It supports both full sends (complete snapshot) and incremental sends (diff between two snapshots).", 

573 handler_type=HandlerType.FILESYSTEM, 

574 vendor=None, 

575 references=[ 

576 Reference( 

577 title="BTRFS Stream Official Documentation", 

578 url="https://btrfs.readthedocs.io/en/latest/dev/dev-send-stream.html", 

579 ), 

580 ], 

581 limitations=["Does not support incremental streams."], 

582 ) 

583 

584 def calculate_chunk(self, file: File, start_offset: int) -> ValidChunk | None: 

585 file.seek(start_offset + STREAM_HEADER_LEN, io.SEEK_SET) 

586 cmd_header = self.cparser_le.cmd_header_t(file) 

587 while cmd_header.cmd_type != CmdType.END: 

588 try: 

589 CmdType(cmd_header.cmd_type) 

590 except ValueError as err: 

591 raise InvalidInputFormat( 

592 f"Invalid BTRFS stream command type: {cmd_header.cmd_type}" 

593 ) from err 

594 file.seek(cmd_header.data_len, io.SEEK_CUR) 

595 cmd_header = self.cparser_le.cmd_header_t(file) 

596 return ValidChunk(start_offset=start_offset, end_offset=file.tell())