Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/dulwich/pack.py: 27%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1357 statements  

1# pack.py -- For dealing with packed git objects. 

2# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net> 

3# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk> 

4# 

5# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU 

6# General Public License as public by the Free Software Foundation; version 2.0 

7# or (at your option) any later version. You can redistribute it and/or 

8# modify it under the terms of either of these two licenses. 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15# 

16# You should have received a copy of the licenses; if not, see 

17# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License 

18# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache 

19# License, Version 2.0. 

20# 

21 

22"""Classes for dealing with packed git objects. 

23 

24A pack is a compact representation of a bunch of objects, stored 

25using deltas where possible. 

26 

27They have two parts, the pack file, which stores the data, and an index 

28that tells you where the data is. 

29 

30To find an object you look in all of the index files 'til you find a 

31match for the object name. You then use the pointer got from this as 

32a pointer in to the corresponding packfile. 

33""" 

34 

35import binascii 

36from collections import defaultdict, deque 

37from contextlib import suppress 

38from io import BytesIO, UnsupportedOperation 

39 

40try: 

41 from cdifflib import CSequenceMatcher as SequenceMatcher 

42except ModuleNotFoundError: 

43 from difflib import SequenceMatcher 

44 

45import os 

46import struct 

47import sys 

48import warnings 

49import zlib 

50from hashlib import sha1 

51from itertools import chain 

52from os import SEEK_CUR, SEEK_END 

53from struct import unpack_from 

54from typing import ( 

55 BinaryIO, 

56 Callable, 

57 Deque, 

58 Dict, 

59 Generic, 

60 Iterable, 

61 Iterator, 

62 List, 

63 Optional, 

64 Protocol, 

65 Sequence, 

66 Set, 

67 Tuple, 

68 TypeVar, 

69 Union, 

70) 

71 

72try: 

73 import mmap 

74except ImportError: 

75 has_mmap = False 

76else: 

77 has_mmap = True 

78 

79# For some reason the above try, except fails to set has_mmap = False for plan9 

80if sys.platform == "Plan9": 

81 has_mmap = False 

82 

83from .errors import ApplyDeltaError, ChecksumMismatch 

84from .file import GitFile 

85from .lru_cache import LRUSizeCache 

86from .objects import ObjectID, ShaFile, hex_to_sha, object_header, sha_to_hex 

87 

88OFS_DELTA = 6 

89REF_DELTA = 7 

90 

91DELTA_TYPES = (OFS_DELTA, REF_DELTA) 

92 

93 

94DEFAULT_PACK_DELTA_WINDOW_SIZE = 10 

95 

96# Keep pack files under 16Mb in memory, otherwise write them out to disk 

97PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024 

98 

99 

100OldUnpackedObject = Union[Tuple[Union[bytes, int], List[bytes]], List[bytes]] 

101ResolveExtRefFn = Callable[[bytes], Tuple[int, OldUnpackedObject]] 

102ProgressFn = Callable[[int, str], None] 

103PackHint = Tuple[int, Optional[bytes]] 

104 

105 

106class UnresolvedDeltas(Exception): 

107 """Delta objects could not be resolved.""" 

108 

109 def __init__(self, shas): 

110 self.shas = shas 

111 

112 

113class ObjectContainer(Protocol): 

114 def add_object(self, obj: ShaFile) -> None: 

115 """Add a single object to this object store.""" 

116 

117 def add_objects( 

118 self, 

119 objects: Sequence[Tuple[ShaFile, Optional[str]]], 

120 progress: Optional[Callable[[str], None]] = None, 

121 ) -> None: 

122 """Add a set of objects to this object store. 

123 

124 Args: 

125 objects: Iterable over a list of (object, path) tuples 

126 """ 

127 

128 def __contains__(self, sha1: bytes) -> bool: 

129 """Check if a hex sha is present.""" 

130 

131 def __getitem__(self, sha1: bytes) -> ShaFile: 

132 """Retrieve an object.""" 

133 

134 

135class PackedObjectContainer(ObjectContainer): 

136 def get_unpacked_object( 

137 self, sha1: bytes, *, include_comp: bool = False 

138 ) -> "UnpackedObject": 

139 """Get a raw unresolved object.""" 

140 raise NotImplementedError(self.get_unpacked_object) 

141 

142 def iterobjects_subset( 

143 self, shas: Iterable[bytes], *, allow_missing: bool = False 

144 ) -> Iterator[ShaFile]: 

145 raise NotImplementedError(self.iterobjects_subset) 

146 

147 def iter_unpacked_subset( 

148 self, 

149 shas: Set[bytes], 

150 include_comp: bool = False, 

151 allow_missing: bool = False, 

152 convert_ofs_delta: bool = True, 

153 ) -> Iterator["UnpackedObject"]: 

154 raise NotImplementedError(self.iter_unpacked_subset) 

155 

156 

157class UnpackedObjectStream: 

158 def __iter__(self) -> Iterator["UnpackedObject"]: 

159 raise NotImplementedError(self.__iter__) 

160 

161 def __len__(self) -> int: 

162 raise NotImplementedError(self.__len__) 

163 

164 

165def take_msb_bytes( 

166 read: Callable[[int], bytes], crc32: Optional[int] = None 

167) -> Tuple[List[int], Optional[int]]: 

168 """Read bytes marked with most significant bit. 

169 

170 Args: 

171 read: Read function 

172 """ 

173 ret: List[int] = [] 

174 while len(ret) == 0 or ret[-1] & 0x80: 

175 b = read(1) 

176 if crc32 is not None: 

177 crc32 = binascii.crc32(b, crc32) 

178 ret.append(ord(b[:1])) 

179 return ret, crc32 

180 

181 

182class PackFileDisappeared(Exception): 

183 def __init__(self, obj) -> None: 

184 self.obj = obj 

185 

186 

187class UnpackedObject: 

188 """Class encapsulating an object unpacked from a pack file. 

189 

190 These objects should only be created from within unpack_object. Most 

191 members start out as empty and are filled in at various points by 

192 read_zlib_chunks, unpack_object, DeltaChainIterator, etc. 

193 

194 End users of this object should take care that the function they're getting 

195 this object from is guaranteed to set the members they need. 

196 """ 

197 

198 __slots__ = [ 

199 "offset", # Offset in its pack. 

200 "_sha", # Cached binary SHA. 

201 "obj_type_num", # Type of this object. 

202 "obj_chunks", # Decompressed and delta-resolved chunks. 

203 "pack_type_num", # Type of this object in the pack (may be a delta). 

204 "delta_base", # Delta base offset or SHA. 

205 "comp_chunks", # Compressed object chunks. 

206 "decomp_chunks", # Decompressed object chunks. 

207 "decomp_len", # Decompressed length of this object. 

208 "crc32", # CRC32. 

209 ] 

210 

211 obj_type_num: Optional[int] 

212 obj_chunks: Optional[List[bytes]] 

213 delta_base: Union[None, bytes, int] 

214 decomp_chunks: List[bytes] 

215 comp_chunks: Optional[List[bytes]] 

216 

217 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be 

218 # methods of this object. 

219 def __init__( 

220 self, 

221 pack_type_num, 

222 *, 

223 delta_base=None, 

224 decomp_len=None, 

225 crc32=None, 

226 sha=None, 

227 decomp_chunks=None, 

228 offset=None, 

229 ) -> None: 

230 self.offset = offset 

231 self._sha = sha 

232 self.pack_type_num = pack_type_num 

233 self.delta_base = delta_base 

234 self.comp_chunks = None 

235 self.decomp_chunks: List[bytes] = decomp_chunks or [] 

236 if decomp_chunks is not None and decomp_len is None: 

237 self.decomp_len = sum(map(len, decomp_chunks)) 

238 else: 

239 self.decomp_len = decomp_len 

240 self.crc32 = crc32 

241 

242 if pack_type_num in DELTA_TYPES: 

243 self.obj_type_num = None 

244 self.obj_chunks = None 

245 else: 

246 self.obj_type_num = pack_type_num 

247 self.obj_chunks = self.decomp_chunks 

248 self.delta_base = delta_base 

249 

250 def sha(self): 

251 """Return the binary SHA of this object.""" 

252 if self._sha is None: 

253 self._sha = obj_sha(self.obj_type_num, self.obj_chunks) 

254 return self._sha 

255 

256 def sha_file(self): 

257 """Return a ShaFile from this object.""" 

258 assert self.obj_type_num is not None and self.obj_chunks is not None 

259 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks) 

260 

261 # Only provided for backwards compatibility with code that expects either 

262 # chunks or a delta tuple. 

263 def _obj(self) -> OldUnpackedObject: 

264 """Return the decompressed chunks, or (delta base, delta chunks).""" 

265 if self.pack_type_num in DELTA_TYPES: 

266 assert isinstance(self.delta_base, (bytes, int)) 

267 return (self.delta_base, self.decomp_chunks) 

268 else: 

269 return self.decomp_chunks 

270 

271 def __eq__(self, other): 

272 if not isinstance(other, UnpackedObject): 

273 return False 

274 for slot in self.__slots__: 

275 if getattr(self, slot) != getattr(other, slot): 

276 return False 

277 return True 

278 

279 def __ne__(self, other): 

280 return not (self == other) 

281 

282 def __repr__(self) -> str: 

283 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__] 

284 return "{}({})".format(self.__class__.__name__, ", ".join(data)) 

285 

286 

287_ZLIB_BUFSIZE = 4096 

288 

289 

290def read_zlib_chunks( 

291 read_some: Callable[[int], bytes], 

292 unpacked: UnpackedObject, 

293 include_comp: bool = False, 

294 buffer_size: int = _ZLIB_BUFSIZE, 

295) -> bytes: 

296 """Read zlib data from a buffer. 

297 

298 This function requires that the buffer have additional data following the 

299 compressed data, which is guaranteed to be the case for git pack files. 

300 

301 Args: 

302 read_some: Read function that returns at least one byte, but may 

303 return less than the requested size. 

304 unpacked: An UnpackedObject to write result data to. If its crc32 

305 attr is not None, the CRC32 of the compressed bytes will be computed 

306 using this starting CRC32. 

307 After this function, will have the following attrs set: 

308 * comp_chunks (if include_comp is True) 

309 * decomp_chunks 

310 * decomp_len 

311 * crc32 

312 include_comp: If True, include compressed data in the result. 

313 buffer_size: Size of the read buffer. 

314 Returns: Leftover unused data from the decompression. 

315 

316 Raises: 

317 zlib.error: if a decompression error occurred. 

318 """ 

319 if unpacked.decomp_len <= -1: 

320 raise ValueError("non-negative zlib data stream size expected") 

321 decomp_obj = zlib.decompressobj() 

322 

323 comp_chunks = [] 

324 decomp_chunks = unpacked.decomp_chunks 

325 decomp_len = 0 

326 crc32 = unpacked.crc32 

327 

328 while True: 

329 add = read_some(buffer_size) 

330 if not add: 

331 raise zlib.error("EOF before end of zlib stream") 

332 comp_chunks.append(add) 

333 decomp = decomp_obj.decompress(add) 

334 decomp_len += len(decomp) 

335 decomp_chunks.append(decomp) 

336 unused = decomp_obj.unused_data 

337 if unused: 

338 left = len(unused) 

339 if crc32 is not None: 

340 crc32 = binascii.crc32(add[:-left], crc32) 

341 if include_comp: 

342 comp_chunks[-1] = add[:-left] 

343 break 

344 elif crc32 is not None: 

345 crc32 = binascii.crc32(add, crc32) 

346 if crc32 is not None: 

347 crc32 &= 0xFFFFFFFF 

348 

349 if decomp_len != unpacked.decomp_len: 

350 raise zlib.error("decompressed data does not match expected size") 

351 

352 unpacked.crc32 = crc32 

353 if include_comp: 

354 unpacked.comp_chunks = comp_chunks 

355 return unused 

356 

357 

358def iter_sha1(iter): 

359 """Return the hexdigest of the SHA1 over a set of names. 

360 

361 Args: 

362 iter: Iterator over string objects 

363 Returns: 40-byte hex sha1 digest 

364 """ 

365 sha = sha1() 

366 for name in iter: 

367 sha.update(name) 

368 return sha.hexdigest().encode("ascii") 

369 

370 

371def load_pack_index(path): 

372 """Load an index file by path. 

373 

374 Args: 

375 path: Path to the index file 

376 Returns: A PackIndex loaded from the given path 

377 """ 

378 with GitFile(path, "rb") as f: 

379 return load_pack_index_file(path, f) 

380 

381 

382def _load_file_contents(f, size=None): 

383 try: 

384 fd = f.fileno() 

385 except (UnsupportedOperation, AttributeError): 

386 fd = None 

387 # Attempt to use mmap if possible 

388 if fd is not None: 

389 if size is None: 

390 size = os.fstat(fd).st_size 

391 if has_mmap: 

392 try: 

393 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ) 

394 except OSError: 

395 # Perhaps a socket? 

396 pass 

397 else: 

398 return contents, size 

399 contents = f.read() 

400 size = len(contents) 

401 return contents, size 

402 

403 

404def load_pack_index_file(path, f): 

405 """Load an index file from a file-like object. 

406 

407 Args: 

408 path: Path for the index file 

409 f: File-like object 

410 Returns: A PackIndex loaded from the given file 

411 """ 

412 contents, size = _load_file_contents(f) 

413 if contents[:4] == b"\377tOc": 

414 version = struct.unpack(b">L", contents[4:8])[0] 

415 if version == 2: 

416 return PackIndex2(path, file=f, contents=contents, size=size) 

417 else: 

418 raise KeyError("Unknown pack index format %d" % version) 

419 else: 

420 return PackIndex1(path, file=f, contents=contents, size=size) 

421 

422 

423def bisect_find_sha(start, end, sha, unpack_name): 

424 """Find a SHA in a data blob with sorted SHAs. 

425 

426 Args: 

427 start: Start index of range to search 

428 end: End index of range to search 

429 sha: Sha to find 

430 unpack_name: Callback to retrieve SHA by index 

431 Returns: Index of the SHA, or None if it wasn't found 

432 """ 

433 assert start <= end 

434 while start <= end: 

435 i = (start + end) // 2 

436 file_sha = unpack_name(i) 

437 if file_sha < sha: 

438 start = i + 1 

439 elif file_sha > sha: 

440 end = i - 1 

441 else: 

442 return i 

443 return None 

444 

445 

446PackIndexEntry = Tuple[bytes, int, Optional[int]] 

447 

448 

449class PackIndex: 

450 """An index in to a packfile. 

451 

452 Given a sha id of an object a pack index can tell you the location in the 

453 packfile of that object if it has it. 

454 """ 

455 

456 def __eq__(self, other): 

457 if not isinstance(other, PackIndex): 

458 return False 

459 

460 for (name1, _, _), (name2, _, _) in zip( 

461 self.iterentries(), other.iterentries() 

462 ): 

463 if name1 != name2: 

464 return False 

465 return True 

466 

467 def __ne__(self, other): 

468 return not self.__eq__(other) 

469 

470 def __len__(self) -> int: 

471 """Return the number of entries in this pack index.""" 

472 raise NotImplementedError(self.__len__) 

473 

474 def __iter__(self) -> Iterator[bytes]: 

475 """Iterate over the SHAs in this pack.""" 

476 return map(sha_to_hex, self._itersha()) 

477 

478 def iterentries(self) -> Iterator[PackIndexEntry]: 

479 """Iterate over the entries in this pack index. 

480 

481 Returns: iterator over tuples with object name, offset in packfile and 

482 crc32 checksum. 

483 """ 

484 raise NotImplementedError(self.iterentries) 

485 

486 def get_pack_checksum(self) -> bytes: 

487 """Return the SHA1 checksum stored for the corresponding packfile. 

488 

489 Returns: 20-byte binary digest 

490 """ 

491 raise NotImplementedError(self.get_pack_checksum) 

492 

493 def object_index(self, sha: bytes) -> int: 

494 warnings.warn( 

495 "Please use object_offset instead", DeprecationWarning, stacklevel=2 

496 ) 

497 return self.object_offset(sha) 

498 

499 def object_offset(self, sha: bytes) -> int: 

500 """Return the offset in to the corresponding packfile for the object. 

501 

502 Given the name of an object it will return the offset that object 

503 lives at within the corresponding pack file. If the pack file doesn't 

504 have the object then None will be returned. 

505 """ 

506 raise NotImplementedError(self.object_offset) 

507 

508 def object_sha1(self, index: int) -> bytes: 

509 """Return the SHA1 corresponding to the index in the pack file.""" 

510 for name, offset, crc32 in self.iterentries(): 

511 if offset == index: 

512 return name 

513 else: 

514 raise KeyError(index) 

515 

516 def _object_offset(self, sha: bytes) -> int: 

517 """See object_offset. 

518 

519 Args: 

520 sha: A *binary* SHA string. (20 characters long)_ 

521 """ 

522 raise NotImplementedError(self._object_offset) 

523 

524 def objects_sha1(self) -> bytes: 

525 """Return the hex SHA1 over all the shas of all objects in this pack. 

526 

527 Note: This is used for the filename of the pack. 

528 """ 

529 return iter_sha1(self._itersha()) 

530 

531 def _itersha(self) -> Iterator[bytes]: 

532 """Yield all the SHA1's of the objects in the index, sorted.""" 

533 raise NotImplementedError(self._itersha) 

534 

535 def close(self): 

536 pass 

537 

538 def check(self) -> None: 

539 pass 

540 

541 

542class MemoryPackIndex(PackIndex): 

543 """Pack index that is stored entirely in memory.""" 

544 

545 def __init__(self, entries, pack_checksum=None) -> None: 

546 """Create a new MemoryPackIndex. 

547 

548 Args: 

549 entries: Sequence of name, idx, crc32 (sorted) 

550 pack_checksum: Optional pack checksum 

551 """ 

552 self._by_sha = {} 

553 self._by_offset = {} 

554 for name, offset, crc32 in entries: 

555 self._by_sha[name] = offset 

556 self._by_offset[offset] = name 

557 self._entries = entries 

558 self._pack_checksum = pack_checksum 

559 

560 def get_pack_checksum(self): 

561 return self._pack_checksum 

562 

563 def __len__(self) -> int: 

564 return len(self._entries) 

565 

566 def object_offset(self, sha): 

567 if len(sha) == 40: 

568 sha = hex_to_sha(sha) 

569 return self._by_sha[sha] 

570 

571 def object_sha1(self, offset): 

572 return self._by_offset[offset] 

573 

574 def _itersha(self): 

575 return iter(self._by_sha) 

576 

577 def iterentries(self): 

578 return iter(self._entries) 

579 

580 @classmethod 

581 def for_pack(cls, pack): 

582 return MemoryPackIndex(pack.sorted_entries(), pack.calculate_checksum()) 

583 

584 @classmethod 

585 def clone(cls, other_index): 

586 return cls(other_index.iterentries(), other_index.get_pack_checksum()) 

587 

588 

589class FilePackIndex(PackIndex): 

590 """Pack index that is based on a file. 

591 

592 To do the loop it opens the file, and indexes first 256 4 byte groups 

593 with the first byte of the sha id. The value in the four byte group indexed 

594 is the end of the group that shares the same starting byte. Subtract one 

595 from the starting byte and index again to find the start of the group. 

596 The values are sorted by sha id within the group, so do the math to find 

597 the start and end offset and then bisect in to find if the value is 

598 present. 

599 """ 

600 

601 _fan_out_table: List[int] 

602 

603 def __init__(self, filename, file=None, contents=None, size=None) -> None: 

604 """Create a pack index object. 

605 

606 Provide it with the name of the index file to consider, and it will map 

607 it whenever required. 

608 """ 

609 self._filename = filename 

610 # Take the size now, so it can be checked each time we map the file to 

611 # ensure that it hasn't changed. 

612 if file is None: 

613 self._file = GitFile(filename, "rb") 

614 else: 

615 self._file = file 

616 if contents is None: 

617 self._contents, self._size = _load_file_contents(self._file, size) 

618 else: 

619 self._contents, self._size = (contents, size) 

620 

621 @property 

622 def path(self) -> str: 

623 return self._filename 

624 

625 def __eq__(self, other): 

626 # Quick optimization: 

627 if ( 

628 isinstance(other, FilePackIndex) 

629 and self._fan_out_table != other._fan_out_table 

630 ): 

631 return False 

632 

633 return super().__eq__(other) 

634 

635 def close(self) -> None: 

636 self._file.close() 

637 if getattr(self._contents, "close", None) is not None: 

638 self._contents.close() 

639 

640 def __len__(self) -> int: 

641 """Return the number of entries in this pack index.""" 

642 return self._fan_out_table[-1] 

643 

644 def _unpack_entry(self, i: int) -> PackIndexEntry: 

645 """Unpack the i-th entry in the index file. 

646 

647 Returns: Tuple with object name (SHA), offset in pack file and CRC32 

648 checksum (if known). 

649 """ 

650 raise NotImplementedError(self._unpack_entry) 

651 

652 def _unpack_name(self, i): 

653 """Unpack the i-th name from the index file.""" 

654 raise NotImplementedError(self._unpack_name) 

655 

656 def _unpack_offset(self, i): 

657 """Unpack the i-th object offset from the index file.""" 

658 raise NotImplementedError(self._unpack_offset) 

659 

660 def _unpack_crc32_checksum(self, i): 

661 """Unpack the crc32 checksum for the ith object from the index file.""" 

662 raise NotImplementedError(self._unpack_crc32_checksum) 

663 

664 def _itersha(self) -> Iterator[bytes]: 

665 for i in range(len(self)): 

666 yield self._unpack_name(i) 

667 

668 def iterentries(self) -> Iterator[PackIndexEntry]: 

669 """Iterate over the entries in this pack index. 

670 

671 Returns: iterator over tuples with object name, offset in packfile and 

672 crc32 checksum. 

673 """ 

674 for i in range(len(self)): 

675 yield self._unpack_entry(i) 

676 

677 def _read_fan_out_table(self, start_offset: int): 

678 ret = [] 

679 for i in range(0x100): 

680 fanout_entry = self._contents[ 

681 start_offset + i * 4 : start_offset + (i + 1) * 4 

682 ] 

683 ret.append(struct.unpack(">L", fanout_entry)[0]) 

684 return ret 

685 

686 def check(self) -> None: 

687 """Check that the stored checksum matches the actual checksum.""" 

688 actual = self.calculate_checksum() 

689 stored = self.get_stored_checksum() 

690 if actual != stored: 

691 raise ChecksumMismatch(stored, actual) 

692 

693 def calculate_checksum(self) -> bytes: 

694 """Calculate the SHA1 checksum over this pack index. 

695 

696 Returns: This is a 20-byte binary digest 

697 """ 

698 return sha1(self._contents[:-20]).digest() 

699 

700 def get_pack_checksum(self) -> bytes: 

701 """Return the SHA1 checksum stored for the corresponding packfile. 

702 

703 Returns: 20-byte binary digest 

704 """ 

705 return bytes(self._contents[-40:-20]) 

706 

707 def get_stored_checksum(self) -> bytes: 

708 """Return the SHA1 checksum stored for this index. 

709 

710 Returns: 20-byte binary digest 

711 """ 

712 return bytes(self._contents[-20:]) 

713 

714 def object_offset(self, sha: bytes) -> int: 

715 """Return the offset in to the corresponding packfile for the object. 

716 

717 Given the name of an object it will return the offset that object 

718 lives at within the corresponding pack file. If the pack file doesn't 

719 have the object then None will be returned. 

720 """ 

721 if len(sha) == 40: 

722 sha = hex_to_sha(sha) 

723 try: 

724 return self._object_offset(sha) 

725 except ValueError as exc: 

726 closed = getattr(self._contents, "closed", None) 

727 if closed in (None, True): 

728 raise PackFileDisappeared(self) from exc 

729 raise 

730 

731 def _object_offset(self, sha: bytes) -> int: 

732 """See object_offset. 

733 

734 Args: 

735 sha: A *binary* SHA string. (20 characters long)_ 

736 """ 

737 assert len(sha) == 20 

738 idx = ord(sha[:1]) 

739 if idx == 0: 

740 start = 0 

741 else: 

742 start = self._fan_out_table[idx - 1] 

743 end = self._fan_out_table[idx] 

744 i = bisect_find_sha(start, end, sha, self._unpack_name) 

745 if i is None: 

746 raise KeyError(sha) 

747 return self._unpack_offset(i) 

748 

749 

750class PackIndex1(FilePackIndex): 

751 """Version 1 Pack Index file.""" 

752 

753 def __init__(self, filename: str, file=None, contents=None, size=None) -> None: 

754 super().__init__(filename, file, contents, size) 

755 self.version = 1 

756 self._fan_out_table = self._read_fan_out_table(0) 

757 

758 def _unpack_entry(self, i): 

759 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24)) 

760 return (name, offset, None) 

761 

762 def _unpack_name(self, i): 

763 offset = (0x100 * 4) + (i * 24) + 4 

764 return self._contents[offset : offset + 20] 

765 

766 def _unpack_offset(self, i): 

767 offset = (0x100 * 4) + (i * 24) 

768 return unpack_from(">L", self._contents, offset)[0] 

769 

770 def _unpack_crc32_checksum(self, i): 

771 # Not stored in v1 index files 

772 return None 

773 

774 

775class PackIndex2(FilePackIndex): 

776 """Version 2 Pack Index file.""" 

777 

778 def __init__(self, filename: str, file=None, contents=None, size=None) -> None: 

779 super().__init__(filename, file, contents, size) 

780 if self._contents[:4] != b"\377tOc": 

781 raise AssertionError("Not a v2 pack index file") 

782 (self.version,) = unpack_from(b">L", self._contents, 4) 

783 if self.version != 2: 

784 raise AssertionError("Version was %d" % self.version) 

785 self._fan_out_table = self._read_fan_out_table(8) 

786 self._name_table_offset = 8 + 0x100 * 4 

787 self._crc32_table_offset = self._name_table_offset + 20 * len(self) 

788 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self) 

789 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len( 

790 self 

791 ) 

792 

793 def _unpack_entry(self, i): 

794 return ( 

795 self._unpack_name(i), 

796 self._unpack_offset(i), 

797 self._unpack_crc32_checksum(i), 

798 ) 

799 

800 def _unpack_name(self, i): 

801 offset = self._name_table_offset + i * 20 

802 return self._contents[offset : offset + 20] 

803 

804 def _unpack_offset(self, i): 

805 offset = self._pack_offset_table_offset + i * 4 

806 offset = unpack_from(">L", self._contents, offset)[0] 

807 if offset & (2**31): 

808 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8 

809 offset = unpack_from(">Q", self._contents, offset)[0] 

810 return offset 

811 

812 def _unpack_crc32_checksum(self, i): 

813 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0] 

814 

815 

816def read_pack_header(read) -> Tuple[int, int]: 

817 """Read the header of a pack file. 

818 

819 Args: 

820 read: Read function 

821 Returns: Tuple of (pack version, number of objects). If no data is 

822 available to read, returns (None, None). 

823 """ 

824 header = read(12) 

825 if not header: 

826 raise AssertionError("file too short to contain pack") 

827 if header[:4] != b"PACK": 

828 raise AssertionError(f"Invalid pack header {header!r}") 

829 (version,) = unpack_from(b">L", header, 4) 

830 if version not in (2, 3): 

831 raise AssertionError("Version was %d" % version) 

832 (num_objects,) = unpack_from(b">L", header, 8) 

833 return (version, num_objects) 

834 

835 

836def chunks_length(chunks: Union[bytes, Iterable[bytes]]) -> int: 

837 if isinstance(chunks, bytes): 

838 return len(chunks) 

839 else: 

840 return sum(map(len, chunks)) 

841 

842 

843def unpack_object( 

844 read_all: Callable[[int], bytes], 

845 read_some: Optional[Callable[[int], bytes]] = None, 

846 compute_crc32=False, 

847 include_comp=False, 

848 zlib_bufsize=_ZLIB_BUFSIZE, 

849) -> Tuple[UnpackedObject, bytes]: 

850 """Unpack a Git object. 

851 

852 Args: 

853 read_all: Read function that blocks until the number of requested 

854 bytes are read. 

855 read_some: Read function that returns at least one byte, but may not 

856 return the number of bytes requested. 

857 compute_crc32: If True, compute the CRC32 of the compressed data. If 

858 False, the returned CRC32 will be None. 

859 include_comp: If True, include compressed data in the result. 

860 zlib_bufsize: An optional buffer size for zlib operations. 

861 Returns: A tuple of (unpacked, unused), where unused is the unused data 

862 leftover from decompression, and unpacked in an UnpackedObject with 

863 the following attrs set: 

864 

865 * obj_chunks (for non-delta types) 

866 * pack_type_num 

867 * delta_base (for delta types) 

868 * comp_chunks (if include_comp is True) 

869 * decomp_chunks 

870 * decomp_len 

871 * crc32 (if compute_crc32 is True) 

872 """ 

873 if read_some is None: 

874 read_some = read_all 

875 if compute_crc32: 

876 crc32 = 0 

877 else: 

878 crc32 = None 

879 

880 raw, crc32 = take_msb_bytes(read_all, crc32=crc32) 

881 type_num = (raw[0] >> 4) & 0x07 

882 size = raw[0] & 0x0F 

883 for i, byte in enumerate(raw[1:]): 

884 size += (byte & 0x7F) << ((i * 7) + 4) 

885 

886 delta_base: Union[int, bytes, None] 

887 raw_base = len(raw) 

888 if type_num == OFS_DELTA: 

889 raw, crc32 = take_msb_bytes(read_all, crc32=crc32) 

890 raw_base += len(raw) 

891 if raw[-1] & 0x80: 

892 raise AssertionError 

893 delta_base_offset = raw[0] & 0x7F 

894 for byte in raw[1:]: 

895 delta_base_offset += 1 

896 delta_base_offset <<= 7 

897 delta_base_offset += byte & 0x7F 

898 delta_base = delta_base_offset 

899 elif type_num == REF_DELTA: 

900 delta_base_obj = read_all(20) 

901 if crc32 is not None: 

902 crc32 = binascii.crc32(delta_base_obj, crc32) 

903 delta_base = delta_base_obj 

904 raw_base += 20 

905 else: 

906 delta_base = None 

907 

908 unpacked = UnpackedObject( 

909 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32 

910 ) 

911 unused = read_zlib_chunks( 

912 read_some, 

913 unpacked, 

914 buffer_size=zlib_bufsize, 

915 include_comp=include_comp, 

916 ) 

917 return unpacked, unused 

918 

919 

920def _compute_object_size(value): 

921 """Compute the size of a unresolved object for use with LRUSizeCache.""" 

922 (num, obj) = value 

923 if num in DELTA_TYPES: 

924 return chunks_length(obj[1]) 

925 return chunks_length(obj) 

926 

927 

928class PackStreamReader: 

929 """Class to read a pack stream. 

930 

931 The pack is read from a ReceivableProtocol using read() or recv() as 

932 appropriate. 

933 """ 

934 

935 def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE) -> None: 

936 self.read_all = read_all 

937 if read_some is None: 

938 self.read_some = read_all 

939 else: 

940 self.read_some = read_some 

941 self.sha = sha1() 

942 self._offset = 0 

943 self._rbuf = BytesIO() 

944 # trailer is a deque to avoid memory allocation on small reads 

945 self._trailer: Deque[bytes] = deque() 

946 self._zlib_bufsize = zlib_bufsize 

947 

948 def _read(self, read, size): 

949 """Read up to size bytes using the given callback. 

950 

951 As a side effect, update the verifier's hash (excluding the last 20 

952 bytes read). 

953 

954 Args: 

955 read: The read callback to read from. 

956 size: The maximum number of bytes to read; the particular 

957 behavior is callback-specific. 

958 """ 

959 data = read(size) 

960 

961 # maintain a trailer of the last 20 bytes we've read 

962 n = len(data) 

963 self._offset += n 

964 tn = len(self._trailer) 

965 if n >= 20: 

966 to_pop = tn 

967 to_add = 20 

968 else: 

969 to_pop = max(n + tn - 20, 0) 

970 to_add = n 

971 self.sha.update( 

972 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)])) 

973 ) 

974 self._trailer.extend(data[-to_add:]) 

975 

976 # hash everything but the trailer 

977 self.sha.update(data[:-to_add]) 

978 return data 

979 

980 def _buf_len(self): 

981 buf = self._rbuf 

982 start = buf.tell() 

983 buf.seek(0, SEEK_END) 

984 end = buf.tell() 

985 buf.seek(start) 

986 return end - start 

987 

988 @property 

989 def offset(self): 

990 return self._offset - self._buf_len() 

991 

992 def read(self, size): 

993 """Read, blocking until size bytes are read.""" 

994 buf_len = self._buf_len() 

995 if buf_len >= size: 

996 return self._rbuf.read(size) 

997 buf_data = self._rbuf.read() 

998 self._rbuf = BytesIO() 

999 return buf_data + self._read(self.read_all, size - buf_len) 

1000 

1001 def recv(self, size): 

1002 """Read up to size bytes, blocking until one byte is read.""" 

1003 buf_len = self._buf_len() 

1004 if buf_len: 

1005 data = self._rbuf.read(size) 

1006 if size >= buf_len: 

1007 self._rbuf = BytesIO() 

1008 return data 

1009 return self._read(self.read_some, size) 

1010 

1011 def __len__(self) -> int: 

1012 return self._num_objects 

1013 

1014 def read_objects(self, compute_crc32=False) -> Iterator[UnpackedObject]: 

1015 """Read the objects in this pack file. 

1016 

1017 Args: 

1018 compute_crc32: If True, compute the CRC32 of the compressed 

1019 data. If False, the returned CRC32 will be None. 

1020 Returns: Iterator over UnpackedObjects with the following members set: 

1021 offset 

1022 obj_type_num 

1023 obj_chunks (for non-delta types) 

1024 delta_base (for delta types) 

1025 decomp_chunks 

1026 decomp_len 

1027 crc32 (if compute_crc32 is True) 

1028 

1029 Raises: 

1030 ChecksumMismatch: if the checksum of the pack contents does not 

1031 match the checksum in the pack trailer. 

1032 zlib.error: if an error occurred during zlib decompression. 

1033 IOError: if an error occurred writing to the output file. 

1034 """ 

1035 pack_version, self._num_objects = read_pack_header(self.read) 

1036 

1037 for i in range(self._num_objects): 

1038 offset = self.offset 

1039 unpacked, unused = unpack_object( 

1040 self.read, 

1041 read_some=self.recv, 

1042 compute_crc32=compute_crc32, 

1043 zlib_bufsize=self._zlib_bufsize, 

1044 ) 

1045 unpacked.offset = offset 

1046 

1047 # prepend any unused data to current read buffer 

1048 buf = BytesIO() 

1049 buf.write(unused) 

1050 buf.write(self._rbuf.read()) 

1051 buf.seek(0) 

1052 self._rbuf = buf 

1053 

1054 yield unpacked 

1055 

1056 if self._buf_len() < 20: 

1057 # If the read buffer is full, then the last read() got the whole 

1058 # trailer off the wire. If not, it means there is still some of the 

1059 # trailer to read. We need to read() all 20 bytes; N come from the 

1060 # read buffer and (20 - N) come from the wire. 

1061 self.read(20) 

1062 

1063 pack_sha = bytearray(self._trailer) # type: ignore 

1064 if pack_sha != self.sha.digest(): 

1065 raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest()) 

1066 

1067 

1068class PackStreamCopier(PackStreamReader): 

1069 """Class to verify a pack stream as it is being read. 

1070 

1071 The pack is read from a ReceivableProtocol using read() or recv() as 

1072 appropriate and written out to the given file-like object. 

1073 """ 

1074 

1075 def __init__(self, read_all, read_some, outfile, delta_iter=None) -> None: 

1076 """Initialize the copier. 

1077 

1078 Args: 

1079 read_all: Read function that blocks until the number of 

1080 requested bytes are read. 

1081 read_some: Read function that returns at least one byte, but may 

1082 not return the number of bytes requested. 

1083 outfile: File-like object to write output through. 

1084 delta_iter: Optional DeltaChainIterator to record deltas as we 

1085 read them. 

1086 """ 

1087 super().__init__(read_all, read_some=read_some) 

1088 self.outfile = outfile 

1089 self._delta_iter = delta_iter 

1090 

1091 def _read(self, read, size): 

1092 """Read data from the read callback and write it to the file.""" 

1093 data = super()._read(read, size) 

1094 self.outfile.write(data) 

1095 return data 

1096 

1097 def verify(self, progress=None): 

1098 """Verify a pack stream and write it to the output file. 

1099 

1100 See PackStreamReader.iterobjects for a list of exceptions this may 

1101 throw. 

1102 """ 

1103 i = 0 # default count of entries if read_objects() is empty 

1104 for i, unpacked in enumerate(self.read_objects()): 

1105 if self._delta_iter: 

1106 self._delta_iter.record(unpacked) 

1107 if progress is not None: 

1108 progress( 

1109 ("copying pack entries: %d/%d\r" % (i, len(self))).encode("ascii") 

1110 ) 

1111 if progress is not None: 

1112 progress(("copied %d pack entries\n" % i).encode("ascii")) 

1113 

1114 

1115def obj_sha(type, chunks): 

1116 """Compute the SHA for a numeric type and object chunks.""" 

1117 sha = sha1() 

1118 sha.update(object_header(type, chunks_length(chunks))) 

1119 if isinstance(chunks, bytes): 

1120 sha.update(chunks) 

1121 else: 

1122 for chunk in chunks: 

1123 sha.update(chunk) 

1124 return sha.digest() 

1125 

1126 

1127def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1 << 16): 

1128 """Hash a portion of a file into a new SHA. 

1129 

1130 Args: 

1131 f: A file-like object to read from that supports seek(). 

1132 start_ofs: The offset in the file to start reading at. 

1133 end_ofs: The offset in the file to end reading at, relative to the 

1134 end of the file. 

1135 buffer_size: A buffer size for reading. 

1136 Returns: A new SHA object updated with data read from the file. 

1137 """ 

1138 sha = sha1() 

1139 f.seek(0, SEEK_END) 

1140 length = f.tell() 

1141 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length: 

1142 raise AssertionError( 

1143 "Attempt to read beyond file length. " 

1144 "start_ofs: %d, end_ofs: %d, file length: %d" % (start_ofs, end_ofs, length) 

1145 ) 

1146 todo = length + end_ofs - start_ofs 

1147 f.seek(start_ofs) 

1148 while todo: 

1149 data = f.read(min(todo, buffer_size)) 

1150 sha.update(data) 

1151 todo -= len(data) 

1152 return sha 

1153 

1154 

1155class PackData: 

1156 """The data contained in a packfile. 

1157 

1158 Pack files can be accessed both sequentially for exploding a pack, and 

1159 directly with the help of an index to retrieve a specific object. 

1160 

1161 The objects within are either complete or a delta against another. 

1162 

1163 The header is variable length. If the MSB of each byte is set then it 

1164 indicates that the subsequent byte is still part of the header. 

1165 For the first byte the next MS bits are the type, which tells you the type 

1166 of object, and whether it is a delta. The LS byte is the lowest bits of the 

1167 size. For each subsequent byte the LS 7 bits are the next MS bits of the 

1168 size, i.e. the last byte of the header contains the MS bits of the size. 

1169 

1170 For the complete objects the data is stored as zlib deflated data. 

1171 The size in the header is the uncompressed object size, so to uncompress 

1172 you need to just keep feeding data to zlib until you get an object back, 

1173 or it errors on bad data. This is done here by just giving the complete 

1174 buffer from the start of the deflated object on. This is bad, but until I 

1175 get mmap sorted out it will have to do. 

1176 

1177 Currently there are no integrity checks done. Also no attempt is made to 

1178 try and detect the delta case, or a request for an object at the wrong 

1179 position. It will all just throw a zlib or KeyError. 

1180 """ 

1181 

1182 def __init__(self, filename, file=None, size=None) -> None: 

1183 """Create a PackData object representing the pack in the given filename. 

1184 

1185 The file must exist and stay readable until the object is disposed of. 

1186 It must also stay the same size. It will be mapped whenever needed. 

1187 

1188 Currently there is a restriction on the size of the pack as the python 

1189 mmap implementation is flawed. 

1190 """ 

1191 self._filename = filename 

1192 self._size = size 

1193 self._header_size = 12 

1194 if file is None: 

1195 self._file = GitFile(self._filename, "rb") 

1196 else: 

1197 self._file = file 

1198 (version, self._num_objects) = read_pack_header(self._file.read) 

1199 self._offset_cache = LRUSizeCache[int, Tuple[int, OldUnpackedObject]]( 

1200 1024 * 1024 * 20, compute_size=_compute_object_size 

1201 ) 

1202 

1203 @property 

1204 def filename(self): 

1205 return os.path.basename(self._filename) 

1206 

1207 @property 

1208 def path(self): 

1209 return self._filename 

1210 

1211 @classmethod 

1212 def from_file(cls, file, size=None): 

1213 return cls(str(file), file=file, size=size) 

1214 

1215 @classmethod 

1216 def from_path(cls, path): 

1217 return cls(filename=path) 

1218 

1219 def close(self): 

1220 self._file.close() 

1221 

1222 def __enter__(self): 

1223 return self 

1224 

1225 def __exit__(self, exc_type, exc_val, exc_tb): 

1226 self.close() 

1227 

1228 def __eq__(self, other): 

1229 if isinstance(other, PackData): 

1230 return self.get_stored_checksum() == other.get_stored_checksum() 

1231 return False 

1232 

1233 def _get_size(self): 

1234 if self._size is not None: 

1235 return self._size 

1236 self._size = os.path.getsize(self._filename) 

1237 if self._size < self._header_size: 

1238 errmsg = "%s is too small for a packfile (%d < %d)" % ( 

1239 self._filename, 

1240 self._size, 

1241 self._header_size, 

1242 ) 

1243 raise AssertionError(errmsg) 

1244 return self._size 

1245 

1246 def __len__(self) -> int: 

1247 """Returns the number of objects in this pack.""" 

1248 return self._num_objects 

1249 

1250 def calculate_checksum(self): 

1251 """Calculate the checksum for this pack. 

1252 

1253 Returns: 20-byte binary SHA1 digest 

1254 """ 

1255 return compute_file_sha(self._file, end_ofs=-20).digest() 

1256 

1257 def iter_unpacked(self, *, include_comp: bool = False): 

1258 self._file.seek(self._header_size) 

1259 

1260 if self._num_objects is None: 

1261 return 

1262 

1263 for _ in range(self._num_objects): 

1264 offset = self._file.tell() 

1265 unpacked, unused = unpack_object( 

1266 self._file.read, compute_crc32=False, include_comp=include_comp 

1267 ) 

1268 unpacked.offset = offset 

1269 yield unpacked 

1270 # Back up over unused data. 

1271 self._file.seek(-len(unused), SEEK_CUR) 

1272 

1273 def iterentries( 

1274 self, progress=None, resolve_ext_ref: Optional[ResolveExtRefFn] = None 

1275 ): 

1276 """Yield entries summarizing the contents of this pack. 

1277 

1278 Args: 

1279 progress: Progress function, called with current and total 

1280 object count. 

1281 Returns: iterator of tuples with (sha, offset, crc32) 

1282 """ 

1283 num_objects = self._num_objects 

1284 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref) 

1285 for i, result in enumerate(indexer): 

1286 if progress is not None: 

1287 progress(i, num_objects) 

1288 yield result 

1289 

1290 def sorted_entries( 

1291 self, 

1292 progress: Optional[ProgressFn] = None, 

1293 resolve_ext_ref: Optional[ResolveExtRefFn] = None, 

1294 ): 

1295 """Return entries in this pack, sorted by SHA. 

1296 

1297 Args: 

1298 progress: Progress function, called with current and total 

1299 object count 

1300 Returns: Iterator of tuples with (sha, offset, crc32) 

1301 """ 

1302 return sorted( 

1303 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref) 

1304 ) 

1305 

1306 def create_index_v1(self, filename, progress=None, resolve_ext_ref=None): 

1307 """Create a version 1 file for this data file. 

1308 

1309 Args: 

1310 filename: Index filename. 

1311 progress: Progress report function 

1312 Returns: Checksum of index file 

1313 """ 

1314 entries = self.sorted_entries( 

1315 progress=progress, resolve_ext_ref=resolve_ext_ref 

1316 ) 

1317 with GitFile(filename, "wb") as f: 

1318 return write_pack_index_v1(f, entries, self.calculate_checksum()) 

1319 

1320 def create_index_v2(self, filename, progress=None, resolve_ext_ref=None): 

1321 """Create a version 2 index file for this data file. 

1322 

1323 Args: 

1324 filename: Index filename. 

1325 progress: Progress report function 

1326 Returns: Checksum of index file 

1327 """ 

1328 entries = self.sorted_entries( 

1329 progress=progress, resolve_ext_ref=resolve_ext_ref 

1330 ) 

1331 with GitFile(filename, "wb") as f: 

1332 return write_pack_index_v2(f, entries, self.calculate_checksum()) 

1333 

1334 def create_index(self, filename, progress=None, version=2, resolve_ext_ref=None): 

1335 """Create an index file for this data file. 

1336 

1337 Args: 

1338 filename: Index filename. 

1339 progress: Progress report function 

1340 Returns: Checksum of index file 

1341 """ 

1342 if version == 1: 

1343 return self.create_index_v1( 

1344 filename, progress, resolve_ext_ref=resolve_ext_ref 

1345 ) 

1346 elif version == 2: 

1347 return self.create_index_v2( 

1348 filename, progress, resolve_ext_ref=resolve_ext_ref 

1349 ) 

1350 else: 

1351 raise ValueError("unknown index format %d" % version) 

1352 

1353 def get_stored_checksum(self): 

1354 """Return the expected checksum stored in this pack.""" 

1355 self._file.seek(-20, SEEK_END) 

1356 return self._file.read(20) 

1357 

1358 def check(self): 

1359 """Check the consistency of this pack.""" 

1360 actual = self.calculate_checksum() 

1361 stored = self.get_stored_checksum() 

1362 if actual != stored: 

1363 raise ChecksumMismatch(stored, actual) 

1364 

1365 def get_unpacked_object_at( 

1366 self, offset: int, *, include_comp: bool = False 

1367 ) -> UnpackedObject: 

1368 """Given offset in the packfile return a UnpackedObject.""" 

1369 assert offset >= self._header_size 

1370 self._file.seek(offset) 

1371 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp) 

1372 unpacked.offset = offset 

1373 return unpacked 

1374 

1375 def get_object_at(self, offset: int) -> Tuple[int, OldUnpackedObject]: 

1376 """Given an offset in to the packfile return the object that is there. 

1377 

1378 Using the associated index the location of an object can be looked up, 

1379 and then the packfile can be asked directly for that object using this 

1380 function. 

1381 """ 

1382 try: 

1383 return self._offset_cache[offset] 

1384 except KeyError: 

1385 pass 

1386 unpacked = self.get_unpacked_object_at(offset, include_comp=False) 

1387 return (unpacked.pack_type_num, unpacked._obj()) 

1388 

1389 

1390T = TypeVar("T") 

1391 

1392 

1393class DeltaChainIterator(Generic[T]): 

1394 """Abstract iterator over pack data based on delta chains. 

1395 

1396 Each object in the pack is guaranteed to be inflated exactly once, 

1397 regardless of how many objects reference it as a delta base. As a result, 

1398 memory usage is proportional to the length of the longest delta chain. 

1399 

1400 Subclasses can override _result to define the result type of the iterator. 

1401 By default, results are UnpackedObjects with the following members set: 

1402 

1403 * offset 

1404 * obj_type_num 

1405 * obj_chunks 

1406 * pack_type_num 

1407 * delta_base (for delta types) 

1408 * comp_chunks (if _include_comp is True) 

1409 * decomp_chunks 

1410 * decomp_len 

1411 * crc32 (if _compute_crc32 is True) 

1412 """ 

1413 

1414 _compute_crc32 = False 

1415 _include_comp = False 

1416 

1417 def __init__(self, file_obj, *, resolve_ext_ref=None) -> None: 

1418 self._file = file_obj 

1419 self._resolve_ext_ref = resolve_ext_ref 

1420 self._pending_ofs: Dict[int, List[int]] = defaultdict(list) 

1421 self._pending_ref: Dict[bytes, List[int]] = defaultdict(list) 

1422 self._full_ofs: List[Tuple[int, int]] = [] 

1423 self._ext_refs: List[bytes] = [] 

1424 

1425 @classmethod 

1426 def for_pack_data(cls, pack_data: PackData, resolve_ext_ref=None): 

1427 walker = cls(None, resolve_ext_ref=resolve_ext_ref) 

1428 walker.set_pack_data(pack_data) 

1429 for unpacked in pack_data.iter_unpacked(include_comp=False): 

1430 walker.record(unpacked) 

1431 return walker 

1432 

1433 @classmethod 

1434 def for_pack_subset( 

1435 cls, 

1436 pack: "Pack", 

1437 shas: Iterable[bytes], 

1438 *, 

1439 allow_missing: bool = False, 

1440 resolve_ext_ref=None, 

1441 ): 

1442 walker = cls(None, resolve_ext_ref=resolve_ext_ref) 

1443 walker.set_pack_data(pack.data) 

1444 todo = set() 

1445 for sha in shas: 

1446 assert isinstance(sha, bytes) 

1447 try: 

1448 off = pack.index.object_offset(sha) 

1449 except KeyError: 

1450 if not allow_missing: 

1451 raise 

1452 else: 

1453 todo.add(off) 

1454 done = set() 

1455 while todo: 

1456 off = todo.pop() 

1457 unpacked = pack.data.get_unpacked_object_at(off) 

1458 walker.record(unpacked) 

1459 done.add(off) 

1460 base_ofs = None 

1461 if unpacked.pack_type_num == OFS_DELTA: 

1462 base_ofs = unpacked.offset - unpacked.delta_base 

1463 elif unpacked.pack_type_num == REF_DELTA: 

1464 with suppress(KeyError): 

1465 assert isinstance(unpacked.delta_base, bytes) 

1466 base_ofs = pack.index.object_index(unpacked.delta_base) 

1467 if base_ofs is not None and base_ofs not in done: 

1468 todo.add(base_ofs) 

1469 return walker 

1470 

1471 def record(self, unpacked: UnpackedObject) -> None: 

1472 type_num = unpacked.pack_type_num 

1473 offset = unpacked.offset 

1474 if type_num == OFS_DELTA: 

1475 base_offset = offset - unpacked.delta_base 

1476 self._pending_ofs[base_offset].append(offset) 

1477 elif type_num == REF_DELTA: 

1478 assert isinstance(unpacked.delta_base, bytes) 

1479 self._pending_ref[unpacked.delta_base].append(offset) 

1480 else: 

1481 self._full_ofs.append((offset, type_num)) 

1482 

1483 def set_pack_data(self, pack_data: PackData) -> None: 

1484 self._file = pack_data._file 

1485 

1486 def _walk_all_chains(self): 

1487 for offset, type_num in self._full_ofs: 

1488 yield from self._follow_chain(offset, type_num, None) 

1489 yield from self._walk_ref_chains() 

1490 assert not self._pending_ofs, repr(self._pending_ofs) 

1491 

1492 def _ensure_no_pending(self) -> None: 

1493 if self._pending_ref: 

1494 raise UnresolvedDeltas([sha_to_hex(s) for s in self._pending_ref]) 

1495 

1496 def _walk_ref_chains(self): 

1497 if not self._resolve_ext_ref: 

1498 self._ensure_no_pending() 

1499 return 

1500 

1501 for base_sha, pending in sorted(self._pending_ref.items()): 

1502 if base_sha not in self._pending_ref: 

1503 continue 

1504 try: 

1505 type_num, chunks = self._resolve_ext_ref(base_sha) 

1506 except KeyError: 

1507 # Not an external ref, but may depend on one. Either it will 

1508 # get popped via a _follow_chain call, or we will raise an 

1509 # error below. 

1510 continue 

1511 self._ext_refs.append(base_sha) 

1512 self._pending_ref.pop(base_sha) 

1513 for new_offset in pending: 

1514 yield from self._follow_chain(new_offset, type_num, chunks) 

1515 

1516 self._ensure_no_pending() 

1517 

1518 def _result(self, unpacked: UnpackedObject) -> T: 

1519 raise NotImplementedError 

1520 

1521 def _resolve_object( 

1522 self, offset: int, obj_type_num: int, base_chunks: List[bytes] 

1523 ) -> UnpackedObject: 

1524 self._file.seek(offset) 

1525 unpacked, _ = unpack_object( 

1526 self._file.read, 

1527 include_comp=self._include_comp, 

1528 compute_crc32=self._compute_crc32, 

1529 ) 

1530 unpacked.offset = offset 

1531 if base_chunks is None: 

1532 assert unpacked.pack_type_num == obj_type_num 

1533 else: 

1534 assert unpacked.pack_type_num in DELTA_TYPES 

1535 unpacked.obj_type_num = obj_type_num 

1536 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks) 

1537 return unpacked 

1538 

1539 def _follow_chain(self, offset: int, obj_type_num: int, base_chunks: List[bytes]): 

1540 # Unlike PackData.get_object_at, there is no need to cache offsets as 

1541 # this approach by design inflates each object exactly once. 

1542 todo = [(offset, obj_type_num, base_chunks)] 

1543 while todo: 

1544 (offset, obj_type_num, base_chunks) = todo.pop() 

1545 unpacked = self._resolve_object(offset, obj_type_num, base_chunks) 

1546 yield self._result(unpacked) 

1547 

1548 unblocked = chain( 

1549 self._pending_ofs.pop(unpacked.offset, []), 

1550 self._pending_ref.pop(unpacked.sha(), []), 

1551 ) 

1552 todo.extend( 

1553 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore 

1554 for new_offset in unblocked 

1555 ) 

1556 

1557 def __iter__(self) -> Iterator[T]: 

1558 return self._walk_all_chains() 

1559 

1560 def ext_refs(self): 

1561 return self._ext_refs 

1562 

1563 

1564class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]): 

1565 """Delta chain iterator that yield unpacked objects.""" 

1566 

1567 def _result(self, unpacked): 

1568 return unpacked 

1569 

1570 

1571class PackIndexer(DeltaChainIterator[PackIndexEntry]): 

1572 """Delta chain iterator that yields index entries.""" 

1573 

1574 _compute_crc32 = True 

1575 

1576 def _result(self, unpacked): 

1577 return unpacked.sha(), unpacked.offset, unpacked.crc32 

1578 

1579 

1580class PackInflater(DeltaChainIterator[ShaFile]): 

1581 """Delta chain iterator that yields ShaFile objects.""" 

1582 

1583 def _result(self, unpacked): 

1584 return unpacked.sha_file() 

1585 

1586 

1587class SHA1Reader: 

1588 """Wrapper for file-like object that remembers the SHA1 of its data.""" 

1589 

1590 def __init__(self, f) -> None: 

1591 self.f = f 

1592 self.sha1 = sha1(b"") 

1593 

1594 def read(self, num=None): 

1595 data = self.f.read(num) 

1596 self.sha1.update(data) 

1597 return data 

1598 

1599 def check_sha(self): 

1600 stored = self.f.read(20) 

1601 if stored != self.sha1.digest(): 

1602 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored)) 

1603 

1604 def close(self): 

1605 return self.f.close() 

1606 

1607 def tell(self): 

1608 return self.f.tell() 

1609 

1610 

1611class SHA1Writer: 

1612 """Wrapper for file-like object that remembers the SHA1 of its data.""" 

1613 

1614 def __init__(self, f) -> None: 

1615 self.f = f 

1616 self.length = 0 

1617 self.sha1 = sha1(b"") 

1618 

1619 def write(self, data): 

1620 self.sha1.update(data) 

1621 self.f.write(data) 

1622 self.length += len(data) 

1623 

1624 def write_sha(self): 

1625 sha = self.sha1.digest() 

1626 assert len(sha) == 20 

1627 self.f.write(sha) 

1628 self.length += len(sha) 

1629 return sha 

1630 

1631 def close(self): 

1632 sha = self.write_sha() 

1633 self.f.close() 

1634 return sha 

1635 

1636 def offset(self): 

1637 return self.length 

1638 

1639 def tell(self): 

1640 return self.f.tell() 

1641 

1642 

1643def pack_object_header(type_num, delta_base, size): 

1644 """Create a pack object header for the given object info. 

1645 

1646 Args: 

1647 type_num: Numeric type of the object. 

1648 delta_base: Delta base offset or ref, or None for whole objects. 

1649 size: Uncompressed object size. 

1650 Returns: A header for a packed object. 

1651 """ 

1652 header = [] 

1653 c = (type_num << 4) | (size & 15) 

1654 size >>= 4 

1655 while size: 

1656 header.append(c | 0x80) 

1657 c = size & 0x7F 

1658 size >>= 7 

1659 header.append(c) 

1660 if type_num == OFS_DELTA: 

1661 ret = [delta_base & 0x7F] 

1662 delta_base >>= 7 

1663 while delta_base: 

1664 delta_base -= 1 

1665 ret.insert(0, 0x80 | (delta_base & 0x7F)) 

1666 delta_base >>= 7 

1667 header.extend(ret) 

1668 elif type_num == REF_DELTA: 

1669 assert len(delta_base) == 20 

1670 header += delta_base 

1671 return bytearray(header) 

1672 

1673 

1674def pack_object_chunks(type, object, compression_level=-1): 

1675 """Generate chunks for a pack object. 

1676 

1677 Args: 

1678 type: Numeric type of the object 

1679 object: Object to write 

1680 compression_level: the zlib compression level 

1681 Returns: Chunks 

1682 """ 

1683 if type in DELTA_TYPES: 

1684 delta_base, object = object 

1685 else: 

1686 delta_base = None 

1687 if isinstance(object, bytes): 

1688 object = [object] 

1689 yield bytes(pack_object_header(type, delta_base, sum(map(len, object)))) 

1690 compressor = zlib.compressobj(level=compression_level) 

1691 for data in object: 

1692 yield compressor.compress(data) 

1693 yield compressor.flush() 

1694 

1695 

1696def write_pack_object(write, type, object, sha=None, compression_level=-1): 

1697 """Write pack object to a file. 

1698 

1699 Args: 

1700 write: Write function to use 

1701 type: Numeric type of the object 

1702 object: Object to write 

1703 compression_level: the zlib compression level 

1704 Returns: Tuple with offset at which the object was written, and crc32 

1705 """ 

1706 crc32 = 0 

1707 for chunk in pack_object_chunks(type, object, compression_level=compression_level): 

1708 write(chunk) 

1709 if sha is not None: 

1710 sha.update(chunk) 

1711 crc32 = binascii.crc32(chunk, crc32) 

1712 return crc32 & 0xFFFFFFFF 

1713 

1714 

1715def write_pack( 

1716 filename, 

1717 objects: Union[Sequence[ShaFile], Sequence[Tuple[ShaFile, Optional[bytes]]]], 

1718 *, 

1719 deltify: Optional[bool] = None, 

1720 delta_window_size: Optional[int] = None, 

1721 compression_level: int = -1, 

1722): 

1723 """Write a new pack data file. 

1724 

1725 Args: 

1726 filename: Path to the new pack file (without .pack extension) 

1727 container: PackedObjectContainer 

1728 entries: Sequence of (object_id, path) tuples to write 

1729 delta_window_size: Delta window size 

1730 deltify: Whether to deltify pack objects 

1731 compression_level: the zlib compression level 

1732 Returns: Tuple with checksum of pack file and index file 

1733 """ 

1734 with GitFile(filename + ".pack", "wb") as f: 

1735 entries, data_sum = write_pack_objects( 

1736 f.write, 

1737 objects, 

1738 delta_window_size=delta_window_size, 

1739 deltify=deltify, 

1740 compression_level=compression_level, 

1741 ) 

1742 entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()]) 

1743 with GitFile(filename + ".idx", "wb") as f: 

1744 return data_sum, write_pack_index_v2(f, entries, data_sum) 

1745 

1746 

1747def pack_header_chunks(num_objects): 

1748 """Yield chunks for a pack header.""" 

1749 yield b"PACK" # Pack header 

1750 yield struct.pack(b">L", 2) # Pack version 

1751 yield struct.pack(b">L", num_objects) # Number of objects in pack 

1752 

1753 

1754def write_pack_header(write, num_objects): 

1755 """Write a pack header for the given number of objects.""" 

1756 if hasattr(write, "write"): 

1757 write = write.write 

1758 warnings.warn( 

1759 "write_pack_header() now takes a write rather than file argument", 

1760 DeprecationWarning, 

1761 stacklevel=2, 

1762 ) 

1763 for chunk in pack_header_chunks(num_objects): 

1764 write(chunk) 

1765 

1766 

1767def find_reusable_deltas( 

1768 container: PackedObjectContainer, 

1769 object_ids: Set[bytes], 

1770 *, 

1771 other_haves: Optional[Set[bytes]] = None, 

1772 progress=None, 

1773) -> Iterator[UnpackedObject]: 

1774 if other_haves is None: 

1775 other_haves = set() 

1776 reused = 0 

1777 for i, unpacked in enumerate( 

1778 container.iter_unpacked_subset( 

1779 object_ids, allow_missing=True, convert_ofs_delta=True 

1780 ) 

1781 ): 

1782 if progress is not None and i % 1000 == 0: 

1783 progress( 

1784 ("checking for reusable deltas: %d/%d\r" % (i, len(object_ids))).encode( 

1785 "utf-8" 

1786 ) 

1787 ) 

1788 if unpacked.pack_type_num == REF_DELTA: 

1789 hexsha = sha_to_hex(unpacked.delta_base) 

1790 if hexsha in object_ids or hexsha in other_haves: 

1791 yield unpacked 

1792 reused += 1 

1793 if progress is not None: 

1794 progress(("found %d deltas to reuse\n" % (reused,)).encode("utf-8")) 

1795 

1796 

1797def deltify_pack_objects( 

1798 objects: Union[Iterator[bytes], Iterator[Tuple[ShaFile, Optional[bytes]]]], 

1799 *, 

1800 window_size: Optional[int] = None, 

1801 progress=None, 

1802) -> Iterator[UnpackedObject]: 

1803 """Generate deltas for pack objects. 

1804 

1805 Args: 

1806 objects: An iterable of (object, path) tuples to deltify. 

1807 window_size: Window size; None for default 

1808 Returns: Iterator over type_num, object id, delta_base, content 

1809 delta_base is None for full text entries 

1810 """ 

1811 

1812 def objects_with_hints(): 

1813 for e in objects: 

1814 if isinstance(e, ShaFile): 

1815 yield (e, (e.type_num, None)) 

1816 else: 

1817 yield (e[0], (e[0].type_num, e[1])) 

1818 

1819 yield from deltas_from_sorted_objects( 

1820 sort_objects_for_delta(objects_with_hints()), 

1821 window_size=window_size, 

1822 progress=progress, 

1823 ) 

1824 

1825 

1826def sort_objects_for_delta( 

1827 objects: Union[Iterator[ShaFile], Iterator[Tuple[ShaFile, Optional[PackHint]]]], 

1828) -> Iterator[ShaFile]: 

1829 magic = [] 

1830 for entry in objects: 

1831 if isinstance(entry, tuple): 

1832 obj, hint = entry 

1833 if hint is None: 

1834 type_num = None 

1835 path = None 

1836 else: 

1837 (type_num, path) = hint 

1838 else: 

1839 obj = entry 

1840 magic.append((type_num, path, -obj.raw_length(), obj)) 

1841 # Build a list of objects ordered by the magic Linus heuristic 

1842 # This helps us find good objects to diff against us 

1843 magic.sort() 

1844 return (x[3] for x in magic) 

1845 

1846 

1847def deltas_from_sorted_objects( 

1848 objects, window_size: Optional[int] = None, progress=None 

1849): 

1850 # TODO(jelmer): Use threads 

1851 if window_size is None: 

1852 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE 

1853 

1854 possible_bases: Deque[Tuple[bytes, int, List[bytes]]] = deque() 

1855 for i, o in enumerate(objects): 

1856 if progress is not None and i % 1000 == 0: 

1857 progress(("generating deltas: %d\r" % (i,)).encode("utf-8")) 

1858 raw = o.as_raw_chunks() 

1859 winner = raw 

1860 winner_len = sum(map(len, winner)) 

1861 winner_base = None 

1862 for base_id, base_type_num, base in possible_bases: 

1863 if base_type_num != o.type_num: 

1864 continue 

1865 delta_len = 0 

1866 delta = [] 

1867 for chunk in create_delta(base, raw): 

1868 delta_len += len(chunk) 

1869 if delta_len >= winner_len: 

1870 break 

1871 delta.append(chunk) 

1872 else: 

1873 winner_base = base_id 

1874 winner = delta 

1875 winner_len = sum(map(len, winner)) 

1876 yield UnpackedObject( 

1877 o.type_num, 

1878 sha=o.sha().digest(), 

1879 delta_base=winner_base, 

1880 decomp_len=winner_len, 

1881 decomp_chunks=winner, 

1882 ) 

1883 possible_bases.appendleft((o.sha().digest(), o.type_num, raw)) 

1884 while len(possible_bases) > window_size: 

1885 possible_bases.pop() 

1886 

1887 

1888def pack_objects_to_data( 

1889 objects: Union[Sequence[ShaFile], Sequence[Tuple[ShaFile, Optional[bytes]]]], 

1890 *, 

1891 deltify: Optional[bool] = None, 

1892 delta_window_size: Optional[int] = None, 

1893 ofs_delta: bool = True, 

1894 progress=None, 

1895) -> Tuple[int, Iterator[UnpackedObject]]: 

1896 """Create pack data from objects. 

1897 

1898 Args: 

1899 objects: Pack objects 

1900 Returns: Tuples with (type_num, hexdigest, delta base, object chunks) 

1901 """ 

1902 # TODO(jelmer): support deltaifying 

1903 count = len(objects) 

1904 if deltify is None: 

1905 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too 

1906 # slow at the moment. 

1907 deltify = False 

1908 if deltify: 

1909 return ( 

1910 count, 

1911 deltify_pack_objects( 

1912 iter(objects), # type: ignore 

1913 window_size=delta_window_size, 

1914 progress=progress, 

1915 ), 

1916 ) 

1917 else: 

1918 

1919 def iter_without_path(): 

1920 for o in objects: 

1921 if isinstance(o, tuple): 

1922 yield full_unpacked_object(o[0]) 

1923 else: 

1924 yield full_unpacked_object(o) 

1925 

1926 return (count, iter_without_path()) 

1927 

1928 

1929def generate_unpacked_objects( 

1930 container: PackedObjectContainer, 

1931 object_ids: Sequence[Tuple[ObjectID, Optional[PackHint]]], 

1932 delta_window_size: Optional[int] = None, 

1933 deltify: Optional[bool] = None, 

1934 reuse_deltas: bool = True, 

1935 ofs_delta: bool = True, 

1936 other_haves: Optional[Set[bytes]] = None, 

1937 progress=None, 

1938) -> Iterator[UnpackedObject]: 

1939 """Create pack data from objects. 

1940 

1941 Args: 

1942 objects: Pack objects 

1943 Returns: Tuples with (type_num, hexdigest, delta base, object chunks) 

1944 """ 

1945 todo = dict(object_ids) 

1946 if reuse_deltas: 

1947 for unpack in find_reusable_deltas( 

1948 container, set(todo), other_haves=other_haves, progress=progress 

1949 ): 

1950 del todo[sha_to_hex(unpack.sha())] 

1951 yield unpack 

1952 if deltify is None: 

1953 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too 

1954 # slow at the moment. 

1955 deltify = False 

1956 if deltify: 

1957 objects_to_delta = container.iterobjects_subset( 

1958 todo.keys(), allow_missing=False 

1959 ) 

1960 yield from deltas_from_sorted_objects( 

1961 sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta), 

1962 window_size=delta_window_size, 

1963 progress=progress, 

1964 ) 

1965 else: 

1966 for oid in todo: 

1967 yield full_unpacked_object(container[oid]) 

1968 

1969 

1970def full_unpacked_object(o: ShaFile) -> UnpackedObject: 

1971 return UnpackedObject( 

1972 o.type_num, 

1973 delta_base=None, 

1974 crc32=None, 

1975 decomp_chunks=o.as_raw_chunks(), 

1976 sha=o.sha().digest(), 

1977 ) 

1978 

1979 

1980def write_pack_from_container( 

1981 write, 

1982 container: PackedObjectContainer, 

1983 object_ids: Sequence[Tuple[ObjectID, Optional[PackHint]]], 

1984 delta_window_size: Optional[int] = None, 

1985 deltify: Optional[bool] = None, 

1986 reuse_deltas: bool = True, 

1987 compression_level: int = -1, 

1988 other_haves: Optional[Set[bytes]] = None, 

1989): 

1990 """Write a new pack data file. 

1991 

1992 Args: 

1993 write: write function to use 

1994 container: PackedObjectContainer 

1995 entries: Sequence of (object_id, path) tuples to write 

1996 delta_window_size: Sliding window size for searching for deltas; 

1997 Set to None for default window size. 

1998 deltify: Whether to deltify objects 

1999 compression_level: the zlib compression level to use 

2000 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

2001 """ 

2002 pack_contents_count = len(object_ids) 

2003 pack_contents = generate_unpacked_objects( 

2004 container, 

2005 object_ids, 

2006 delta_window_size=delta_window_size, 

2007 deltify=deltify, 

2008 reuse_deltas=reuse_deltas, 

2009 other_haves=other_haves, 

2010 ) 

2011 

2012 return write_pack_data( 

2013 write, 

2014 pack_contents, 

2015 num_records=pack_contents_count, 

2016 compression_level=compression_level, 

2017 ) 

2018 

2019 

2020def write_pack_objects( 

2021 write, 

2022 objects: Union[Sequence[ShaFile], Sequence[Tuple[ShaFile, Optional[bytes]]]], 

2023 *, 

2024 delta_window_size: Optional[int] = None, 

2025 deltify: Optional[bool] = None, 

2026 compression_level: int = -1, 

2027): 

2028 """Write a new pack data file. 

2029 

2030 Args: 

2031 write: write function to use 

2032 objects: Sequence of (object, path) tuples to write 

2033 delta_window_size: Sliding window size for searching for deltas; 

2034 Set to None for default window size. 

2035 deltify: Whether to deltify objects 

2036 compression_level: the zlib compression level to use 

2037 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

2038 """ 

2039 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify) 

2040 

2041 return write_pack_data( 

2042 write, 

2043 pack_contents, 

2044 num_records=pack_contents_count, 

2045 compression_level=compression_level, 

2046 ) 

2047 

2048 

2049class PackChunkGenerator: 

2050 def __init__( 

2051 self, 

2052 num_records=None, 

2053 records=None, 

2054 progress=None, 

2055 compression_level=-1, 

2056 reuse_compressed=True, 

2057 ) -> None: 

2058 self.cs = sha1(b"") 

2059 self.entries: Dict[Union[int, bytes], Tuple[int, int]] = {} 

2060 self._it = self._pack_data_chunks( 

2061 num_records=num_records, 

2062 records=records, 

2063 progress=progress, 

2064 compression_level=compression_level, 

2065 reuse_compressed=reuse_compressed, 

2066 ) 

2067 

2068 def sha1digest(self): 

2069 return self.cs.digest() 

2070 

2071 def __iter__(self): 

2072 return self._it 

2073 

2074 def _pack_data_chunks( 

2075 self, 

2076 records: Iterator[UnpackedObject], 

2077 *, 

2078 num_records=None, 

2079 progress=None, 

2080 compression_level: int = -1, 

2081 reuse_compressed: bool = True, 

2082 ) -> Iterator[bytes]: 

2083 """Iterate pack data file chunks. 

2084 

2085 Args: 

2086 records: Iterator over UnpackedObject 

2087 num_records: Number of records (defaults to len(records) if not specified) 

2088 progress: Function to report progress to 

2089 compression_level: the zlib compression level 

2090 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

2091 """ 

2092 # Write the pack 

2093 if num_records is None: 

2094 num_records = len(records) # type: ignore 

2095 offset = 0 

2096 for chunk in pack_header_chunks(num_records): 

2097 yield chunk 

2098 self.cs.update(chunk) 

2099 offset += len(chunk) 

2100 actual_num_records = 0 

2101 for i, unpacked in enumerate(records): 

2102 type_num = unpacked.pack_type_num 

2103 if progress is not None and i % 1000 == 0: 

2104 progress( 

2105 ("writing pack data: %d/%d\r" % (i, num_records)).encode("ascii") 

2106 ) 

2107 raw: Union[List[bytes], Tuple[int, List[bytes]], Tuple[bytes, List[bytes]]] 

2108 if unpacked.delta_base is not None: 

2109 try: 

2110 base_offset, base_crc32 = self.entries[unpacked.delta_base] 

2111 except KeyError: 

2112 type_num = REF_DELTA 

2113 assert isinstance(unpacked.delta_base, bytes) 

2114 raw = (unpacked.delta_base, unpacked.decomp_chunks) 

2115 else: 

2116 type_num = OFS_DELTA 

2117 raw = (offset - base_offset, unpacked.decomp_chunks) 

2118 else: 

2119 raw = unpacked.decomp_chunks 

2120 if unpacked.comp_chunks is not None and reuse_compressed: 

2121 chunks = unpacked.comp_chunks 

2122 else: 

2123 chunks = pack_object_chunks( 

2124 type_num, raw, compression_level=compression_level 

2125 ) 

2126 crc32 = 0 

2127 object_size = 0 

2128 for chunk in chunks: 

2129 yield chunk 

2130 crc32 = binascii.crc32(chunk, crc32) 

2131 self.cs.update(chunk) 

2132 object_size += len(chunk) 

2133 actual_num_records += 1 

2134 self.entries[unpacked.sha()] = (offset, crc32) 

2135 offset += object_size 

2136 if actual_num_records != num_records: 

2137 raise AssertionError( 

2138 "actual records written differs: %d != %d" 

2139 % (actual_num_records, num_records) 

2140 ) 

2141 

2142 yield self.cs.digest() 

2143 

2144 

2145def write_pack_data( 

2146 write, 

2147 records: Iterator[UnpackedObject], 

2148 *, 

2149 num_records=None, 

2150 progress=None, 

2151 compression_level=-1, 

2152): 

2153 """Write a new pack data file. 

2154 

2155 Args: 

2156 write: Write function to use 

2157 num_records: Number of records (defaults to len(records) if None) 

2158 records: Iterator over type_num, object_id, delta_base, raw 

2159 progress: Function to report progress to 

2160 compression_level: the zlib compression level 

2161 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

2162 """ 

2163 chunk_generator = PackChunkGenerator( 

2164 num_records=num_records, 

2165 records=records, 

2166 progress=progress, 

2167 compression_level=compression_level, 

2168 ) 

2169 for chunk in chunk_generator: 

2170 write(chunk) 

2171 return chunk_generator.entries, chunk_generator.sha1digest() 

2172 

2173 

2174def write_pack_index_v1(f, entries, pack_checksum): 

2175 """Write a new pack index file. 

2176 

2177 Args: 

2178 f: A file-like object to write to 

2179 entries: List of tuples with object name (sha), offset_in_pack, 

2180 and crc32_checksum. 

2181 pack_checksum: Checksum of the pack file. 

2182 Returns: The SHA of the written index file 

2183 """ 

2184 f = SHA1Writer(f) 

2185 fan_out_table = defaultdict(lambda: 0) 

2186 for name, offset, entry_checksum in entries: 

2187 fan_out_table[ord(name[:1])] += 1 

2188 # Fan-out table 

2189 for i in range(0x100): 

2190 f.write(struct.pack(">L", fan_out_table[i])) 

2191 fan_out_table[i + 1] += fan_out_table[i] 

2192 for name, offset, entry_checksum in entries: 

2193 if not (offset <= 0xFFFFFFFF): 

2194 raise TypeError("pack format 1 only supports offsets < 2Gb") 

2195 f.write(struct.pack(">L20s", offset, name)) 

2196 assert len(pack_checksum) == 20 

2197 f.write(pack_checksum) 

2198 return f.write_sha() 

2199 

2200 

2201def _delta_encode_size(size) -> bytes: 

2202 ret = bytearray() 

2203 c = size & 0x7F 

2204 size >>= 7 

2205 while size: 

2206 ret.append(c | 0x80) 

2207 c = size & 0x7F 

2208 size >>= 7 

2209 ret.append(c) 

2210 return bytes(ret) 

2211 

2212 

2213# The length of delta compression copy operations in version 2 packs is limited 

2214# to 64K. To copy more, we use several copy operations. Version 3 packs allow 

2215# 24-bit lengths in copy operations, but we always make version 2 packs. 

2216_MAX_COPY_LEN = 0xFFFF 

2217 

2218 

2219def _encode_copy_operation(start, length): 

2220 scratch = bytearray([0x80]) 

2221 for i in range(4): 

2222 if start & 0xFF << i * 8: 

2223 scratch.append((start >> i * 8) & 0xFF) 

2224 scratch[0] |= 1 << i 

2225 for i in range(2): 

2226 if length & 0xFF << i * 8: 

2227 scratch.append((length >> i * 8) & 0xFF) 

2228 scratch[0] |= 1 << (4 + i) 

2229 return bytes(scratch) 

2230 

2231 

2232def create_delta(base_buf, target_buf): 

2233 """Use python difflib to work out how to transform base_buf to target_buf. 

2234 

2235 Args: 

2236 base_buf: Base buffer 

2237 target_buf: Target buffer 

2238 """ 

2239 if isinstance(base_buf, list): 

2240 base_buf = b"".join(base_buf) 

2241 if isinstance(target_buf, list): 

2242 target_buf = b"".join(target_buf) 

2243 assert isinstance(base_buf, bytes) 

2244 assert isinstance(target_buf, bytes) 

2245 # write delta header 

2246 yield _delta_encode_size(len(base_buf)) 

2247 yield _delta_encode_size(len(target_buf)) 

2248 # write out delta opcodes 

2249 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf) 

2250 for opcode, i1, i2, j1, j2 in seq.get_opcodes(): 

2251 # Git patch opcodes don't care about deletes! 

2252 # if opcode == 'replace' or opcode == 'delete': 

2253 # pass 

2254 if opcode == "equal": 

2255 # If they are equal, unpacker will use data from base_buf 

2256 # Write out an opcode that says what range to use 

2257 copy_start = i1 

2258 copy_len = i2 - i1 

2259 while copy_len > 0: 

2260 to_copy = min(copy_len, _MAX_COPY_LEN) 

2261 yield _encode_copy_operation(copy_start, to_copy) 

2262 copy_start += to_copy 

2263 copy_len -= to_copy 

2264 if opcode == "replace" or opcode == "insert": 

2265 # If we are replacing a range or adding one, then we just 

2266 # output it to the stream (prefixed by its size) 

2267 s = j2 - j1 

2268 o = j1 

2269 while s > 127: 

2270 yield bytes([127]) 

2271 yield memoryview(target_buf)[o : o + 127] 

2272 s -= 127 

2273 o += 127 

2274 yield bytes([s]) 

2275 yield memoryview(target_buf)[o : o + s] 

2276 

2277 

2278def apply_delta(src_buf, delta): 

2279 """Based on the similar function in git's patch-delta.c. 

2280 

2281 Args: 

2282 src_buf: Source buffer 

2283 delta: Delta instructions 

2284 """ 

2285 if not isinstance(src_buf, bytes): 

2286 src_buf = b"".join(src_buf) 

2287 if not isinstance(delta, bytes): 

2288 delta = b"".join(delta) 

2289 out = [] 

2290 index = 0 

2291 delta_length = len(delta) 

2292 

2293 def get_delta_header_size(delta, index): 

2294 size = 0 

2295 i = 0 

2296 while delta: 

2297 cmd = ord(delta[index : index + 1]) 

2298 index += 1 

2299 size |= (cmd & ~0x80) << i 

2300 i += 7 

2301 if not cmd & 0x80: 

2302 break 

2303 return size, index 

2304 

2305 src_size, index = get_delta_header_size(delta, index) 

2306 dest_size, index = get_delta_header_size(delta, index) 

2307 assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf)) 

2308 while index < delta_length: 

2309 cmd = ord(delta[index : index + 1]) 

2310 index += 1 

2311 if cmd & 0x80: 

2312 cp_off = 0 

2313 for i in range(4): 

2314 if cmd & (1 << i): 

2315 x = ord(delta[index : index + 1]) 

2316 index += 1 

2317 cp_off |= x << (i * 8) 

2318 cp_size = 0 

2319 # Version 3 packs can contain copy sizes larger than 64K. 

2320 for i in range(3): 

2321 if cmd & (1 << (4 + i)): 

2322 x = ord(delta[index : index + 1]) 

2323 index += 1 

2324 cp_size |= x << (i * 8) 

2325 if cp_size == 0: 

2326 cp_size = 0x10000 

2327 if ( 

2328 cp_off + cp_size < cp_size 

2329 or cp_off + cp_size > src_size 

2330 or cp_size > dest_size 

2331 ): 

2332 break 

2333 out.append(src_buf[cp_off : cp_off + cp_size]) 

2334 elif cmd != 0: 

2335 out.append(delta[index : index + cmd]) 

2336 index += cmd 

2337 else: 

2338 raise ApplyDeltaError("Invalid opcode 0") 

2339 

2340 if index != delta_length: 

2341 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}") 

2342 

2343 if dest_size != chunks_length(out): 

2344 raise ApplyDeltaError("dest size incorrect") 

2345 

2346 return out 

2347 

2348 

2349def write_pack_index_v2( 

2350 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes 

2351) -> bytes: 

2352 """Write a new pack index file. 

2353 

2354 Args: 

2355 f: File-like object to write to 

2356 entries: List of tuples with object name (sha), offset_in_pack, and 

2357 crc32_checksum. 

2358 pack_checksum: Checksum of the pack file. 

2359 Returns: The SHA of the index file written 

2360 """ 

2361 f = SHA1Writer(f) 

2362 f.write(b"\377tOc") # Magic! 

2363 f.write(struct.pack(">L", 2)) 

2364 fan_out_table: Dict[int, int] = defaultdict(lambda: 0) 

2365 for name, offset, entry_checksum in entries: 

2366 fan_out_table[ord(name[:1])] += 1 

2367 # Fan-out table 

2368 largetable: List[int] = [] 

2369 for i in range(0x100): 

2370 f.write(struct.pack(b">L", fan_out_table[i])) 

2371 fan_out_table[i + 1] += fan_out_table[i] 

2372 for name, offset, entry_checksum in entries: 

2373 f.write(name) 

2374 for name, offset, entry_checksum in entries: 

2375 f.write(struct.pack(b">L", entry_checksum)) 

2376 for name, offset, entry_checksum in entries: 

2377 if offset < 2**31: 

2378 f.write(struct.pack(b">L", offset)) 

2379 else: 

2380 f.write(struct.pack(b">L", 2**31 + len(largetable))) 

2381 largetable.append(offset) 

2382 for offset in largetable: 

2383 f.write(struct.pack(b">Q", offset)) 

2384 assert len(pack_checksum) == 20 

2385 f.write(pack_checksum) 

2386 return f.write_sha() 

2387 

2388 

2389write_pack_index = write_pack_index_v2 

2390 

2391 

2392class Pack: 

2393 """A Git pack object.""" 

2394 

2395 _data_load: Optional[Callable[[], PackData]] 

2396 _idx_load: Optional[Callable[[], PackIndex]] 

2397 

2398 _data: Optional[PackData] 

2399 _idx: Optional[PackIndex] 

2400 

2401 def __init__( 

2402 self, basename, resolve_ext_ref: Optional[ResolveExtRefFn] = None 

2403 ) -> None: 

2404 self._basename = basename 

2405 self._data = None 

2406 self._idx = None 

2407 self._idx_path = self._basename + ".idx" 

2408 self._data_path = self._basename + ".pack" 

2409 self._data_load = lambda: PackData(self._data_path) 

2410 self._idx_load = lambda: load_pack_index(self._idx_path) 

2411 self.resolve_ext_ref = resolve_ext_ref 

2412 

2413 @classmethod 

2414 def from_lazy_objects(cls, data_fn, idx_fn): 

2415 """Create a new pack object from callables to load pack data and 

2416 index objects. 

2417 """ 

2418 ret = cls("") 

2419 ret._data_load = data_fn 

2420 ret._idx_load = idx_fn 

2421 return ret 

2422 

2423 @classmethod 

2424 def from_objects(cls, data, idx): 

2425 """Create a new pack object from pack data and index objects.""" 

2426 ret = cls("") 

2427 ret._data = data 

2428 ret._data_load = None 

2429 ret._idx = idx 

2430 ret._idx_load = None 

2431 ret.check_length_and_checksum() 

2432 return ret 

2433 

2434 def name(self): 

2435 """The SHA over the SHAs of the objects in this pack.""" 

2436 return self.index.objects_sha1() 

2437 

2438 @property 

2439 def data(self) -> PackData: 

2440 """The pack data object being used.""" 

2441 if self._data is None: 

2442 assert self._data_load 

2443 self._data = self._data_load() 

2444 self.check_length_and_checksum() 

2445 return self._data 

2446 

2447 @property 

2448 def index(self) -> PackIndex: 

2449 """The index being used. 

2450 

2451 Note: This may be an in-memory index 

2452 """ 

2453 if self._idx is None: 

2454 assert self._idx_load 

2455 self._idx = self._idx_load() 

2456 return self._idx 

2457 

2458 def close(self): 

2459 if self._data is not None: 

2460 self._data.close() 

2461 if self._idx is not None: 

2462 self._idx.close() 

2463 

2464 def __enter__(self): 

2465 return self 

2466 

2467 def __exit__(self, exc_type, exc_val, exc_tb): 

2468 self.close() 

2469 

2470 def __eq__(self, other): 

2471 return isinstance(self, type(other)) and self.index == other.index 

2472 

2473 def __len__(self) -> int: 

2474 """Number of entries in this pack.""" 

2475 return len(self.index) 

2476 

2477 def __repr__(self) -> str: 

2478 return f"{self.__class__.__name__}({self._basename!r})" 

2479 

2480 def __iter__(self): 

2481 """Iterate over all the sha1s of the objects in this pack.""" 

2482 return iter(self.index) 

2483 

2484 def check_length_and_checksum(self) -> None: 

2485 """Sanity check the length and checksum of the pack index and data.""" 

2486 assert len(self.index) == len( 

2487 self.data 

2488 ), f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)" 

2489 idx_stored_checksum = self.index.get_pack_checksum() 

2490 data_stored_checksum = self.data.get_stored_checksum() 

2491 if idx_stored_checksum != data_stored_checksum: 

2492 raise ChecksumMismatch( 

2493 sha_to_hex(idx_stored_checksum), 

2494 sha_to_hex(data_stored_checksum), 

2495 ) 

2496 

2497 def check(self) -> None: 

2498 """Check the integrity of this pack. 

2499 

2500 Raises: 

2501 ChecksumMismatch: if a checksum for the index or data is wrong 

2502 """ 

2503 self.index.check() 

2504 self.data.check() 

2505 for obj in self.iterobjects(): 

2506 obj.check() 

2507 # TODO: object connectivity checks 

2508 

2509 def get_stored_checksum(self) -> bytes: 

2510 return self.data.get_stored_checksum() 

2511 

2512 def pack_tuples(self): 

2513 return [(o, None) for o in self.iterobjects()] 

2514 

2515 def __contains__(self, sha1: bytes) -> bool: 

2516 """Check whether this pack contains a particular SHA1.""" 

2517 try: 

2518 self.index.object_offset(sha1) 

2519 return True 

2520 except KeyError: 

2521 return False 

2522 

2523 def get_raw(self, sha1: bytes) -> Tuple[int, bytes]: 

2524 offset = self.index.object_offset(sha1) 

2525 obj_type, obj = self.data.get_object_at(offset) 

2526 type_num, chunks = self.resolve_object(offset, obj_type, obj) 

2527 return type_num, b"".join(chunks) 

2528 

2529 def __getitem__(self, sha1: bytes) -> ShaFile: 

2530 """Retrieve the specified SHA1.""" 

2531 type, uncomp = self.get_raw(sha1) 

2532 return ShaFile.from_raw_string(type, uncomp, sha=sha1) 

2533 

2534 def iterobjects(self) -> Iterator[ShaFile]: 

2535 """Iterate over the objects in this pack.""" 

2536 return iter( 

2537 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref) 

2538 ) 

2539 

2540 def iterobjects_subset( 

2541 self, shas: Iterable[ObjectID], *, allow_missing: bool = False 

2542 ) -> Iterator[ShaFile]: 

2543 return ( 

2544 uo 

2545 for uo in PackInflater.for_pack_subset( 

2546 self, 

2547 shas, 

2548 allow_missing=allow_missing, 

2549 resolve_ext_ref=self.resolve_ext_ref, 

2550 ) 

2551 if uo.id in shas 

2552 ) 

2553 

2554 def iter_unpacked_subset( 

2555 self, 

2556 shas: Iterable[ObjectID], 

2557 *, 

2558 include_comp: bool = False, 

2559 allow_missing: bool = False, 

2560 convert_ofs_delta: bool = False, 

2561 ) -> Iterator[UnpackedObject]: 

2562 ofs_pending: Dict[int, List[UnpackedObject]] = defaultdict(list) 

2563 ofs: Dict[bytes, int] = {} 

2564 todo = set(shas) 

2565 for unpacked in self.iter_unpacked(include_comp=include_comp): 

2566 sha = unpacked.sha() 

2567 ofs[unpacked.offset] = sha 

2568 hexsha = sha_to_hex(sha) 

2569 if hexsha in todo: 

2570 if unpacked.pack_type_num == OFS_DELTA: 

2571 assert isinstance(unpacked.delta_base, int) 

2572 base_offset = unpacked.offset - unpacked.delta_base 

2573 try: 

2574 unpacked.delta_base = ofs[base_offset] 

2575 except KeyError: 

2576 ofs_pending[base_offset].append(unpacked) 

2577 continue 

2578 else: 

2579 unpacked.pack_type_num = REF_DELTA 

2580 yield unpacked 

2581 todo.remove(hexsha) 

2582 for child in ofs_pending.pop(unpacked.offset, []): 

2583 child.pack_type_num = REF_DELTA 

2584 child.delta_base = sha 

2585 yield child 

2586 assert not ofs_pending 

2587 if not allow_missing and todo: 

2588 raise UnresolvedDeltas(todo) 

2589 

2590 def iter_unpacked(self, include_comp=False): 

2591 ofs_to_entries = { 

2592 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries() 

2593 } 

2594 for unpacked in self.data.iter_unpacked(include_comp=include_comp): 

2595 (sha, crc32) = ofs_to_entries[unpacked.offset] 

2596 unpacked._sha = sha 

2597 unpacked.crc32 = crc32 

2598 yield unpacked 

2599 

2600 def keep(self, msg: Optional[bytes] = None) -> str: 

2601 """Add a .keep file for the pack, preventing git from garbage collecting it. 

2602 

2603 Args: 

2604 msg: A message written inside the .keep file; can be used later 

2605 to determine whether or not a .keep file is obsolete. 

2606 Returns: The path of the .keep file, as a string. 

2607 """ 

2608 keepfile_name = f"{self._basename}.keep" 

2609 with GitFile(keepfile_name, "wb") as keepfile: 

2610 if msg: 

2611 keepfile.write(msg) 

2612 keepfile.write(b"\n") 

2613 return keepfile_name 

2614 

2615 def get_ref(self, sha: bytes) -> Tuple[Optional[int], int, OldUnpackedObject]: 

2616 """Get the object for a ref SHA, only looking in this pack.""" 

2617 # TODO: cache these results 

2618 try: 

2619 offset = self.index.object_offset(sha) 

2620 except KeyError: 

2621 offset = None 

2622 if offset: 

2623 type, obj = self.data.get_object_at(offset) 

2624 elif self.resolve_ext_ref: 

2625 type, obj = self.resolve_ext_ref(sha) 

2626 else: 

2627 raise KeyError(sha) 

2628 return offset, type, obj 

2629 

2630 def resolve_object( 

2631 self, offset: int, type: int, obj, get_ref=None 

2632 ) -> Tuple[int, Iterable[bytes]]: 

2633 """Resolve an object, possibly resolving deltas when necessary. 

2634 

2635 Returns: Tuple with object type and contents. 

2636 """ 

2637 # Walk down the delta chain, building a stack of deltas to reach 

2638 # the requested object. 

2639 base_offset = offset 

2640 base_type = type 

2641 base_obj = obj 

2642 delta_stack = [] 

2643 while base_type in DELTA_TYPES: 

2644 prev_offset = base_offset 

2645 if get_ref is None: 

2646 get_ref = self.get_ref 

2647 if base_type == OFS_DELTA: 

2648 (delta_offset, delta) = base_obj 

2649 # TODO: clean up asserts and replace with nicer error messages 

2650 base_offset = base_offset - delta_offset 

2651 base_type, base_obj = self.data.get_object_at(base_offset) 

2652 assert isinstance(base_type, int) 

2653 elif base_type == REF_DELTA: 

2654 (basename, delta) = base_obj 

2655 assert isinstance(basename, bytes) and len(basename) == 20 

2656 base_offset, base_type, base_obj = get_ref(basename) 

2657 assert isinstance(base_type, int) 

2658 if base_offset == prev_offset: # object is based on itself 

2659 raise UnresolvedDeltas(sha_to_hex(basename)) 

2660 delta_stack.append((prev_offset, base_type, delta)) 

2661 

2662 # Now grab the base object (mustn't be a delta) and apply the 

2663 # deltas all the way up the stack. 

2664 chunks = base_obj 

2665 for prev_offset, delta_type, delta in reversed(delta_stack): 

2666 chunks = apply_delta(chunks, delta) 

2667 # TODO(dborowitz): This can result in poor performance if 

2668 # large base objects are separated from deltas in the pack. 

2669 # We should reorganize so that we apply deltas to all 

2670 # objects in a chain one after the other to optimize cache 

2671 # performance. 

2672 if prev_offset is not None: 

2673 self.data._offset_cache[prev_offset] = base_type, chunks 

2674 return base_type, chunks 

2675 

2676 def entries( 

2677 self, progress: Optional[ProgressFn] = None 

2678 ) -> Iterator[PackIndexEntry]: 

2679 """Yield entries summarizing the contents of this pack. 

2680 

2681 Args: 

2682 progress: Progress function, called with current and total 

2683 object count. 

2684 Returns: iterator of tuples with (sha, offset, crc32) 

2685 """ 

2686 return self.data.iterentries( 

2687 progress=progress, resolve_ext_ref=self.resolve_ext_ref 

2688 ) 

2689 

2690 def sorted_entries( 

2691 self, progress: Optional[ProgressFn] = None 

2692 ) -> Iterator[PackIndexEntry]: 

2693 """Return entries in this pack, sorted by SHA. 

2694 

2695 Args: 

2696 progress: Progress function, called with current and total 

2697 object count 

2698 Returns: Iterator of tuples with (sha, offset, crc32) 

2699 """ 

2700 return self.data.sorted_entries( 

2701 progress=progress, resolve_ext_ref=self.resolve_ext_ref 

2702 ) 

2703 

2704 def get_unpacked_object( 

2705 self, sha: bytes, *, include_comp: bool = False, convert_ofs_delta: bool = True 

2706 ) -> UnpackedObject: 

2707 """Get the unpacked object for a sha. 

2708 

2709 Args: 

2710 sha: SHA of object to fetch 

2711 include_comp: Whether to include compression data in UnpackedObject 

2712 """ 

2713 offset = self.index.object_offset(sha) 

2714 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp) 

2715 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta: 

2716 assert isinstance(unpacked.delta_base, int) 

2717 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base) 

2718 unpacked.pack_type_num = REF_DELTA 

2719 return unpacked 

2720 

2721 

2722def extend_pack( 

2723 f: BinaryIO, 

2724 object_ids: Set[ObjectID], 

2725 get_raw, 

2726 *, 

2727 compression_level=-1, 

2728 progress=None, 

2729) -> Tuple[bytes, List]: 

2730 """Extend a pack file with more objects. 

2731 

2732 The caller should make sure that object_ids does not contain any objects 

2733 that are already in the pack 

2734 """ 

2735 # Update the header with the new number of objects. 

2736 f.seek(0) 

2737 _version, num_objects = read_pack_header(f.read) 

2738 

2739 if object_ids: 

2740 f.seek(0) 

2741 write_pack_header(f.write, num_objects + len(object_ids)) 

2742 

2743 # Must flush before reading (http://bugs.python.org/issue3207) 

2744 f.flush() 

2745 

2746 # Rescan the rest of the pack, computing the SHA with the new header. 

2747 new_sha = compute_file_sha(f, end_ofs=-20) 

2748 

2749 # Must reposition before writing (http://bugs.python.org/issue3207) 

2750 f.seek(0, os.SEEK_CUR) 

2751 

2752 extra_entries = [] 

2753 

2754 # Complete the pack. 

2755 for i, object_id in enumerate(object_ids): 

2756 if progress is not None: 

2757 progress( 

2758 ("writing extra base objects: %d/%d\r" % (i, len(object_ids))).encode( 

2759 "ascii" 

2760 ) 

2761 ) 

2762 assert len(object_id) == 20 

2763 type_num, data = get_raw(object_id) 

2764 offset = f.tell() 

2765 crc32 = write_pack_object( 

2766 f.write, 

2767 type_num, 

2768 data, 

2769 sha=new_sha, 

2770 compression_level=compression_level, 

2771 ) 

2772 extra_entries.append((object_id, offset, crc32)) 

2773 pack_sha = new_sha.digest() 

2774 f.write(pack_sha) 

2775 return pack_sha, extra_entries 

2776 

2777 

2778try: 

2779 from dulwich._pack import ( # type: ignore 

2780 apply_delta, # type: ignore 

2781 bisect_find_sha, # type: ignore 

2782 ) 

2783except ImportError: 

2784 pass