Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/pack.py: 23%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1667 statements  

1# pack.py -- For dealing with packed git objects. 

2# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net> 

3# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk> 

4# 

5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 

6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU 

7# General Public License as published by the Free Software Foundation; version 2.0 

8# or (at your option) any later version. You can redistribute it and/or 

9# modify it under the terms of either of these two licenses. 

10# 

11# Unless required by applicable law or agreed to in writing, software 

12# distributed under the License is distributed on an "AS IS" BASIS, 

13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

14# See the License for the specific language governing permissions and 

15# limitations under the License. 

16# 

17# You should have received a copy of the licenses; if not, see 

18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License 

19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache 

20# License, Version 2.0. 

21# 

22 

23"""Classes for dealing with packed git objects. 

24 

25A pack is a compact representation of a bunch of objects, stored 

26using deltas where possible. 

27 

28They have two parts, the pack file, which stores the data, and an index 

29that tells you where the data is. 

30 

31To find an object you look in all of the index files 'til you find a 

32match for the object name. You then use the pointer got from this as 

33a pointer in to the corresponding packfile. 

34""" 

35 

36__all__ = [ 

37 "DEFAULT_PACK_DELTA_WINDOW_SIZE", 

38 "DEFAULT_PACK_INDEX_VERSION", 

39 "DELTA_TYPES", 

40 "OFS_DELTA", 

41 "PACK_SPOOL_FILE_MAX_SIZE", 

42 "REF_DELTA", 

43 "DeltaChainIterator", 

44 "FilePackIndex", 

45 "MemoryPackIndex", 

46 "ObjectContainer", 

47 "Pack", 

48 "PackChunkGenerator", 

49 "PackData", 

50 "PackFileDisappeared", 

51 "PackHint", 

52 "PackIndex", 

53 "PackIndex1", 

54 "PackIndex2", 

55 "PackIndex3", 

56 "PackIndexEntry", 

57 "PackIndexer", 

58 "PackInflater", 

59 "PackStreamCopier", 

60 "PackStreamReader", 

61 "PackedObjectContainer", 

62 "SHA1Reader", 

63 "SHA1Writer", 

64 "UnpackedObject", 

65 "UnpackedObjectIterator", 

66 "UnpackedObjectStream", 

67 "UnresolvedDeltas", 

68 "apply_delta", 

69 "bisect_find_sha", 

70 "chunks_length", 

71 "compute_file_sha", 

72 "deltas_from_sorted_objects", 

73 "deltify_pack_objects", 

74 "extend_pack", 

75 "find_reusable_deltas", 

76 "full_unpacked_object", 

77 "generate_unpacked_objects", 

78 "iter_sha1", 

79 "load_pack_index", 

80 "load_pack_index_file", 

81 "obj_sha", 

82 "pack_header_chunks", 

83 "pack_object_chunks", 

84 "pack_object_header", 

85 "pack_objects_to_data", 

86 "read_pack_header", 

87 "read_zlib_chunks", 

88 "sort_objects_for_delta", 

89 "take_msb_bytes", 

90 "unpack_object", 

91 "write_pack", 

92 "write_pack_data", 

93 "write_pack_from_container", 

94 "write_pack_header", 

95 "write_pack_index", 

96 "write_pack_object", 

97 "write_pack_objects", 

98] 

99 

100import binascii 

101from collections import defaultdict, deque 

102from contextlib import suppress 

103from io import BytesIO, UnsupportedOperation 

104 

105try: 

106 from cdifflib import CSequenceMatcher as SequenceMatcher 

107except ModuleNotFoundError: 

108 from difflib import SequenceMatcher 

109 

110import os 

111import struct 

112import sys 

113import warnings 

114import zlib 

115from collections.abc import Callable, Iterable, Iterator, Sequence, Set 

116from hashlib import sha1 

117from itertools import chain 

118from os import SEEK_CUR, SEEK_END 

119from struct import unpack_from 

120from types import TracebackType 

121from typing import ( 

122 IO, 

123 TYPE_CHECKING, 

124 Any, 

125 BinaryIO, 

126 Generic, 

127 Protocol, 

128 TypeVar, 

129 cast, 

130) 

131 

132try: 

133 import mmap 

134except ImportError: 

135 has_mmap = False 

136else: 

137 has_mmap = True 

138 

139if TYPE_CHECKING: 

140 from _hashlib import HASH as HashObject 

141 

142 from .bitmap import PackBitmap 

143 from .commit_graph import CommitGraph 

144 from .object_store import BaseObjectStore 

145 from .refs import Ref 

146 

147# For some reason the above try, except fails to set has_mmap = False for plan9 

148if sys.platform == "Plan9": 

149 has_mmap = False 

150 

151from . import replace_me 

152from .errors import ApplyDeltaError, ChecksumMismatch 

153from .file import GitFile, _GitFile 

154from .lru_cache import LRUSizeCache 

155from .objects import ( 

156 ObjectID, 

157 RawObjectID, 

158 ShaFile, 

159 hex_to_sha, 

160 object_header, 

161 sha_to_hex, 

162) 

163 

164OFS_DELTA = 6 

165REF_DELTA = 7 

166 

167DELTA_TYPES = (OFS_DELTA, REF_DELTA) 

168 

169 

170DEFAULT_PACK_DELTA_WINDOW_SIZE = 10 

171 

172# Keep pack files under 16Mb in memory, otherwise write them out to disk 

173PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024 

174 

175# Default pack index version to use when none is specified 

176DEFAULT_PACK_INDEX_VERSION = 2 

177 

178 

179OldUnpackedObject = tuple[bytes | int, list[bytes]] | list[bytes] 

180ResolveExtRefFn = Callable[[bytes], tuple[int, OldUnpackedObject]] 

181ProgressFn = Callable[[int, str], None] 

182PackHint = tuple[int, bytes | None] 

183 

184 

185class UnresolvedDeltas(Exception): 

186 """Delta objects could not be resolved.""" 

187 

188 def __init__(self, shas: list[bytes]) -> None: 

189 """Initialize UnresolvedDeltas exception. 

190 

191 Args: 

192 shas: List of SHA hashes for unresolved delta objects 

193 """ 

194 self.shas = shas 

195 

196 

197class ObjectContainer(Protocol): 

198 """Protocol for objects that can contain git objects.""" 

199 

200 def add_object(self, obj: ShaFile) -> None: 

201 """Add a single object to this object store.""" 

202 

203 def add_objects( 

204 self, 

205 objects: Sequence[tuple[ShaFile, str | None]], 

206 progress: Callable[..., None] | None = None, 

207 ) -> "Pack | None": 

208 """Add a set of objects to this object store. 

209 

210 Args: 

211 objects: Iterable over a list of (object, path) tuples 

212 progress: Progress callback for object insertion 

213 Returns: Optional Pack object of the objects written. 

214 """ 

215 

216 def __contains__(self, sha1: "ObjectID") -> bool: 

217 """Check if a hex sha is present.""" 

218 

219 def __getitem__(self, sha1: "ObjectID | RawObjectID") -> ShaFile: 

220 """Retrieve an object.""" 

221 

222 def get_commit_graph(self) -> "CommitGraph | None": 

223 """Get the commit graph for this object store. 

224 

225 Returns: 

226 CommitGraph object if available, None otherwise 

227 """ 

228 return None 

229 

230 

231class PackedObjectContainer(ObjectContainer): 

232 """Container for objects packed in a pack file.""" 

233 

234 def get_unpacked_object( 

235 self, sha1: "ObjectID | RawObjectID", *, include_comp: bool = False 

236 ) -> "UnpackedObject": 

237 """Get a raw unresolved object. 

238 

239 Args: 

240 sha1: SHA-1 hash of the object 

241 include_comp: Whether to include compressed data 

242 

243 Returns: 

244 UnpackedObject instance 

245 """ 

246 raise NotImplementedError(self.get_unpacked_object) 

247 

248 def iterobjects_subset( 

249 self, shas: Iterable["ObjectID"], *, allow_missing: bool = False 

250 ) -> Iterator[ShaFile]: 

251 """Iterate over a subset of objects. 

252 

253 Args: 

254 shas: Iterable of object SHAs to retrieve 

255 allow_missing: If True, skip missing objects 

256 

257 Returns: 

258 Iterator of ShaFile objects 

259 """ 

260 raise NotImplementedError(self.iterobjects_subset) 

261 

262 def iter_unpacked_subset( 

263 self, 

264 shas: Iterable["ObjectID | RawObjectID"], 

265 *, 

266 include_comp: bool = False, 

267 allow_missing: bool = False, 

268 convert_ofs_delta: bool = True, 

269 ) -> Iterator["UnpackedObject"]: 

270 """Iterate over unpacked objects from a subset of SHAs. 

271 

272 Args: 

273 shas: Set of object SHAs to retrieve 

274 include_comp: Include compressed data if True 

275 allow_missing: If True, skip missing objects 

276 convert_ofs_delta: If True, convert offset deltas to ref deltas 

277 

278 Returns: 

279 Iterator of UnpackedObject instances 

280 """ 

281 raise NotImplementedError(self.iter_unpacked_subset) 

282 

283 

284class UnpackedObjectStream: 

285 """Abstract base class for a stream of unpacked objects.""" 

286 

287 def __iter__(self) -> Iterator["UnpackedObject"]: 

288 """Iterate over unpacked objects.""" 

289 raise NotImplementedError(self.__iter__) 

290 

291 def __len__(self) -> int: 

292 """Return the number of objects in the stream.""" 

293 raise NotImplementedError(self.__len__) 

294 

295 

296def take_msb_bytes( 

297 read: Callable[[int], bytes], crc32: int | None = None 

298) -> tuple[list[int], int | None]: 

299 """Read bytes marked with most significant bit. 

300 

301 Args: 

302 read: Read function 

303 crc32: Optional CRC32 checksum to update 

304 

305 Returns: 

306 Tuple of (list of bytes read, updated CRC32 or None) 

307 """ 

308 ret: list[int] = [] 

309 while len(ret) == 0 or ret[-1] & 0x80: 

310 b = read(1) 

311 if crc32 is not None: 

312 crc32 = binascii.crc32(b, crc32) 

313 ret.append(ord(b[:1])) 

314 return ret, crc32 

315 

316 

317class PackFileDisappeared(Exception): 

318 """Raised when a pack file unexpectedly disappears.""" 

319 

320 def __init__(self, obj: object) -> None: 

321 """Initialize PackFileDisappeared exception. 

322 

323 Args: 

324 obj: The object that triggered the exception 

325 """ 

326 self.obj = obj 

327 

328 

329class UnpackedObject: 

330 """Class encapsulating an object unpacked from a pack file. 

331 

332 These objects should only be created from within unpack_object. Most 

333 members start out as empty and are filled in at various points by 

334 read_zlib_chunks, unpack_object, DeltaChainIterator, etc. 

335 

336 End users of this object should take care that the function they're getting 

337 this object from is guaranteed to set the members they need. 

338 """ 

339 

340 __slots__ = [ 

341 "_sha", # Cached binary SHA. 

342 "comp_chunks", # Compressed object chunks. 

343 "crc32", # CRC32. 

344 "decomp_chunks", # Decompressed object chunks. 

345 "decomp_len", # Decompressed length of this object. 

346 "delta_base", # Delta base offset or SHA. 

347 "obj_chunks", # Decompressed and delta-resolved chunks. 

348 "obj_type_num", # Type of this object. 

349 "offset", # Offset in its pack. 

350 "pack_type_num", # Type of this object in the pack (may be a delta). 

351 ] 

352 

353 obj_type_num: int | None 

354 obj_chunks: list[bytes] | None 

355 delta_base: None | bytes | int 

356 decomp_chunks: list[bytes] 

357 comp_chunks: list[bytes] | None 

358 decomp_len: int | None 

359 crc32: int | None 

360 offset: int | None 

361 pack_type_num: int 

362 _sha: bytes | None 

363 

364 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be 

365 # methods of this object. 

366 def __init__( 

367 self, 

368 pack_type_num: int, 

369 *, 

370 delta_base: None | bytes | int = None, 

371 decomp_len: int | None = None, 

372 crc32: int | None = None, 

373 sha: bytes | None = None, 

374 decomp_chunks: list[bytes] | None = None, 

375 offset: int | None = None, 

376 ) -> None: 

377 """Initialize an UnpackedObject. 

378 

379 Args: 

380 pack_type_num: Type number of this object in the pack 

381 delta_base: Delta base (offset or SHA) if this is a delta object 

382 decomp_len: Decompressed length of this object 

383 crc32: CRC32 checksum 

384 sha: SHA-1 hash of the object 

385 decomp_chunks: Decompressed chunks 

386 offset: Offset in the pack file 

387 """ 

388 self.offset = offset 

389 self._sha = sha 

390 self.pack_type_num = pack_type_num 

391 self.delta_base = delta_base 

392 self.comp_chunks = None 

393 self.decomp_chunks: list[bytes] = decomp_chunks or [] 

394 if decomp_chunks is not None and decomp_len is None: 

395 self.decomp_len = sum(map(len, decomp_chunks)) 

396 else: 

397 self.decomp_len = decomp_len 

398 self.crc32 = crc32 

399 

400 if pack_type_num in DELTA_TYPES: 

401 self.obj_type_num = None 

402 self.obj_chunks = None 

403 else: 

404 self.obj_type_num = pack_type_num 

405 self.obj_chunks = self.decomp_chunks 

406 self.delta_base = delta_base 

407 

408 def sha(self) -> RawObjectID: 

409 """Return the binary SHA of this object.""" 

410 if self._sha is None: 

411 assert self.obj_type_num is not None and self.obj_chunks is not None 

412 self._sha = obj_sha(self.obj_type_num, self.obj_chunks) 

413 return RawObjectID(self._sha) 

414 

415 def sha_file(self) -> ShaFile: 

416 """Return a ShaFile from this object.""" 

417 assert self.obj_type_num is not None and self.obj_chunks is not None 

418 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks) 

419 

420 # Only provided for backwards compatibility with code that expects either 

421 # chunks or a delta tuple. 

422 def _obj(self) -> OldUnpackedObject: 

423 """Return the decompressed chunks, or (delta base, delta chunks).""" 

424 if self.pack_type_num in DELTA_TYPES: 

425 assert isinstance(self.delta_base, (bytes, int)) 

426 return (self.delta_base, self.decomp_chunks) 

427 else: 

428 return self.decomp_chunks 

429 

430 def __eq__(self, other: object) -> bool: 

431 """Check equality with another UnpackedObject.""" 

432 if not isinstance(other, UnpackedObject): 

433 return False 

434 for slot in self.__slots__: 

435 if getattr(self, slot) != getattr(other, slot): 

436 return False 

437 return True 

438 

439 def __ne__(self, other: object) -> bool: 

440 """Check inequality with another UnpackedObject.""" 

441 return not (self == other) 

442 

443 def __repr__(self) -> str: 

444 """Return string representation of this UnpackedObject.""" 

445 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__] 

446 return "{}({})".format(self.__class__.__name__, ", ".join(data)) 

447 

448 

449_ZLIB_BUFSIZE = 65536 # 64KB buffer for better I/O performance 

450 

451 

452def read_zlib_chunks( 

453 read_some: Callable[[int], bytes], 

454 unpacked: UnpackedObject, 

455 include_comp: bool = False, 

456 buffer_size: int = _ZLIB_BUFSIZE, 

457) -> bytes: 

458 """Read zlib data from a buffer. 

459 

460 This function requires that the buffer have additional data following the 

461 compressed data, which is guaranteed to be the case for git pack files. 

462 

463 Args: 

464 read_some: Read function that returns at least one byte, but may 

465 return less than the requested size. 

466 unpacked: An UnpackedObject to write result data to. If its crc32 

467 attr is not None, the CRC32 of the compressed bytes will be computed 

468 using this starting CRC32. 

469 After this function, will have the following attrs set: 

470 * comp_chunks (if include_comp is True) 

471 * decomp_chunks 

472 * decomp_len 

473 * crc32 

474 include_comp: If True, include compressed data in the result. 

475 buffer_size: Size of the read buffer. 

476 Returns: Leftover unused data from the decompression. 

477 

478 Raises: 

479 zlib.error: if a decompression error occurred. 

480 """ 

481 if unpacked.decomp_len is None or unpacked.decomp_len <= -1: 

482 raise ValueError("non-negative zlib data stream size expected") 

483 decomp_obj = zlib.decompressobj() 

484 

485 comp_chunks = [] 

486 decomp_chunks = unpacked.decomp_chunks 

487 decomp_len = 0 

488 crc32 = unpacked.crc32 

489 

490 while True: 

491 add = read_some(buffer_size) 

492 if not add: 

493 raise zlib.error("EOF before end of zlib stream") 

494 comp_chunks.append(add) 

495 decomp = decomp_obj.decompress(add) 

496 decomp_len += len(decomp) 

497 decomp_chunks.append(decomp) 

498 unused = decomp_obj.unused_data 

499 if unused: 

500 left = len(unused) 

501 if crc32 is not None: 

502 crc32 = binascii.crc32(add[:-left], crc32) 

503 if include_comp: 

504 comp_chunks[-1] = add[:-left] 

505 break 

506 elif crc32 is not None: 

507 crc32 = binascii.crc32(add, crc32) 

508 if crc32 is not None: 

509 crc32 &= 0xFFFFFFFF 

510 

511 if decomp_len != unpacked.decomp_len: 

512 raise zlib.error("decompressed data does not match expected size") 

513 

514 unpacked.crc32 = crc32 

515 if include_comp: 

516 unpacked.comp_chunks = comp_chunks 

517 return unused 

518 

519 

520def iter_sha1(iter: Iterable[bytes]) -> bytes: 

521 """Return the hexdigest of the SHA1 over a set of names. 

522 

523 Args: 

524 iter: Iterator over string objects 

525 Returns: 40-byte hex sha1 digest 

526 """ 

527 sha = sha1() 

528 for name in iter: 

529 sha.update(name) 

530 return sha.hexdigest().encode("ascii") 

531 

532 

533def load_pack_index(path: str | os.PathLike[str]) -> "PackIndex": 

534 """Load an index file by path. 

535 

536 Args: 

537 path: Path to the index file 

538 Returns: A PackIndex loaded from the given path 

539 """ 

540 with GitFile(path, "rb") as f: 

541 return load_pack_index_file(path, f) 

542 

543 

544def _load_file_contents( 

545 f: IO[bytes] | _GitFile, size: int | None = None 

546) -> tuple[bytes | Any, int]: 

547 """Load contents from a file, preferring mmap when possible. 

548 

549 Args: 

550 f: File-like object to load 

551 size: Expected size, or None to determine from file 

552 Returns: Tuple of (contents, size) 

553 """ 

554 try: 

555 fd = f.fileno() 

556 except (UnsupportedOperation, AttributeError): 

557 fd = None 

558 # Attempt to use mmap if possible 

559 if fd is not None: 

560 if size is None: 

561 size = os.fstat(fd).st_size 

562 if has_mmap: 

563 try: 

564 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ) 

565 except (OSError, ValueError): 

566 # Can't mmap - perhaps a socket or invalid file descriptor 

567 pass 

568 else: 

569 return contents, size 

570 contents_bytes = f.read() 

571 size = len(contents_bytes) 

572 return contents_bytes, size 

573 

574 

575def load_pack_index_file( 

576 path: str | os.PathLike[str], f: IO[bytes] | _GitFile 

577) -> "PackIndex": 

578 """Load an index file from a file-like object. 

579 

580 Args: 

581 path: Path for the index file 

582 f: File-like object 

583 Returns: A PackIndex loaded from the given file 

584 """ 

585 contents, size = _load_file_contents(f) 

586 if contents[:4] == b"\377tOc": 

587 version = struct.unpack(b">L", contents[4:8])[0] 

588 if version == 2: 

589 return PackIndex2(path, file=f, contents=contents, size=size) 

590 elif version == 3: 

591 return PackIndex3(path, file=f, contents=contents, size=size) 

592 else: 

593 raise KeyError(f"Unknown pack index format {version}") 

594 else: 

595 return PackIndex1(path, file=f, contents=contents, size=size) 

596 

597 

598def bisect_find_sha( 

599 start: int, end: int, sha: bytes, unpack_name: Callable[[int], bytes] 

600) -> int | None: 

601 """Find a SHA in a data blob with sorted SHAs. 

602 

603 Args: 

604 start: Start index of range to search 

605 end: End index of range to search 

606 sha: Sha to find 

607 unpack_name: Callback to retrieve SHA by index 

608 Returns: Index of the SHA, or None if it wasn't found 

609 """ 

610 assert start <= end 

611 while start <= end: 

612 i = (start + end) // 2 

613 file_sha = unpack_name(i) 

614 if file_sha < sha: 

615 start = i + 1 

616 elif file_sha > sha: 

617 end = i - 1 

618 else: 

619 return i 

620 return None 

621 

622 

623PackIndexEntry = tuple[RawObjectID, int, int | None] 

624 

625 

626class PackIndex: 

627 """An index in to a packfile. 

628 

629 Given a sha id of an object a pack index can tell you the location in the 

630 packfile of that object if it has it. 

631 """ 

632 

633 # Default to SHA-1 for backward compatibility 

634 hash_algorithm = 1 

635 hash_size = 20 

636 

637 def __eq__(self, other: object) -> bool: 

638 """Check equality with another PackIndex.""" 

639 if not isinstance(other, PackIndex): 

640 return False 

641 

642 for (name1, _, _), (name2, _, _) in zip( 

643 self.iterentries(), other.iterentries() 

644 ): 

645 if name1 != name2: 

646 return False 

647 return True 

648 

649 def __ne__(self, other: object) -> bool: 

650 """Check if this pack index is not equal to another.""" 

651 return not self.__eq__(other) 

652 

653 def __len__(self) -> int: 

654 """Return the number of entries in this pack index.""" 

655 raise NotImplementedError(self.__len__) 

656 

657 def __iter__(self) -> Iterator[ObjectID]: 

658 """Iterate over the SHAs in this pack.""" 

659 return map(lambda sha: sha_to_hex(RawObjectID(sha)), self._itersha()) 

660 

661 def iterentries(self) -> Iterator[PackIndexEntry]: 

662 """Iterate over the entries in this pack index. 

663 

664 Returns: iterator over tuples with object name, offset in packfile and 

665 crc32 checksum. 

666 """ 

667 raise NotImplementedError(self.iterentries) 

668 

669 def get_pack_checksum(self) -> bytes | None: 

670 """Return the SHA1 checksum stored for the corresponding packfile. 

671 

672 Returns: 20-byte binary digest, or None if not available 

673 """ 

674 raise NotImplementedError(self.get_pack_checksum) 

675 

676 @replace_me(since="0.21.0", remove_in="0.23.0") 

677 def object_index(self, sha: ObjectID | RawObjectID) -> int: 

678 """Return the index for the given SHA. 

679 

680 Args: 

681 sha: SHA-1 hash 

682 

683 Returns: 

684 Index position 

685 """ 

686 return self.object_offset(sha) 

687 

688 def object_offset(self, sha: ObjectID | RawObjectID) -> int: 

689 """Return the offset in to the corresponding packfile for the object. 

690 

691 Given the name of an object it will return the offset that object 

692 lives at within the corresponding pack file. If the pack file doesn't 

693 have the object then None will be returned. 

694 """ 

695 raise NotImplementedError(self.object_offset) 

696 

697 def object_sha1(self, index: int) -> bytes: 

698 """Return the SHA1 corresponding to the index in the pack file.""" 

699 for name, offset, _crc32 in self.iterentries(): 

700 if offset == index: 

701 return name 

702 else: 

703 raise KeyError(index) 

704 

705 def _object_offset(self, sha: bytes) -> int: 

706 """See object_offset. 

707 

708 Args: 

709 sha: A *binary* SHA string. (20 characters long)_ 

710 """ 

711 raise NotImplementedError(self._object_offset) 

712 

713 def objects_sha1(self) -> bytes: 

714 """Return the hex SHA1 over all the shas of all objects in this pack. 

715 

716 Note: This is used for the filename of the pack. 

717 """ 

718 return iter_sha1(self._itersha()) 

719 

720 def _itersha(self) -> Iterator[bytes]: 

721 """Yield all the SHA1's of the objects in the index, sorted.""" 

722 raise NotImplementedError(self._itersha) 

723 

724 def iter_prefix(self, prefix: bytes) -> Iterator[RawObjectID]: 

725 """Iterate over all SHA1s with the given prefix. 

726 

727 Args: 

728 prefix: Binary prefix to match 

729 Returns: Iterator of matching SHA1s 

730 """ 

731 # Default implementation for PackIndex classes that don't override 

732 for sha, _, _ in self.iterentries(): 

733 if sha.startswith(prefix): 

734 yield RawObjectID(sha) 

735 

736 def close(self) -> None: 

737 """Close any open files.""" 

738 

739 def check(self) -> None: 

740 """Check the consistency of this pack index.""" 

741 

742 

743class MemoryPackIndex(PackIndex): 

744 """Pack index that is stored entirely in memory.""" 

745 

746 def __init__( 

747 self, 

748 entries: list[PackIndexEntry], 

749 pack_checksum: bytes | None = None, 

750 ) -> None: 

751 """Create a new MemoryPackIndex. 

752 

753 Args: 

754 entries: Sequence of name, idx, crc32 (sorted) 

755 pack_checksum: Optional pack checksum 

756 """ 

757 self._by_sha = {} 

758 self._by_offset = {} 

759 for name, offset, _crc32 in entries: 

760 self._by_sha[name] = offset 

761 self._by_offset[offset] = name 

762 self._entries = entries 

763 self._pack_checksum = pack_checksum 

764 

765 def get_pack_checksum(self) -> bytes | None: 

766 """Return the SHA checksum stored for the corresponding packfile.""" 

767 return self._pack_checksum 

768 

769 def __len__(self) -> int: 

770 """Return the number of entries in this pack index.""" 

771 return len(self._entries) 

772 

773 def object_offset(self, sha: ObjectID | RawObjectID) -> int: 

774 """Return the offset for the given SHA. 

775 

776 Args: 

777 sha: SHA to look up (binary or hex) 

778 Returns: Offset in the pack file 

779 """ 

780 if len(sha) == 40: 

781 sha = hex_to_sha(cast(ObjectID, sha)) 

782 return self._by_sha[cast(RawObjectID, sha)] 

783 

784 def object_sha1(self, offset: int) -> bytes: 

785 """Return the SHA1 for the object at the given offset.""" 

786 return self._by_offset[offset] 

787 

788 def _itersha(self) -> Iterator[bytes]: 

789 """Iterate over all SHA1s in the index.""" 

790 return iter(self._by_sha) 

791 

792 def iterentries(self) -> Iterator[PackIndexEntry]: 

793 """Iterate over all index entries.""" 

794 return iter(self._entries) 

795 

796 @classmethod 

797 def for_pack(cls, pack_data: "PackData") -> "MemoryPackIndex": 

798 """Create a MemoryPackIndex from a PackData object.""" 

799 return MemoryPackIndex( 

800 list(pack_data.sorted_entries()), pack_data.get_stored_checksum() 

801 ) 

802 

803 @classmethod 

804 def clone(cls, other_index: "PackIndex") -> "MemoryPackIndex": 

805 """Create a copy of another PackIndex in memory.""" 

806 return cls(list(other_index.iterentries()), other_index.get_pack_checksum()) 

807 

808 

809class FilePackIndex(PackIndex): 

810 """Pack index that is based on a file. 

811 

812 To do the loop it opens the file, and indexes first 256 4 byte groups 

813 with the first byte of the sha id. The value in the four byte group indexed 

814 is the end of the group that shares the same starting byte. Subtract one 

815 from the starting byte and index again to find the start of the group. 

816 The values are sorted by sha id within the group, so do the math to find 

817 the start and end offset and then bisect in to find if the value is 

818 present. 

819 """ 

820 

821 _fan_out_table: list[int] 

822 _file: IO[bytes] | _GitFile 

823 

824 def __init__( 

825 self, 

826 filename: str | os.PathLike[str], 

827 file: IO[bytes] | _GitFile | None = None, 

828 contents: "bytes | mmap.mmap | None" = None, 

829 size: int | None = None, 

830 ) -> None: 

831 """Create a pack index object. 

832 

833 Provide it with the name of the index file to consider, and it will map 

834 it whenever required. 

835 """ 

836 self._filename = filename 

837 # Take the size now, so it can be checked each time we map the file to 

838 # ensure that it hasn't changed. 

839 if file is None: 

840 self._file = GitFile(filename, "rb") 

841 else: 

842 self._file = file 

843 if contents is None: 

844 self._contents, self._size = _load_file_contents(self._file, size) 

845 else: 

846 self._contents = contents 

847 self._size = size if size is not None else len(contents) 

848 

849 @property 

850 def path(self) -> str: 

851 """Return the path to this index file.""" 

852 return os.fspath(self._filename) 

853 

854 def __eq__(self, other: object) -> bool: 

855 """Check equality with another FilePackIndex.""" 

856 # Quick optimization: 

857 if ( 

858 isinstance(other, FilePackIndex) 

859 and self._fan_out_table != other._fan_out_table 

860 ): 

861 return False 

862 

863 return super().__eq__(other) 

864 

865 def close(self) -> None: 

866 """Close the underlying file and any mmap.""" 

867 self._file.close() 

868 close_fn = getattr(self._contents, "close", None) 

869 if close_fn is not None: 

870 close_fn() 

871 

872 def __len__(self) -> int: 

873 """Return the number of entries in this pack index.""" 

874 return self._fan_out_table[-1] 

875 

876 def _unpack_entry(self, i: int) -> PackIndexEntry: 

877 """Unpack the i-th entry in the index file. 

878 

879 Returns: Tuple with object name (SHA), offset in pack file and CRC32 

880 checksum (if known). 

881 """ 

882 raise NotImplementedError(self._unpack_entry) 

883 

884 def _unpack_name(self, i: int) -> bytes: 

885 """Unpack the i-th name from the index file.""" 

886 raise NotImplementedError(self._unpack_name) 

887 

888 def _unpack_offset(self, i: int) -> int: 

889 """Unpack the i-th object offset from the index file.""" 

890 raise NotImplementedError(self._unpack_offset) 

891 

892 def _unpack_crc32_checksum(self, i: int) -> int | None: 

893 """Unpack the crc32 checksum for the ith object from the index file.""" 

894 raise NotImplementedError(self._unpack_crc32_checksum) 

895 

896 def _itersha(self) -> Iterator[bytes]: 

897 """Iterate over all SHA1s in the index.""" 

898 for i in range(len(self)): 

899 yield self._unpack_name(i) 

900 

901 def iterentries(self) -> Iterator[PackIndexEntry]: 

902 """Iterate over the entries in this pack index. 

903 

904 Returns: iterator over tuples with object name, offset in packfile and 

905 crc32 checksum. 

906 """ 

907 for i in range(len(self)): 

908 yield self._unpack_entry(i) 

909 

910 def _read_fan_out_table(self, start_offset: int) -> list[int]: 

911 """Read the fan-out table from the index. 

912 

913 The fan-out table contains 256 entries mapping first byte values 

914 to the number of objects with SHA1s less than or equal to that byte. 

915 

916 Args: 

917 start_offset: Offset in the file where the fan-out table starts 

918 Returns: List of 256 integers 

919 """ 

920 ret = [] 

921 for i in range(0x100): 

922 fanout_entry = self._contents[ 

923 start_offset + i * 4 : start_offset + (i + 1) * 4 

924 ] 

925 ret.append(struct.unpack(">L", fanout_entry)[0]) 

926 return ret 

927 

928 def check(self) -> None: 

929 """Check that the stored checksum matches the actual checksum.""" 

930 actual = self.calculate_checksum() 

931 stored = self.get_stored_checksum() 

932 if actual != stored: 

933 raise ChecksumMismatch(stored, actual) 

934 

935 def calculate_checksum(self) -> bytes: 

936 """Calculate the SHA1 checksum over this pack index. 

937 

938 Returns: This is a 20-byte binary digest 

939 """ 

940 return sha1(self._contents[:-20]).digest() 

941 

942 def get_pack_checksum(self) -> bytes: 

943 """Return the SHA1 checksum stored for the corresponding packfile. 

944 

945 Returns: 20-byte binary digest 

946 """ 

947 return bytes(self._contents[-40:-20]) 

948 

949 def get_stored_checksum(self) -> bytes: 

950 """Return the SHA1 checksum stored for this index. 

951 

952 Returns: 20-byte binary digest 

953 """ 

954 return bytes(self._contents[-20:]) 

955 

956 def object_offset(self, sha: ObjectID | RawObjectID) -> int: 

957 """Return the offset in to the corresponding packfile for the object. 

958 

959 Given the name of an object it will return the offset that object 

960 lives at within the corresponding pack file. If the pack file doesn't 

961 have the object then None will be returned. 

962 """ 

963 if len(sha) == 40: 

964 sha = hex_to_sha(cast(ObjectID, sha)) 

965 try: 

966 return self._object_offset(sha) 

967 except ValueError as exc: 

968 closed = getattr(self._contents, "closed", None) 

969 if closed in (None, True): 

970 raise PackFileDisappeared(self) from exc 

971 raise 

972 

973 def _object_offset(self, sha: bytes) -> int: 

974 """See object_offset. 

975 

976 Args: 

977 sha: A *binary* SHA string. (20 characters long)_ 

978 """ 

979 assert len(sha) == 20 

980 idx = ord(sha[:1]) 

981 if idx == 0: 

982 start = 0 

983 else: 

984 start = self._fan_out_table[idx - 1] 

985 end = self._fan_out_table[idx] 

986 i = bisect_find_sha(start, end, sha, self._unpack_name) 

987 if i is None: 

988 raise KeyError(sha) 

989 return self._unpack_offset(i) 

990 

991 def iter_prefix(self, prefix: bytes) -> Iterator[RawObjectID]: 

992 """Iterate over all SHA1s with the given prefix.""" 

993 start = ord(prefix[:1]) 

994 if start == 0: 

995 start = 0 

996 else: 

997 start = self._fan_out_table[start - 1] 

998 end = ord(prefix[:1]) + 1 

999 if end == 0x100: 

1000 end = len(self) 

1001 else: 

1002 end = self._fan_out_table[end] 

1003 assert start <= end 

1004 started = False 

1005 for i in range(start, end): 

1006 name: bytes = self._unpack_name(i) 

1007 if name.startswith(prefix): 

1008 yield RawObjectID(name) 

1009 started = True 

1010 elif started: 

1011 break 

1012 

1013 

1014class PackIndex1(FilePackIndex): 

1015 """Version 1 Pack Index file.""" 

1016 

1017 def __init__( 

1018 self, 

1019 filename: str | os.PathLike[str], 

1020 file: IO[bytes] | _GitFile | None = None, 

1021 contents: bytes | None = None, 

1022 size: int | None = None, 

1023 ) -> None: 

1024 """Initialize a version 1 pack index. 

1025 

1026 Args: 

1027 filename: Path to the index file 

1028 file: Optional file object 

1029 contents: Optional mmap'd contents 

1030 size: Optional size of the index 

1031 """ 

1032 super().__init__(filename, file, contents, size) 

1033 self.version = 1 

1034 self._fan_out_table = self._read_fan_out_table(0) 

1035 

1036 def _unpack_entry(self, i: int) -> tuple[RawObjectID, int, None]: 

1037 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24)) 

1038 return (RawObjectID(name), offset, None) 

1039 

1040 def _unpack_name(self, i: int) -> bytes: 

1041 offset = (0x100 * 4) + (i * 24) + 4 

1042 return self._contents[offset : offset + 20] 

1043 

1044 def _unpack_offset(self, i: int) -> int: 

1045 offset = (0x100 * 4) + (i * 24) 

1046 result = unpack_from(">L", self._contents, offset)[0] 

1047 assert isinstance(result, int) 

1048 return result 

1049 

1050 def _unpack_crc32_checksum(self, i: int) -> None: 

1051 # Not stored in v1 index files 

1052 return None 

1053 

1054 

1055class PackIndex2(FilePackIndex): 

1056 """Version 2 Pack Index file.""" 

1057 

1058 def __init__( 

1059 self, 

1060 filename: str | os.PathLike[str], 

1061 file: IO[bytes] | _GitFile | None = None, 

1062 contents: bytes | None = None, 

1063 size: int | None = None, 

1064 ) -> None: 

1065 """Initialize a version 2 pack index. 

1066 

1067 Args: 

1068 filename: Path to the index file 

1069 file: Optional file object 

1070 contents: Optional mmap'd contents 

1071 size: Optional size of the index 

1072 """ 

1073 super().__init__(filename, file, contents, size) 

1074 if self._contents[:4] != b"\377tOc": 

1075 raise AssertionError("Not a v2 pack index file") 

1076 (self.version,) = unpack_from(b">L", self._contents, 4) 

1077 if self.version != 2: 

1078 raise AssertionError(f"Version was {self.version}") 

1079 self._fan_out_table = self._read_fan_out_table(8) 

1080 self._name_table_offset = 8 + 0x100 * 4 

1081 self._crc32_table_offset = self._name_table_offset + 20 * len(self) 

1082 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self) 

1083 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len( 

1084 self 

1085 ) 

1086 

1087 def _unpack_entry(self, i: int) -> tuple[RawObjectID, int, int]: 

1088 return ( 

1089 RawObjectID(self._unpack_name(i)), 

1090 self._unpack_offset(i), 

1091 self._unpack_crc32_checksum(i), 

1092 ) 

1093 

1094 def _unpack_name(self, i: int) -> bytes: 

1095 offset = self._name_table_offset + i * 20 

1096 return self._contents[offset : offset + 20] 

1097 

1098 def _unpack_offset(self, i: int) -> int: 

1099 offset_pos = self._pack_offset_table_offset + i * 4 

1100 offset = unpack_from(">L", self._contents, offset_pos)[0] 

1101 assert isinstance(offset, int) 

1102 if offset & (2**31): 

1103 large_offset_pos = ( 

1104 self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8 

1105 ) 

1106 offset = unpack_from(">Q", self._contents, large_offset_pos)[0] 

1107 assert isinstance(offset, int) 

1108 return offset 

1109 

1110 def _unpack_crc32_checksum(self, i: int) -> int: 

1111 result = unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0] 

1112 assert isinstance(result, int) 

1113 return result 

1114 

1115 

1116class PackIndex3(FilePackIndex): 

1117 """Version 3 Pack Index file. 

1118 

1119 Supports variable hash sizes for SHA-1 (20 bytes) and SHA-256 (32 bytes). 

1120 """ 

1121 

1122 def __init__( 

1123 self, 

1124 filename: str | os.PathLike[str], 

1125 file: IO[bytes] | _GitFile | None = None, 

1126 contents: bytes | None = None, 

1127 size: int | None = None, 

1128 ) -> None: 

1129 """Initialize a version 3 pack index. 

1130 

1131 Args: 

1132 filename: Path to the index file 

1133 file: Optional file object 

1134 contents: Optional mmap'd contents 

1135 size: Optional size of the index 

1136 """ 

1137 super().__init__(filename, file, contents, size) 

1138 if self._contents[:4] != b"\377tOc": 

1139 raise AssertionError("Not a v3 pack index file") 

1140 (self.version,) = unpack_from(b">L", self._contents, 4) 

1141 if self.version != 3: 

1142 raise AssertionError(f"Version was {self.version}") 

1143 

1144 # Read hash algorithm identifier (1 = SHA-1, 2 = SHA-256) 

1145 (self.hash_algorithm,) = unpack_from(b">L", self._contents, 8) 

1146 if self.hash_algorithm == 1: 

1147 self.hash_size = 20 # SHA-1 

1148 elif self.hash_algorithm == 2: 

1149 self.hash_size = 32 # SHA-256 

1150 else: 

1151 raise AssertionError(f"Unknown hash algorithm {self.hash_algorithm}") 

1152 

1153 # Read length of shortened object names 

1154 (self.shortened_oid_len,) = unpack_from(b">L", self._contents, 12) 

1155 

1156 # Calculate offsets based on variable hash size 

1157 self._fan_out_table = self._read_fan_out_table( 

1158 16 

1159 ) # After header (4 + 4 + 4 + 4) 

1160 self._name_table_offset = 16 + 0x100 * 4 

1161 self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self) 

1162 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self) 

1163 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len( 

1164 self 

1165 ) 

1166 

1167 def _unpack_entry(self, i: int) -> tuple[RawObjectID, int, int]: 

1168 return ( 

1169 RawObjectID(self._unpack_name(i)), 

1170 self._unpack_offset(i), 

1171 self._unpack_crc32_checksum(i), 

1172 ) 

1173 

1174 def _unpack_name(self, i: int) -> bytes: 

1175 offset = self._name_table_offset + i * self.hash_size 

1176 return self._contents[offset : offset + self.hash_size] 

1177 

1178 def _unpack_offset(self, i: int) -> int: 

1179 offset_pos = self._pack_offset_table_offset + i * 4 

1180 offset = unpack_from(">L", self._contents, offset_pos)[0] 

1181 assert isinstance(offset, int) 

1182 if offset & (2**31): 

1183 large_offset_pos = ( 

1184 self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8 

1185 ) 

1186 offset = unpack_from(">Q", self._contents, large_offset_pos)[0] 

1187 assert isinstance(offset, int) 

1188 return offset 

1189 

1190 def _unpack_crc32_checksum(self, i: int) -> int: 

1191 result = unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0] 

1192 assert isinstance(result, int) 

1193 return result 

1194 

1195 

1196def read_pack_header(read: Callable[[int], bytes]) -> tuple[int, int]: 

1197 """Read the header of a pack file. 

1198 

1199 Args: 

1200 read: Read function 

1201 Returns: Tuple of (pack version, number of objects). If no data is 

1202 available to read, returns (None, None). 

1203 """ 

1204 header = read(12) 

1205 if not header: 

1206 raise AssertionError("file too short to contain pack") 

1207 if header[:4] != b"PACK": 

1208 raise AssertionError(f"Invalid pack header {header!r}") 

1209 (version,) = unpack_from(b">L", header, 4) 

1210 if version not in (2, 3): 

1211 raise AssertionError(f"Version was {version}") 

1212 (num_objects,) = unpack_from(b">L", header, 8) 

1213 return (version, num_objects) 

1214 

1215 

1216def chunks_length(chunks: bytes | Iterable[bytes]) -> int: 

1217 """Get the total length of a sequence of chunks. 

1218 

1219 Args: 

1220 chunks: Either a single bytes object or an iterable of bytes 

1221 Returns: Total length in bytes 

1222 """ 

1223 if isinstance(chunks, bytes): 

1224 return len(chunks) 

1225 else: 

1226 return sum(map(len, chunks)) 

1227 

1228 

1229def unpack_object( 

1230 read_all: Callable[[int], bytes], 

1231 read_some: Callable[[int], bytes] | None = None, 

1232 compute_crc32: bool = False, 

1233 include_comp: bool = False, 

1234 zlib_bufsize: int = _ZLIB_BUFSIZE, 

1235) -> tuple[UnpackedObject, bytes]: 

1236 """Unpack a Git object. 

1237 

1238 Args: 

1239 read_all: Read function that blocks until the number of requested 

1240 bytes are read. 

1241 read_some: Read function that returns at least one byte, but may not 

1242 return the number of bytes requested. 

1243 compute_crc32: If True, compute the CRC32 of the compressed data. If 

1244 False, the returned CRC32 will be None. 

1245 include_comp: If True, include compressed data in the result. 

1246 zlib_bufsize: An optional buffer size for zlib operations. 

1247 Returns: A tuple of (unpacked, unused), where unused is the unused data 

1248 leftover from decompression, and unpacked in an UnpackedObject with 

1249 the following attrs set: 

1250 

1251 * obj_chunks (for non-delta types) 

1252 * pack_type_num 

1253 * delta_base (for delta types) 

1254 * comp_chunks (if include_comp is True) 

1255 * decomp_chunks 

1256 * decomp_len 

1257 * crc32 (if compute_crc32 is True) 

1258 """ 

1259 if read_some is None: 

1260 read_some = read_all 

1261 if compute_crc32: 

1262 crc32 = 0 

1263 else: 

1264 crc32 = None 

1265 

1266 raw, crc32 = take_msb_bytes(read_all, crc32=crc32) 

1267 type_num = (raw[0] >> 4) & 0x07 

1268 size = raw[0] & 0x0F 

1269 for i, byte in enumerate(raw[1:]): 

1270 size += (byte & 0x7F) << ((i * 7) + 4) 

1271 

1272 delta_base: int | bytes | None 

1273 raw_base = len(raw) 

1274 if type_num == OFS_DELTA: 

1275 raw, crc32 = take_msb_bytes(read_all, crc32=crc32) 

1276 raw_base += len(raw) 

1277 if raw[-1] & 0x80: 

1278 raise AssertionError 

1279 delta_base_offset = raw[0] & 0x7F 

1280 for byte in raw[1:]: 

1281 delta_base_offset += 1 

1282 delta_base_offset <<= 7 

1283 delta_base_offset += byte & 0x7F 

1284 delta_base = delta_base_offset 

1285 elif type_num == REF_DELTA: 

1286 delta_base_obj = read_all(20) 

1287 if crc32 is not None: 

1288 crc32 = binascii.crc32(delta_base_obj, crc32) 

1289 delta_base = delta_base_obj 

1290 raw_base += 20 

1291 else: 

1292 delta_base = None 

1293 

1294 unpacked = UnpackedObject( 

1295 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32 

1296 ) 

1297 unused = read_zlib_chunks( 

1298 read_some, 

1299 unpacked, 

1300 buffer_size=zlib_bufsize, 

1301 include_comp=include_comp, 

1302 ) 

1303 return unpacked, unused 

1304 

1305 

1306def _compute_object_size(value: tuple[int, Any]) -> int: 

1307 """Compute the size of a unresolved object for use with LRUSizeCache.""" 

1308 (num, obj) = value 

1309 if num in DELTA_TYPES: 

1310 return chunks_length(obj[1]) 

1311 return chunks_length(obj) 

1312 

1313 

1314class PackStreamReader: 

1315 """Class to read a pack stream. 

1316 

1317 The pack is read from a ReceivableProtocol using read() or recv() as 

1318 appropriate. 

1319 """ 

1320 

1321 def __init__( 

1322 self, 

1323 read_all: Callable[[int], bytes], 

1324 read_some: Callable[[int], bytes] | None = None, 

1325 zlib_bufsize: int = _ZLIB_BUFSIZE, 

1326 ) -> None: 

1327 """Initialize pack stream reader. 

1328 

1329 Args: 

1330 read_all: Function to read all requested bytes 

1331 read_some: Function to read some bytes (optional) 

1332 zlib_bufsize: Buffer size for zlib decompression 

1333 """ 

1334 self.read_all = read_all 

1335 if read_some is None: 

1336 self.read_some = read_all 

1337 else: 

1338 self.read_some = read_some 

1339 self.sha = sha1() 

1340 self._offset = 0 

1341 self._rbuf = BytesIO() 

1342 # trailer is a deque to avoid memory allocation on small reads 

1343 self._trailer: deque[int] = deque() 

1344 self._zlib_bufsize = zlib_bufsize 

1345 

1346 def _read(self, read: Callable[[int], bytes], size: int) -> bytes: 

1347 """Read up to size bytes using the given callback. 

1348 

1349 As a side effect, update the verifier's hash (excluding the last 20 

1350 bytes read). 

1351 

1352 Args: 

1353 read: The read callback to read from. 

1354 size: The maximum number of bytes to read; the particular 

1355 behavior is callback-specific. 

1356 Returns: Bytes read 

1357 """ 

1358 data = read(size) 

1359 

1360 # maintain a trailer of the last 20 bytes we've read 

1361 n = len(data) 

1362 self._offset += n 

1363 tn = len(self._trailer) 

1364 if n >= 20: 

1365 to_pop = tn 

1366 to_add = 20 

1367 else: 

1368 to_pop = max(n + tn - 20, 0) 

1369 to_add = n 

1370 self.sha.update( 

1371 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)])) 

1372 ) 

1373 self._trailer.extend(data[-to_add:]) 

1374 

1375 # hash everything but the trailer 

1376 self.sha.update(data[:-to_add]) 

1377 return data 

1378 

1379 def _buf_len(self) -> int: 

1380 buf = self._rbuf 

1381 start = buf.tell() 

1382 buf.seek(0, SEEK_END) 

1383 end = buf.tell() 

1384 buf.seek(start) 

1385 return end - start 

1386 

1387 @property 

1388 def offset(self) -> int: 

1389 """Return current offset in the stream.""" 

1390 return self._offset - self._buf_len() 

1391 

1392 def read(self, size: int) -> bytes: 

1393 """Read, blocking until size bytes are read.""" 

1394 buf_len = self._buf_len() 

1395 if buf_len >= size: 

1396 return self._rbuf.read(size) 

1397 buf_data = self._rbuf.read() 

1398 self._rbuf = BytesIO() 

1399 return buf_data + self._read(self.read_all, size - buf_len) 

1400 

1401 def recv(self, size: int) -> bytes: 

1402 """Read up to size bytes, blocking until one byte is read.""" 

1403 buf_len = self._buf_len() 

1404 if buf_len: 

1405 data = self._rbuf.read(size) 

1406 if size >= buf_len: 

1407 self._rbuf = BytesIO() 

1408 return data 

1409 return self._read(self.read_some, size) 

1410 

1411 def __len__(self) -> int: 

1412 """Return the number of objects in this pack.""" 

1413 return self._num_objects 

1414 

1415 def read_objects(self, compute_crc32: bool = False) -> Iterator[UnpackedObject]: 

1416 """Read the objects in this pack file. 

1417 

1418 Args: 

1419 compute_crc32: If True, compute the CRC32 of the compressed 

1420 data. If False, the returned CRC32 will be None. 

1421 Returns: Iterator over UnpackedObjects with the following members set: 

1422 offset 

1423 obj_type_num 

1424 obj_chunks (for non-delta types) 

1425 delta_base (for delta types) 

1426 decomp_chunks 

1427 decomp_len 

1428 crc32 (if compute_crc32 is True) 

1429 

1430 Raises: 

1431 ChecksumMismatch: if the checksum of the pack contents does not 

1432 match the checksum in the pack trailer. 

1433 zlib.error: if an error occurred during zlib decompression. 

1434 IOError: if an error occurred writing to the output file. 

1435 """ 

1436 _pack_version, self._num_objects = read_pack_header(self.read) 

1437 

1438 for _ in range(self._num_objects): 

1439 offset = self.offset 

1440 unpacked, unused = unpack_object( 

1441 self.read, 

1442 read_some=self.recv, 

1443 compute_crc32=compute_crc32, 

1444 zlib_bufsize=self._zlib_bufsize, 

1445 ) 

1446 unpacked.offset = offset 

1447 

1448 # prepend any unused data to current read buffer 

1449 buf = BytesIO() 

1450 buf.write(unused) 

1451 buf.write(self._rbuf.read()) 

1452 buf.seek(0) 

1453 self._rbuf = buf 

1454 

1455 yield unpacked 

1456 

1457 if self._buf_len() < 20: 

1458 # If the read buffer is full, then the last read() got the whole 

1459 # trailer off the wire. If not, it means there is still some of the 

1460 # trailer to read. We need to read() all 20 bytes; N come from the 

1461 # read buffer and (20 - N) come from the wire. 

1462 self.read(20) 

1463 

1464 pack_sha = bytearray(self._trailer) 

1465 if pack_sha != self.sha.digest(): 

1466 raise ChecksumMismatch( 

1467 sha_to_hex(RawObjectID(bytes(pack_sha))), self.sha.hexdigest() 

1468 ) 

1469 

1470 

1471class PackStreamCopier(PackStreamReader): 

1472 """Class to verify a pack stream as it is being read. 

1473 

1474 The pack is read from a ReceivableProtocol using read() or recv() as 

1475 appropriate and written out to the given file-like object. 

1476 """ 

1477 

1478 def __init__( 

1479 self, 

1480 read_all: Callable[[int], bytes], 

1481 read_some: Callable[[int], bytes] | None, 

1482 outfile: IO[bytes], 

1483 delta_iter: "DeltaChainIterator[UnpackedObject] | None" = None, 

1484 ) -> None: 

1485 """Initialize the copier. 

1486 

1487 Args: 

1488 read_all: Read function that blocks until the number of 

1489 requested bytes are read. 

1490 read_some: Read function that returns at least one byte, but may 

1491 not return the number of bytes requested. 

1492 outfile: File-like object to write output through. 

1493 delta_iter: Optional DeltaChainIterator to record deltas as we 

1494 read them. 

1495 """ 

1496 super().__init__(read_all, read_some=read_some) 

1497 self.outfile = outfile 

1498 self._delta_iter = delta_iter 

1499 

1500 def _read(self, read: Callable[[int], bytes], size: int) -> bytes: 

1501 """Read data from the read callback and write it to the file.""" 

1502 data = super()._read(read, size) 

1503 self.outfile.write(data) 

1504 return data 

1505 

1506 def verify(self, progress: Callable[..., None] | None = None) -> None: 

1507 """Verify a pack stream and write it to the output file. 

1508 

1509 See PackStreamReader.iterobjects for a list of exceptions this may 

1510 throw. 

1511 """ 

1512 i = 0 # default count of entries if read_objects() is empty 

1513 for i, unpacked in enumerate(self.read_objects()): 

1514 if self._delta_iter: 

1515 self._delta_iter.record(unpacked) 

1516 if progress is not None: 

1517 progress(f"copying pack entries: {i}/{len(self)}\r".encode("ascii")) 

1518 if progress is not None: 

1519 progress(f"copied {i} pack entries\n".encode("ascii")) 

1520 

1521 

1522def obj_sha(type: int, chunks: bytes | Iterable[bytes]) -> bytes: 

1523 """Compute the SHA for a numeric type and object chunks.""" 

1524 sha = sha1() 

1525 sha.update(object_header(type, chunks_length(chunks))) 

1526 if isinstance(chunks, bytes): 

1527 sha.update(chunks) 

1528 else: 

1529 for chunk in chunks: 

1530 sha.update(chunk) 

1531 return sha.digest() 

1532 

1533 

1534def compute_file_sha( 

1535 f: IO[bytes], start_ofs: int = 0, end_ofs: int = 0, buffer_size: int = 1 << 16 

1536) -> "HashObject": 

1537 """Hash a portion of a file into a new SHA. 

1538 

1539 Args: 

1540 f: A file-like object to read from that supports seek(). 

1541 start_ofs: The offset in the file to start reading at. 

1542 end_ofs: The offset in the file to end reading at, relative to the 

1543 end of the file. 

1544 buffer_size: A buffer size for reading. 

1545 Returns: A new SHA object updated with data read from the file. 

1546 """ 

1547 sha = sha1() 

1548 f.seek(0, SEEK_END) 

1549 length = f.tell() 

1550 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length: 

1551 raise AssertionError( 

1552 f"Attempt to read beyond file length. start_ofs: {start_ofs}, end_ofs: {end_ofs}, file length: {length}" 

1553 ) 

1554 todo = length + end_ofs - start_ofs 

1555 f.seek(start_ofs) 

1556 while todo: 

1557 data = f.read(min(todo, buffer_size)) 

1558 sha.update(data) 

1559 todo -= len(data) 

1560 return sha 

1561 

1562 

1563class PackData: 

1564 """The data contained in a packfile. 

1565 

1566 Pack files can be accessed both sequentially for exploding a pack, and 

1567 directly with the help of an index to retrieve a specific object. 

1568 

1569 The objects within are either complete or a delta against another. 

1570 

1571 The header is variable length. If the MSB of each byte is set then it 

1572 indicates that the subsequent byte is still part of the header. 

1573 For the first byte the next MS bits are the type, which tells you the type 

1574 of object, and whether it is a delta. The LS byte is the lowest bits of the 

1575 size. For each subsequent byte the LS 7 bits are the next MS bits of the 

1576 size, i.e. the last byte of the header contains the MS bits of the size. 

1577 

1578 For the complete objects the data is stored as zlib deflated data. 

1579 The size in the header is the uncompressed object size, so to uncompress 

1580 you need to just keep feeding data to zlib until you get an object back, 

1581 or it errors on bad data. This is done here by just giving the complete 

1582 buffer from the start of the deflated object on. This is bad, but until I 

1583 get mmap sorted out it will have to do. 

1584 

1585 Currently there are no integrity checks done. Also no attempt is made to 

1586 try and detect the delta case, or a request for an object at the wrong 

1587 position. It will all just throw a zlib or KeyError. 

1588 """ 

1589 

1590 def __init__( 

1591 self, 

1592 filename: str | os.PathLike[str], 

1593 file: IO[bytes] | None = None, 

1594 size: int | None = None, 

1595 *, 

1596 delta_window_size: int | None = None, 

1597 window_memory: int | None = None, 

1598 delta_cache_size: int | None = None, 

1599 depth: int | None = None, 

1600 threads: int | None = None, 

1601 big_file_threshold: int | None = None, 

1602 ) -> None: 

1603 """Create a PackData object representing the pack in the given filename. 

1604 

1605 The file must exist and stay readable until the object is disposed of. 

1606 It must also stay the same size. It will be mapped whenever needed. 

1607 

1608 Currently there is a restriction on the size of the pack as the python 

1609 mmap implementation is flawed. 

1610 """ 

1611 self._filename = filename 

1612 self._size = size 

1613 self._header_size = 12 

1614 self.delta_window_size = delta_window_size 

1615 self.window_memory = window_memory 

1616 self.delta_cache_size = delta_cache_size 

1617 self.depth = depth 

1618 self.threads = threads 

1619 self.big_file_threshold = big_file_threshold 

1620 self._file: IO[bytes] 

1621 

1622 if file is None: 

1623 self._file = GitFile(self._filename, "rb") 

1624 else: 

1625 self._file = file 

1626 (_version, self._num_objects) = read_pack_header(self._file.read) 

1627 

1628 # Use delta_cache_size config if available, otherwise default 

1629 cache_size = delta_cache_size or (1024 * 1024 * 20) 

1630 self._offset_cache = LRUSizeCache[int, tuple[int, OldUnpackedObject]]( 

1631 cache_size, compute_size=_compute_object_size 

1632 ) 

1633 

1634 @property 

1635 def filename(self) -> str: 

1636 """Get the filename of the pack file. 

1637 

1638 Returns: 

1639 Base filename without directory path 

1640 """ 

1641 return os.path.basename(self._filename) 

1642 

1643 @property 

1644 def path(self) -> str | os.PathLike[str]: 

1645 """Get the full path of the pack file. 

1646 

1647 Returns: 

1648 Full path to the pack file 

1649 """ 

1650 return self._filename 

1651 

1652 @classmethod 

1653 def from_file(cls, file: IO[bytes], size: int | None = None) -> "PackData": 

1654 """Create a PackData object from an open file. 

1655 

1656 Args: 

1657 file: Open file object 

1658 size: Optional file size 

1659 

1660 Returns: 

1661 PackData instance 

1662 """ 

1663 return cls(str(file), file=file, size=size) 

1664 

1665 @classmethod 

1666 def from_path(cls, path: str | os.PathLike[str]) -> "PackData": 

1667 """Create a PackData object from a file path. 

1668 

1669 Args: 

1670 path: Path to the pack file 

1671 

1672 Returns: 

1673 PackData instance 

1674 """ 

1675 return cls(filename=path) 

1676 

1677 def close(self) -> None: 

1678 """Close the underlying pack file.""" 

1679 self._file.close() 

1680 

1681 def __enter__(self) -> "PackData": 

1682 """Enter context manager.""" 

1683 return self 

1684 

1685 def __exit__( 

1686 self, 

1687 exc_type: type | None, 

1688 exc_val: BaseException | None, 

1689 exc_tb: TracebackType | None, 

1690 ) -> None: 

1691 """Exit context manager.""" 

1692 self.close() 

1693 

1694 def __eq__(self, other: object) -> bool: 

1695 """Check equality with another object.""" 

1696 if isinstance(other, PackData): 

1697 return self.get_stored_checksum() == other.get_stored_checksum() 

1698 return False 

1699 

1700 def _get_size(self) -> int: 

1701 if self._size is not None: 

1702 return self._size 

1703 self._size = os.path.getsize(self._filename) 

1704 if self._size < self._header_size: 

1705 errmsg = f"{self._filename} is too small for a packfile ({self._size} < {self._header_size})" 

1706 raise AssertionError(errmsg) 

1707 return self._size 

1708 

1709 def __len__(self) -> int: 

1710 """Returns the number of objects in this pack.""" 

1711 return self._num_objects 

1712 

1713 def calculate_checksum(self) -> bytes: 

1714 """Calculate the checksum for this pack. 

1715 

1716 Returns: 20-byte binary SHA1 digest 

1717 """ 

1718 return compute_file_sha(self._file, end_ofs=-20).digest() 

1719 

1720 def iter_unpacked(self, *, include_comp: bool = False) -> Iterator[UnpackedObject]: 

1721 """Iterate over unpacked objects in the pack.""" 

1722 self._file.seek(self._header_size) 

1723 

1724 if self._num_objects is None: 

1725 return 

1726 

1727 for _ in range(self._num_objects): 

1728 offset = self._file.tell() 

1729 unpacked, unused = unpack_object( 

1730 self._file.read, compute_crc32=False, include_comp=include_comp 

1731 ) 

1732 unpacked.offset = offset 

1733 yield unpacked 

1734 # Back up over unused data. 

1735 self._file.seek(-len(unused), SEEK_CUR) 

1736 

1737 def iterentries( 

1738 self, 

1739 progress: Callable[[int, int], None] | None = None, 

1740 resolve_ext_ref: ResolveExtRefFn | None = None, 

1741 ) -> Iterator[PackIndexEntry]: 

1742 """Yield entries summarizing the contents of this pack. 

1743 

1744 Args: 

1745 progress: Progress function, called with current and total 

1746 object count. 

1747 resolve_ext_ref: Optional function to resolve external references 

1748 Returns: iterator of tuples with (sha, offset, crc32) 

1749 """ 

1750 num_objects = self._num_objects 

1751 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref) 

1752 for i, result in enumerate(indexer): 

1753 if progress is not None: 

1754 progress(i, num_objects) 

1755 yield result 

1756 

1757 def sorted_entries( 

1758 self, 

1759 progress: ProgressFn | None = None, 

1760 resolve_ext_ref: ResolveExtRefFn | None = None, 

1761 ) -> list[tuple[RawObjectID, int, int]]: 

1762 """Return entries in this pack, sorted by SHA. 

1763 

1764 Args: 

1765 progress: Progress function, called with current and total 

1766 object count 

1767 resolve_ext_ref: Optional function to resolve external references 

1768 Returns: Iterator of tuples with (sha, offset, crc32) 

1769 """ 

1770 return sorted( 

1771 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref) # type: ignore 

1772 ) 

1773 

1774 def create_index_v1( 

1775 self, 

1776 filename: str, 

1777 progress: Callable[..., None] | None = None, 

1778 resolve_ext_ref: ResolveExtRefFn | None = None, 

1779 ) -> bytes: 

1780 """Create a version 1 file for this data file. 

1781 

1782 Args: 

1783 filename: Index filename. 

1784 progress: Progress report function 

1785 resolve_ext_ref: Optional function to resolve external references 

1786 Returns: Checksum of index file 

1787 """ 

1788 entries = self.sorted_entries( 

1789 progress=progress, resolve_ext_ref=resolve_ext_ref 

1790 ) 

1791 checksum = self.calculate_checksum() 

1792 with GitFile(filename, "wb") as f: 

1793 write_pack_index_v1( 

1794 f, 

1795 entries, 

1796 checksum, 

1797 ) 

1798 return checksum 

1799 

1800 def create_index_v2( 

1801 self, 

1802 filename: str, 

1803 progress: Callable[..., None] | None = None, 

1804 resolve_ext_ref: ResolveExtRefFn | None = None, 

1805 ) -> bytes: 

1806 """Create a version 2 index file for this data file. 

1807 

1808 Args: 

1809 filename: Index filename. 

1810 progress: Progress report function 

1811 resolve_ext_ref: Optional function to resolve external references 

1812 Returns: Checksum of index file 

1813 """ 

1814 entries = self.sorted_entries( 

1815 progress=progress, resolve_ext_ref=resolve_ext_ref 

1816 ) 

1817 with GitFile(filename, "wb") as f: 

1818 return write_pack_index_v2(f, entries, self.calculate_checksum()) 

1819 

1820 def create_index_v3( 

1821 self, 

1822 filename: str, 

1823 progress: Callable[..., None] | None = None, 

1824 resolve_ext_ref: ResolveExtRefFn | None = None, 

1825 hash_algorithm: int = 1, 

1826 ) -> bytes: 

1827 """Create a version 3 index file for this data file. 

1828 

1829 Args: 

1830 filename: Index filename. 

1831 progress: Progress report function 

1832 resolve_ext_ref: Function to resolve external references 

1833 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256) 

1834 Returns: Checksum of index file 

1835 """ 

1836 entries = self.sorted_entries( 

1837 progress=progress, resolve_ext_ref=resolve_ext_ref 

1838 ) 

1839 with GitFile(filename, "wb") as f: 

1840 return write_pack_index_v3( 

1841 f, entries, self.calculate_checksum(), hash_algorithm 

1842 ) 

1843 

1844 def create_index( 

1845 self, 

1846 filename: str, 

1847 progress: Callable[..., None] | None = None, 

1848 version: int = 2, 

1849 resolve_ext_ref: ResolveExtRefFn | None = None, 

1850 hash_algorithm: int = 1, 

1851 ) -> bytes: 

1852 """Create an index file for this data file. 

1853 

1854 Args: 

1855 filename: Index filename. 

1856 progress: Progress report function 

1857 version: Index version (1, 2, or 3) 

1858 resolve_ext_ref: Function to resolve external references 

1859 hash_algorithm: Hash algorithm identifier for v3 (1 = SHA-1, 2 = SHA-256) 

1860 Returns: Checksum of index file 

1861 """ 

1862 if version == 1: 

1863 return self.create_index_v1( 

1864 filename, progress, resolve_ext_ref=resolve_ext_ref 

1865 ) 

1866 elif version == 2: 

1867 return self.create_index_v2( 

1868 filename, progress, resolve_ext_ref=resolve_ext_ref 

1869 ) 

1870 elif version == 3: 

1871 return self.create_index_v3( 

1872 filename, 

1873 progress, 

1874 resolve_ext_ref=resolve_ext_ref, 

1875 hash_algorithm=hash_algorithm, 

1876 ) 

1877 else: 

1878 raise ValueError(f"unknown index format {version}") 

1879 

1880 def get_stored_checksum(self) -> bytes: 

1881 """Return the expected checksum stored in this pack.""" 

1882 self._file.seek(-20, SEEK_END) 

1883 return self._file.read(20) 

1884 

1885 def check(self) -> None: 

1886 """Check the consistency of this pack.""" 

1887 actual = self.calculate_checksum() 

1888 stored = self.get_stored_checksum() 

1889 if actual != stored: 

1890 raise ChecksumMismatch(stored, actual) 

1891 

1892 def get_unpacked_object_at( 

1893 self, offset: int, *, include_comp: bool = False 

1894 ) -> UnpackedObject: 

1895 """Given offset in the packfile return a UnpackedObject.""" 

1896 assert offset >= self._header_size 

1897 self._file.seek(offset) 

1898 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp) 

1899 unpacked.offset = offset 

1900 return unpacked 

1901 

1902 def get_object_at(self, offset: int) -> tuple[int, OldUnpackedObject]: 

1903 """Given an offset in to the packfile return the object that is there. 

1904 

1905 Using the associated index the location of an object can be looked up, 

1906 and then the packfile can be asked directly for that object using this 

1907 function. 

1908 """ 

1909 try: 

1910 return self._offset_cache[offset] 

1911 except KeyError: 

1912 pass 

1913 unpacked = self.get_unpacked_object_at(offset, include_comp=False) 

1914 return (unpacked.pack_type_num, unpacked._obj()) 

1915 

1916 

1917T = TypeVar("T") 

1918 

1919 

1920class DeltaChainIterator(Generic[T]): 

1921 """Abstract iterator over pack data based on delta chains. 

1922 

1923 Each object in the pack is guaranteed to be inflated exactly once, 

1924 regardless of how many objects reference it as a delta base. As a result, 

1925 memory usage is proportional to the length of the longest delta chain. 

1926 

1927 Subclasses can override _result to define the result type of the iterator. 

1928 By default, results are UnpackedObjects with the following members set: 

1929 

1930 * offset 

1931 * obj_type_num 

1932 * obj_chunks 

1933 * pack_type_num 

1934 * delta_base (for delta types) 

1935 * comp_chunks (if _include_comp is True) 

1936 * decomp_chunks 

1937 * decomp_len 

1938 * crc32 (if _compute_crc32 is True) 

1939 """ 

1940 

1941 _compute_crc32 = False 

1942 _include_comp = False 

1943 

1944 def __init__( 

1945 self, 

1946 file_obj: IO[bytes] | None, 

1947 *, 

1948 resolve_ext_ref: ResolveExtRefFn | None = None, 

1949 ) -> None: 

1950 """Initialize DeltaChainIterator. 

1951 

1952 Args: 

1953 file_obj: File object to read pack data from 

1954 resolve_ext_ref: Optional function to resolve external references 

1955 """ 

1956 self._file = file_obj 

1957 self._resolve_ext_ref = resolve_ext_ref 

1958 self._pending_ofs: dict[int, list[int]] = defaultdict(list) 

1959 self._pending_ref: dict[bytes, list[int]] = defaultdict(list) 

1960 self._full_ofs: list[tuple[int, int]] = [] 

1961 self._ext_refs: list[RawObjectID] = [] 

1962 

1963 @classmethod 

1964 def for_pack_data( 

1965 cls, pack_data: PackData, resolve_ext_ref: ResolveExtRefFn | None = None 

1966 ) -> "DeltaChainIterator[T]": 

1967 """Create a DeltaChainIterator from pack data. 

1968 

1969 Args: 

1970 pack_data: PackData object to iterate 

1971 resolve_ext_ref: Optional function to resolve external refs 

1972 

1973 Returns: 

1974 DeltaChainIterator instance 

1975 """ 

1976 walker = cls(None, resolve_ext_ref=resolve_ext_ref) 

1977 walker.set_pack_data(pack_data) 

1978 for unpacked in pack_data.iter_unpacked(include_comp=False): 

1979 walker.record(unpacked) 

1980 return walker 

1981 

1982 @classmethod 

1983 def for_pack_subset( 

1984 cls, 

1985 pack: "Pack", 

1986 shas: Iterable[ObjectID | RawObjectID], 

1987 *, 

1988 allow_missing: bool = False, 

1989 resolve_ext_ref: ResolveExtRefFn | None = None, 

1990 ) -> "DeltaChainIterator[T]": 

1991 """Create a DeltaChainIterator for a subset of objects. 

1992 

1993 Args: 

1994 pack: Pack object containing the data 

1995 shas: Iterable of object SHAs to include 

1996 allow_missing: If True, skip missing objects 

1997 resolve_ext_ref: Optional function to resolve external refs 

1998 

1999 Returns: 

2000 DeltaChainIterator instance 

2001 """ 

2002 walker = cls(None, resolve_ext_ref=resolve_ext_ref) 

2003 walker.set_pack_data(pack.data) 

2004 todo = set() 

2005 for sha in shas: 

2006 try: 

2007 off = pack.index.object_offset(sha) 

2008 except KeyError: 

2009 if not allow_missing: 

2010 raise 

2011 else: 

2012 todo.add(off) 

2013 done = set() 

2014 while todo: 

2015 off = todo.pop() 

2016 unpacked = pack.data.get_unpacked_object_at(off) 

2017 walker.record(unpacked) 

2018 done.add(off) 

2019 base_ofs = None 

2020 if unpacked.pack_type_num == OFS_DELTA: 

2021 assert unpacked.offset is not None 

2022 assert unpacked.delta_base is not None 

2023 assert isinstance(unpacked.delta_base, int) 

2024 base_ofs = unpacked.offset - unpacked.delta_base 

2025 elif unpacked.pack_type_num == REF_DELTA: 

2026 with suppress(KeyError): 

2027 assert isinstance(unpacked.delta_base, bytes) 

2028 base_ofs = pack.index.object_index(RawObjectID(unpacked.delta_base)) 

2029 if base_ofs is not None and base_ofs not in done: 

2030 todo.add(base_ofs) 

2031 return walker 

2032 

2033 def record(self, unpacked: UnpackedObject) -> None: 

2034 """Record an unpacked object for later processing. 

2035 

2036 Args: 

2037 unpacked: UnpackedObject to record 

2038 """ 

2039 type_num = unpacked.pack_type_num 

2040 offset = unpacked.offset 

2041 assert offset is not None 

2042 if type_num == OFS_DELTA: 

2043 assert unpacked.delta_base is not None 

2044 assert isinstance(unpacked.delta_base, int) 

2045 base_offset = offset - unpacked.delta_base 

2046 self._pending_ofs[base_offset].append(offset) 

2047 elif type_num == REF_DELTA: 

2048 assert isinstance(unpacked.delta_base, bytes) 

2049 self._pending_ref[unpacked.delta_base].append(offset) 

2050 else: 

2051 self._full_ofs.append((offset, type_num)) 

2052 

2053 def set_pack_data(self, pack_data: PackData) -> None: 

2054 """Set the pack data for iteration. 

2055 

2056 Args: 

2057 pack_data: PackData object to use 

2058 """ 

2059 self._file = pack_data._file 

2060 

2061 def _walk_all_chains(self) -> Iterator[T]: 

2062 for offset, type_num in self._full_ofs: 

2063 yield from self._follow_chain(offset, type_num, None) 

2064 yield from self._walk_ref_chains() 

2065 assert not self._pending_ofs, repr(self._pending_ofs) 

2066 

2067 def _ensure_no_pending(self) -> None: 

2068 if self._pending_ref: 

2069 raise UnresolvedDeltas( 

2070 [sha_to_hex(RawObjectID(s)) for s in self._pending_ref] 

2071 ) 

2072 

2073 def _walk_ref_chains(self) -> Iterator[T]: 

2074 if not self._resolve_ext_ref: 

2075 self._ensure_no_pending() 

2076 return 

2077 

2078 for base_sha, pending in sorted(self._pending_ref.items()): 

2079 if base_sha not in self._pending_ref: 

2080 continue 

2081 try: 

2082 type_num, chunks = self._resolve_ext_ref(base_sha) 

2083 except KeyError: 

2084 # Not an external ref, but may depend on one. Either it will 

2085 # get popped via a _follow_chain call, or we will raise an 

2086 # error below. 

2087 continue 

2088 self._ext_refs.append(RawObjectID(base_sha)) 

2089 self._pending_ref.pop(base_sha) 

2090 for new_offset in pending: 

2091 yield from self._follow_chain(new_offset, type_num, chunks) # type: ignore[arg-type] 

2092 

2093 self._ensure_no_pending() 

2094 

2095 def _result(self, unpacked: UnpackedObject) -> T: 

2096 raise NotImplementedError 

2097 

2098 def _resolve_object( 

2099 self, offset: int, obj_type_num: int, base_chunks: list[bytes] | None 

2100 ) -> UnpackedObject: 

2101 assert self._file is not None 

2102 self._file.seek(offset) 

2103 unpacked, _ = unpack_object( 

2104 self._file.read, 

2105 include_comp=self._include_comp, 

2106 compute_crc32=self._compute_crc32, 

2107 ) 

2108 unpacked.offset = offset 

2109 if base_chunks is None: 

2110 assert unpacked.pack_type_num == obj_type_num 

2111 else: 

2112 assert unpacked.pack_type_num in DELTA_TYPES 

2113 unpacked.obj_type_num = obj_type_num 

2114 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks) 

2115 return unpacked 

2116 

2117 def _follow_chain( 

2118 self, offset: int, obj_type_num: int, base_chunks: list[bytes] | None 

2119 ) -> Iterator[T]: 

2120 # Unlike PackData.get_object_at, there is no need to cache offsets as 

2121 # this approach by design inflates each object exactly once. 

2122 todo = [(offset, obj_type_num, base_chunks)] 

2123 while todo: 

2124 (offset, obj_type_num, base_chunks) = todo.pop() 

2125 unpacked = self._resolve_object(offset, obj_type_num, base_chunks) 

2126 yield self._result(unpacked) 

2127 

2128 assert unpacked.offset is not None 

2129 unblocked = chain( 

2130 self._pending_ofs.pop(unpacked.offset, []), 

2131 self._pending_ref.pop(unpacked.sha(), []), 

2132 ) 

2133 todo.extend( 

2134 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore 

2135 for new_offset in unblocked 

2136 ) 

2137 

2138 def __iter__(self) -> Iterator[T]: 

2139 """Iterate over objects in the pack.""" 

2140 return self._walk_all_chains() 

2141 

2142 def ext_refs(self) -> list[RawObjectID]: 

2143 """Return external references.""" 

2144 return self._ext_refs 

2145 

2146 

2147class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]): 

2148 """Delta chain iterator that yield unpacked objects.""" 

2149 

2150 def _result(self, unpacked: UnpackedObject) -> UnpackedObject: 

2151 """Return the unpacked object. 

2152 

2153 Args: 

2154 unpacked: The unpacked object 

2155 

2156 Returns: 

2157 The unpacked object unchanged 

2158 """ 

2159 return unpacked 

2160 

2161 

2162class PackIndexer(DeltaChainIterator[PackIndexEntry]): 

2163 """Delta chain iterator that yields index entries.""" 

2164 

2165 _compute_crc32 = True 

2166 

2167 def _result(self, unpacked: UnpackedObject) -> PackIndexEntry: 

2168 """Convert unpacked object to pack index entry. 

2169 

2170 Args: 

2171 unpacked: The unpacked object 

2172 

2173 Returns: 

2174 Tuple of (sha, offset, crc32) for index entry 

2175 """ 

2176 assert unpacked.offset is not None 

2177 return unpacked.sha(), unpacked.offset, unpacked.crc32 

2178 

2179 

2180class PackInflater(DeltaChainIterator[ShaFile]): 

2181 """Delta chain iterator that yields ShaFile objects.""" 

2182 

2183 def _result(self, unpacked: UnpackedObject) -> ShaFile: 

2184 """Convert unpacked object to ShaFile. 

2185 

2186 Args: 

2187 unpacked: The unpacked object 

2188 

2189 Returns: 

2190 ShaFile object from the unpacked data 

2191 """ 

2192 return unpacked.sha_file() 

2193 

2194 

2195class SHA1Reader(BinaryIO): 

2196 """Wrapper for file-like object that remembers the SHA1 of its data.""" 

2197 

2198 def __init__(self, f: IO[bytes]) -> None: 

2199 """Initialize SHA1Reader. 

2200 

2201 Args: 

2202 f: File-like object to wrap 

2203 """ 

2204 self.f = f 

2205 self.sha1 = sha1(b"") 

2206 

2207 def read(self, size: int = -1) -> bytes: 

2208 """Read bytes and update SHA1. 

2209 

2210 Args: 

2211 size: Number of bytes to read, -1 for all 

2212 

2213 Returns: 

2214 Bytes read from file 

2215 """ 

2216 data = self.f.read(size) 

2217 self.sha1.update(data) 

2218 return data 

2219 

2220 def check_sha(self, allow_empty: bool = False) -> None: 

2221 """Check if the SHA1 matches the expected value. 

2222 

2223 Args: 

2224 allow_empty: Allow empty SHA1 hash 

2225 

2226 Raises: 

2227 ChecksumMismatch: If SHA1 doesn't match 

2228 """ 

2229 stored = self.f.read(20) 

2230 # If git option index.skipHash is set the index will be empty 

2231 if stored != self.sha1.digest() and ( 

2232 not allow_empty 

2233 or sha_to_hex(RawObjectID(stored)) 

2234 != b"0000000000000000000000000000000000000000" 

2235 ): 

2236 raise ChecksumMismatch( 

2237 self.sha1.hexdigest(), sha_to_hex(RawObjectID(stored)) 

2238 ) 

2239 

2240 def close(self) -> None: 

2241 """Close the underlying file.""" 

2242 return self.f.close() 

2243 

2244 def tell(self) -> int: 

2245 """Return current file position.""" 

2246 return self.f.tell() 

2247 

2248 # BinaryIO abstract methods 

2249 def readable(self) -> bool: 

2250 """Check if file is readable.""" 

2251 return True 

2252 

2253 def writable(self) -> bool: 

2254 """Check if file is writable.""" 

2255 return False 

2256 

2257 def seekable(self) -> bool: 

2258 """Check if file is seekable.""" 

2259 return getattr(self.f, "seekable", lambda: False)() 

2260 

2261 def seek(self, offset: int, whence: int = 0) -> int: 

2262 """Seek to position in file. 

2263 

2264 Args: 

2265 offset: Position offset 

2266 whence: Reference point (0=start, 1=current, 2=end) 

2267 

2268 Returns: 

2269 New file position 

2270 """ 

2271 return self.f.seek(offset, whence) 

2272 

2273 def flush(self) -> None: 

2274 """Flush the file buffer.""" 

2275 if hasattr(self.f, "flush"): 

2276 self.f.flush() 

2277 

2278 def readline(self, size: int = -1) -> bytes: 

2279 """Read a line from the file. 

2280 

2281 Args: 

2282 size: Maximum bytes to read 

2283 

2284 Returns: 

2285 Line read from file 

2286 """ 

2287 return self.f.readline(size) 

2288 

2289 def readlines(self, hint: int = -1) -> list[bytes]: 

2290 """Read all lines from the file. 

2291 

2292 Args: 

2293 hint: Approximate number of bytes to read 

2294 

2295 Returns: 

2296 List of lines 

2297 """ 

2298 return self.f.readlines(hint) 

2299 

2300 def writelines(self, lines: Iterable[bytes], /) -> None: # type: ignore[override] 

2301 """Write multiple lines to the file (not supported).""" 

2302 raise UnsupportedOperation("writelines") 

2303 

2304 def write(self, data: bytes, /) -> int: # type: ignore[override] 

2305 """Write data to the file (not supported).""" 

2306 raise UnsupportedOperation("write") 

2307 

2308 def __enter__(self) -> "SHA1Reader": 

2309 """Enter context manager.""" 

2310 return self 

2311 

2312 def __exit__( 

2313 self, 

2314 type: type | None, 

2315 value: BaseException | None, 

2316 traceback: TracebackType | None, 

2317 ) -> None: 

2318 """Exit context manager and close file.""" 

2319 self.close() 

2320 

2321 def __iter__(self) -> "SHA1Reader": 

2322 """Return iterator for reading file lines.""" 

2323 return self 

2324 

2325 def __next__(self) -> bytes: 

2326 """Get next line from file. 

2327 

2328 Returns: 

2329 Next line 

2330 

2331 Raises: 

2332 StopIteration: When no more lines 

2333 """ 

2334 line = self.readline() 

2335 if not line: 

2336 raise StopIteration 

2337 return line 

2338 

2339 def fileno(self) -> int: 

2340 """Return file descriptor number.""" 

2341 return self.f.fileno() 

2342 

2343 def isatty(self) -> bool: 

2344 """Check if file is a terminal.""" 

2345 return getattr(self.f, "isatty", lambda: False)() 

2346 

2347 def truncate(self, size: int | None = None) -> int: 

2348 """Not supported for read-only file. 

2349 

2350 Raises: 

2351 UnsupportedOperation: Always raised 

2352 """ 

2353 raise UnsupportedOperation("truncate") 

2354 

2355 

2356class SHA1Writer(BinaryIO): 

2357 """Wrapper for file-like object that remembers the SHA1 of its data.""" 

2358 

2359 def __init__(self, f: BinaryIO | IO[bytes]) -> None: 

2360 """Initialize SHA1Writer. 

2361 

2362 Args: 

2363 f: File-like object to wrap 

2364 """ 

2365 self.f = f 

2366 self.length = 0 

2367 self.sha1 = sha1(b"") 

2368 self.digest: bytes | None = None 

2369 

2370 def write(self, data: bytes | bytearray | memoryview, /) -> int: # type: ignore[override] 

2371 """Write data and update SHA1. 

2372 

2373 Args: 

2374 data: Data to write 

2375 

2376 Returns: 

2377 Number of bytes written 

2378 """ 

2379 self.sha1.update(data) 

2380 written = self.f.write(data) 

2381 self.length += written 

2382 return written 

2383 

2384 def write_sha(self) -> bytes: 

2385 """Write the SHA1 digest to the file. 

2386 

2387 Returns: 

2388 The SHA1 digest bytes 

2389 """ 

2390 sha = self.sha1.digest() 

2391 assert len(sha) == 20 

2392 self.f.write(sha) 

2393 self.length += len(sha) 

2394 return sha 

2395 

2396 def close(self) -> None: 

2397 """Close the pack file and finalize the SHA.""" 

2398 self.digest = self.write_sha() 

2399 self.f.close() 

2400 

2401 def offset(self) -> int: 

2402 """Get the total number of bytes written. 

2403 

2404 Returns: 

2405 Total bytes written 

2406 """ 

2407 return self.length 

2408 

2409 def tell(self) -> int: 

2410 """Return current file position.""" 

2411 return self.f.tell() 

2412 

2413 # BinaryIO abstract methods 

2414 def readable(self) -> bool: 

2415 """Check if file is readable.""" 

2416 return False 

2417 

2418 def writable(self) -> bool: 

2419 """Check if file is writable.""" 

2420 return True 

2421 

2422 def seekable(self) -> bool: 

2423 """Check if file is seekable.""" 

2424 return getattr(self.f, "seekable", lambda: False)() 

2425 

2426 def seek(self, offset: int, whence: int = 0) -> int: 

2427 """Seek to position in file. 

2428 

2429 Args: 

2430 offset: Position offset 

2431 whence: Reference point (0=start, 1=current, 2=end) 

2432 

2433 Returns: 

2434 New file position 

2435 """ 

2436 return self.f.seek(offset, whence) 

2437 

2438 def flush(self) -> None: 

2439 """Flush the file buffer.""" 

2440 if hasattr(self.f, "flush"): 

2441 self.f.flush() 

2442 

2443 def readline(self, size: int = -1) -> bytes: 

2444 """Not supported for write-only file. 

2445 

2446 Raises: 

2447 UnsupportedOperation: Always raised 

2448 """ 

2449 raise UnsupportedOperation("readline") 

2450 

2451 def readlines(self, hint: int = -1) -> list[bytes]: 

2452 """Not supported for write-only file. 

2453 

2454 Raises: 

2455 UnsupportedOperation: Always raised 

2456 """ 

2457 raise UnsupportedOperation("readlines") 

2458 

2459 def writelines(self, lines: Iterable[bytes], /) -> None: # type: ignore[override] 

2460 """Write multiple lines to the file. 

2461 

2462 Args: 

2463 lines: Iterable of lines to write 

2464 """ 

2465 for line in lines: 

2466 self.write(line) 

2467 

2468 def read(self, size: int = -1) -> bytes: 

2469 """Not supported for write-only file. 

2470 

2471 Raises: 

2472 UnsupportedOperation: Always raised 

2473 """ 

2474 raise UnsupportedOperation("read") 

2475 

2476 def __enter__(self) -> "SHA1Writer": 

2477 """Enter context manager.""" 

2478 return self 

2479 

2480 def __exit__( 

2481 self, 

2482 type: type | None, 

2483 value: BaseException | None, 

2484 traceback: TracebackType | None, 

2485 ) -> None: 

2486 """Exit context manager and close file.""" 

2487 self.close() 

2488 

2489 def __iter__(self) -> "SHA1Writer": 

2490 """Return iterator.""" 

2491 return self 

2492 

2493 def __next__(self) -> bytes: 

2494 """Not supported for write-only file. 

2495 

2496 Raises: 

2497 UnsupportedOperation: Always raised 

2498 """ 

2499 raise UnsupportedOperation("__next__") 

2500 

2501 def fileno(self) -> int: 

2502 """Return file descriptor number.""" 

2503 return self.f.fileno() 

2504 

2505 def isatty(self) -> bool: 

2506 """Check if file is a terminal.""" 

2507 return getattr(self.f, "isatty", lambda: False)() 

2508 

2509 def truncate(self, size: int | None = None) -> int: 

2510 """Not supported for write-only file. 

2511 

2512 Raises: 

2513 UnsupportedOperation: Always raised 

2514 """ 

2515 raise UnsupportedOperation("truncate") 

2516 

2517 

2518def pack_object_header( 

2519 type_num: int, delta_base: bytes | int | None, size: int 

2520) -> bytearray: 

2521 """Create a pack object header for the given object info. 

2522 

2523 Args: 

2524 type_num: Numeric type of the object. 

2525 delta_base: Delta base offset or ref, or None for whole objects. 

2526 size: Uncompressed object size. 

2527 Returns: A header for a packed object. 

2528 """ 

2529 header = [] 

2530 c = (type_num << 4) | (size & 15) 

2531 size >>= 4 

2532 while size: 

2533 header.append(c | 0x80) 

2534 c = size & 0x7F 

2535 size >>= 7 

2536 header.append(c) 

2537 if type_num == OFS_DELTA: 

2538 assert isinstance(delta_base, int) 

2539 ret = [delta_base & 0x7F] 

2540 delta_base >>= 7 

2541 while delta_base: 

2542 delta_base -= 1 

2543 ret.insert(0, 0x80 | (delta_base & 0x7F)) 

2544 delta_base >>= 7 

2545 header.extend(ret) 

2546 elif type_num == REF_DELTA: 

2547 assert isinstance(delta_base, bytes) 

2548 assert len(delta_base) == 20 

2549 header += delta_base 

2550 return bytearray(header) 

2551 

2552 

2553def pack_object_chunks( 

2554 type: int, 

2555 object: list[bytes] | tuple[bytes | int, list[bytes]], 

2556 compression_level: int = -1, 

2557) -> Iterator[bytes]: 

2558 """Generate chunks for a pack object. 

2559 

2560 Args: 

2561 type: Numeric type of the object 

2562 object: Object to write 

2563 compression_level: the zlib compression level 

2564 Returns: Chunks 

2565 """ 

2566 if type in DELTA_TYPES: 

2567 if isinstance(object, tuple): 

2568 delta_base, object = object 

2569 else: 

2570 raise TypeError("Delta types require a tuple of (delta_base, object)") 

2571 else: 

2572 delta_base = None 

2573 

2574 # Convert object to list of bytes chunks 

2575 if isinstance(object, bytes): 

2576 chunks = [object] 

2577 elif isinstance(object, list): 

2578 chunks = object 

2579 elif isinstance(object, ShaFile): 

2580 chunks = object.as_raw_chunks() 

2581 else: 

2582 # Shouldn't reach here with proper typing 

2583 raise TypeError(f"Unexpected object type: {object.__class__.__name__}") 

2584 

2585 yield bytes(pack_object_header(type, delta_base, sum(map(len, chunks)))) 

2586 compressor = zlib.compressobj(level=compression_level) 

2587 for data in chunks: 

2588 yield compressor.compress(data) 

2589 yield compressor.flush() 

2590 

2591 

2592def write_pack_object( 

2593 write: Callable[[bytes], int], 

2594 type: int, 

2595 object: list[bytes] | tuple[bytes | int, list[bytes]], 

2596 sha: "HashObject | None" = None, 

2597 compression_level: int = -1, 

2598) -> int: 

2599 """Write pack object to a file. 

2600 

2601 Args: 

2602 write: Write function to use 

2603 type: Numeric type of the object 

2604 object: Object to write 

2605 sha: Optional SHA-1 hasher to update 

2606 compression_level: the zlib compression level 

2607 Returns: CRC32 checksum of the written object 

2608 """ 

2609 crc32 = 0 

2610 for chunk in pack_object_chunks(type, object, compression_level=compression_level): 

2611 write(chunk) 

2612 if sha is not None: 

2613 sha.update(chunk) 

2614 crc32 = binascii.crc32(chunk, crc32) 

2615 return crc32 & 0xFFFFFFFF 

2616 

2617 

2618def write_pack( 

2619 filename: str, 

2620 objects: Sequence[ShaFile] | Sequence[tuple[ShaFile, bytes | None]], 

2621 *, 

2622 deltify: bool | None = None, 

2623 delta_window_size: int | None = None, 

2624 compression_level: int = -1, 

2625) -> tuple[bytes, bytes]: 

2626 """Write a new pack data file. 

2627 

2628 Args: 

2629 filename: Path to the new pack file (without .pack extension) 

2630 objects: Objects to write to the pack 

2631 delta_window_size: Delta window size 

2632 deltify: Whether to deltify pack objects 

2633 compression_level: the zlib compression level 

2634 Returns: Tuple with checksum of pack file and index file 

2635 """ 

2636 with GitFile(filename + ".pack", "wb") as f: 

2637 entries, data_sum = write_pack_objects( 

2638 f, 

2639 objects, 

2640 delta_window_size=delta_window_size, 

2641 deltify=deltify, 

2642 compression_level=compression_level, 

2643 ) 

2644 entries_list = sorted([(k, v[0], v[1]) for (k, v) in entries.items()]) 

2645 with GitFile(filename + ".idx", "wb") as f: 

2646 idx_sha = write_pack_index(f, entries_list, data_sum) 

2647 return data_sum, idx_sha 

2648 

2649 

2650def pack_header_chunks(num_objects: int) -> Iterator[bytes]: 

2651 """Yield chunks for a pack header.""" 

2652 yield b"PACK" # Pack header 

2653 yield struct.pack(b">L", 2) # Pack version 

2654 yield struct.pack(b">L", num_objects) # Number of objects in pack 

2655 

2656 

2657def write_pack_header( 

2658 write: Callable[[bytes], int] | IO[bytes], num_objects: int 

2659) -> None: 

2660 """Write a pack header for the given number of objects.""" 

2661 write_fn: Callable[[bytes], int] 

2662 if hasattr(write, "write"): 

2663 write_fn = write.write 

2664 warnings.warn( 

2665 "write_pack_header() now takes a write rather than file argument", 

2666 DeprecationWarning, 

2667 stacklevel=2, 

2668 ) 

2669 else: 

2670 write_fn = write 

2671 for chunk in pack_header_chunks(num_objects): 

2672 write_fn(chunk) 

2673 

2674 

2675def find_reusable_deltas( 

2676 container: PackedObjectContainer, 

2677 object_ids: Set[ObjectID], 

2678 *, 

2679 other_haves: Set[ObjectID] | None = None, 

2680 progress: Callable[..., None] | None = None, 

2681) -> Iterator[UnpackedObject]: 

2682 """Find deltas in a pack that can be reused. 

2683 

2684 Args: 

2685 container: Pack container to search for deltas 

2686 object_ids: Set of object IDs to find deltas for 

2687 other_haves: Set of other object IDs we have 

2688 progress: Optional progress reporting callback 

2689 

2690 Returns: 

2691 Iterator of UnpackedObject entries that can be reused 

2692 """ 

2693 if other_haves is None: 

2694 other_haves = set() 

2695 reused = 0 

2696 for i, unpacked in enumerate( 

2697 container.iter_unpacked_subset( 

2698 object_ids, allow_missing=True, convert_ofs_delta=True 

2699 ) 

2700 ): 

2701 if progress is not None and i % 1000 == 0: 

2702 progress(f"checking for reusable deltas: {i}/{len(object_ids)}\r".encode()) 

2703 if unpacked.pack_type_num == REF_DELTA: 

2704 hexsha = sha_to_hex(unpacked.delta_base) # type: ignore 

2705 if hexsha in object_ids or hexsha in other_haves: 

2706 yield unpacked 

2707 reused += 1 

2708 if progress is not None: 

2709 progress((f"found {reused} deltas to reuse\n").encode()) 

2710 

2711 

2712def deltify_pack_objects( 

2713 objects: Iterator[ShaFile] | Iterator[tuple[ShaFile, bytes | None]], 

2714 *, 

2715 window_size: int | None = None, 

2716 progress: Callable[..., None] | None = None, 

2717) -> Iterator[UnpackedObject]: 

2718 """Generate deltas for pack objects. 

2719 

2720 Args: 

2721 objects: An iterable of (object, path) tuples to deltify. 

2722 window_size: Window size; None for default 

2723 progress: Optional progress reporting callback 

2724 Returns: Iterator over type_num, object id, delta_base, content 

2725 delta_base is None for full text entries 

2726 """ 

2727 

2728 def objects_with_hints() -> Iterator[tuple[ShaFile, tuple[int, bytes | None]]]: 

2729 for e in objects: 

2730 if isinstance(e, ShaFile): 

2731 yield (e, (e.type_num, None)) 

2732 else: 

2733 yield (e[0], (e[0].type_num, e[1])) 

2734 

2735 sorted_objs = sort_objects_for_delta(objects_with_hints()) 

2736 yield from deltas_from_sorted_objects( 

2737 sorted_objs, 

2738 window_size=window_size, 

2739 progress=progress, 

2740 ) 

2741 

2742 

2743def sort_objects_for_delta( 

2744 objects: Iterator[ShaFile] | Iterator[tuple[ShaFile, PackHint | None]], 

2745) -> Iterator[tuple[ShaFile, bytes | None]]: 

2746 """Sort objects for optimal delta compression. 

2747 

2748 Args: 

2749 objects: Iterator of objects or (object, hint) tuples 

2750 

2751 Returns: 

2752 Iterator of sorted (ShaFile, path) tuples 

2753 """ 

2754 magic = [] 

2755 for entry in objects: 

2756 if isinstance(entry, tuple): 

2757 obj, hint = entry 

2758 if hint is None: 

2759 type_num = None 

2760 path = None 

2761 else: 

2762 (type_num, path) = hint 

2763 else: 

2764 obj = entry 

2765 type_num = None 

2766 path = None 

2767 magic.append((type_num, path, -obj.raw_length(), obj)) 

2768 # Build a list of objects ordered by the magic Linus heuristic 

2769 # This helps us find good objects to diff against us 

2770 magic.sort() 

2771 return ((x[3], x[1]) for x in magic) 

2772 

2773 

2774def deltas_from_sorted_objects( 

2775 objects: Iterator[tuple[ShaFile, bytes | None]], 

2776 window_size: int | None = None, 

2777 progress: Callable[..., None] | None = None, 

2778) -> Iterator[UnpackedObject]: 

2779 """Create deltas from sorted objects. 

2780 

2781 Args: 

2782 objects: Iterator of sorted objects to deltify 

2783 window_size: Delta window size; None for default 

2784 progress: Optional progress reporting callback 

2785 

2786 Returns: 

2787 Iterator of UnpackedObject entries 

2788 """ 

2789 # TODO(jelmer): Use threads 

2790 if window_size is None: 

2791 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE 

2792 

2793 possible_bases: deque[tuple[bytes, int, list[bytes]]] = deque() 

2794 for i, (o, path) in enumerate(objects): 

2795 if progress is not None and i % 1000 == 0: 

2796 progress((f"generating deltas: {i}\r").encode()) 

2797 raw = o.as_raw_chunks() 

2798 winner = raw 

2799 winner_len = sum(map(len, winner)) 

2800 winner_base = None 

2801 for base_id, base_type_num, base in possible_bases: 

2802 if base_type_num != o.type_num: 

2803 continue 

2804 delta_len = 0 

2805 delta = [] 

2806 for chunk in create_delta(b"".join(base), b"".join(raw)): 

2807 delta_len += len(chunk) 

2808 if delta_len >= winner_len: 

2809 break 

2810 delta.append(chunk) 

2811 else: 

2812 winner_base = base_id 

2813 winner = delta 

2814 winner_len = sum(map(len, winner)) 

2815 yield UnpackedObject( 

2816 o.type_num, 

2817 sha=o.sha().digest(), 

2818 delta_base=winner_base, 

2819 decomp_len=winner_len, 

2820 decomp_chunks=winner, 

2821 ) 

2822 possible_bases.appendleft((o.sha().digest(), o.type_num, raw)) 

2823 while len(possible_bases) > window_size: 

2824 possible_bases.pop() 

2825 

2826 

2827def pack_objects_to_data( 

2828 objects: Sequence[ShaFile] 

2829 | Sequence[tuple[ShaFile, bytes | None]] 

2830 | Sequence[tuple[ShaFile, PackHint | None]], 

2831 *, 

2832 deltify: bool | None = None, 

2833 delta_window_size: int | None = None, 

2834 ofs_delta: bool = True, 

2835 progress: Callable[..., None] | None = None, 

2836) -> tuple[int, Iterator[UnpackedObject]]: 

2837 """Create pack data from objects. 

2838 

2839 Args: 

2840 objects: Pack objects 

2841 deltify: Whether to deltify pack objects 

2842 delta_window_size: Delta window size 

2843 ofs_delta: Whether to use offset deltas 

2844 progress: Optional progress reporting callback 

2845 Returns: Tuples with (type_num, hexdigest, delta base, object chunks) 

2846 """ 

2847 count = len(objects) 

2848 if deltify is None: 

2849 # PERFORMANCE/TODO(jelmer): This should be enabled but the python 

2850 # implementation is *much* too slow at the moment. 

2851 # Maybe consider enabling it just if the rust extension is available? 

2852 deltify = False 

2853 if deltify: 

2854 return ( 

2855 count, 

2856 deltify_pack_objects( 

2857 iter(objects), # type: ignore 

2858 window_size=delta_window_size, 

2859 progress=progress, 

2860 ), 

2861 ) 

2862 else: 

2863 

2864 def iter_without_path() -> Iterator[UnpackedObject]: 

2865 for o in objects: 

2866 if isinstance(o, tuple): 

2867 yield full_unpacked_object(o[0]) 

2868 else: 

2869 yield full_unpacked_object(o) 

2870 

2871 return (count, iter_without_path()) 

2872 

2873 

2874def generate_unpacked_objects( 

2875 container: PackedObjectContainer, 

2876 object_ids: Sequence[tuple[ObjectID, PackHint | None]], 

2877 delta_window_size: int | None = None, 

2878 deltify: bool | None = None, 

2879 reuse_deltas: bool = True, 

2880 ofs_delta: bool = True, 

2881 other_haves: set[ObjectID] | None = None, 

2882 progress: Callable[..., None] | None = None, 

2883) -> Iterator[UnpackedObject]: 

2884 """Create pack data from objects. 

2885 

2886 Returns: Tuples with (type_num, hexdigest, delta base, object chunks) 

2887 """ 

2888 todo = dict(object_ids) 

2889 if reuse_deltas: 

2890 for unpack in find_reusable_deltas( 

2891 container, set(todo), other_haves=other_haves, progress=progress 

2892 ): 

2893 del todo[sha_to_hex(RawObjectID(unpack.sha()))] 

2894 yield unpack 

2895 if deltify is None: 

2896 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too 

2897 # slow at the moment. 

2898 deltify = False 

2899 if deltify: 

2900 objects_to_delta = container.iterobjects_subset( 

2901 todo.keys(), allow_missing=False 

2902 ) 

2903 sorted_objs = sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta) 

2904 yield from deltas_from_sorted_objects( 

2905 sorted_objs, 

2906 window_size=delta_window_size, 

2907 progress=progress, 

2908 ) 

2909 else: 

2910 for oid in todo: 

2911 yield full_unpacked_object(container[oid]) 

2912 

2913 

2914def full_unpacked_object(o: ShaFile) -> UnpackedObject: 

2915 """Create an UnpackedObject from a ShaFile. 

2916 

2917 Args: 

2918 o: ShaFile object to convert 

2919 

2920 Returns: 

2921 UnpackedObject with full object data 

2922 """ 

2923 return UnpackedObject( 

2924 o.type_num, 

2925 delta_base=None, 

2926 crc32=None, 

2927 decomp_chunks=o.as_raw_chunks(), 

2928 sha=o.sha().digest(), 

2929 ) 

2930 

2931 

2932def write_pack_from_container( 

2933 write: Callable[[bytes], None] 

2934 | Callable[[bytes | bytearray | memoryview], int] 

2935 | IO[bytes], 

2936 container: PackedObjectContainer, 

2937 object_ids: Sequence[tuple[ObjectID, PackHint | None]], 

2938 delta_window_size: int | None = None, 

2939 deltify: bool | None = None, 

2940 reuse_deltas: bool = True, 

2941 compression_level: int = -1, 

2942 other_haves: set[ObjectID] | None = None, 

2943) -> tuple[dict[bytes, tuple[int, int]], bytes]: 

2944 """Write a new pack data file. 

2945 

2946 Args: 

2947 write: write function to use 

2948 container: PackedObjectContainer 

2949 object_ids: Sequence of (object_id, hint) tuples to write 

2950 delta_window_size: Sliding window size for searching for deltas; 

2951 Set to None for default window size. 

2952 deltify: Whether to deltify objects 

2953 reuse_deltas: Whether to reuse existing deltas 

2954 compression_level: the zlib compression level to use 

2955 other_haves: Set of additional object IDs the receiver has 

2956 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

2957 """ 

2958 pack_contents_count = len(object_ids) 

2959 pack_contents = generate_unpacked_objects( 

2960 container, 

2961 object_ids, 

2962 delta_window_size=delta_window_size, 

2963 deltify=deltify, 

2964 reuse_deltas=reuse_deltas, 

2965 other_haves=other_haves, 

2966 ) 

2967 

2968 return write_pack_data( 

2969 write, 

2970 pack_contents, 

2971 num_records=pack_contents_count, 

2972 compression_level=compression_level, 

2973 ) 

2974 

2975 

2976def write_pack_objects( 

2977 write: Callable[[bytes], None] | IO[bytes], 

2978 objects: Sequence[ShaFile] | Sequence[tuple[ShaFile, bytes | None]], 

2979 *, 

2980 delta_window_size: int | None = None, 

2981 deltify: bool | None = None, 

2982 compression_level: int = -1, 

2983) -> tuple[dict[bytes, tuple[int, int]], bytes]: 

2984 """Write a new pack data file. 

2985 

2986 Args: 

2987 write: write function to use 

2988 objects: Sequence of (object, path) tuples to write 

2989 delta_window_size: Sliding window size for searching for deltas; 

2990 Set to None for default window size. 

2991 deltify: Whether to deltify objects 

2992 compression_level: the zlib compression level to use 

2993 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

2994 """ 

2995 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify) 

2996 

2997 return write_pack_data( 

2998 write, 

2999 pack_contents, 

3000 num_records=pack_contents_count, 

3001 compression_level=compression_level, 

3002 ) 

3003 

3004 

3005class PackChunkGenerator: 

3006 """Generator for pack data chunks.""" 

3007 

3008 def __init__( 

3009 self, 

3010 num_records: int | None = None, 

3011 records: Iterator[UnpackedObject] | None = None, 

3012 progress: Callable[..., None] | None = None, 

3013 compression_level: int = -1, 

3014 reuse_compressed: bool = True, 

3015 ) -> None: 

3016 """Initialize PackChunkGenerator. 

3017 

3018 Args: 

3019 num_records: Expected number of records 

3020 records: Iterator of pack records 

3021 progress: Optional progress callback 

3022 compression_level: Compression level (-1 for default) 

3023 reuse_compressed: Whether to reuse compressed chunks 

3024 """ 

3025 self.cs = sha1(b"") 

3026 self.entries: dict[bytes, tuple[int, int]] = {} 

3027 if records is None: 

3028 records = iter([]) # Empty iterator if None 

3029 self._it = self._pack_data_chunks( 

3030 records=records, 

3031 num_records=num_records, 

3032 progress=progress, 

3033 compression_level=compression_level, 

3034 reuse_compressed=reuse_compressed, 

3035 ) 

3036 

3037 def sha1digest(self) -> bytes: 

3038 """Return the SHA1 digest of the pack data.""" 

3039 return self.cs.digest() 

3040 

3041 def __iter__(self) -> Iterator[bytes]: 

3042 """Iterate over pack data chunks.""" 

3043 return self._it 

3044 

3045 def _pack_data_chunks( 

3046 self, 

3047 records: Iterator[UnpackedObject], 

3048 *, 

3049 num_records: int | None = None, 

3050 progress: Callable[..., None] | None = None, 

3051 compression_level: int = -1, 

3052 reuse_compressed: bool = True, 

3053 ) -> Iterator[bytes]: 

3054 """Iterate pack data file chunks. 

3055 

3056 Args: 

3057 records: Iterator over UnpackedObject 

3058 num_records: Number of records (defaults to len(records) if not specified) 

3059 progress: Function to report progress to 

3060 compression_level: the zlib compression level 

3061 reuse_compressed: Whether to reuse compressed chunks 

3062 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

3063 """ 

3064 # Write the pack 

3065 if num_records is None: 

3066 num_records = len(records) # type: ignore 

3067 offset = 0 

3068 for chunk in pack_header_chunks(num_records): 

3069 yield chunk 

3070 self.cs.update(chunk) 

3071 offset += len(chunk) 

3072 actual_num_records = 0 

3073 for i, unpacked in enumerate(records): 

3074 type_num = unpacked.pack_type_num 

3075 if progress is not None and i % 1000 == 0: 

3076 progress((f"writing pack data: {i}/{num_records}\r").encode("ascii")) 

3077 raw: list[bytes] | tuple[int, list[bytes]] | tuple[bytes, list[bytes]] 

3078 if unpacked.delta_base is not None: 

3079 assert isinstance(unpacked.delta_base, bytes), ( 

3080 f"Expected bytes, got {type(unpacked.delta_base)}" 

3081 ) 

3082 try: 

3083 base_offset, _base_crc32 = self.entries[unpacked.delta_base] 

3084 except KeyError: 

3085 type_num = REF_DELTA 

3086 assert isinstance(unpacked.delta_base, bytes) 

3087 raw = (unpacked.delta_base, unpacked.decomp_chunks) 

3088 else: 

3089 type_num = OFS_DELTA 

3090 raw = (offset - base_offset, unpacked.decomp_chunks) 

3091 else: 

3092 raw = unpacked.decomp_chunks 

3093 chunks: list[bytes] | Iterator[bytes] 

3094 if unpacked.comp_chunks is not None and reuse_compressed: 

3095 chunks = unpacked.comp_chunks 

3096 else: 

3097 chunks = pack_object_chunks( 

3098 type_num, raw, compression_level=compression_level 

3099 ) 

3100 crc32 = 0 

3101 object_size = 0 

3102 for chunk in chunks: 

3103 yield chunk 

3104 crc32 = binascii.crc32(chunk, crc32) 

3105 self.cs.update(chunk) 

3106 object_size += len(chunk) 

3107 actual_num_records += 1 

3108 self.entries[unpacked.sha()] = (offset, crc32) 

3109 offset += object_size 

3110 if actual_num_records != num_records: 

3111 raise AssertionError( 

3112 f"actual records written differs: {actual_num_records} != {num_records}" 

3113 ) 

3114 

3115 yield self.cs.digest() 

3116 

3117 

3118def write_pack_data( 

3119 write: Callable[[bytes], None] 

3120 | Callable[[bytes | bytearray | memoryview], int] 

3121 | IO[bytes], 

3122 records: Iterator[UnpackedObject], 

3123 *, 

3124 num_records: int | None = None, 

3125 progress: Callable[..., None] | None = None, 

3126 compression_level: int = -1, 

3127) -> tuple[dict[bytes, tuple[int, int]], bytes]: 

3128 """Write a new pack data file. 

3129 

3130 Args: 

3131 write: Write function to use 

3132 num_records: Number of records (defaults to len(records) if None) 

3133 records: Iterator over type_num, object_id, delta_base, raw 

3134 progress: Function to report progress to 

3135 compression_level: the zlib compression level 

3136 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

3137 """ 

3138 chunk_generator = PackChunkGenerator( 

3139 num_records=num_records, 

3140 records=records, 

3141 progress=progress, 

3142 compression_level=compression_level, 

3143 ) 

3144 for chunk in chunk_generator: 

3145 if callable(write): 

3146 write(chunk) 

3147 else: 

3148 write.write(chunk) 

3149 return chunk_generator.entries, chunk_generator.sha1digest() 

3150 

3151 

3152def write_pack_index_v1( 

3153 f: IO[bytes], 

3154 entries: Iterable[tuple[bytes, int, int | None]], 

3155 pack_checksum: bytes, 

3156) -> bytes: 

3157 """Write a new pack index file. 

3158 

3159 Args: 

3160 f: A file-like object to write to 

3161 entries: List of tuples with object name (sha), offset_in_pack, 

3162 and crc32_checksum. 

3163 pack_checksum: Checksum of the pack file. 

3164 Returns: The SHA of the written index file 

3165 """ 

3166 f = SHA1Writer(f) 

3167 fan_out_table: dict[int, int] = defaultdict(lambda: 0) 

3168 for name, _offset, _entry_checksum in entries: 

3169 fan_out_table[ord(name[:1])] += 1 

3170 # Fan-out table 

3171 for i in range(0x100): 

3172 f.write(struct.pack(">L", fan_out_table[i])) 

3173 fan_out_table[i + 1] += fan_out_table[i] 

3174 for name, offset, _entry_checksum in entries: 

3175 if not (offset <= 0xFFFFFFFF): 

3176 raise TypeError("pack format 1 only supports offsets < 2Gb") 

3177 f.write(struct.pack(">L20s", offset, name)) 

3178 assert len(pack_checksum) == 20 

3179 f.write(pack_checksum) 

3180 return f.write_sha() 

3181 

3182 

3183def _delta_encode_size(size: int) -> bytes: 

3184 ret = bytearray() 

3185 c = size & 0x7F 

3186 size >>= 7 

3187 while size: 

3188 ret.append(c | 0x80) 

3189 c = size & 0x7F 

3190 size >>= 7 

3191 ret.append(c) 

3192 return bytes(ret) 

3193 

3194 

3195# The length of delta compression copy operations in version 2 packs is limited 

3196# to 64K. To copy more, we use several copy operations. Version 3 packs allow 

3197# 24-bit lengths in copy operations, but we always make version 2 packs. 

3198_MAX_COPY_LEN = 0xFFFF 

3199 

3200 

3201def _encode_copy_operation(start: int, length: int) -> bytes: 

3202 scratch = bytearray([0x80]) 

3203 for i in range(4): 

3204 if start & 0xFF << i * 8: 

3205 scratch.append((start >> i * 8) & 0xFF) 

3206 scratch[0] |= 1 << i 

3207 for i in range(2): 

3208 if length & 0xFF << i * 8: 

3209 scratch.append((length >> i * 8) & 0xFF) 

3210 scratch[0] |= 1 << (4 + i) 

3211 return bytes(scratch) 

3212 

3213 

3214def _create_delta_py(base_buf: bytes, target_buf: bytes) -> Iterator[bytes]: 

3215 """Use python difflib to work out how to transform base_buf to target_buf. 

3216 

3217 Args: 

3218 base_buf: Base buffer 

3219 target_buf: Target buffer 

3220 """ 

3221 if isinstance(base_buf, list): 

3222 base_buf = b"".join(base_buf) 

3223 if isinstance(target_buf, list): 

3224 target_buf = b"".join(target_buf) 

3225 assert isinstance(base_buf, bytes) 

3226 assert isinstance(target_buf, bytes) 

3227 # write delta header 

3228 yield _delta_encode_size(len(base_buf)) 

3229 yield _delta_encode_size(len(target_buf)) 

3230 # write out delta opcodes 

3231 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf) 

3232 for opcode, i1, i2, j1, j2 in seq.get_opcodes(): 

3233 # Git patch opcodes don't care about deletes! 

3234 # if opcode == 'replace' or opcode == 'delete': 

3235 # pass 

3236 if opcode == "equal": 

3237 # If they are equal, unpacker will use data from base_buf 

3238 # Write out an opcode that says what range to use 

3239 copy_start = i1 

3240 copy_len = i2 - i1 

3241 while copy_len > 0: 

3242 to_copy = min(copy_len, _MAX_COPY_LEN) 

3243 yield _encode_copy_operation(copy_start, to_copy) 

3244 copy_start += to_copy 

3245 copy_len -= to_copy 

3246 if opcode == "replace" or opcode == "insert": 

3247 # If we are replacing a range or adding one, then we just 

3248 # output it to the stream (prefixed by its size) 

3249 s = j2 - j1 

3250 o = j1 

3251 while s > 127: 

3252 yield bytes([127]) 

3253 yield bytes(memoryview(target_buf)[o : o + 127]) 

3254 s -= 127 

3255 o += 127 

3256 yield bytes([s]) 

3257 yield bytes(memoryview(target_buf)[o : o + s]) 

3258 

3259 

3260# Default to pure Python implementation 

3261create_delta = _create_delta_py 

3262 

3263 

3264def apply_delta( 

3265 src_buf: bytes | list[bytes], delta: bytes | list[bytes] 

3266) -> list[bytes]: 

3267 """Based on the similar function in git's patch-delta.c. 

3268 

3269 Args: 

3270 src_buf: Source buffer 

3271 delta: Delta instructions 

3272 """ 

3273 if not isinstance(src_buf, bytes): 

3274 src_buf = b"".join(src_buf) 

3275 if not isinstance(delta, bytes): 

3276 delta = b"".join(delta) 

3277 out = [] 

3278 index = 0 

3279 delta_length = len(delta) 

3280 

3281 def get_delta_header_size(delta: bytes, index: int) -> tuple[int, int]: 

3282 size = 0 

3283 i = 0 

3284 while delta: 

3285 cmd = ord(delta[index : index + 1]) 

3286 index += 1 

3287 size |= (cmd & ~0x80) << i 

3288 i += 7 

3289 if not cmd & 0x80: 

3290 break 

3291 return size, index 

3292 

3293 src_size, index = get_delta_header_size(delta, index) 

3294 dest_size, index = get_delta_header_size(delta, index) 

3295 if src_size != len(src_buf): 

3296 raise ApplyDeltaError( 

3297 f"Unexpected source buffer size: {src_size} vs {len(src_buf)}" 

3298 ) 

3299 while index < delta_length: 

3300 cmd = ord(delta[index : index + 1]) 

3301 index += 1 

3302 if cmd & 0x80: 

3303 cp_off = 0 

3304 for i in range(4): 

3305 if cmd & (1 << i): 

3306 x = ord(delta[index : index + 1]) 

3307 index += 1 

3308 cp_off |= x << (i * 8) 

3309 cp_size = 0 

3310 # Version 3 packs can contain copy sizes larger than 64K. 

3311 for i in range(3): 

3312 if cmd & (1 << (4 + i)): 

3313 x = ord(delta[index : index + 1]) 

3314 index += 1 

3315 cp_size |= x << (i * 8) 

3316 if cp_size == 0: 

3317 cp_size = 0x10000 

3318 if ( 

3319 cp_off + cp_size < cp_size 

3320 or cp_off + cp_size > src_size 

3321 or cp_size > dest_size 

3322 ): 

3323 break 

3324 out.append(src_buf[cp_off : cp_off + cp_size]) 

3325 elif cmd != 0: 

3326 out.append(delta[index : index + cmd]) 

3327 index += cmd 

3328 else: 

3329 raise ApplyDeltaError("Invalid opcode 0") 

3330 

3331 if index != delta_length: 

3332 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}") 

3333 

3334 if dest_size != chunks_length(out): 

3335 raise ApplyDeltaError("dest size incorrect") 

3336 

3337 return out 

3338 

3339 

3340def write_pack_index_v2( 

3341 f: IO[bytes], 

3342 entries: Iterable[tuple[bytes, int, int | None]], 

3343 pack_checksum: bytes, 

3344) -> bytes: 

3345 """Write a new pack index file. 

3346 

3347 Args: 

3348 f: File-like object to write to 

3349 entries: List of tuples with object name (sha), offset_in_pack, and 

3350 crc32_checksum. 

3351 pack_checksum: Checksum of the pack file. 

3352 Returns: The SHA of the index file written 

3353 """ 

3354 f = SHA1Writer(f) 

3355 f.write(b"\377tOc") # Magic! 

3356 f.write(struct.pack(">L", 2)) 

3357 fan_out_table: dict[int, int] = defaultdict(lambda: 0) 

3358 for name, offset, entry_checksum in entries: 

3359 fan_out_table[ord(name[:1])] += 1 

3360 # Fan-out table 

3361 largetable: list[int] = [] 

3362 for i in range(0x100): 

3363 f.write(struct.pack(b">L", fan_out_table[i])) 

3364 fan_out_table[i + 1] += fan_out_table[i] 

3365 for name, offset, entry_checksum in entries: 

3366 f.write(name) 

3367 for name, offset, entry_checksum in entries: 

3368 f.write(struct.pack(b">L", entry_checksum)) 

3369 for name, offset, entry_checksum in entries: 

3370 if offset < 2**31: 

3371 f.write(struct.pack(b">L", offset)) 

3372 else: 

3373 f.write(struct.pack(b">L", 2**31 + len(largetable))) 

3374 largetable.append(offset) 

3375 for offset in largetable: 

3376 f.write(struct.pack(b">Q", offset)) 

3377 assert len(pack_checksum) == 20 

3378 f.write(pack_checksum) 

3379 return f.write_sha() 

3380 

3381 

3382def write_pack_index_v3( 

3383 f: IO[bytes], 

3384 entries: Iterable[tuple[bytes, int, int | None]], 

3385 pack_checksum: bytes, 

3386 hash_algorithm: int = 1, 

3387) -> bytes: 

3388 """Write a new pack index file in v3 format. 

3389 

3390 Args: 

3391 f: File-like object to write to 

3392 entries: List of tuples with object name (sha), offset_in_pack, and 

3393 crc32_checksum. 

3394 pack_checksum: Checksum of the pack file. 

3395 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256) 

3396 Returns: The SHA of the index file written 

3397 """ 

3398 if hash_algorithm == 1: 

3399 hash_size = 20 # SHA-1 

3400 writer_cls = SHA1Writer 

3401 elif hash_algorithm == 2: 

3402 hash_size = 32 # SHA-256 

3403 # TODO: Add SHA256Writer when SHA-256 support is implemented 

3404 raise NotImplementedError("SHA-256 support not yet implemented") 

3405 else: 

3406 raise ValueError(f"Unknown hash algorithm {hash_algorithm}") 

3407 

3408 # Convert entries to list to allow multiple iterations 

3409 entries_list = list(entries) 

3410 

3411 # Calculate shortest unambiguous prefix length for object names 

3412 # For now, use full hash size (this could be optimized) 

3413 shortened_oid_len = hash_size 

3414 

3415 f = writer_cls(f) 

3416 f.write(b"\377tOc") # Magic! 

3417 f.write(struct.pack(">L", 3)) # Version 3 

3418 f.write(struct.pack(">L", hash_algorithm)) # Hash algorithm 

3419 f.write(struct.pack(">L", shortened_oid_len)) # Shortened OID length 

3420 

3421 fan_out_table: dict[int, int] = defaultdict(lambda: 0) 

3422 for name, offset, entry_checksum in entries_list: 

3423 if len(name) != hash_size: 

3424 raise ValueError( 

3425 f"Object name has wrong length: expected {hash_size}, got {len(name)}" 

3426 ) 

3427 fan_out_table[ord(name[:1])] += 1 

3428 

3429 # Fan-out table 

3430 largetable: list[int] = [] 

3431 for i in range(0x100): 

3432 f.write(struct.pack(b">L", fan_out_table[i])) 

3433 fan_out_table[i + 1] += fan_out_table[i] 

3434 

3435 # Object names table 

3436 for name, offset, entry_checksum in entries_list: 

3437 f.write(name) 

3438 

3439 # CRC32 checksums table 

3440 for name, offset, entry_checksum in entries_list: 

3441 f.write(struct.pack(b">L", entry_checksum)) 

3442 

3443 # Offset table 

3444 for name, offset, entry_checksum in entries_list: 

3445 if offset < 2**31: 

3446 f.write(struct.pack(b">L", offset)) 

3447 else: 

3448 f.write(struct.pack(b">L", 2**31 + len(largetable))) 

3449 largetable.append(offset) 

3450 

3451 # Large offset table 

3452 for offset in largetable: 

3453 f.write(struct.pack(b">Q", offset)) 

3454 

3455 assert len(pack_checksum) == hash_size, ( 

3456 f"Pack checksum has wrong length: expected {hash_size}, got {len(pack_checksum)}" 

3457 ) 

3458 f.write(pack_checksum) 

3459 return f.write_sha() 

3460 

3461 

3462def write_pack_index( 

3463 f: IO[bytes], 

3464 entries: Iterable[tuple[bytes, int, int | None]], 

3465 pack_checksum: bytes, 

3466 progress: Callable[..., None] | None = None, 

3467 version: int | None = None, 

3468) -> bytes: 

3469 """Write a pack index file. 

3470 

3471 Args: 

3472 f: File-like object to write to. 

3473 entries: List of (checksum, offset, crc32) tuples 

3474 pack_checksum: Checksum of the pack file. 

3475 progress: Progress function (not currently used) 

3476 version: Pack index version to use (1, 2, or 3). If None, defaults to DEFAULT_PACK_INDEX_VERSION. 

3477 

3478 Returns: 

3479 SHA of the written index file 

3480 """ 

3481 if version is None: 

3482 version = DEFAULT_PACK_INDEX_VERSION 

3483 

3484 if version == 1: 

3485 return write_pack_index_v1(f, entries, pack_checksum) 

3486 elif version == 2: 

3487 return write_pack_index_v2(f, entries, pack_checksum) 

3488 elif version == 3: 

3489 return write_pack_index_v3(f, entries, pack_checksum) 

3490 else: 

3491 raise ValueError(f"Unsupported pack index version: {version}") 

3492 

3493 

3494class Pack: 

3495 """A Git pack object.""" 

3496 

3497 _data_load: Callable[[], PackData] | None 

3498 _idx_load: Callable[[], PackIndex] | None 

3499 

3500 _data: PackData | None 

3501 _idx: PackIndex | None 

3502 _bitmap: "PackBitmap | None" 

3503 

3504 def __init__( 

3505 self, 

3506 basename: str, 

3507 resolve_ext_ref: ResolveExtRefFn | None = None, 

3508 *, 

3509 delta_window_size: int | None = None, 

3510 window_memory: int | None = None, 

3511 delta_cache_size: int | None = None, 

3512 depth: int | None = None, 

3513 threads: int | None = None, 

3514 big_file_threshold: int | None = None, 

3515 ) -> None: 

3516 """Initialize a Pack object. 

3517 

3518 Args: 

3519 basename: Base path for pack files (without .pack/.idx extension) 

3520 resolve_ext_ref: Optional function to resolve external references 

3521 delta_window_size: Size of the delta compression window 

3522 window_memory: Memory limit for delta compression window 

3523 delta_cache_size: Size of the delta cache 

3524 depth: Maximum depth for delta chains 

3525 threads: Number of threads to use for operations 

3526 big_file_threshold: Size threshold for big file handling 

3527 """ 

3528 self._basename = basename 

3529 self._data = None 

3530 self._idx = None 

3531 self._bitmap = None 

3532 self._idx_path = self._basename + ".idx" 

3533 self._data_path = self._basename + ".pack" 

3534 self._bitmap_path = self._basename + ".bitmap" 

3535 self.delta_window_size = delta_window_size 

3536 self.window_memory = window_memory 

3537 self.delta_cache_size = delta_cache_size 

3538 self.depth = depth 

3539 self.threads = threads 

3540 self.big_file_threshold = big_file_threshold 

3541 self._data_load = lambda: PackData( 

3542 self._data_path, 

3543 delta_window_size=delta_window_size, 

3544 window_memory=window_memory, 

3545 delta_cache_size=delta_cache_size, 

3546 depth=depth, 

3547 threads=threads, 

3548 big_file_threshold=big_file_threshold, 

3549 ) 

3550 self._idx_load = lambda: load_pack_index(self._idx_path) 

3551 self.resolve_ext_ref = resolve_ext_ref 

3552 

3553 @classmethod 

3554 def from_lazy_objects( 

3555 cls, data_fn: Callable[[], PackData], idx_fn: Callable[[], PackIndex] 

3556 ) -> "Pack": 

3557 """Create a new pack object from callables to load pack data and index objects.""" 

3558 ret = cls("") 

3559 ret._data_load = data_fn 

3560 ret._idx_load = idx_fn 

3561 return ret 

3562 

3563 @classmethod 

3564 def from_objects(cls, data: PackData, idx: PackIndex) -> "Pack": 

3565 """Create a new pack object from pack data and index objects.""" 

3566 ret = cls("") 

3567 ret._data = data 

3568 ret._data_load = None 

3569 ret._idx = idx 

3570 ret._idx_load = None 

3571 ret.check_length_and_checksum() 

3572 return ret 

3573 

3574 def name(self) -> bytes: 

3575 """The SHA over the SHAs of the objects in this pack.""" 

3576 return self.index.objects_sha1() 

3577 

3578 @property 

3579 def data(self) -> PackData: 

3580 """The pack data object being used.""" 

3581 if self._data is None: 

3582 assert self._data_load 

3583 self._data = self._data_load() 

3584 self.check_length_and_checksum() 

3585 return self._data 

3586 

3587 @property 

3588 def index(self) -> PackIndex: 

3589 """The index being used. 

3590 

3591 Note: This may be an in-memory index 

3592 """ 

3593 if self._idx is None: 

3594 assert self._idx_load 

3595 self._idx = self._idx_load() 

3596 return self._idx 

3597 

3598 @property 

3599 def bitmap(self) -> "PackBitmap | None": 

3600 """The bitmap being used, if available. 

3601 

3602 Returns: 

3603 PackBitmap instance or None if no bitmap exists 

3604 

3605 Raises: 

3606 ValueError: If bitmap file is invalid or corrupt 

3607 """ 

3608 if self._bitmap is None: 

3609 from .bitmap import read_bitmap 

3610 

3611 self._bitmap = read_bitmap(self._bitmap_path, pack_index=self.index) 

3612 return self._bitmap 

3613 

3614 def ensure_bitmap( 

3615 self, 

3616 object_store: "BaseObjectStore", 

3617 refs: dict["Ref", "ObjectID"], 

3618 commit_interval: int | None = None, 

3619 progress: Callable[[str], None] | None = None, 

3620 ) -> "PackBitmap": 

3621 """Ensure a bitmap exists for this pack, generating one if needed. 

3622 

3623 Args: 

3624 object_store: Object store to read objects from 

3625 refs: Dictionary of ref names to commit SHAs 

3626 commit_interval: Include every Nth commit in bitmap index 

3627 progress: Optional progress reporting callback 

3628 

3629 Returns: 

3630 PackBitmap instance (either existing or newly generated) 

3631 """ 

3632 from .bitmap import generate_bitmap, write_bitmap 

3633 

3634 # Check if bitmap already exists 

3635 try: 

3636 existing = self.bitmap 

3637 if existing is not None: 

3638 return existing 

3639 except FileNotFoundError: 

3640 pass # No bitmap, we'll generate one 

3641 

3642 # Generate new bitmap 

3643 if progress: 

3644 progress(f"Generating bitmap for {self.name().decode('utf-8')}...\n") 

3645 

3646 pack_bitmap = generate_bitmap( 

3647 self.index, 

3648 object_store, 

3649 refs, 

3650 self.get_stored_checksum(), 

3651 commit_interval=commit_interval, 

3652 progress=progress, 

3653 ) 

3654 

3655 # Write bitmap file 

3656 write_bitmap(self._bitmap_path, pack_bitmap) 

3657 

3658 if progress: 

3659 progress(f"Wrote {self._bitmap_path}\n") 

3660 

3661 # Update cached bitmap 

3662 self._bitmap = pack_bitmap 

3663 

3664 return pack_bitmap 

3665 

3666 def close(self) -> None: 

3667 """Close the pack file and index.""" 

3668 if self._data is not None: 

3669 self._data.close() 

3670 if self._idx is not None: 

3671 self._idx.close() 

3672 

3673 def __enter__(self) -> "Pack": 

3674 """Enter context manager.""" 

3675 return self 

3676 

3677 def __exit__( 

3678 self, 

3679 exc_type: type | None, 

3680 exc_val: BaseException | None, 

3681 exc_tb: TracebackType | None, 

3682 ) -> None: 

3683 """Exit context manager.""" 

3684 self.close() 

3685 

3686 def __eq__(self, other: object) -> bool: 

3687 """Check equality with another pack.""" 

3688 if not isinstance(other, Pack): 

3689 return False 

3690 return self.index == other.index 

3691 

3692 def __len__(self) -> int: 

3693 """Number of entries in this pack.""" 

3694 return len(self.index) 

3695 

3696 def __repr__(self) -> str: 

3697 """Return string representation of this pack.""" 

3698 return f"{self.__class__.__name__}({self._basename!r})" 

3699 

3700 def __iter__(self) -> Iterator[ObjectID]: 

3701 """Iterate over all the sha1s of the objects in this pack.""" 

3702 return iter(self.index) 

3703 

3704 def check_length_and_checksum(self) -> None: 

3705 """Sanity check the length and checksum of the pack index and data.""" 

3706 assert len(self.index) == len(self.data), ( 

3707 f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)" 

3708 ) 

3709 idx_stored_checksum = self.index.get_pack_checksum() 

3710 data_stored_checksum = self.data.get_stored_checksum() 

3711 if ( 

3712 idx_stored_checksum is not None 

3713 and idx_stored_checksum != data_stored_checksum 

3714 ): 

3715 raise ChecksumMismatch( 

3716 sha_to_hex(RawObjectID(idx_stored_checksum)), 

3717 sha_to_hex(RawObjectID(data_stored_checksum)), 

3718 ) 

3719 

3720 def check(self) -> None: 

3721 """Check the integrity of this pack. 

3722 

3723 Raises: 

3724 ChecksumMismatch: if a checksum for the index or data is wrong 

3725 """ 

3726 self.index.check() 

3727 self.data.check() 

3728 for obj in self.iterobjects(): 

3729 obj.check() 

3730 # TODO: object connectivity checks 

3731 

3732 def get_stored_checksum(self) -> bytes: 

3733 """Return the stored checksum of the pack data.""" 

3734 return self.data.get_stored_checksum() 

3735 

3736 def pack_tuples(self) -> list[tuple[ShaFile, None]]: 

3737 """Return pack tuples for all objects in pack.""" 

3738 return [(o, None) for o in self.iterobjects()] 

3739 

3740 def __contains__(self, sha1: ObjectID | RawObjectID) -> bool: 

3741 """Check whether this pack contains a particular SHA1.""" 

3742 try: 

3743 self.index.object_offset(sha1) 

3744 return True 

3745 except KeyError: 

3746 return False 

3747 

3748 def get_raw(self, sha1: RawObjectID | ObjectID) -> tuple[int, bytes]: 

3749 """Get raw object data by SHA1.""" 

3750 offset = self.index.object_offset(sha1) 

3751 obj_type, obj = self.data.get_object_at(offset) 

3752 type_num, chunks = self.resolve_object(offset, obj_type, obj) 

3753 return type_num, b"".join(chunks) # type: ignore[arg-type] 

3754 

3755 def __getitem__(self, sha1: "ObjectID | RawObjectID") -> ShaFile: 

3756 """Retrieve the specified SHA1.""" 

3757 type, uncomp = self.get_raw(sha1) 

3758 return ShaFile.from_raw_string(type, uncomp, sha=sha1) 

3759 

3760 def iterobjects(self) -> Iterator[ShaFile]: 

3761 """Iterate over the objects in this pack.""" 

3762 return iter( 

3763 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref) 

3764 ) 

3765 

3766 def iterobjects_subset( 

3767 self, shas: Iterable[ObjectID], *, allow_missing: bool = False 

3768 ) -> Iterator[ShaFile]: 

3769 """Iterate over a subset of objects in this pack.""" 

3770 return ( 

3771 uo 

3772 for uo in PackInflater.for_pack_subset( 

3773 self, 

3774 shas, 

3775 allow_missing=allow_missing, 

3776 resolve_ext_ref=self.resolve_ext_ref, 

3777 ) 

3778 if uo.id in shas 

3779 ) 

3780 

3781 def iter_unpacked_subset( 

3782 self, 

3783 shas: Iterable[ObjectID | RawObjectID], 

3784 *, 

3785 include_comp: bool = False, 

3786 allow_missing: bool = False, 

3787 convert_ofs_delta: bool = False, 

3788 ) -> Iterator[UnpackedObject]: 

3789 """Iterate over unpacked objects in subset.""" 

3790 ofs_pending: dict[int, list[UnpackedObject]] = defaultdict(list) 

3791 ofs: dict[int, bytes] = {} 

3792 todo: set[ObjectID | RawObjectID] = set(shas) 

3793 for unpacked in self.iter_unpacked(include_comp=include_comp): 

3794 sha = unpacked.sha() 

3795 if unpacked.offset is not None: 

3796 ofs[unpacked.offset] = sha 

3797 hexsha = sha_to_hex(RawObjectID(sha)) 

3798 if hexsha in todo: 

3799 if unpacked.pack_type_num == OFS_DELTA: 

3800 assert isinstance(unpacked.delta_base, int) 

3801 assert unpacked.offset is not None 

3802 base_offset = unpacked.offset - unpacked.delta_base 

3803 try: 

3804 unpacked.delta_base = ofs[base_offset] 

3805 except KeyError: 

3806 ofs_pending[base_offset].append(unpacked) 

3807 continue 

3808 else: 

3809 unpacked.pack_type_num = REF_DELTA 

3810 yield unpacked 

3811 todo.remove(hexsha) 

3812 if unpacked.offset is not None: 

3813 for child in ofs_pending.pop(unpacked.offset, []): 

3814 child.pack_type_num = REF_DELTA 

3815 child.delta_base = sha 

3816 yield child 

3817 assert not ofs_pending 

3818 if not allow_missing and todo: 

3819 raise UnresolvedDeltas(list(todo)) 

3820 

3821 def iter_unpacked(self, include_comp: bool = False) -> Iterator[UnpackedObject]: 

3822 """Iterate over all unpacked objects in this pack.""" 

3823 ofs_to_entries = { 

3824 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries() 

3825 } 

3826 for unpacked in self.data.iter_unpacked(include_comp=include_comp): 

3827 assert unpacked.offset is not None 

3828 (sha, crc32) = ofs_to_entries[unpacked.offset] 

3829 unpacked._sha = sha 

3830 unpacked.crc32 = crc32 

3831 yield unpacked 

3832 

3833 def keep(self, msg: bytes | None = None) -> str: 

3834 """Add a .keep file for the pack, preventing git from garbage collecting it. 

3835 

3836 Args: 

3837 msg: A message written inside the .keep file; can be used later 

3838 to determine whether or not a .keep file is obsolete. 

3839 Returns: The path of the .keep file, as a string. 

3840 """ 

3841 keepfile_name = f"{self._basename}.keep" 

3842 with GitFile(keepfile_name, "wb") as keepfile: 

3843 if msg: 

3844 keepfile.write(msg) 

3845 keepfile.write(b"\n") 

3846 return keepfile_name 

3847 

3848 def get_ref( 

3849 self, sha: RawObjectID | ObjectID 

3850 ) -> tuple[int | None, int, OldUnpackedObject]: 

3851 """Get the object for a ref SHA, only looking in this pack.""" 

3852 # TODO: cache these results 

3853 try: 

3854 offset = self.index.object_offset(sha) 

3855 except KeyError: 

3856 offset = None 

3857 if offset: 

3858 type, obj = self.data.get_object_at(offset) 

3859 elif self.resolve_ext_ref: 

3860 type, obj = self.resolve_ext_ref(sha) 

3861 else: 

3862 raise KeyError(sha) 

3863 return offset, type, obj 

3864 

3865 def resolve_object( 

3866 self, 

3867 offset: int, 

3868 type: int, 

3869 obj: OldUnpackedObject, 

3870 get_ref: Callable[ 

3871 [RawObjectID | ObjectID], tuple[int | None, int, OldUnpackedObject] 

3872 ] 

3873 | None = None, 

3874 ) -> tuple[int, OldUnpackedObject]: 

3875 """Resolve an object, possibly resolving deltas when necessary. 

3876 

3877 Returns: Tuple with object type and contents. 

3878 """ 

3879 # Walk down the delta chain, building a stack of deltas to reach 

3880 # the requested object. 

3881 base_offset: int | None = offset 

3882 base_type = type 

3883 base_obj = obj 

3884 delta_stack = [] 

3885 while base_type in DELTA_TYPES: 

3886 prev_offset = base_offset 

3887 if get_ref is None: 

3888 get_ref = self.get_ref 

3889 if base_type == OFS_DELTA: 

3890 (delta_offset, delta) = base_obj 

3891 # TODO: clean up asserts and replace with nicer error messages 

3892 assert isinstance(delta_offset, int), ( 

3893 f"Expected int, got {delta_offset.__class__}" 

3894 ) 

3895 assert base_offset is not None 

3896 base_offset = base_offset - delta_offset 

3897 base_type, base_obj = self.data.get_object_at(base_offset) 

3898 assert isinstance(base_type, int) 

3899 elif base_type == REF_DELTA: 

3900 (basename, delta) = base_obj 

3901 assert isinstance(basename, bytes) and len(basename) == 20 

3902 base_offset, base_type, base_obj = get_ref(cast(RawObjectID, basename)) 

3903 assert isinstance(base_type, int) 

3904 if base_offset == prev_offset: # object is based on itself 

3905 raise UnresolvedDeltas([basename]) 

3906 delta_stack.append((prev_offset, base_type, delta)) 

3907 

3908 # Now grab the base object (mustn't be a delta) and apply the 

3909 # deltas all the way up the stack. 

3910 chunks = base_obj 

3911 for prev_offset, _delta_type, delta in reversed(delta_stack): 

3912 # Convert chunks to bytes for apply_delta if needed 

3913 if isinstance(chunks, list): 

3914 chunks_bytes = b"".join(chunks) 

3915 elif isinstance(chunks, tuple): 

3916 # For tuple type, second element is the actual data 

3917 _, chunk_data = chunks 

3918 if isinstance(chunk_data, list): 

3919 chunks_bytes = b"".join(chunk_data) 

3920 else: 

3921 chunks_bytes = chunk_data 

3922 else: 

3923 chunks_bytes = chunks 

3924 

3925 # Apply delta and get result as list 

3926 chunks = apply_delta(chunks_bytes, delta) 

3927 

3928 if prev_offset is not None: 

3929 self.data._offset_cache[prev_offset] = base_type, chunks 

3930 return base_type, chunks 

3931 

3932 def entries( 

3933 self, progress: Callable[[int, int], None] | None = None 

3934 ) -> Iterator[PackIndexEntry]: 

3935 """Yield entries summarizing the contents of this pack. 

3936 

3937 Args: 

3938 progress: Progress function, called with current and total 

3939 object count. 

3940 Returns: iterator of tuples with (sha, offset, crc32) 

3941 """ 

3942 return self.data.iterentries( 

3943 progress=progress, resolve_ext_ref=self.resolve_ext_ref 

3944 ) 

3945 

3946 def sorted_entries( 

3947 self, progress: ProgressFn | None = None 

3948 ) -> Iterator[PackIndexEntry]: 

3949 """Return entries in this pack, sorted by SHA. 

3950 

3951 Args: 

3952 progress: Progress function, called with current and total 

3953 object count 

3954 Returns: Iterator of tuples with (sha, offset, crc32) 

3955 """ 

3956 return iter( 

3957 self.data.sorted_entries( 

3958 progress=progress, resolve_ext_ref=self.resolve_ext_ref 

3959 ) 

3960 ) 

3961 

3962 def get_unpacked_object( 

3963 self, 

3964 sha: ObjectID | RawObjectID, 

3965 *, 

3966 include_comp: bool = False, 

3967 convert_ofs_delta: bool = True, 

3968 ) -> UnpackedObject: 

3969 """Get the unpacked object for a sha. 

3970 

3971 Args: 

3972 sha: SHA of object to fetch 

3973 include_comp: Whether to include compression data in UnpackedObject 

3974 convert_ofs_delta: Whether to convert offset deltas to ref deltas 

3975 """ 

3976 offset = self.index.object_offset(sha) 

3977 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp) 

3978 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta: 

3979 assert isinstance(unpacked.delta_base, int) 

3980 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base) 

3981 unpacked.pack_type_num = REF_DELTA 

3982 return unpacked 

3983 

3984 

3985def extend_pack( 

3986 f: BinaryIO, 

3987 object_ids: Set["RawObjectID"], 

3988 get_raw: Callable[["RawObjectID | ObjectID"], tuple[int, bytes]], 

3989 *, 

3990 compression_level: int = -1, 

3991 progress: Callable[[bytes], None] | None = None, 

3992) -> tuple[bytes, list[tuple["RawObjectID", int, int]]]: 

3993 """Extend a pack file with more objects. 

3994 

3995 The caller should make sure that object_ids does not contain any objects 

3996 that are already in the pack 

3997 """ 

3998 # Update the header with the new number of objects. 

3999 f.seek(0) 

4000 _version, num_objects = read_pack_header(f.read) 

4001 

4002 if object_ids: 

4003 f.seek(0) 

4004 write_pack_header(f.write, num_objects + len(object_ids)) 

4005 

4006 # Must flush before reading (http://bugs.python.org/issue3207) 

4007 f.flush() 

4008 

4009 # Rescan the rest of the pack, computing the SHA with the new header. 

4010 new_sha = compute_file_sha(f, end_ofs=-20) 

4011 

4012 # Must reposition before writing (http://bugs.python.org/issue3207) 

4013 f.seek(0, os.SEEK_CUR) 

4014 

4015 extra_entries = [] 

4016 

4017 # Complete the pack. 

4018 for i, object_id in enumerate(object_ids): 

4019 if progress is not None: 

4020 progress( 

4021 (f"writing extra base objects: {i}/{len(object_ids)}\r").encode("ascii") 

4022 ) 

4023 assert len(object_id) == 20 

4024 type_num, data = get_raw(object_id) 

4025 offset = f.tell() 

4026 crc32 = write_pack_object( 

4027 f.write, 

4028 type_num, 

4029 [data], # Convert bytes to list[bytes] 

4030 sha=new_sha, 

4031 compression_level=compression_level, 

4032 ) 

4033 extra_entries.append((object_id, offset, crc32)) 

4034 pack_sha = new_sha.digest() 

4035 f.write(pack_sha) 

4036 return pack_sha, extra_entries 

4037 

4038 

4039try: 

4040 from dulwich._pack import ( # type: ignore 

4041 apply_delta, 

4042 bisect_find_sha, 

4043 ) 

4044except ImportError: 

4045 pass 

4046 

4047# Try to import the Rust version of create_delta 

4048try: 

4049 from dulwich._pack import create_delta as _create_delta_rs 

4050except ImportError: 

4051 pass 

4052else: 

4053 # Wrap the Rust version to match the Python API (returns bytes instead of Iterator) 

4054 def _create_delta_rs_wrapper(base_buf: bytes, target_buf: bytes) -> Iterator[bytes]: 

4055 """Wrapper for Rust create_delta to match Python API.""" 

4056 yield _create_delta_rs(base_buf, target_buf) 

4057 

4058 create_delta = _create_delta_rs_wrapper