Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/pack.py: 24%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1572 statements  

1# pack.py -- For dealing with packed git objects. 

2# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net> 

3# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk> 

4# 

5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 

6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU 

7# General Public License as published by the Free Software Foundation; version 2.0 

8# or (at your option) any later version. You can redistribute it and/or 

9# modify it under the terms of either of these two licenses. 

10# 

11# Unless required by applicable law or agreed to in writing, software 

12# distributed under the License is distributed on an "AS IS" BASIS, 

13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

14# See the License for the specific language governing permissions and 

15# limitations under the License. 

16# 

17# You should have received a copy of the licenses; if not, see 

18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License 

19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache 

20# License, Version 2.0. 

21# 

22 

23"""Classes for dealing with packed git objects. 

24 

25A pack is a compact representation of a bunch of objects, stored 

26using deltas where possible. 

27 

28They have two parts, the pack file, which stores the data, and an index 

29that tells you where the data is. 

30 

31To find an object you look in all of the index files 'til you find a 

32match for the object name. You then use the pointer got from this as 

33a pointer in to the corresponding packfile. 

34""" 

35 

36import binascii 

37from collections import defaultdict, deque 

38from contextlib import suppress 

39from io import BytesIO, UnsupportedOperation 

40 

41try: 

42 from cdifflib import CSequenceMatcher as SequenceMatcher 

43except ModuleNotFoundError: 

44 from difflib import SequenceMatcher 

45 

46import os 

47import struct 

48import sys 

49import warnings 

50import zlib 

51from collections.abc import Iterable, Iterator, Sequence 

52from hashlib import sha1 

53from itertools import chain 

54from os import SEEK_CUR, SEEK_END 

55from struct import unpack_from 

56from typing import ( 

57 IO, 

58 TYPE_CHECKING, 

59 Any, 

60 BinaryIO, 

61 Callable, 

62 Generic, 

63 Optional, 

64 Protocol, 

65 TypeVar, 

66 Union, 

67) 

68 

69try: 

70 import mmap 

71except ImportError: 

72 has_mmap = False 

73else: 

74 has_mmap = True 

75 

76if TYPE_CHECKING: 

77 from .commit_graph import CommitGraph 

78 

79# For some reason the above try, except fails to set has_mmap = False for plan9 

80if sys.platform == "Plan9": 

81 has_mmap = False 

82 

83from . import replace_me 

84from .errors import ApplyDeltaError, ChecksumMismatch 

85from .file import GitFile, _GitFile 

86from .lru_cache import LRUSizeCache 

87from .objects import ObjectID, ShaFile, hex_to_sha, object_header, sha_to_hex 

88 

89OFS_DELTA = 6 

90REF_DELTA = 7 

91 

92DELTA_TYPES = (OFS_DELTA, REF_DELTA) 

93 

94 

95DEFAULT_PACK_DELTA_WINDOW_SIZE = 10 

96 

97# Keep pack files under 16Mb in memory, otherwise write them out to disk 

98PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024 

99 

100# Default pack index version to use when none is specified 

101DEFAULT_PACK_INDEX_VERSION = 2 

102 

103 

104OldUnpackedObject = Union[tuple[Union[bytes, int], list[bytes]], list[bytes]] 

105ResolveExtRefFn = Callable[[bytes], tuple[int, OldUnpackedObject]] 

106ProgressFn = Callable[[int, str], None] 

107PackHint = tuple[int, Optional[bytes]] 

108 

109 

110class UnresolvedDeltas(Exception): 

111 """Delta objects could not be resolved.""" 

112 

113 def __init__(self, shas: list[bytes]) -> None: 

114 self.shas = shas 

115 

116 

117class ObjectContainer(Protocol): 

118 def add_object(self, obj: ShaFile) -> None: 

119 """Add a single object to this object store.""" 

120 

121 def add_objects( 

122 self, 

123 objects: Sequence[tuple[ShaFile, Optional[str]]], 

124 progress: Optional[Callable[[str], None]] = None, 

125 ) -> None: 

126 """Add a set of objects to this object store. 

127 

128 Args: 

129 objects: Iterable over a list of (object, path) tuples 

130 """ 

131 

132 def __contains__(self, sha1: bytes) -> bool: 

133 """Check if a hex sha is present.""" 

134 

135 def __getitem__(self, sha1: bytes) -> ShaFile: 

136 """Retrieve an object.""" 

137 

138 def get_commit_graph(self) -> Optional["CommitGraph"]: 

139 """Get the commit graph for this object store. 

140 

141 Returns: 

142 CommitGraph object if available, None otherwise 

143 """ 

144 return None 

145 

146 

147class PackedObjectContainer(ObjectContainer): 

148 def get_unpacked_object( 

149 self, sha1: bytes, *, include_comp: bool = False 

150 ) -> "UnpackedObject": 

151 """Get a raw unresolved object.""" 

152 raise NotImplementedError(self.get_unpacked_object) 

153 

154 def iterobjects_subset( 

155 self, shas: Iterable[bytes], *, allow_missing: bool = False 

156 ) -> Iterator[ShaFile]: 

157 raise NotImplementedError(self.iterobjects_subset) 

158 

159 def iter_unpacked_subset( 

160 self, 

161 shas: set[bytes], 

162 include_comp: bool = False, 

163 allow_missing: bool = False, 

164 convert_ofs_delta: bool = True, 

165 ) -> Iterator["UnpackedObject"]: 

166 raise NotImplementedError(self.iter_unpacked_subset) 

167 

168 

169class UnpackedObjectStream: 

170 """Abstract base class for a stream of unpacked objects.""" 

171 

172 def __iter__(self) -> Iterator["UnpackedObject"]: 

173 raise NotImplementedError(self.__iter__) 

174 

175 def __len__(self) -> int: 

176 raise NotImplementedError(self.__len__) 

177 

178 

179def take_msb_bytes( 

180 read: Callable[[int], bytes], crc32: Optional[int] = None 

181) -> tuple[list[int], Optional[int]]: 

182 """Read bytes marked with most significant bit. 

183 

184 Args: 

185 read: Read function 

186 """ 

187 ret: list[int] = [] 

188 while len(ret) == 0 or ret[-1] & 0x80: 

189 b = read(1) 

190 if crc32 is not None: 

191 crc32 = binascii.crc32(b, crc32) 

192 ret.append(ord(b[:1])) 

193 return ret, crc32 

194 

195 

196class PackFileDisappeared(Exception): 

197 """Raised when a pack file unexpectedly disappears.""" 

198 

199 def __init__(self, obj: object) -> None: 

200 self.obj = obj 

201 

202 

203class UnpackedObject: 

204 """Class encapsulating an object unpacked from a pack file. 

205 

206 These objects should only be created from within unpack_object. Most 

207 members start out as empty and are filled in at various points by 

208 read_zlib_chunks, unpack_object, DeltaChainIterator, etc. 

209 

210 End users of this object should take care that the function they're getting 

211 this object from is guaranteed to set the members they need. 

212 """ 

213 

214 __slots__ = [ 

215 "_sha", # Cached binary SHA. 

216 "comp_chunks", # Compressed object chunks. 

217 "crc32", # CRC32. 

218 "decomp_chunks", # Decompressed object chunks. 

219 "decomp_len", # Decompressed length of this object. 

220 "delta_base", # Delta base offset or SHA. 

221 "obj_chunks", # Decompressed and delta-resolved chunks. 

222 "obj_type_num", # Type of this object. 

223 "offset", # Offset in its pack. 

224 "pack_type_num", # Type of this object in the pack (may be a delta). 

225 ] 

226 

227 obj_type_num: Optional[int] 

228 obj_chunks: Optional[list[bytes]] 

229 delta_base: Union[None, bytes, int] 

230 decomp_chunks: list[bytes] 

231 comp_chunks: Optional[list[bytes]] 

232 decomp_len: Optional[int] 

233 crc32: Optional[int] 

234 offset: Optional[int] 

235 pack_type_num: int 

236 _sha: Optional[bytes] 

237 

238 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be 

239 # methods of this object. 

240 def __init__( 

241 self, 

242 pack_type_num: int, 

243 *, 

244 delta_base: Union[None, bytes, int] = None, 

245 decomp_len: Optional[int] = None, 

246 crc32: Optional[int] = None, 

247 sha: Optional[bytes] = None, 

248 decomp_chunks: Optional[list[bytes]] = None, 

249 offset: Optional[int] = None, 

250 ) -> None: 

251 self.offset = offset 

252 self._sha = sha 

253 self.pack_type_num = pack_type_num 

254 self.delta_base = delta_base 

255 self.comp_chunks = None 

256 self.decomp_chunks: list[bytes] = decomp_chunks or [] 

257 if decomp_chunks is not None and decomp_len is None: 

258 self.decomp_len = sum(map(len, decomp_chunks)) 

259 else: 

260 self.decomp_len = decomp_len 

261 self.crc32 = crc32 

262 

263 if pack_type_num in DELTA_TYPES: 

264 self.obj_type_num = None 

265 self.obj_chunks = None 

266 else: 

267 self.obj_type_num = pack_type_num 

268 self.obj_chunks = self.decomp_chunks 

269 self.delta_base = delta_base 

270 

271 def sha(self) -> bytes: 

272 """Return the binary SHA of this object.""" 

273 if self._sha is None: 

274 self._sha = obj_sha(self.obj_type_num, self.obj_chunks) 

275 return self._sha 

276 

277 def sha_file(self) -> ShaFile: 

278 """Return a ShaFile from this object.""" 

279 assert self.obj_type_num is not None and self.obj_chunks is not None 

280 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks) 

281 

282 # Only provided for backwards compatibility with code that expects either 

283 # chunks or a delta tuple. 

284 def _obj(self) -> OldUnpackedObject: 

285 """Return the decompressed chunks, or (delta base, delta chunks).""" 

286 if self.pack_type_num in DELTA_TYPES: 

287 assert isinstance(self.delta_base, (bytes, int)) 

288 return (self.delta_base, self.decomp_chunks) 

289 else: 

290 return self.decomp_chunks 

291 

292 def __eq__(self, other: object) -> bool: 

293 if not isinstance(other, UnpackedObject): 

294 return False 

295 for slot in self.__slots__: 

296 if getattr(self, slot) != getattr(other, slot): 

297 return False 

298 return True 

299 

300 def __ne__(self, other: object) -> bool: 

301 """Check inequality with another UnpackedObject.""" 

302 return not (self == other) 

303 

304 def __repr__(self) -> str: 

305 """Return string representation of this UnpackedObject.""" 

306 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__] 

307 return "{}({})".format(self.__class__.__name__, ", ".join(data)) 

308 

309 

310_ZLIB_BUFSIZE = 65536 # 64KB buffer for better I/O performance 

311 

312 

313def read_zlib_chunks( 

314 read_some: Callable[[int], bytes], 

315 unpacked: UnpackedObject, 

316 include_comp: bool = False, 

317 buffer_size: int = _ZLIB_BUFSIZE, 

318) -> bytes: 

319 """Read zlib data from a buffer. 

320 

321 This function requires that the buffer have additional data following the 

322 compressed data, which is guaranteed to be the case for git pack files. 

323 

324 Args: 

325 read_some: Read function that returns at least one byte, but may 

326 return less than the requested size. 

327 unpacked: An UnpackedObject to write result data to. If its crc32 

328 attr is not None, the CRC32 of the compressed bytes will be computed 

329 using this starting CRC32. 

330 After this function, will have the following attrs set: 

331 * comp_chunks (if include_comp is True) 

332 * decomp_chunks 

333 * decomp_len 

334 * crc32 

335 include_comp: If True, include compressed data in the result. 

336 buffer_size: Size of the read buffer. 

337 Returns: Leftover unused data from the decompression. 

338 

339 Raises: 

340 zlib.error: if a decompression error occurred. 

341 """ 

342 if unpacked.decomp_len is None or unpacked.decomp_len <= -1: 

343 raise ValueError("non-negative zlib data stream size expected") 

344 decomp_obj = zlib.decompressobj() 

345 

346 comp_chunks = [] 

347 decomp_chunks = unpacked.decomp_chunks 

348 decomp_len = 0 

349 crc32 = unpacked.crc32 

350 

351 while True: 

352 add = read_some(buffer_size) 

353 if not add: 

354 raise zlib.error("EOF before end of zlib stream") 

355 comp_chunks.append(add) 

356 decomp = decomp_obj.decompress(add) 

357 decomp_len += len(decomp) 

358 decomp_chunks.append(decomp) 

359 unused = decomp_obj.unused_data 

360 if unused: 

361 left = len(unused) 

362 if crc32 is not None: 

363 crc32 = binascii.crc32(add[:-left], crc32) 

364 if include_comp: 

365 comp_chunks[-1] = add[:-left] 

366 break 

367 elif crc32 is not None: 

368 crc32 = binascii.crc32(add, crc32) 

369 if crc32 is not None: 

370 crc32 &= 0xFFFFFFFF 

371 

372 if decomp_len != unpacked.decomp_len: 

373 raise zlib.error("decompressed data does not match expected size") 

374 

375 unpacked.crc32 = crc32 

376 if include_comp: 

377 unpacked.comp_chunks = comp_chunks 

378 return unused 

379 

380 

381def iter_sha1(iter: Iterable[bytes]) -> bytes: 

382 """Return the hexdigest of the SHA1 over a set of names. 

383 

384 Args: 

385 iter: Iterator over string objects 

386 Returns: 40-byte hex sha1 digest 

387 """ 

388 sha = sha1() 

389 for name in iter: 

390 sha.update(name) 

391 return sha.hexdigest().encode("ascii") 

392 

393 

394def load_pack_index(path: Union[str, os.PathLike]) -> "PackIndex": 

395 """Load an index file by path. 

396 

397 Args: 

398 path: Path to the index file 

399 Returns: A PackIndex loaded from the given path 

400 """ 

401 with GitFile(path, "rb") as f: 

402 return load_pack_index_file(path, f) 

403 

404 

405def _load_file_contents( 

406 f: Union[IO[bytes], _GitFile], size: Optional[int] = None 

407) -> tuple[Union[bytes, Any], int]: 

408 """Load contents from a file, preferring mmap when possible. 

409 

410 Args: 

411 f: File-like object to load 

412 size: Expected size, or None to determine from file 

413 Returns: Tuple of (contents, size) 

414 """ 

415 try: 

416 fd = f.fileno() 

417 except (UnsupportedOperation, AttributeError): 

418 fd = None 

419 # Attempt to use mmap if possible 

420 if fd is not None: 

421 if size is None: 

422 size = os.fstat(fd).st_size 

423 if has_mmap: 

424 try: 

425 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ) 

426 except (OSError, ValueError): 

427 # Can't mmap - perhaps a socket or invalid file descriptor 

428 pass 

429 else: 

430 return contents, size 

431 contents_bytes = f.read() 

432 size = len(contents_bytes) 

433 return contents_bytes, size 

434 

435 

436def load_pack_index_file( 

437 path: Union[str, os.PathLike], f: Union[IO[bytes], _GitFile] 

438) -> "PackIndex": 

439 """Load an index file from a file-like object. 

440 

441 Args: 

442 path: Path for the index file 

443 f: File-like object 

444 Returns: A PackIndex loaded from the given file 

445 """ 

446 contents, size = _load_file_contents(f) 

447 if contents[:4] == b"\377tOc": 

448 version = struct.unpack(b">L", contents[4:8])[0] 

449 if version == 2: 

450 return PackIndex2(path, file=f, contents=contents, size=size) 

451 elif version == 3: 

452 return PackIndex3(path, file=f, contents=contents, size=size) 

453 else: 

454 raise KeyError(f"Unknown pack index format {version}") 

455 else: 

456 return PackIndex1(path, file=f, contents=contents, size=size) 

457 

458 

459def bisect_find_sha( 

460 start: int, end: int, sha: bytes, unpack_name: Callable[[int], bytes] 

461) -> Optional[int]: 

462 """Find a SHA in a data blob with sorted SHAs. 

463 

464 Args: 

465 start: Start index of range to search 

466 end: End index of range to search 

467 sha: Sha to find 

468 unpack_name: Callback to retrieve SHA by index 

469 Returns: Index of the SHA, or None if it wasn't found 

470 """ 

471 assert start <= end 

472 while start <= end: 

473 i = (start + end) // 2 

474 file_sha = unpack_name(i) 

475 if file_sha < sha: 

476 start = i + 1 

477 elif file_sha > sha: 

478 end = i - 1 

479 else: 

480 return i 

481 return None 

482 

483 

484PackIndexEntry = tuple[bytes, int, Optional[int]] 

485 

486 

487class PackIndex: 

488 """An index in to a packfile. 

489 

490 Given a sha id of an object a pack index can tell you the location in the 

491 packfile of that object if it has it. 

492 """ 

493 

494 # Default to SHA-1 for backward compatibility 

495 hash_algorithm = 1 

496 hash_size = 20 

497 

498 def __eq__(self, other: object) -> bool: 

499 if not isinstance(other, PackIndex): 

500 return False 

501 

502 for (name1, _, _), (name2, _, _) in zip( 

503 self.iterentries(), other.iterentries() 

504 ): 

505 if name1 != name2: 

506 return False 

507 return True 

508 

509 def __ne__(self, other: object) -> bool: 

510 """Check if this pack index is not equal to another.""" 

511 return not self.__eq__(other) 

512 

513 def __len__(self) -> int: 

514 """Return the number of entries in this pack index.""" 

515 raise NotImplementedError(self.__len__) 

516 

517 def __iter__(self) -> Iterator[bytes]: 

518 """Iterate over the SHAs in this pack.""" 

519 return map(sha_to_hex, self._itersha()) 

520 

521 def iterentries(self) -> Iterator[PackIndexEntry]: 

522 """Iterate over the entries in this pack index. 

523 

524 Returns: iterator over tuples with object name, offset in packfile and 

525 crc32 checksum. 

526 """ 

527 raise NotImplementedError(self.iterentries) 

528 

529 def get_pack_checksum(self) -> Optional[bytes]: 

530 """Return the SHA1 checksum stored for the corresponding packfile. 

531 

532 Returns: 20-byte binary digest, or None if not available 

533 """ 

534 raise NotImplementedError(self.get_pack_checksum) 

535 

536 @replace_me(since="0.21.0", remove_in="0.23.0") 

537 def object_index(self, sha: bytes) -> int: 

538 return self.object_offset(sha) 

539 

540 def object_offset(self, sha: bytes) -> int: 

541 """Return the offset in to the corresponding packfile for the object. 

542 

543 Given the name of an object it will return the offset that object 

544 lives at within the corresponding pack file. If the pack file doesn't 

545 have the object then None will be returned. 

546 """ 

547 raise NotImplementedError(self.object_offset) 

548 

549 def object_sha1(self, index: int) -> bytes: 

550 """Return the SHA1 corresponding to the index in the pack file.""" 

551 for name, offset, _crc32 in self.iterentries(): 

552 if offset == index: 

553 return name 

554 else: 

555 raise KeyError(index) 

556 

557 def _object_offset(self, sha: bytes) -> int: 

558 """See object_offset. 

559 

560 Args: 

561 sha: A *binary* SHA string. (20 characters long)_ 

562 """ 

563 raise NotImplementedError(self._object_offset) 

564 

565 def objects_sha1(self) -> bytes: 

566 """Return the hex SHA1 over all the shas of all objects in this pack. 

567 

568 Note: This is used for the filename of the pack. 

569 """ 

570 return iter_sha1(self._itersha()) 

571 

572 def _itersha(self) -> Iterator[bytes]: 

573 """Yield all the SHA1's of the objects in the index, sorted.""" 

574 raise NotImplementedError(self._itersha) 

575 

576 def close(self) -> None: 

577 """Close any open files.""" 

578 

579 def check(self) -> None: 

580 """Check the consistency of this pack index.""" 

581 

582 

583class MemoryPackIndex(PackIndex): 

584 """Pack index that is stored entirely in memory.""" 

585 

586 def __init__( 

587 self, 

588 entries: list[tuple[bytes, int, Optional[int]]], 

589 pack_checksum: Optional[bytes] = None, 

590 ) -> None: 

591 """Create a new MemoryPackIndex. 

592 

593 Args: 

594 entries: Sequence of name, idx, crc32 (sorted) 

595 pack_checksum: Optional pack checksum 

596 """ 

597 self._by_sha = {} 

598 self._by_offset = {} 

599 for name, offset, _crc32 in entries: 

600 self._by_sha[name] = offset 

601 self._by_offset[offset] = name 

602 self._entries = entries 

603 self._pack_checksum = pack_checksum 

604 

605 def get_pack_checksum(self) -> Optional[bytes]: 

606 """Return the SHA checksum stored for the corresponding packfile.""" 

607 return self._pack_checksum 

608 

609 def __len__(self) -> int: 

610 """Return the number of entries in this pack index.""" 

611 return len(self._entries) 

612 

613 def object_offset(self, sha: bytes) -> int: 

614 """Return the offset for the given SHA. 

615 

616 Args: 

617 sha: SHA to look up (binary or hex) 

618 Returns: Offset in the pack file 

619 """ 

620 if len(sha) == 40: 

621 sha = hex_to_sha(sha) 

622 return self._by_sha[sha] 

623 

624 def object_sha1(self, offset: int) -> bytes: 

625 """Return the SHA1 for the object at the given offset.""" 

626 return self._by_offset[offset] 

627 

628 def _itersha(self) -> Iterator[bytes]: 

629 """Iterate over all SHA1s in the index.""" 

630 return iter(self._by_sha) 

631 

632 def iterentries(self) -> Iterator[PackIndexEntry]: 

633 """Iterate over all index entries.""" 

634 return iter(self._entries) 

635 

636 @classmethod 

637 def for_pack(cls, pack_data: "PackData") -> "MemoryPackIndex": 

638 """Create a MemoryPackIndex from a PackData object.""" 

639 return MemoryPackIndex( 

640 list(pack_data.sorted_entries()), pack_data.get_stored_checksum() 

641 ) 

642 

643 @classmethod 

644 def clone(cls, other_index: "PackIndex") -> "MemoryPackIndex": 

645 """Create a copy of another PackIndex in memory.""" 

646 return cls(list(other_index.iterentries()), other_index.get_pack_checksum()) 

647 

648 

649class FilePackIndex(PackIndex): 

650 """Pack index that is based on a file. 

651 

652 To do the loop it opens the file, and indexes first 256 4 byte groups 

653 with the first byte of the sha id. The value in the four byte group indexed 

654 is the end of the group that shares the same starting byte. Subtract one 

655 from the starting byte and index again to find the start of the group. 

656 The values are sorted by sha id within the group, so do the math to find 

657 the start and end offset and then bisect in to find if the value is 

658 present. 

659 """ 

660 

661 _fan_out_table: list[int] 

662 

663 def __init__( 

664 self, 

665 filename: Union[str, os.PathLike], 

666 file: Optional[BinaryIO] = None, 

667 contents: Optional[Union[bytes, "mmap.mmap"]] = None, 

668 size: Optional[int] = None, 

669 ) -> None: 

670 """Create a pack index object. 

671 

672 Provide it with the name of the index file to consider, and it will map 

673 it whenever required. 

674 """ 

675 self._filename = filename 

676 # Take the size now, so it can be checked each time we map the file to 

677 # ensure that it hasn't changed. 

678 if file is None: 

679 self._file = GitFile(filename, "rb") 

680 else: 

681 self._file = file 

682 if contents is None: 

683 self._contents, self._size = _load_file_contents(self._file, size) 

684 else: 

685 self._contents = contents 

686 self._size = size if size is not None else len(contents) 

687 

688 @property 

689 def path(self) -> str: 

690 """Return the path to this index file.""" 

691 return os.fspath(self._filename) 

692 

693 def __eq__(self, other: object) -> bool: 

694 # Quick optimization: 

695 if ( 

696 isinstance(other, FilePackIndex) 

697 and self._fan_out_table != other._fan_out_table 

698 ): 

699 return False 

700 

701 return super().__eq__(other) 

702 

703 def close(self) -> None: 

704 """Close the underlying file and any mmap.""" 

705 self._file.close() 

706 close_fn = getattr(self._contents, "close", None) 

707 if close_fn is not None: 

708 close_fn() 

709 

710 def __len__(self) -> int: 

711 """Return the number of entries in this pack index.""" 

712 return self._fan_out_table[-1] 

713 

714 def _unpack_entry(self, i: int) -> PackIndexEntry: 

715 """Unpack the i-th entry in the index file. 

716 

717 Returns: Tuple with object name (SHA), offset in pack file and CRC32 

718 checksum (if known). 

719 """ 

720 raise NotImplementedError(self._unpack_entry) 

721 

722 def _unpack_name(self, i) -> bytes: 

723 """Unpack the i-th name from the index file.""" 

724 raise NotImplementedError(self._unpack_name) 

725 

726 def _unpack_offset(self, i) -> int: 

727 """Unpack the i-th object offset from the index file.""" 

728 raise NotImplementedError(self._unpack_offset) 

729 

730 def _unpack_crc32_checksum(self, i) -> Optional[int]: 

731 """Unpack the crc32 checksum for the ith object from the index file.""" 

732 raise NotImplementedError(self._unpack_crc32_checksum) 

733 

734 def _itersha(self) -> Iterator[bytes]: 

735 """Iterate over all SHA1s in the index.""" 

736 for i in range(len(self)): 

737 yield self._unpack_name(i) 

738 

739 def iterentries(self) -> Iterator[PackIndexEntry]: 

740 """Iterate over the entries in this pack index. 

741 

742 Returns: iterator over tuples with object name, offset in packfile and 

743 crc32 checksum. 

744 """ 

745 for i in range(len(self)): 

746 yield self._unpack_entry(i) 

747 

748 def _read_fan_out_table(self, start_offset: int) -> list[int]: 

749 """Read the fan-out table from the index. 

750 

751 The fan-out table contains 256 entries mapping first byte values 

752 to the number of objects with SHA1s less than or equal to that byte. 

753 

754 Args: 

755 start_offset: Offset in the file where the fan-out table starts 

756 Returns: List of 256 integers 

757 """ 

758 ret = [] 

759 for i in range(0x100): 

760 fanout_entry = self._contents[ 

761 start_offset + i * 4 : start_offset + (i + 1) * 4 

762 ] 

763 ret.append(struct.unpack(">L", fanout_entry)[0]) 

764 return ret 

765 

766 def check(self) -> None: 

767 """Check that the stored checksum matches the actual checksum.""" 

768 actual = self.calculate_checksum() 

769 stored = self.get_stored_checksum() 

770 if actual != stored: 

771 raise ChecksumMismatch(stored, actual) 

772 

773 def calculate_checksum(self) -> bytes: 

774 """Calculate the SHA1 checksum over this pack index. 

775 

776 Returns: This is a 20-byte binary digest 

777 """ 

778 return sha1(self._contents[:-20]).digest() 

779 

780 def get_pack_checksum(self) -> bytes: 

781 """Return the SHA1 checksum stored for the corresponding packfile. 

782 

783 Returns: 20-byte binary digest 

784 """ 

785 return bytes(self._contents[-40:-20]) 

786 

787 def get_stored_checksum(self) -> bytes: 

788 """Return the SHA1 checksum stored for this index. 

789 

790 Returns: 20-byte binary digest 

791 """ 

792 return bytes(self._contents[-20:]) 

793 

794 def object_offset(self, sha: bytes) -> int: 

795 """Return the offset in to the corresponding packfile for the object. 

796 

797 Given the name of an object it will return the offset that object 

798 lives at within the corresponding pack file. If the pack file doesn't 

799 have the object then None will be returned. 

800 """ 

801 if len(sha) == 40: 

802 sha = hex_to_sha(sha) 

803 try: 

804 return self._object_offset(sha) 

805 except ValueError as exc: 

806 closed = getattr(self._contents, "closed", None) 

807 if closed in (None, True): 

808 raise PackFileDisappeared(self) from exc 

809 raise 

810 

811 def _object_offset(self, sha: bytes) -> int: 

812 """See object_offset. 

813 

814 Args: 

815 sha: A *binary* SHA string. (20 characters long)_ 

816 """ 

817 assert len(sha) == 20 

818 idx = ord(sha[:1]) 

819 if idx == 0: 

820 start = 0 

821 else: 

822 start = self._fan_out_table[idx - 1] 

823 end = self._fan_out_table[idx] 

824 i = bisect_find_sha(start, end, sha, self._unpack_name) 

825 if i is None: 

826 raise KeyError(sha) 

827 return self._unpack_offset(i) 

828 

829 def iter_prefix(self, prefix: bytes) -> Iterator[bytes]: 

830 """Iterate over all SHA1s with the given prefix.""" 

831 start = ord(prefix[:1]) 

832 if start == 0: 

833 start = 0 

834 else: 

835 start = self._fan_out_table[start - 1] 

836 end = ord(prefix[:1]) + 1 

837 if end == 0x100: 

838 end = len(self) 

839 else: 

840 end = self._fan_out_table[end] 

841 assert start <= end 

842 started = False 

843 for i in range(start, end): 

844 name: bytes = self._unpack_name(i) 

845 if name.startswith(prefix): 

846 yield name 

847 started = True 

848 elif started: 

849 break 

850 

851 

852class PackIndex1(FilePackIndex): 

853 """Version 1 Pack Index file.""" 

854 

855 def __init__( 

856 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None 

857 ) -> None: 

858 super().__init__(filename, file, contents, size) 

859 self.version = 1 

860 self._fan_out_table = self._read_fan_out_table(0) 

861 

862 def _unpack_entry(self, i): 

863 """Unpack the i-th entry from the v1 index.""" 

864 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24)) 

865 return (name, offset, None) 

866 

867 def _unpack_name(self, i): 

868 """Unpack the i-th SHA1 from the v1 index.""" 

869 offset = (0x100 * 4) + (i * 24) + 4 

870 return self._contents[offset : offset + 20] 

871 

872 def _unpack_offset(self, i): 

873 """Unpack the i-th offset from the v1 index.""" 

874 offset = (0x100 * 4) + (i * 24) 

875 return unpack_from(">L", self._contents, offset)[0] 

876 

877 def _unpack_crc32_checksum(self, i) -> None: 

878 """Return None as v1 indexes don't store CRC32 checksums.""" 

879 # Not stored in v1 index files 

880 return None 

881 

882 

883class PackIndex2(FilePackIndex): 

884 """Version 2 Pack Index file.""" 

885 

886 def __init__( 

887 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None 

888 ) -> None: 

889 super().__init__(filename, file, contents, size) 

890 if self._contents[:4] != b"\377tOc": 

891 raise AssertionError("Not a v2 pack index file") 

892 (self.version,) = unpack_from(b">L", self._contents, 4) 

893 if self.version != 2: 

894 raise AssertionError(f"Version was {self.version}") 

895 self._fan_out_table = self._read_fan_out_table(8) 

896 self._name_table_offset = 8 + 0x100 * 4 

897 self._crc32_table_offset = self._name_table_offset + 20 * len(self) 

898 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self) 

899 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len( 

900 self 

901 ) 

902 

903 def _unpack_entry(self, i): 

904 """Unpack the i-th entry from the v2 index.""" 

905 return ( 

906 self._unpack_name(i), 

907 self._unpack_offset(i), 

908 self._unpack_crc32_checksum(i), 

909 ) 

910 

911 def _unpack_name(self, i): 

912 """Unpack the i-th SHA1 from the v2 index.""" 

913 offset = self._name_table_offset + i * 20 

914 return self._contents[offset : offset + 20] 

915 

916 def _unpack_offset(self, i): 

917 """Unpack the i-th offset from the v2 index. 

918 

919 Handles large offsets (>2GB) by reading from the large offset table. 

920 """ 

921 offset = self._pack_offset_table_offset + i * 4 

922 offset = unpack_from(">L", self._contents, offset)[0] 

923 if offset & (2**31): 

924 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8 

925 offset = unpack_from(">Q", self._contents, offset)[0] 

926 return offset 

927 

928 def _unpack_crc32_checksum(self, i): 

929 """Unpack the i-th CRC32 checksum from the v2 index.""" 

930 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0] 

931 

932 

933class PackIndex3(FilePackIndex): 

934 """Version 3 Pack Index file. 

935 

936 Supports variable hash sizes for SHA-1 (20 bytes) and SHA-256 (32 bytes). 

937 """ 

938 

939 def __init__( 

940 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None 

941 ) -> None: 

942 super().__init__(filename, file, contents, size) 

943 if self._contents[:4] != b"\377tOc": 

944 raise AssertionError("Not a v3 pack index file") 

945 (self.version,) = unpack_from(b">L", self._contents, 4) 

946 if self.version != 3: 

947 raise AssertionError(f"Version was {self.version}") 

948 

949 # Read hash algorithm identifier (1 = SHA-1, 2 = SHA-256) 

950 (self.hash_algorithm,) = unpack_from(b">L", self._contents, 8) 

951 if self.hash_algorithm == 1: 

952 self.hash_size = 20 # SHA-1 

953 elif self.hash_algorithm == 2: 

954 self.hash_size = 32 # SHA-256 

955 else: 

956 raise AssertionError(f"Unknown hash algorithm {self.hash_algorithm}") 

957 

958 # Read length of shortened object names 

959 (self.shortened_oid_len,) = unpack_from(b">L", self._contents, 12) 

960 

961 # Calculate offsets based on variable hash size 

962 self._fan_out_table = self._read_fan_out_table( 

963 16 

964 ) # After header (4 + 4 + 4 + 4) 

965 self._name_table_offset = 16 + 0x100 * 4 

966 self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self) 

967 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self) 

968 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len( 

969 self 

970 ) 

971 

972 def _unpack_entry(self, i): 

973 return ( 

974 self._unpack_name(i), 

975 self._unpack_offset(i), 

976 self._unpack_crc32_checksum(i), 

977 ) 

978 

979 def _unpack_name(self, i): 

980 offset = self._name_table_offset + i * self.hash_size 

981 return self._contents[offset : offset + self.hash_size] 

982 

983 def _unpack_offset(self, i): 

984 offset = self._pack_offset_table_offset + i * 4 

985 offset = unpack_from(">L", self._contents, offset)[0] 

986 if offset & (2**31): 

987 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8 

988 offset = unpack_from(">Q", self._contents, offset)[0] 

989 return offset 

990 

991 def _unpack_crc32_checksum(self, i): 

992 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0] 

993 

994 

995def read_pack_header(read) -> tuple[int, int]: 

996 """Read the header of a pack file. 

997 

998 Args: 

999 read: Read function 

1000 Returns: Tuple of (pack version, number of objects). If no data is 

1001 available to read, returns (None, None). 

1002 """ 

1003 header = read(12) 

1004 if not header: 

1005 raise AssertionError("file too short to contain pack") 

1006 if header[:4] != b"PACK": 

1007 raise AssertionError(f"Invalid pack header {header!r}") 

1008 (version,) = unpack_from(b">L", header, 4) 

1009 if version not in (2, 3): 

1010 raise AssertionError(f"Version was {version}") 

1011 (num_objects,) = unpack_from(b">L", header, 8) 

1012 return (version, num_objects) 

1013 

1014 

1015def chunks_length(chunks: Union[bytes, Iterable[bytes]]) -> int: 

1016 """Get the total length of a sequence of chunks. 

1017 

1018 Args: 

1019 chunks: Either a single bytes object or an iterable of bytes 

1020 Returns: Total length in bytes 

1021 """ 

1022 if isinstance(chunks, bytes): 

1023 return len(chunks) 

1024 else: 

1025 return sum(map(len, chunks)) 

1026 

1027 

1028def unpack_object( 

1029 read_all: Callable[[int], bytes], 

1030 read_some: Optional[Callable[[int], bytes]] = None, 

1031 compute_crc32=False, 

1032 include_comp=False, 

1033 zlib_bufsize=_ZLIB_BUFSIZE, 

1034) -> tuple[UnpackedObject, bytes]: 

1035 """Unpack a Git object. 

1036 

1037 Args: 

1038 read_all: Read function that blocks until the number of requested 

1039 bytes are read. 

1040 read_some: Read function that returns at least one byte, but may not 

1041 return the number of bytes requested. 

1042 compute_crc32: If True, compute the CRC32 of the compressed data. If 

1043 False, the returned CRC32 will be None. 

1044 include_comp: If True, include compressed data in the result. 

1045 zlib_bufsize: An optional buffer size for zlib operations. 

1046 Returns: A tuple of (unpacked, unused), where unused is the unused data 

1047 leftover from decompression, and unpacked in an UnpackedObject with 

1048 the following attrs set: 

1049 

1050 * obj_chunks (for non-delta types) 

1051 * pack_type_num 

1052 * delta_base (for delta types) 

1053 * comp_chunks (if include_comp is True) 

1054 * decomp_chunks 

1055 * decomp_len 

1056 * crc32 (if compute_crc32 is True) 

1057 """ 

1058 if read_some is None: 

1059 read_some = read_all 

1060 if compute_crc32: 

1061 crc32 = 0 

1062 else: 

1063 crc32 = None 

1064 

1065 raw, crc32 = take_msb_bytes(read_all, crc32=crc32) 

1066 type_num = (raw[0] >> 4) & 0x07 

1067 size = raw[0] & 0x0F 

1068 for i, byte in enumerate(raw[1:]): 

1069 size += (byte & 0x7F) << ((i * 7) + 4) 

1070 

1071 delta_base: Union[int, bytes, None] 

1072 raw_base = len(raw) 

1073 if type_num == OFS_DELTA: 

1074 raw, crc32 = take_msb_bytes(read_all, crc32=crc32) 

1075 raw_base += len(raw) 

1076 if raw[-1] & 0x80: 

1077 raise AssertionError 

1078 delta_base_offset = raw[0] & 0x7F 

1079 for byte in raw[1:]: 

1080 delta_base_offset += 1 

1081 delta_base_offset <<= 7 

1082 delta_base_offset += byte & 0x7F 

1083 delta_base = delta_base_offset 

1084 elif type_num == REF_DELTA: 

1085 delta_base_obj = read_all(20) 

1086 if crc32 is not None: 

1087 crc32 = binascii.crc32(delta_base_obj, crc32) 

1088 delta_base = delta_base_obj 

1089 raw_base += 20 

1090 else: 

1091 delta_base = None 

1092 

1093 unpacked = UnpackedObject( 

1094 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32 

1095 ) 

1096 unused = read_zlib_chunks( 

1097 read_some, 

1098 unpacked, 

1099 buffer_size=zlib_bufsize, 

1100 include_comp=include_comp, 

1101 ) 

1102 return unpacked, unused 

1103 

1104 

1105def _compute_object_size(value): 

1106 """Compute the size of an unresolved object for use with LRUSizeCache. 

1107 

1108 Args: 

1109 value: Tuple of (type_num, object_chunks) 

1110 Returns: Size in bytes 

1111 """ 

1112 (num, obj) = value 

1113 if num in DELTA_TYPES: 

1114 return chunks_length(obj[1]) 

1115 return chunks_length(obj) 

1116 

1117 

1118class PackStreamReader: 

1119 """Class to read a pack stream. 

1120 

1121 The pack is read from a ReceivableProtocol using read() or recv() as 

1122 appropriate. 

1123 """ 

1124 

1125 def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE) -> None: 

1126 self.read_all = read_all 

1127 if read_some is None: 

1128 self.read_some = read_all 

1129 else: 

1130 self.read_some = read_some 

1131 self.sha = sha1() 

1132 self._offset = 0 

1133 self._rbuf = BytesIO() 

1134 # trailer is a deque to avoid memory allocation on small reads 

1135 self._trailer: deque[bytes] = deque() 

1136 self._zlib_bufsize = zlib_bufsize 

1137 

1138 def _read(self, read, size): 

1139 """Read up to size bytes using the given callback. 

1140 

1141 As a side effect, update the verifier's hash (excluding the last 20 

1142 bytes read). 

1143 

1144 Args: 

1145 read: The read callback to read from. 

1146 size: The maximum number of bytes to read; the particular 

1147 behavior is callback-specific. 

1148 Returns: Bytes read 

1149 """ 

1150 data = read(size) 

1151 

1152 # maintain a trailer of the last 20 bytes we've read 

1153 n = len(data) 

1154 self._offset += n 

1155 tn = len(self._trailer) 

1156 if n >= 20: 

1157 to_pop = tn 

1158 to_add = 20 

1159 else: 

1160 to_pop = max(n + tn - 20, 0) 

1161 to_add = n 

1162 self.sha.update( 

1163 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)])) 

1164 ) 

1165 self._trailer.extend(data[-to_add:]) 

1166 

1167 # hash everything but the trailer 

1168 self.sha.update(data[:-to_add]) 

1169 return data 

1170 

1171 def _buf_len(self): 

1172 """Get the number of bytes in the read buffer.""" 

1173 buf = self._rbuf 

1174 start = buf.tell() 

1175 buf.seek(0, SEEK_END) 

1176 end = buf.tell() 

1177 buf.seek(start) 

1178 return end - start 

1179 

1180 @property 

1181 def offset(self): 

1182 """Return the current offset in the pack stream.""" 

1183 return self._offset - self._buf_len() 

1184 

1185 def read(self, size): 

1186 """Read, blocking until size bytes are read.""" 

1187 buf_len = self._buf_len() 

1188 if buf_len >= size: 

1189 return self._rbuf.read(size) 

1190 buf_data = self._rbuf.read() 

1191 self._rbuf = BytesIO() 

1192 return buf_data + self._read(self.read_all, size - buf_len) 

1193 

1194 def recv(self, size): 

1195 """Read up to size bytes, blocking until one byte is read.""" 

1196 buf_len = self._buf_len() 

1197 if buf_len: 

1198 data = self._rbuf.read(size) 

1199 if size >= buf_len: 

1200 self._rbuf = BytesIO() 

1201 return data 

1202 return self._read(self.read_some, size) 

1203 

1204 def __len__(self) -> int: 

1205 return self._num_objects 

1206 

1207 def read_objects(self, compute_crc32=False) -> Iterator[UnpackedObject]: 

1208 """Read the objects in this pack file. 

1209 

1210 Args: 

1211 compute_crc32: If True, compute the CRC32 of the compressed 

1212 data. If False, the returned CRC32 will be None. 

1213 Returns: Iterator over UnpackedObjects with the following members set: 

1214 offset 

1215 obj_type_num 

1216 obj_chunks (for non-delta types) 

1217 delta_base (for delta types) 

1218 decomp_chunks 

1219 decomp_len 

1220 crc32 (if compute_crc32 is True) 

1221 

1222 Raises: 

1223 ChecksumMismatch: if the checksum of the pack contents does not 

1224 match the checksum in the pack trailer. 

1225 zlib.error: if an error occurred during zlib decompression. 

1226 IOError: if an error occurred writing to the output file. 

1227 """ 

1228 pack_version, self._num_objects = read_pack_header(self.read) 

1229 

1230 for _ in range(self._num_objects): 

1231 offset = self.offset 

1232 unpacked, unused = unpack_object( 

1233 self.read, 

1234 read_some=self.recv, 

1235 compute_crc32=compute_crc32, 

1236 zlib_bufsize=self._zlib_bufsize, 

1237 ) 

1238 unpacked.offset = offset 

1239 

1240 # prepend any unused data to current read buffer 

1241 buf = BytesIO() 

1242 buf.write(unused) 

1243 buf.write(self._rbuf.read()) 

1244 buf.seek(0) 

1245 self._rbuf = buf 

1246 

1247 yield unpacked 

1248 

1249 if self._buf_len() < 20: 

1250 # If the read buffer is full, then the last read() got the whole 

1251 # trailer off the wire. If not, it means there is still some of the 

1252 # trailer to read. We need to read() all 20 bytes; N come from the 

1253 # read buffer and (20 - N) come from the wire. 

1254 self.read(20) 

1255 

1256 pack_sha = bytearray(self._trailer) # type: ignore 

1257 if pack_sha != self.sha.digest(): 

1258 raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest()) 

1259 

1260 

1261class PackStreamCopier(PackStreamReader): 

1262 """Class to verify a pack stream as it is being read. 

1263 

1264 The pack is read from a ReceivableProtocol using read() or recv() as 

1265 appropriate and written out to the given file-like object. 

1266 """ 

1267 

1268 def __init__(self, read_all, read_some, outfile, delta_iter=None) -> None: 

1269 """Initialize the copier. 

1270 

1271 Args: 

1272 read_all: Read function that blocks until the number of 

1273 requested bytes are read. 

1274 read_some: Read function that returns at least one byte, but may 

1275 not return the number of bytes requested. 

1276 outfile: File-like object to write output through. 

1277 delta_iter: Optional DeltaChainIterator to record deltas as we 

1278 read them. 

1279 """ 

1280 super().__init__(read_all, read_some=read_some) 

1281 self.outfile = outfile 

1282 self._delta_iter = delta_iter 

1283 

1284 def _read(self, read, size): 

1285 """Read data from the read callback and write it to the file. 

1286 

1287 Args: 

1288 read: Read callback function 

1289 size: Number of bytes to read 

1290 Returns: Data read 

1291 """ 

1292 data = super()._read(read, size) 

1293 self.outfile.write(data) 

1294 return data 

1295 

1296 def verify(self, progress=None) -> None: 

1297 """Verify a pack stream and write it to the output file. 

1298 

1299 See PackStreamReader.iterobjects for a list of exceptions this may 

1300 throw. 

1301 """ 

1302 i = 0 # default count of entries if read_objects() is empty 

1303 for i, unpacked in enumerate(self.read_objects()): 

1304 if self._delta_iter: 

1305 self._delta_iter.record(unpacked) 

1306 if progress is not None: 

1307 progress(f"copying pack entries: {i}/{len(self)}\r".encode("ascii")) 

1308 if progress is not None: 

1309 progress(f"copied {i} pack entries\n".encode("ascii")) 

1310 

1311 

1312def obj_sha(type, chunks): 

1313 """Compute the SHA for a numeric type and object chunks. 

1314 

1315 Args: 

1316 type: Numeric type of the object 

1317 chunks: Object data as bytes or iterable of bytes 

1318 Returns: SHA-1 digest (20 bytes) 

1319 """ 

1320 sha = sha1() 

1321 sha.update(object_header(type, chunks_length(chunks))) 

1322 if isinstance(chunks, bytes): 

1323 sha.update(chunks) 

1324 else: 

1325 for chunk in chunks: 

1326 sha.update(chunk) 

1327 return sha.digest() 

1328 

1329 

1330def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1 << 16): 

1331 """Hash a portion of a file into a new SHA. 

1332 

1333 Args: 

1334 f: A file-like object to read from that supports seek(). 

1335 start_ofs: The offset in the file to start reading at. 

1336 end_ofs: The offset in the file to end reading at, relative to the 

1337 end of the file. 

1338 buffer_size: A buffer size for reading. 

1339 Returns: A new SHA object updated with data read from the file. 

1340 """ 

1341 sha = sha1() 

1342 f.seek(0, SEEK_END) 

1343 length = f.tell() 

1344 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length: 

1345 raise AssertionError( 

1346 f"Attempt to read beyond file length. start_ofs: {start_ofs}, end_ofs: {end_ofs}, file length: {length}" 

1347 ) 

1348 todo = length + end_ofs - start_ofs 

1349 f.seek(start_ofs) 

1350 while todo: 

1351 data = f.read(min(todo, buffer_size)) 

1352 sha.update(data) 

1353 todo -= len(data) 

1354 return sha 

1355 

1356 

1357class PackData: 

1358 """The data contained in a packfile. 

1359 

1360 Pack files can be accessed both sequentially for exploding a pack, and 

1361 directly with the help of an index to retrieve a specific object. 

1362 

1363 The objects within are either complete or a delta against another. 

1364 

1365 The header is variable length. If the MSB of each byte is set then it 

1366 indicates that the subsequent byte is still part of the header. 

1367 For the first byte the next MS bits are the type, which tells you the type 

1368 of object, and whether it is a delta. The LS byte is the lowest bits of the 

1369 size. For each subsequent byte the LS 7 bits are the next MS bits of the 

1370 size, i.e. the last byte of the header contains the MS bits of the size. 

1371 

1372 For the complete objects the data is stored as zlib deflated data. 

1373 The size in the header is the uncompressed object size, so to uncompress 

1374 you need to just keep feeding data to zlib until you get an object back, 

1375 or it errors on bad data. This is done here by just giving the complete 

1376 buffer from the start of the deflated object on. This is bad, but until I 

1377 get mmap sorted out it will have to do. 

1378 

1379 Currently there are no integrity checks done. Also no attempt is made to 

1380 try and detect the delta case, or a request for an object at the wrong 

1381 position. It will all just throw a zlib or KeyError. 

1382 """ 

1383 

1384 def __init__( 

1385 self, 

1386 filename: Union[str, os.PathLike], 

1387 file=None, 

1388 size=None, 

1389 *, 

1390 delta_window_size=None, 

1391 window_memory=None, 

1392 delta_cache_size=None, 

1393 depth=None, 

1394 threads=None, 

1395 big_file_threshold=None, 

1396 ) -> None: 

1397 """Create a PackData object representing the pack in the given filename. 

1398 

1399 The file must exist and stay readable until the object is disposed of. 

1400 It must also stay the same size. It will be mapped whenever needed. 

1401 

1402 Currently there is a restriction on the size of the pack as the python 

1403 mmap implementation is flawed. 

1404 """ 

1405 self._filename = filename 

1406 self._size = size 

1407 self._header_size = 12 

1408 self.delta_window_size = delta_window_size 

1409 self.window_memory = window_memory 

1410 self.delta_cache_size = delta_cache_size 

1411 self.depth = depth 

1412 self.threads = threads 

1413 self.big_file_threshold = big_file_threshold 

1414 

1415 if file is None: 

1416 self._file = GitFile(self._filename, "rb") 

1417 else: 

1418 self._file = file 

1419 (version, self._num_objects) = read_pack_header(self._file.read) 

1420 

1421 # Use delta_cache_size config if available, otherwise default 

1422 cache_size = delta_cache_size or (1024 * 1024 * 20) 

1423 self._offset_cache = LRUSizeCache[int, tuple[int, OldUnpackedObject]]( 

1424 cache_size, compute_size=_compute_object_size 

1425 ) 

1426 

1427 @property 

1428 def filename(self): 

1429 return os.path.basename(self._filename) 

1430 

1431 @property 

1432 def path(self): 

1433 return self._filename 

1434 

1435 @classmethod 

1436 def from_file(cls, file, size=None): 

1437 return cls(str(file), file=file, size=size) 

1438 

1439 @classmethod 

1440 def from_path(cls, path: Union[str, os.PathLike]): 

1441 return cls(filename=path) 

1442 

1443 def close(self) -> None: 

1444 """Close the underlying pack file.""" 

1445 self._file.close() 

1446 

1447 def __enter__(self): 

1448 return self 

1449 

1450 def __exit__(self, exc_type, exc_val, exc_tb): 

1451 self.close() 

1452 

1453 def __eq__(self, other): 

1454 """Check equality based on pack checksum.""" 

1455 if isinstance(other, PackData): 

1456 return self.get_stored_checksum() == other.get_stored_checksum() 

1457 return False 

1458 

1459 def _get_size(self): 

1460 """Get the size of the pack file. 

1461 

1462 Returns: Size in bytes 

1463 Raises: AssertionError if file is too small to be a pack 

1464 """ 

1465 if self._size is not None: 

1466 return self._size 

1467 self._size = os.path.getsize(self._filename) 

1468 if self._size < self._header_size: 

1469 errmsg = f"{self._filename} is too small for a packfile ({self._size} < {self._header_size})" 

1470 raise AssertionError(errmsg) 

1471 return self._size 

1472 

1473 def __len__(self) -> int: 

1474 """Returns the number of objects in this pack.""" 

1475 return self._num_objects 

1476 

1477 def calculate_checksum(self): 

1478 """Calculate the checksum for this pack. 

1479 

1480 Returns: 20-byte binary SHA1 digest 

1481 """ 

1482 return compute_file_sha(self._file, end_ofs=-20).digest() 

1483 

1484 def iter_unpacked(self, *, include_comp: bool = False): 

1485 """Iterate over unpacked objects in the pack. 

1486 

1487 Args: 

1488 include_comp: If True, include compressed object data 

1489 Yields: UnpackedObject instances 

1490 """ 

1491 self._file.seek(self._header_size) 

1492 

1493 if self._num_objects is None: 

1494 return 

1495 

1496 for _ in range(self._num_objects): 

1497 offset = self._file.tell() 

1498 unpacked, unused = unpack_object( 

1499 self._file.read, compute_crc32=False, include_comp=include_comp 

1500 ) 

1501 unpacked.offset = offset 

1502 yield unpacked 

1503 # Back up over unused data. 

1504 self._file.seek(-len(unused), SEEK_CUR) 

1505 

1506 def iterentries( 

1507 self, progress=None, resolve_ext_ref: Optional[ResolveExtRefFn] = None 

1508 ): 

1509 """Yield entries summarizing the contents of this pack. 

1510 

1511 Args: 

1512 progress: Progress function, called with current and total 

1513 object count. 

1514 Returns: iterator of tuples with (sha, offset, crc32) 

1515 """ 

1516 num_objects = self._num_objects 

1517 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref) 

1518 for i, result in enumerate(indexer): 

1519 if progress is not None: 

1520 progress(i, num_objects) 

1521 yield result 

1522 

1523 def sorted_entries( 

1524 self, 

1525 progress: Optional[ProgressFn] = None, 

1526 resolve_ext_ref: Optional[ResolveExtRefFn] = None, 

1527 ): 

1528 """Return entries in this pack, sorted by SHA. 

1529 

1530 Args: 

1531 progress: Progress function, called with current and total 

1532 object count 

1533 Returns: Iterator of tuples with (sha, offset, crc32) 

1534 """ 

1535 return sorted( 

1536 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref) 

1537 ) 

1538 

1539 def create_index_v1(self, filename, progress=None, resolve_ext_ref=None): 

1540 """Create a version 1 file for this data file. 

1541 

1542 Args: 

1543 filename: Index filename. 

1544 progress: Progress report function 

1545 Returns: Checksum of index file 

1546 """ 

1547 entries = self.sorted_entries( 

1548 progress=progress, resolve_ext_ref=resolve_ext_ref 

1549 ) 

1550 with GitFile(filename, "wb") as f: 

1551 return write_pack_index_v1(f, entries, self.calculate_checksum()) 

1552 

1553 def create_index_v2(self, filename, progress=None, resolve_ext_ref=None): 

1554 """Create a version 2 index file for this data file. 

1555 

1556 Args: 

1557 filename: Index filename. 

1558 progress: Progress report function 

1559 Returns: Checksum of index file 

1560 """ 

1561 entries = self.sorted_entries( 

1562 progress=progress, resolve_ext_ref=resolve_ext_ref 

1563 ) 

1564 with GitFile(filename, "wb") as f: 

1565 return write_pack_index_v2(f, entries, self.calculate_checksum()) 

1566 

1567 def create_index_v3( 

1568 self, filename, progress=None, resolve_ext_ref=None, hash_algorithm=1 

1569 ): 

1570 """Create a version 3 index file for this data file. 

1571 

1572 Args: 

1573 filename: Index filename. 

1574 progress: Progress report function 

1575 resolve_ext_ref: Function to resolve external references 

1576 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256) 

1577 Returns: Checksum of index file 

1578 """ 

1579 entries = self.sorted_entries( 

1580 progress=progress, resolve_ext_ref=resolve_ext_ref 

1581 ) 

1582 with GitFile(filename, "wb") as f: 

1583 return write_pack_index_v3( 

1584 f, entries, self.calculate_checksum(), hash_algorithm 

1585 ) 

1586 

1587 def create_index( 

1588 self, filename, progress=None, version=2, resolve_ext_ref=None, hash_algorithm=1 

1589 ): 

1590 """Create an index file for this data file. 

1591 

1592 Args: 

1593 filename: Index filename. 

1594 progress: Progress report function 

1595 version: Index version (1, 2, or 3) 

1596 resolve_ext_ref: Function to resolve external references 

1597 hash_algorithm: Hash algorithm identifier for v3 (1 = SHA-1, 2 = SHA-256) 

1598 Returns: Checksum of index file 

1599 """ 

1600 if version == 1: 

1601 return self.create_index_v1( 

1602 filename, progress, resolve_ext_ref=resolve_ext_ref 

1603 ) 

1604 elif version == 2: 

1605 return self.create_index_v2( 

1606 filename, progress, resolve_ext_ref=resolve_ext_ref 

1607 ) 

1608 elif version == 3: 

1609 return self.create_index_v3( 

1610 filename, 

1611 progress, 

1612 resolve_ext_ref=resolve_ext_ref, 

1613 hash_algorithm=hash_algorithm, 

1614 ) 

1615 else: 

1616 raise ValueError(f"unknown index format {version}") 

1617 

1618 def get_stored_checksum(self): 

1619 """Return the expected checksum stored in this pack.""" 

1620 self._file.seek(-20, SEEK_END) 

1621 return self._file.read(20) 

1622 

1623 def check(self) -> None: 

1624 """Check the consistency of this pack.""" 

1625 actual = self.calculate_checksum() 

1626 stored = self.get_stored_checksum() 

1627 if actual != stored: 

1628 raise ChecksumMismatch(stored, actual) 

1629 

1630 def get_unpacked_object_at( 

1631 self, offset: int, *, include_comp: bool = False 

1632 ) -> UnpackedObject: 

1633 """Given offset in the packfile return a UnpackedObject.""" 

1634 assert offset >= self._header_size 

1635 self._file.seek(offset) 

1636 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp) 

1637 unpacked.offset = offset 

1638 return unpacked 

1639 

1640 def get_object_at(self, offset: int) -> tuple[int, OldUnpackedObject]: 

1641 """Given an offset in to the packfile return the object that is there. 

1642 

1643 Using the associated index the location of an object can be looked up, 

1644 and then the packfile can be asked directly for that object using this 

1645 function. 

1646 """ 

1647 try: 

1648 return self._offset_cache[offset] 

1649 except KeyError: 

1650 pass 

1651 unpacked = self.get_unpacked_object_at(offset, include_comp=False) 

1652 return (unpacked.pack_type_num, unpacked._obj()) 

1653 

1654 

1655T = TypeVar("T") 

1656 

1657 

1658class DeltaChainIterator(Generic[T]): 

1659 """Abstract iterator over pack data based on delta chains. 

1660 

1661 Each object in the pack is guaranteed to be inflated exactly once, 

1662 regardless of how many objects reference it as a delta base. As a result, 

1663 memory usage is proportional to the length of the longest delta chain. 

1664 

1665 Subclasses can override _result to define the result type of the iterator. 

1666 By default, results are UnpackedObjects with the following members set: 

1667 

1668 * offset 

1669 * obj_type_num 

1670 * obj_chunks 

1671 * pack_type_num 

1672 * delta_base (for delta types) 

1673 * comp_chunks (if _include_comp is True) 

1674 * decomp_chunks 

1675 * decomp_len 

1676 * crc32 (if _compute_crc32 is True) 

1677 """ 

1678 

1679 _compute_crc32 = False 

1680 _include_comp = False 

1681 

1682 def __init__(self, file_obj, *, resolve_ext_ref=None) -> None: 

1683 self._file = file_obj 

1684 self._resolve_ext_ref = resolve_ext_ref 

1685 self._pending_ofs: dict[int, list[int]] = defaultdict(list) 

1686 self._pending_ref: dict[bytes, list[int]] = defaultdict(list) 

1687 self._full_ofs: list[tuple[int, int]] = [] 

1688 self._ext_refs: list[bytes] = [] 

1689 

1690 @classmethod 

1691 def for_pack_data(cls, pack_data: PackData, resolve_ext_ref=None): 

1692 walker = cls(None, resolve_ext_ref=resolve_ext_ref) 

1693 walker.set_pack_data(pack_data) 

1694 for unpacked in pack_data.iter_unpacked(include_comp=False): 

1695 walker.record(unpacked) 

1696 return walker 

1697 

1698 @classmethod 

1699 def for_pack_subset( 

1700 cls, 

1701 pack: "Pack", 

1702 shas: Iterable[bytes], 

1703 *, 

1704 allow_missing: bool = False, 

1705 resolve_ext_ref=None, 

1706 ): 

1707 walker = cls(None, resolve_ext_ref=resolve_ext_ref) 

1708 walker.set_pack_data(pack.data) 

1709 todo = set() 

1710 for sha in shas: 

1711 assert isinstance(sha, bytes) 

1712 try: 

1713 off = pack.index.object_offset(sha) 

1714 except KeyError: 

1715 if not allow_missing: 

1716 raise 

1717 else: 

1718 todo.add(off) 

1719 done = set() 

1720 while todo: 

1721 off = todo.pop() 

1722 unpacked = pack.data.get_unpacked_object_at(off) 

1723 walker.record(unpacked) 

1724 done.add(off) 

1725 base_ofs = None 

1726 if unpacked.pack_type_num == OFS_DELTA: 

1727 assert unpacked.offset is not None 

1728 assert unpacked.delta_base is not None 

1729 assert isinstance(unpacked.delta_base, int) 

1730 base_ofs = unpacked.offset - unpacked.delta_base 

1731 elif unpacked.pack_type_num == REF_DELTA: 

1732 with suppress(KeyError): 

1733 assert isinstance(unpacked.delta_base, bytes) 

1734 base_ofs = pack.index.object_index(unpacked.delta_base) 

1735 if base_ofs is not None and base_ofs not in done: 

1736 todo.add(base_ofs) 

1737 return walker 

1738 

1739 def record(self, unpacked: UnpackedObject) -> None: 

1740 type_num = unpacked.pack_type_num 

1741 offset = unpacked.offset 

1742 assert offset is not None 

1743 if type_num == OFS_DELTA: 

1744 assert unpacked.delta_base is not None 

1745 assert isinstance(unpacked.delta_base, int) 

1746 base_offset = offset - unpacked.delta_base 

1747 self._pending_ofs[base_offset].append(offset) 

1748 elif type_num == REF_DELTA: 

1749 assert isinstance(unpacked.delta_base, bytes) 

1750 self._pending_ref[unpacked.delta_base].append(offset) 

1751 else: 

1752 self._full_ofs.append((offset, type_num)) 

1753 

1754 def set_pack_data(self, pack_data: PackData) -> None: 

1755 self._file = pack_data._file 

1756 

1757 def _walk_all_chains(self): 

1758 for offset, type_num in self._full_ofs: 

1759 yield from self._follow_chain(offset, type_num, None) 

1760 yield from self._walk_ref_chains() 

1761 assert not self._pending_ofs, repr(self._pending_ofs) 

1762 

1763 def _ensure_no_pending(self) -> None: 

1764 if self._pending_ref: 

1765 raise UnresolvedDeltas([sha_to_hex(s) for s in self._pending_ref]) 

1766 

1767 def _walk_ref_chains(self): 

1768 if not self._resolve_ext_ref: 

1769 self._ensure_no_pending() 

1770 return 

1771 

1772 for base_sha, pending in sorted(self._pending_ref.items()): 

1773 if base_sha not in self._pending_ref: 

1774 continue 

1775 try: 

1776 type_num, chunks = self._resolve_ext_ref(base_sha) 

1777 except KeyError: 

1778 # Not an external ref, but may depend on one. Either it will 

1779 # get popped via a _follow_chain call, or we will raise an 

1780 # error below. 

1781 continue 

1782 self._ext_refs.append(base_sha) 

1783 self._pending_ref.pop(base_sha) 

1784 for new_offset in pending: 

1785 yield from self._follow_chain(new_offset, type_num, chunks) 

1786 

1787 self._ensure_no_pending() 

1788 

1789 def _result(self, unpacked: UnpackedObject) -> T: 

1790 raise NotImplementedError 

1791 

1792 def _resolve_object( 

1793 self, offset: int, obj_type_num: int, base_chunks: list[bytes] 

1794 ) -> UnpackedObject: 

1795 self._file.seek(offset) 

1796 unpacked, _ = unpack_object( 

1797 self._file.read, 

1798 include_comp=self._include_comp, 

1799 compute_crc32=self._compute_crc32, 

1800 ) 

1801 unpacked.offset = offset 

1802 if base_chunks is None: 

1803 assert unpacked.pack_type_num == obj_type_num 

1804 else: 

1805 assert unpacked.pack_type_num in DELTA_TYPES 

1806 unpacked.obj_type_num = obj_type_num 

1807 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks) 

1808 return unpacked 

1809 

1810 def _follow_chain(self, offset: int, obj_type_num: int, base_chunks: list[bytes]): 

1811 # Unlike PackData.get_object_at, there is no need to cache offsets as 

1812 # this approach by design inflates each object exactly once. 

1813 todo = [(offset, obj_type_num, base_chunks)] 

1814 while todo: 

1815 (offset, obj_type_num, base_chunks) = todo.pop() 

1816 unpacked = self._resolve_object(offset, obj_type_num, base_chunks) 

1817 yield self._result(unpacked) 

1818 

1819 assert unpacked.offset is not None 

1820 unblocked = chain( 

1821 self._pending_ofs.pop(unpacked.offset, []), 

1822 self._pending_ref.pop(unpacked.sha(), []), 

1823 ) 

1824 todo.extend( 

1825 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore 

1826 for new_offset in unblocked 

1827 ) 

1828 

1829 def __iter__(self) -> Iterator[T]: 

1830 return self._walk_all_chains() 

1831 

1832 def ext_refs(self): 

1833 return self._ext_refs 

1834 

1835 

1836class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]): 

1837 """Delta chain iterator that yield unpacked objects.""" 

1838 

1839 def _result(self, unpacked): 

1840 return unpacked 

1841 

1842 

1843class PackIndexer(DeltaChainIterator[PackIndexEntry]): 

1844 """Delta chain iterator that yields index entries.""" 

1845 

1846 _compute_crc32 = True 

1847 

1848 def _result(self, unpacked): 

1849 return unpacked.sha(), unpacked.offset, unpacked.crc32 

1850 

1851 

1852class PackInflater(DeltaChainIterator[ShaFile]): 

1853 """Delta chain iterator that yields ShaFile objects.""" 

1854 

1855 def _result(self, unpacked): 

1856 return unpacked.sha_file() 

1857 

1858 

1859class SHA1Reader(BinaryIO): 

1860 """Wrapper for file-like object that remembers the SHA1 of its data.""" 

1861 

1862 def __init__(self, f) -> None: 

1863 self.f = f 

1864 self.sha1 = sha1(b"") 

1865 

1866 def read(self, size: int = -1) -> bytes: 

1867 data = self.f.read(size) 

1868 self.sha1.update(data) 

1869 return data 

1870 

1871 def check_sha(self, allow_empty: bool = False) -> None: 

1872 stored = self.f.read(20) 

1873 # If git option index.skipHash is set the index will be empty 

1874 if stored != self.sha1.digest() and ( 

1875 not allow_empty 

1876 or sha_to_hex(stored) != b"0000000000000000000000000000000000000000" 

1877 ): 

1878 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored)) 

1879 

1880 def close(self): 

1881 return self.f.close() 

1882 

1883 def tell(self) -> int: 

1884 return self.f.tell() 

1885 

1886 # BinaryIO abstract methods 

1887 def readable(self) -> bool: 

1888 return True 

1889 

1890 def writable(self) -> bool: 

1891 return False 

1892 

1893 def seekable(self) -> bool: 

1894 return getattr(self.f, "seekable", lambda: False)() 

1895 

1896 def seek(self, offset: int, whence: int = 0) -> int: 

1897 return self.f.seek(offset, whence) 

1898 

1899 def flush(self) -> None: 

1900 if hasattr(self.f, "flush"): 

1901 self.f.flush() 

1902 

1903 def readline(self, size: int = -1) -> bytes: 

1904 return self.f.readline(size) 

1905 

1906 def readlines(self, hint: int = -1) -> list[bytes]: 

1907 return self.f.readlines(hint) 

1908 

1909 def writelines(self, lines) -> None: 

1910 raise UnsupportedOperation("writelines") 

1911 

1912 def write(self, data) -> int: 

1913 raise UnsupportedOperation("write") 

1914 

1915 def __enter__(self): 

1916 return self 

1917 

1918 def __exit__(self, type, value, traceback): 

1919 self.close() 

1920 

1921 def __iter__(self): 

1922 return self 

1923 

1924 def __next__(self) -> bytes: 

1925 line = self.readline() 

1926 if not line: 

1927 raise StopIteration 

1928 return line 

1929 

1930 def fileno(self) -> int: 

1931 return self.f.fileno() 

1932 

1933 def isatty(self) -> bool: 

1934 return getattr(self.f, "isatty", lambda: False)() 

1935 

1936 def truncate(self, size: Optional[int] = None) -> int: 

1937 raise UnsupportedOperation("truncate") 

1938 

1939 

1940class SHA1Writer(BinaryIO): 

1941 """Wrapper for file-like object that remembers the SHA1 of its data.""" 

1942 

1943 def __init__(self, f) -> None: 

1944 self.f = f 

1945 self.length = 0 

1946 self.sha1 = sha1(b"") 

1947 

1948 def write(self, data) -> int: 

1949 self.sha1.update(data) 

1950 self.f.write(data) 

1951 self.length += len(data) 

1952 return len(data) 

1953 

1954 def write_sha(self): 

1955 sha = self.sha1.digest() 

1956 assert len(sha) == 20 

1957 self.f.write(sha) 

1958 self.length += len(sha) 

1959 return sha 

1960 

1961 def close(self): 

1962 sha = self.write_sha() 

1963 self.f.close() 

1964 return sha 

1965 

1966 def offset(self): 

1967 return self.length 

1968 

1969 def tell(self) -> int: 

1970 return self.f.tell() 

1971 

1972 # BinaryIO abstract methods 

1973 def readable(self) -> bool: 

1974 return False 

1975 

1976 def writable(self) -> bool: 

1977 return True 

1978 

1979 def seekable(self) -> bool: 

1980 return getattr(self.f, "seekable", lambda: False)() 

1981 

1982 def seek(self, offset: int, whence: int = 0) -> int: 

1983 return self.f.seek(offset, whence) 

1984 

1985 def flush(self) -> None: 

1986 if hasattr(self.f, "flush"): 

1987 self.f.flush() 

1988 

1989 def readline(self, size: int = -1) -> bytes: 

1990 raise UnsupportedOperation("readline") 

1991 

1992 def readlines(self, hint: int = -1) -> list[bytes]: 

1993 raise UnsupportedOperation("readlines") 

1994 

1995 def writelines(self, lines) -> None: 

1996 for line in lines: 

1997 self.write(line) 

1998 

1999 def read(self, size: int = -1) -> bytes: 

2000 raise UnsupportedOperation("read") 

2001 

2002 def __enter__(self): 

2003 return self 

2004 

2005 def __exit__(self, type, value, traceback): 

2006 self.close() 

2007 

2008 def __iter__(self): 

2009 return self 

2010 

2011 def __next__(self) -> bytes: 

2012 raise UnsupportedOperation("__next__") 

2013 

2014 def fileno(self) -> int: 

2015 return self.f.fileno() 

2016 

2017 def isatty(self) -> bool: 

2018 return getattr(self.f, "isatty", lambda: False)() 

2019 

2020 def truncate(self, size: Optional[int] = None) -> int: 

2021 raise UnsupportedOperation("truncate") 

2022 

2023 

2024def pack_object_header(type_num, delta_base, size): 

2025 """Create a pack object header for the given object info. 

2026 

2027 Args: 

2028 type_num: Numeric type of the object. 

2029 delta_base: Delta base offset or ref, or None for whole objects. 

2030 size: Uncompressed object size. 

2031 Returns: A header for a packed object. 

2032 """ 

2033 header = [] 

2034 c = (type_num << 4) | (size & 15) 

2035 size >>= 4 

2036 while size: 

2037 header.append(c | 0x80) 

2038 c = size & 0x7F 

2039 size >>= 7 

2040 header.append(c) 

2041 if type_num == OFS_DELTA: 

2042 ret = [delta_base & 0x7F] 

2043 delta_base >>= 7 

2044 while delta_base: 

2045 delta_base -= 1 

2046 ret.insert(0, 0x80 | (delta_base & 0x7F)) 

2047 delta_base >>= 7 

2048 header.extend(ret) 

2049 elif type_num == REF_DELTA: 

2050 assert len(delta_base) == 20 

2051 header += delta_base 

2052 return bytearray(header) 

2053 

2054 

2055def pack_object_chunks(type, object, compression_level=-1): 

2056 """Generate chunks for a pack object. 

2057 

2058 Args: 

2059 type: Numeric type of the object 

2060 object: Object to write 

2061 compression_level: the zlib compression level 

2062 Returns: Chunks 

2063 """ 

2064 if type in DELTA_TYPES: 

2065 delta_base, object = object 

2066 else: 

2067 delta_base = None 

2068 if isinstance(object, bytes): 

2069 object = [object] 

2070 yield bytes(pack_object_header(type, delta_base, sum(map(len, object)))) 

2071 compressor = zlib.compressobj(level=compression_level) 

2072 for data in object: 

2073 yield compressor.compress(data) 

2074 yield compressor.flush() 

2075 

2076 

2077def write_pack_object(write, type, object, sha=None, compression_level=-1): 

2078 """Write pack object to a file. 

2079 

2080 Args: 

2081 write: Write function to use 

2082 type: Numeric type of the object 

2083 object: Object to write 

2084 compression_level: the zlib compression level 

2085 Returns: Tuple with offset at which the object was written, and crc32 

2086 """ 

2087 crc32 = 0 

2088 for chunk in pack_object_chunks(type, object, compression_level=compression_level): 

2089 write(chunk) 

2090 if sha is not None: 

2091 sha.update(chunk) 

2092 crc32 = binascii.crc32(chunk, crc32) 

2093 return crc32 & 0xFFFFFFFF 

2094 

2095 

2096def write_pack( 

2097 filename, 

2098 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]], 

2099 *, 

2100 deltify: Optional[bool] = None, 

2101 delta_window_size: Optional[int] = None, 

2102 compression_level: int = -1, 

2103): 

2104 """Write a new pack data file. 

2105 

2106 Args: 

2107 filename: Path to the new pack file (without .pack extension) 

2108 delta_window_size: Delta window size 

2109 deltify: Whether to deltify pack objects 

2110 compression_level: the zlib compression level 

2111 Returns: Tuple with checksum of pack file and index file 

2112 """ 

2113 with GitFile(filename + ".pack", "wb") as f: 

2114 entries, data_sum = write_pack_objects( 

2115 f.write, 

2116 objects, 

2117 delta_window_size=delta_window_size, 

2118 deltify=deltify, 

2119 compression_level=compression_level, 

2120 ) 

2121 entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()]) 

2122 with GitFile(filename + ".idx", "wb") as f: 

2123 return data_sum, write_pack_index(f, entries, data_sum) 

2124 

2125 

2126def pack_header_chunks(num_objects): 

2127 """Yield chunks for a pack header.""" 

2128 yield b"PACK" # Pack header 

2129 yield struct.pack(b">L", 2) # Pack version 

2130 yield struct.pack(b">L", num_objects) # Number of objects in pack 

2131 

2132 

2133def write_pack_header(write, num_objects) -> None: 

2134 """Write a pack header for the given number of objects.""" 

2135 if hasattr(write, "write"): 

2136 write = write.write 

2137 warnings.warn( 

2138 "write_pack_header() now takes a write rather than file argument", 

2139 DeprecationWarning, 

2140 stacklevel=2, 

2141 ) 

2142 for chunk in pack_header_chunks(num_objects): 

2143 write(chunk) 

2144 

2145 

2146def find_reusable_deltas( 

2147 container: PackedObjectContainer, 

2148 object_ids: set[bytes], 

2149 *, 

2150 other_haves: Optional[set[bytes]] = None, 

2151 progress=None, 

2152) -> Iterator[UnpackedObject]: 

2153 if other_haves is None: 

2154 other_haves = set() 

2155 reused = 0 

2156 for i, unpacked in enumerate( 

2157 container.iter_unpacked_subset( 

2158 object_ids, allow_missing=True, convert_ofs_delta=True 

2159 ) 

2160 ): 

2161 if progress is not None and i % 1000 == 0: 

2162 progress(f"checking for reusable deltas: {i}/{len(object_ids)}\r".encode()) 

2163 if unpacked.pack_type_num == REF_DELTA: 

2164 hexsha = sha_to_hex(unpacked.delta_base) # type: ignore 

2165 if hexsha in object_ids or hexsha in other_haves: 

2166 yield unpacked 

2167 reused += 1 

2168 if progress is not None: 

2169 progress((f"found {reused} deltas to reuse\n").encode()) 

2170 

2171 

2172def deltify_pack_objects( 

2173 objects: Union[Iterator[bytes], Iterator[tuple[ShaFile, Optional[bytes]]]], 

2174 *, 

2175 window_size: Optional[int] = None, 

2176 progress=None, 

2177) -> Iterator[UnpackedObject]: 

2178 """Generate deltas for pack objects. 

2179 

2180 Args: 

2181 objects: An iterable of (object, path) tuples to deltify. 

2182 window_size: Window size; None for default 

2183 Returns: Iterator over type_num, object id, delta_base, content 

2184 delta_base is None for full text entries 

2185 """ 

2186 

2187 def objects_with_hints(): 

2188 for e in objects: 

2189 if isinstance(e, ShaFile): 

2190 yield (e, (e.type_num, None)) 

2191 else: 

2192 yield (e[0], (e[0].type_num, e[1])) 

2193 

2194 yield from deltas_from_sorted_objects( 

2195 sort_objects_for_delta(objects_with_hints()), 

2196 window_size=window_size, 

2197 progress=progress, 

2198 ) 

2199 

2200 

2201def sort_objects_for_delta( 

2202 objects: Union[Iterator[ShaFile], Iterator[tuple[ShaFile, Optional[PackHint]]]], 

2203) -> Iterator[ShaFile]: 

2204 magic = [] 

2205 for entry in objects: 

2206 if isinstance(entry, tuple): 

2207 obj, hint = entry 

2208 if hint is None: 

2209 type_num = None 

2210 path = None 

2211 else: 

2212 (type_num, path) = hint 

2213 else: 

2214 obj = entry 

2215 magic.append((type_num, path, -obj.raw_length(), obj)) 

2216 # Build a list of objects ordered by the magic Linus heuristic 

2217 # This helps us find good objects to diff against us 

2218 magic.sort() 

2219 return (x[3] for x in magic) 

2220 

2221 

2222def deltas_from_sorted_objects( 

2223 objects, window_size: Optional[int] = None, progress=None 

2224): 

2225 # TODO(jelmer): Use threads 

2226 if window_size is None: 

2227 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE 

2228 

2229 possible_bases: deque[tuple[bytes, int, list[bytes]]] = deque() 

2230 for i, o in enumerate(objects): 

2231 if progress is not None and i % 1000 == 0: 

2232 progress((f"generating deltas: {i}\r").encode()) 

2233 raw = o.as_raw_chunks() 

2234 winner = raw 

2235 winner_len = sum(map(len, winner)) 

2236 winner_base = None 

2237 for base_id, base_type_num, base in possible_bases: 

2238 if base_type_num != o.type_num: 

2239 continue 

2240 delta_len = 0 

2241 delta = [] 

2242 for chunk in create_delta(base, raw): 

2243 delta_len += len(chunk) 

2244 if delta_len >= winner_len: 

2245 break 

2246 delta.append(chunk) 

2247 else: 

2248 winner_base = base_id 

2249 winner = delta 

2250 winner_len = sum(map(len, winner)) 

2251 yield UnpackedObject( 

2252 o.type_num, 

2253 sha=o.sha().digest(), 

2254 delta_base=winner_base, 

2255 decomp_len=winner_len, 

2256 decomp_chunks=winner, 

2257 ) 

2258 possible_bases.appendleft((o.sha().digest(), o.type_num, raw)) 

2259 while len(possible_bases) > window_size: 

2260 possible_bases.pop() 

2261 

2262 

2263def pack_objects_to_data( 

2264 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]], 

2265 *, 

2266 deltify: Optional[bool] = None, 

2267 delta_window_size: Optional[int] = None, 

2268 ofs_delta: bool = True, 

2269 progress=None, 

2270) -> tuple[int, Iterator[UnpackedObject]]: 

2271 """Create pack data from objects. 

2272 

2273 Args: 

2274 objects: Pack objects 

2275 Returns: Tuples with (type_num, hexdigest, delta base, object chunks) 

2276 """ 

2277 # TODO(jelmer): support deltaifying 

2278 count = len(objects) 

2279 if deltify is None: 

2280 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too 

2281 # slow at the moment. 

2282 deltify = False 

2283 if deltify: 

2284 return ( 

2285 count, 

2286 deltify_pack_objects( 

2287 iter(objects), # type: ignore 

2288 window_size=delta_window_size, 

2289 progress=progress, 

2290 ), 

2291 ) 

2292 else: 

2293 

2294 def iter_without_path(): 

2295 for o in objects: 

2296 if isinstance(o, tuple): 

2297 yield full_unpacked_object(o[0]) 

2298 else: 

2299 yield full_unpacked_object(o) 

2300 

2301 return (count, iter_without_path()) 

2302 

2303 

2304def generate_unpacked_objects( 

2305 container: PackedObjectContainer, 

2306 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]], 

2307 delta_window_size: Optional[int] = None, 

2308 deltify: Optional[bool] = None, 

2309 reuse_deltas: bool = True, 

2310 ofs_delta: bool = True, 

2311 other_haves: Optional[set[bytes]] = None, 

2312 progress=None, 

2313) -> Iterator[UnpackedObject]: 

2314 """Create pack data from objects. 

2315 

2316 Returns: Tuples with (type_num, hexdigest, delta base, object chunks) 

2317 """ 

2318 todo = dict(object_ids) 

2319 if reuse_deltas: 

2320 for unpack in find_reusable_deltas( 

2321 container, set(todo), other_haves=other_haves, progress=progress 

2322 ): 

2323 del todo[sha_to_hex(unpack.sha())] 

2324 yield unpack 

2325 if deltify is None: 

2326 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too 

2327 # slow at the moment. 

2328 deltify = False 

2329 if deltify: 

2330 objects_to_delta = container.iterobjects_subset( 

2331 todo.keys(), allow_missing=False 

2332 ) 

2333 yield from deltas_from_sorted_objects( 

2334 sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta), 

2335 window_size=delta_window_size, 

2336 progress=progress, 

2337 ) 

2338 else: 

2339 for oid in todo: 

2340 yield full_unpacked_object(container[oid]) 

2341 

2342 

2343def full_unpacked_object(o: ShaFile) -> UnpackedObject: 

2344 return UnpackedObject( 

2345 o.type_num, 

2346 delta_base=None, 

2347 crc32=None, 

2348 decomp_chunks=o.as_raw_chunks(), 

2349 sha=o.sha().digest(), 

2350 ) 

2351 

2352 

2353def write_pack_from_container( 

2354 write, 

2355 container: PackedObjectContainer, 

2356 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]], 

2357 delta_window_size: Optional[int] = None, 

2358 deltify: Optional[bool] = None, 

2359 reuse_deltas: bool = True, 

2360 compression_level: int = -1, 

2361 other_haves: Optional[set[bytes]] = None, 

2362): 

2363 """Write a new pack data file. 

2364 

2365 Args: 

2366 write: write function to use 

2367 container: PackedObjectContainer 

2368 delta_window_size: Sliding window size for searching for deltas; 

2369 Set to None for default window size. 

2370 deltify: Whether to deltify objects 

2371 compression_level: the zlib compression level to use 

2372 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

2373 """ 

2374 pack_contents_count = len(object_ids) 

2375 pack_contents = generate_unpacked_objects( 

2376 container, 

2377 object_ids, 

2378 delta_window_size=delta_window_size, 

2379 deltify=deltify, 

2380 reuse_deltas=reuse_deltas, 

2381 other_haves=other_haves, 

2382 ) 

2383 

2384 return write_pack_data( 

2385 write, 

2386 pack_contents, 

2387 num_records=pack_contents_count, 

2388 compression_level=compression_level, 

2389 ) 

2390 

2391 

2392def write_pack_objects( 

2393 write, 

2394 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]], 

2395 *, 

2396 delta_window_size: Optional[int] = None, 

2397 deltify: Optional[bool] = None, 

2398 compression_level: int = -1, 

2399): 

2400 """Write a new pack data file. 

2401 

2402 Args: 

2403 write: write function to use 

2404 objects: Sequence of (object, path) tuples to write 

2405 delta_window_size: Sliding window size for searching for deltas; 

2406 Set to None for default window size. 

2407 deltify: Whether to deltify objects 

2408 compression_level: the zlib compression level to use 

2409 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

2410 """ 

2411 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify) 

2412 

2413 return write_pack_data( 

2414 write, 

2415 pack_contents, 

2416 num_records=pack_contents_count, 

2417 compression_level=compression_level, 

2418 ) 

2419 

2420 

2421class PackChunkGenerator: 

2422 def __init__( 

2423 self, 

2424 num_records=None, 

2425 records=None, 

2426 progress=None, 

2427 compression_level=-1, 

2428 reuse_compressed=True, 

2429 ) -> None: 

2430 self.cs = sha1(b"") 

2431 self.entries: dict[Union[int, bytes], tuple[int, int]] = {} 

2432 self._it = self._pack_data_chunks( 

2433 num_records=num_records, 

2434 records=records, 

2435 progress=progress, 

2436 compression_level=compression_level, 

2437 reuse_compressed=reuse_compressed, 

2438 ) 

2439 

2440 def sha1digest(self): 

2441 return self.cs.digest() 

2442 

2443 def __iter__(self): 

2444 return self._it 

2445 

2446 def _pack_data_chunks( 

2447 self, 

2448 records: Iterator[UnpackedObject], 

2449 *, 

2450 num_records=None, 

2451 progress=None, 

2452 compression_level: int = -1, 

2453 reuse_compressed: bool = True, 

2454 ) -> Iterator[bytes]: 

2455 """Iterate pack data file chunks. 

2456 

2457 Args: 

2458 records: Iterator over UnpackedObject 

2459 num_records: Number of records (defaults to len(records) if not specified) 

2460 progress: Function to report progress to 

2461 compression_level: the zlib compression level 

2462 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

2463 """ 

2464 # Write the pack 

2465 if num_records is None: 

2466 num_records = len(records) # type: ignore 

2467 offset = 0 

2468 for chunk in pack_header_chunks(num_records): 

2469 yield chunk 

2470 self.cs.update(chunk) 

2471 offset += len(chunk) 

2472 actual_num_records = 0 

2473 for i, unpacked in enumerate(records): 

2474 type_num = unpacked.pack_type_num 

2475 if progress is not None and i % 1000 == 0: 

2476 progress((f"writing pack data: {i}/{num_records}\r").encode("ascii")) 

2477 raw: Union[list[bytes], tuple[int, list[bytes]], tuple[bytes, list[bytes]]] 

2478 if unpacked.delta_base is not None: 

2479 try: 

2480 base_offset, base_crc32 = self.entries[unpacked.delta_base] 

2481 except KeyError: 

2482 type_num = REF_DELTA 

2483 assert isinstance(unpacked.delta_base, bytes) 

2484 raw = (unpacked.delta_base, unpacked.decomp_chunks) 

2485 else: 

2486 type_num = OFS_DELTA 

2487 raw = (offset - base_offset, unpacked.decomp_chunks) 

2488 else: 

2489 raw = unpacked.decomp_chunks 

2490 if unpacked.comp_chunks is not None and reuse_compressed: 

2491 chunks = unpacked.comp_chunks 

2492 else: 

2493 chunks = pack_object_chunks( 

2494 type_num, raw, compression_level=compression_level 

2495 ) 

2496 crc32 = 0 

2497 object_size = 0 

2498 for chunk in chunks: 

2499 yield chunk 

2500 crc32 = binascii.crc32(chunk, crc32) 

2501 self.cs.update(chunk) 

2502 object_size += len(chunk) 

2503 actual_num_records += 1 

2504 self.entries[unpacked.sha()] = (offset, crc32) 

2505 offset += object_size 

2506 if actual_num_records != num_records: 

2507 raise AssertionError( 

2508 f"actual records written differs: {actual_num_records} != {num_records}" 

2509 ) 

2510 

2511 yield self.cs.digest() 

2512 

2513 

2514def write_pack_data( 

2515 write, 

2516 records: Iterator[UnpackedObject], 

2517 *, 

2518 num_records=None, 

2519 progress=None, 

2520 compression_level=-1, 

2521): 

2522 """Write a new pack data file. 

2523 

2524 Args: 

2525 write: Write function to use 

2526 num_records: Number of records (defaults to len(records) if None) 

2527 records: Iterator over type_num, object_id, delta_base, raw 

2528 progress: Function to report progress to 

2529 compression_level: the zlib compression level 

2530 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

2531 """ 

2532 chunk_generator = PackChunkGenerator( 

2533 num_records=num_records, 

2534 records=records, 

2535 progress=progress, 

2536 compression_level=compression_level, 

2537 ) 

2538 for chunk in chunk_generator: 

2539 write(chunk) 

2540 return chunk_generator.entries, chunk_generator.sha1digest() 

2541 

2542 

2543def write_pack_index_v1(f, entries, pack_checksum): 

2544 """Write a new pack index file. 

2545 

2546 Args: 

2547 f: A file-like object to write to 

2548 entries: List of tuples with object name (sha), offset_in_pack, 

2549 and crc32_checksum. 

2550 pack_checksum: Checksum of the pack file. 

2551 Returns: The SHA of the written index file 

2552 """ 

2553 f = SHA1Writer(f) 

2554 fan_out_table = defaultdict(lambda: 0) 

2555 for name, _offset, _entry_checksum in entries: 

2556 fan_out_table[ord(name[:1])] += 1 

2557 # Fan-out table 

2558 for i in range(0x100): 

2559 f.write(struct.pack(">L", fan_out_table[i])) 

2560 fan_out_table[i + 1] += fan_out_table[i] 

2561 for name, offset, _entry_checksum in entries: 

2562 if not (offset <= 0xFFFFFFFF): 

2563 raise TypeError("pack format 1 only supports offsets < 2Gb") 

2564 f.write(struct.pack(">L20s", offset, name)) 

2565 assert len(pack_checksum) == 20 

2566 f.write(pack_checksum) 

2567 return f.write_sha() 

2568 

2569 

2570def _delta_encode_size(size) -> bytes: 

2571 ret = bytearray() 

2572 c = size & 0x7F 

2573 size >>= 7 

2574 while size: 

2575 ret.append(c | 0x80) 

2576 c = size & 0x7F 

2577 size >>= 7 

2578 ret.append(c) 

2579 return bytes(ret) 

2580 

2581 

2582# The length of delta compression copy operations in version 2 packs is limited 

2583# to 64K. To copy more, we use several copy operations. Version 3 packs allow 

2584# 24-bit lengths in copy operations, but we always make version 2 packs. 

2585_MAX_COPY_LEN = 0xFFFF 

2586 

2587 

2588def _encode_copy_operation(start, length): 

2589 scratch = bytearray([0x80]) 

2590 for i in range(4): 

2591 if start & 0xFF << i * 8: 

2592 scratch.append((start >> i * 8) & 0xFF) 

2593 scratch[0] |= 1 << i 

2594 for i in range(2): 

2595 if length & 0xFF << i * 8: 

2596 scratch.append((length >> i * 8) & 0xFF) 

2597 scratch[0] |= 1 << (4 + i) 

2598 return bytes(scratch) 

2599 

2600 

2601def create_delta(base_buf, target_buf): 

2602 """Use python difflib to work out how to transform base_buf to target_buf. 

2603 

2604 Args: 

2605 base_buf: Base buffer 

2606 target_buf: Target buffer 

2607 """ 

2608 if isinstance(base_buf, list): 

2609 base_buf = b"".join(base_buf) 

2610 if isinstance(target_buf, list): 

2611 target_buf = b"".join(target_buf) 

2612 assert isinstance(base_buf, bytes) 

2613 assert isinstance(target_buf, bytes) 

2614 # write delta header 

2615 yield _delta_encode_size(len(base_buf)) 

2616 yield _delta_encode_size(len(target_buf)) 

2617 # write out delta opcodes 

2618 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf) 

2619 for opcode, i1, i2, j1, j2 in seq.get_opcodes(): 

2620 # Git patch opcodes don't care about deletes! 

2621 # if opcode == 'replace' or opcode == 'delete': 

2622 # pass 

2623 if opcode == "equal": 

2624 # If they are equal, unpacker will use data from base_buf 

2625 # Write out an opcode that says what range to use 

2626 copy_start = i1 

2627 copy_len = i2 - i1 

2628 while copy_len > 0: 

2629 to_copy = min(copy_len, _MAX_COPY_LEN) 

2630 yield _encode_copy_operation(copy_start, to_copy) 

2631 copy_start += to_copy 

2632 copy_len -= to_copy 

2633 if opcode == "replace" or opcode == "insert": 

2634 # If we are replacing a range or adding one, then we just 

2635 # output it to the stream (prefixed by its size) 

2636 s = j2 - j1 

2637 o = j1 

2638 while s > 127: 

2639 yield bytes([127]) 

2640 yield memoryview(target_buf)[o : o + 127] 

2641 s -= 127 

2642 o += 127 

2643 yield bytes([s]) 

2644 yield memoryview(target_buf)[o : o + s] 

2645 

2646 

2647def apply_delta(src_buf, delta): 

2648 """Based on the similar function in git's patch-delta.c. 

2649 

2650 Args: 

2651 src_buf: Source buffer 

2652 delta: Delta instructions 

2653 """ 

2654 if not isinstance(src_buf, bytes): 

2655 src_buf = b"".join(src_buf) 

2656 if not isinstance(delta, bytes): 

2657 delta = b"".join(delta) 

2658 out = [] 

2659 index = 0 

2660 delta_length = len(delta) 

2661 

2662 def get_delta_header_size(delta, index): 

2663 size = 0 

2664 i = 0 

2665 while delta: 

2666 cmd = ord(delta[index : index + 1]) 

2667 index += 1 

2668 size |= (cmd & ~0x80) << i 

2669 i += 7 

2670 if not cmd & 0x80: 

2671 break 

2672 return size, index 

2673 

2674 src_size, index = get_delta_header_size(delta, index) 

2675 dest_size, index = get_delta_header_size(delta, index) 

2676 if src_size != len(src_buf): 

2677 raise ApplyDeltaError( 

2678 f"Unexpected source buffer size: {src_size} vs {len(src_buf)}" 

2679 ) 

2680 while index < delta_length: 

2681 cmd = ord(delta[index : index + 1]) 

2682 index += 1 

2683 if cmd & 0x80: 

2684 cp_off = 0 

2685 for i in range(4): 

2686 if cmd & (1 << i): 

2687 x = ord(delta[index : index + 1]) 

2688 index += 1 

2689 cp_off |= x << (i * 8) 

2690 cp_size = 0 

2691 # Version 3 packs can contain copy sizes larger than 64K. 

2692 for i in range(3): 

2693 if cmd & (1 << (4 + i)): 

2694 x = ord(delta[index : index + 1]) 

2695 index += 1 

2696 cp_size |= x << (i * 8) 

2697 if cp_size == 0: 

2698 cp_size = 0x10000 

2699 if ( 

2700 cp_off + cp_size < cp_size 

2701 or cp_off + cp_size > src_size 

2702 or cp_size > dest_size 

2703 ): 

2704 break 

2705 out.append(src_buf[cp_off : cp_off + cp_size]) 

2706 elif cmd != 0: 

2707 out.append(delta[index : index + cmd]) 

2708 index += cmd 

2709 else: 

2710 raise ApplyDeltaError("Invalid opcode 0") 

2711 

2712 if index != delta_length: 

2713 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}") 

2714 

2715 if dest_size != chunks_length(out): 

2716 raise ApplyDeltaError("dest size incorrect") 

2717 

2718 return out 

2719 

2720 

2721def write_pack_index_v2( 

2722 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes 

2723) -> bytes: 

2724 """Write a new pack index file. 

2725 

2726 Args: 

2727 f: File-like object to write to 

2728 entries: List of tuples with object name (sha), offset_in_pack, and 

2729 crc32_checksum. 

2730 pack_checksum: Checksum of the pack file. 

2731 Returns: The SHA of the index file written 

2732 """ 

2733 f = SHA1Writer(f) 

2734 f.write(b"\377tOc") # Magic! 

2735 f.write(struct.pack(">L", 2)) 

2736 fan_out_table: dict[int, int] = defaultdict(lambda: 0) 

2737 for name, offset, entry_checksum in entries: 

2738 fan_out_table[ord(name[:1])] += 1 

2739 # Fan-out table 

2740 largetable: list[int] = [] 

2741 for i in range(0x100): 

2742 f.write(struct.pack(b">L", fan_out_table[i])) 

2743 fan_out_table[i + 1] += fan_out_table[i] 

2744 for name, offset, entry_checksum in entries: 

2745 f.write(name) 

2746 for name, offset, entry_checksum in entries: 

2747 f.write(struct.pack(b">L", entry_checksum)) 

2748 for name, offset, entry_checksum in entries: 

2749 if offset < 2**31: 

2750 f.write(struct.pack(b">L", offset)) 

2751 else: 

2752 f.write(struct.pack(b">L", 2**31 + len(largetable))) 

2753 largetable.append(offset) 

2754 for offset in largetable: 

2755 f.write(struct.pack(b">Q", offset)) 

2756 assert len(pack_checksum) == 20 

2757 f.write(pack_checksum) 

2758 return f.write_sha() 

2759 

2760 

2761def write_pack_index_v3( 

2762 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes, hash_algorithm: int = 1 

2763) -> bytes: 

2764 """Write a new pack index file in v3 format. 

2765 

2766 Args: 

2767 f: File-like object to write to 

2768 entries: List of tuples with object name (sha), offset_in_pack, and 

2769 crc32_checksum. 

2770 pack_checksum: Checksum of the pack file. 

2771 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256) 

2772 Returns: The SHA of the index file written 

2773 """ 

2774 if hash_algorithm == 1: 

2775 hash_size = 20 # SHA-1 

2776 writer_cls = SHA1Writer 

2777 elif hash_algorithm == 2: 

2778 hash_size = 32 # SHA-256 

2779 # TODO: Add SHA256Writer when SHA-256 support is implemented 

2780 raise NotImplementedError("SHA-256 support not yet implemented") 

2781 else: 

2782 raise ValueError(f"Unknown hash algorithm {hash_algorithm}") 

2783 

2784 # Convert entries to list to allow multiple iterations 

2785 entries_list = list(entries) 

2786 

2787 # Calculate shortest unambiguous prefix length for object names 

2788 # For now, use full hash size (this could be optimized) 

2789 shortened_oid_len = hash_size 

2790 

2791 f = writer_cls(f) 

2792 f.write(b"\377tOc") # Magic! 

2793 f.write(struct.pack(">L", 3)) # Version 3 

2794 f.write(struct.pack(">L", hash_algorithm)) # Hash algorithm 

2795 f.write(struct.pack(">L", shortened_oid_len)) # Shortened OID length 

2796 

2797 fan_out_table: dict[int, int] = defaultdict(lambda: 0) 

2798 for name, offset, entry_checksum in entries_list: 

2799 if len(name) != hash_size: 

2800 raise ValueError( 

2801 f"Object name has wrong length: expected {hash_size}, got {len(name)}" 

2802 ) 

2803 fan_out_table[ord(name[:1])] += 1 

2804 

2805 # Fan-out table 

2806 largetable: list[int] = [] 

2807 for i in range(0x100): 

2808 f.write(struct.pack(b">L", fan_out_table[i])) 

2809 fan_out_table[i + 1] += fan_out_table[i] 

2810 

2811 # Object names table 

2812 for name, offset, entry_checksum in entries_list: 

2813 f.write(name) 

2814 

2815 # CRC32 checksums table 

2816 for name, offset, entry_checksum in entries_list: 

2817 f.write(struct.pack(b">L", entry_checksum)) 

2818 

2819 # Offset table 

2820 for name, offset, entry_checksum in entries_list: 

2821 if offset < 2**31: 

2822 f.write(struct.pack(b">L", offset)) 

2823 else: 

2824 f.write(struct.pack(b">L", 2**31 + len(largetable))) 

2825 largetable.append(offset) 

2826 

2827 # Large offset table 

2828 for offset in largetable: 

2829 f.write(struct.pack(b">Q", offset)) 

2830 

2831 assert len(pack_checksum) == hash_size, ( 

2832 f"Pack checksum has wrong length: expected {hash_size}, got {len(pack_checksum)}" 

2833 ) 

2834 f.write(pack_checksum) 

2835 return f.write_sha() 

2836 

2837 

2838def write_pack_index( 

2839 index_filename, entries, pack_checksum, progress=None, version=None 

2840): 

2841 """Write a pack index file. 

2842 

2843 Args: 

2844 index_filename: Index filename. 

2845 entries: List of (checksum, offset, crc32) tuples 

2846 pack_checksum: Checksum of the pack file. 

2847 progress: Progress function (not currently used) 

2848 version: Pack index version to use (1, 2, or 3). If None, defaults to DEFAULT_PACK_INDEX_VERSION. 

2849 

2850 Returns: 

2851 SHA of the written index file 

2852 """ 

2853 if version is None: 

2854 version = DEFAULT_PACK_INDEX_VERSION 

2855 

2856 if version == 1: 

2857 return write_pack_index_v1(index_filename, entries, pack_checksum) 

2858 elif version == 2: 

2859 return write_pack_index_v2(index_filename, entries, pack_checksum) 

2860 elif version == 3: 

2861 return write_pack_index_v3(index_filename, entries, pack_checksum) 

2862 else: 

2863 raise ValueError(f"Unsupported pack index version: {version}") 

2864 

2865 

2866class Pack: 

2867 """A Git pack object.""" 

2868 

2869 _data_load: Optional[Callable[[], PackData]] 

2870 _idx_load: Optional[Callable[[], PackIndex]] 

2871 

2872 _data: Optional[PackData] 

2873 _idx: Optional[PackIndex] 

2874 

2875 def __init__( 

2876 self, 

2877 basename, 

2878 resolve_ext_ref: Optional[ResolveExtRefFn] = None, 

2879 *, 

2880 delta_window_size=None, 

2881 window_memory=None, 

2882 delta_cache_size=None, 

2883 depth=None, 

2884 threads=None, 

2885 big_file_threshold=None, 

2886 ) -> None: 

2887 self._basename = basename 

2888 self._data = None 

2889 self._idx = None 

2890 self._idx_path = self._basename + ".idx" 

2891 self._data_path = self._basename + ".pack" 

2892 self.delta_window_size = delta_window_size 

2893 self.window_memory = window_memory 

2894 self.delta_cache_size = delta_cache_size 

2895 self.depth = depth 

2896 self.threads = threads 

2897 self.big_file_threshold = big_file_threshold 

2898 self._data_load = lambda: PackData( 

2899 self._data_path, 

2900 delta_window_size=delta_window_size, 

2901 window_memory=window_memory, 

2902 delta_cache_size=delta_cache_size, 

2903 depth=depth, 

2904 threads=threads, 

2905 big_file_threshold=big_file_threshold, 

2906 ) 

2907 self._idx_load = lambda: load_pack_index(self._idx_path) 

2908 self.resolve_ext_ref = resolve_ext_ref 

2909 

2910 @classmethod 

2911 def from_lazy_objects(cls, data_fn, idx_fn): 

2912 """Create a new pack object from callables to load pack data and 

2913 index objects. 

2914 """ 

2915 ret = cls("") 

2916 ret._data_load = data_fn 

2917 ret._idx_load = idx_fn 

2918 return ret 

2919 

2920 @classmethod 

2921 def from_objects(cls, data, idx): 

2922 """Create a new pack object from pack data and index objects.""" 

2923 ret = cls("") 

2924 ret._data = data 

2925 ret._data_load = None 

2926 ret._idx = idx 

2927 ret._idx_load = None 

2928 ret.check_length_and_checksum() 

2929 return ret 

2930 

2931 def name(self): 

2932 """The SHA over the SHAs of the objects in this pack.""" 

2933 return self.index.objects_sha1() 

2934 

2935 @property 

2936 def data(self) -> PackData: 

2937 """The pack data object being used.""" 

2938 if self._data is None: 

2939 assert self._data_load 

2940 self._data = self._data_load() 

2941 self.check_length_and_checksum() 

2942 return self._data 

2943 

2944 @property 

2945 def index(self) -> PackIndex: 

2946 """The index being used. 

2947 

2948 Note: This may be an in-memory index 

2949 """ 

2950 if self._idx is None: 

2951 assert self._idx_load 

2952 self._idx = self._idx_load() 

2953 return self._idx 

2954 

2955 def close(self) -> None: 

2956 if self._data is not None: 

2957 self._data.close() 

2958 if self._idx is not None: 

2959 self._idx.close() 

2960 

2961 def __enter__(self): 

2962 return self 

2963 

2964 def __exit__(self, exc_type, exc_val, exc_tb): 

2965 self.close() 

2966 

2967 def __eq__(self, other): 

2968 return isinstance(self, type(other)) and self.index == other.index 

2969 

2970 def __len__(self) -> int: 

2971 """Number of entries in this pack.""" 

2972 return len(self.index) 

2973 

2974 def __repr__(self) -> str: 

2975 return f"{self.__class__.__name__}({self._basename!r})" 

2976 

2977 def __iter__(self): 

2978 """Iterate over all the sha1s of the objects in this pack.""" 

2979 return iter(self.index) 

2980 

2981 def check_length_and_checksum(self) -> None: 

2982 """Sanity check the length and checksum of the pack index and data.""" 

2983 assert len(self.index) == len(self.data), ( 

2984 f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)" 

2985 ) 

2986 idx_stored_checksum = self.index.get_pack_checksum() 

2987 data_stored_checksum = self.data.get_stored_checksum() 

2988 if ( 

2989 idx_stored_checksum is not None 

2990 and idx_stored_checksum != data_stored_checksum 

2991 ): 

2992 raise ChecksumMismatch( 

2993 sha_to_hex(idx_stored_checksum), 

2994 sha_to_hex(data_stored_checksum), 

2995 ) 

2996 

2997 def check(self) -> None: 

2998 """Check the integrity of this pack. 

2999 

3000 Raises: 

3001 ChecksumMismatch: if a checksum for the index or data is wrong 

3002 """ 

3003 self.index.check() 

3004 self.data.check() 

3005 for obj in self.iterobjects(): 

3006 obj.check() 

3007 # TODO: object connectivity checks 

3008 

3009 def get_stored_checksum(self) -> bytes: 

3010 return self.data.get_stored_checksum() 

3011 

3012 def pack_tuples(self): 

3013 return [(o, None) for o in self.iterobjects()] 

3014 

3015 def __contains__(self, sha1: bytes) -> bool: 

3016 """Check whether this pack contains a particular SHA1.""" 

3017 try: 

3018 self.index.object_offset(sha1) 

3019 return True 

3020 except KeyError: 

3021 return False 

3022 

3023 def get_raw(self, sha1: bytes) -> tuple[int, bytes]: 

3024 offset = self.index.object_offset(sha1) 

3025 obj_type, obj = self.data.get_object_at(offset) 

3026 type_num, chunks = self.resolve_object(offset, obj_type, obj) 

3027 return type_num, b"".join(chunks) 

3028 

3029 def __getitem__(self, sha1: bytes) -> ShaFile: 

3030 """Retrieve the specified SHA1.""" 

3031 type, uncomp = self.get_raw(sha1) 

3032 return ShaFile.from_raw_string(type, uncomp, sha=sha1) 

3033 

3034 def iterobjects(self) -> Iterator[ShaFile]: 

3035 """Iterate over the objects in this pack.""" 

3036 return iter( 

3037 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref) 

3038 ) 

3039 

3040 def iterobjects_subset( 

3041 self, shas: Iterable[ObjectID], *, allow_missing: bool = False 

3042 ) -> Iterator[ShaFile]: 

3043 return ( 

3044 uo 

3045 for uo in PackInflater.for_pack_subset( 

3046 self, 

3047 shas, 

3048 allow_missing=allow_missing, 

3049 resolve_ext_ref=self.resolve_ext_ref, 

3050 ) 

3051 if uo.id in shas 

3052 ) 

3053 

3054 def iter_unpacked_subset( 

3055 self, 

3056 shas: Iterable[ObjectID], 

3057 *, 

3058 include_comp: bool = False, 

3059 allow_missing: bool = False, 

3060 convert_ofs_delta: bool = False, 

3061 ) -> Iterator[UnpackedObject]: 

3062 ofs_pending: dict[int, list[UnpackedObject]] = defaultdict(list) 

3063 ofs: dict[bytes, int] = {} 

3064 todo = set(shas) 

3065 for unpacked in self.iter_unpacked(include_comp=include_comp): 

3066 sha = unpacked.sha() 

3067 ofs[unpacked.offset] = sha 

3068 hexsha = sha_to_hex(sha) 

3069 if hexsha in todo: 

3070 if unpacked.pack_type_num == OFS_DELTA: 

3071 assert isinstance(unpacked.delta_base, int) 

3072 base_offset = unpacked.offset - unpacked.delta_base 

3073 try: 

3074 unpacked.delta_base = ofs[base_offset] 

3075 except KeyError: 

3076 ofs_pending[base_offset].append(unpacked) 

3077 continue 

3078 else: 

3079 unpacked.pack_type_num = REF_DELTA 

3080 yield unpacked 

3081 todo.remove(hexsha) 

3082 for child in ofs_pending.pop(unpacked.offset, []): 

3083 child.pack_type_num = REF_DELTA 

3084 child.delta_base = sha 

3085 yield child 

3086 assert not ofs_pending 

3087 if not allow_missing and todo: 

3088 raise UnresolvedDeltas(list(todo)) 

3089 

3090 def iter_unpacked(self, include_comp=False): 

3091 ofs_to_entries = { 

3092 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries() 

3093 } 

3094 for unpacked in self.data.iter_unpacked(include_comp=include_comp): 

3095 (sha, crc32) = ofs_to_entries[unpacked.offset] 

3096 unpacked._sha = sha 

3097 unpacked.crc32 = crc32 

3098 yield unpacked 

3099 

3100 def keep(self, msg: Optional[bytes] = None) -> str: 

3101 """Add a .keep file for the pack, preventing git from garbage collecting it. 

3102 

3103 Args: 

3104 msg: A message written inside the .keep file; can be used later 

3105 to determine whether or not a .keep file is obsolete. 

3106 Returns: The path of the .keep file, as a string. 

3107 """ 

3108 keepfile_name = f"{self._basename}.keep" 

3109 with GitFile(keepfile_name, "wb") as keepfile: 

3110 if msg: 

3111 keepfile.write(msg) 

3112 keepfile.write(b"\n") 

3113 return keepfile_name 

3114 

3115 def get_ref(self, sha: bytes) -> tuple[Optional[int], int, OldUnpackedObject]: 

3116 """Get the object for a ref SHA, only looking in this pack.""" 

3117 # TODO: cache these results 

3118 try: 

3119 offset = self.index.object_offset(sha) 

3120 except KeyError: 

3121 offset = None 

3122 if offset: 

3123 type, obj = self.data.get_object_at(offset) 

3124 elif self.resolve_ext_ref: 

3125 type, obj = self.resolve_ext_ref(sha) 

3126 else: 

3127 raise KeyError(sha) 

3128 return offset, type, obj 

3129 

3130 def resolve_object( 

3131 self, offset: int, type: int, obj, get_ref=None 

3132 ) -> tuple[int, Iterable[bytes]]: 

3133 """Resolve an object, possibly resolving deltas when necessary. 

3134 

3135 Returns: Tuple with object type and contents. 

3136 """ 

3137 # Walk down the delta chain, building a stack of deltas to reach 

3138 # the requested object. 

3139 base_offset = offset 

3140 base_type = type 

3141 base_obj = obj 

3142 delta_stack = [] 

3143 while base_type in DELTA_TYPES: 

3144 prev_offset = base_offset 

3145 if get_ref is None: 

3146 get_ref = self.get_ref 

3147 if base_type == OFS_DELTA: 

3148 (delta_offset, delta) = base_obj 

3149 # TODO: clean up asserts and replace with nicer error messages 

3150 base_offset = base_offset - delta_offset 

3151 base_type, base_obj = self.data.get_object_at(base_offset) 

3152 assert isinstance(base_type, int) 

3153 elif base_type == REF_DELTA: 

3154 (basename, delta) = base_obj 

3155 assert isinstance(basename, bytes) and len(basename) == 20 

3156 base_offset, base_type, base_obj = get_ref(basename) 

3157 assert isinstance(base_type, int) 

3158 if base_offset == prev_offset: # object is based on itself 

3159 raise UnresolvedDeltas([basename]) 

3160 delta_stack.append((prev_offset, base_type, delta)) 

3161 

3162 # Now grab the base object (mustn't be a delta) and apply the 

3163 # deltas all the way up the stack. 

3164 chunks = base_obj 

3165 for prev_offset, _delta_type, delta in reversed(delta_stack): 

3166 chunks = apply_delta(chunks, delta) 

3167 if prev_offset is not None: 

3168 self.data._offset_cache[prev_offset] = base_type, chunks 

3169 return base_type, chunks 

3170 

3171 def entries( 

3172 self, progress: Optional[ProgressFn] = None 

3173 ) -> Iterator[PackIndexEntry]: 

3174 """Yield entries summarizing the contents of this pack. 

3175 

3176 Args: 

3177 progress: Progress function, called with current and total 

3178 object count. 

3179 Returns: iterator of tuples with (sha, offset, crc32) 

3180 """ 

3181 return self.data.iterentries( 

3182 progress=progress, resolve_ext_ref=self.resolve_ext_ref 

3183 ) 

3184 

3185 def sorted_entries( 

3186 self, progress: Optional[ProgressFn] = None 

3187 ) -> Iterator[PackIndexEntry]: 

3188 """Return entries in this pack, sorted by SHA. 

3189 

3190 Args: 

3191 progress: Progress function, called with current and total 

3192 object count 

3193 Returns: Iterator of tuples with (sha, offset, crc32) 

3194 """ 

3195 return self.data.sorted_entries( 

3196 progress=progress, resolve_ext_ref=self.resolve_ext_ref 

3197 ) 

3198 

3199 def get_unpacked_object( 

3200 self, sha: bytes, *, include_comp: bool = False, convert_ofs_delta: bool = True 

3201 ) -> UnpackedObject: 

3202 """Get the unpacked object for a sha. 

3203 

3204 Args: 

3205 sha: SHA of object to fetch 

3206 include_comp: Whether to include compression data in UnpackedObject 

3207 """ 

3208 offset = self.index.object_offset(sha) 

3209 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp) 

3210 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta: 

3211 assert isinstance(unpacked.delta_base, int) 

3212 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base) 

3213 unpacked.pack_type_num = REF_DELTA 

3214 return unpacked 

3215 

3216 

3217def extend_pack( 

3218 f: BinaryIO, 

3219 object_ids: set[ObjectID], 

3220 get_raw, 

3221 *, 

3222 compression_level=-1, 

3223 progress=None, 

3224) -> tuple[bytes, list]: 

3225 """Extend a pack file with more objects. 

3226 

3227 The caller should make sure that object_ids does not contain any objects 

3228 that are already in the pack 

3229 """ 

3230 # Update the header with the new number of objects. 

3231 f.seek(0) 

3232 _version, num_objects = read_pack_header(f.read) 

3233 

3234 if object_ids: 

3235 f.seek(0) 

3236 write_pack_header(f.write, num_objects + len(object_ids)) 

3237 

3238 # Must flush before reading (http://bugs.python.org/issue3207) 

3239 f.flush() 

3240 

3241 # Rescan the rest of the pack, computing the SHA with the new header. 

3242 new_sha = compute_file_sha(f, end_ofs=-20) 

3243 

3244 # Must reposition before writing (http://bugs.python.org/issue3207) 

3245 f.seek(0, os.SEEK_CUR) 

3246 

3247 extra_entries = [] 

3248 

3249 # Complete the pack. 

3250 for i, object_id in enumerate(object_ids): 

3251 if progress is not None: 

3252 progress( 

3253 (f"writing extra base objects: {i}/{len(object_ids)}\r").encode("ascii") 

3254 ) 

3255 assert len(object_id) == 20 

3256 type_num, data = get_raw(object_id) 

3257 offset = f.tell() 

3258 crc32 = write_pack_object( 

3259 f.write, 

3260 type_num, 

3261 data, 

3262 sha=new_sha, 

3263 compression_level=compression_level, 

3264 ) 

3265 extra_entries.append((object_id, offset, crc32)) 

3266 pack_sha = new_sha.digest() 

3267 f.write(pack_sha) 

3268 return pack_sha, extra_entries 

3269 

3270 

3271try: 

3272 from dulwich._pack import ( # type: ignore 

3273 apply_delta, # type: ignore 

3274 bisect_find_sha, # type: ignore 

3275 ) 

3276except ImportError: 

3277 pass