Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/pack.py: 27%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1543 statements  

1# pack.py -- For dealing with packed git objects. 

2# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net> 

3# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk> 

4# 

5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 

6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU 

7# General Public License as public by the Free Software Foundation; version 2.0 

8# or (at your option) any later version. You can redistribute it and/or 

9# modify it under the terms of either of these two licenses. 

10# 

11# Unless required by applicable law or agreed to in writing, software 

12# distributed under the License is distributed on an "AS IS" BASIS, 

13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

14# See the License for the specific language governing permissions and 

15# limitations under the License. 

16# 

17# You should have received a copy of the licenses; if not, see 

18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License 

19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache 

20# License, Version 2.0. 

21# 

22 

23"""Classes for dealing with packed git objects. 

24 

25A pack is a compact representation of a bunch of objects, stored 

26using deltas where possible. 

27 

28They have two parts, the pack file, which stores the data, and an index 

29that tells you where the data is. 

30 

31To find an object you look in all of the index files 'til you find a 

32match for the object name. You then use the pointer got from this as 

33a pointer in to the corresponding packfile. 

34""" 

35 

36import binascii 

37from collections import defaultdict, deque 

38from contextlib import suppress 

39from io import BytesIO, UnsupportedOperation 

40 

41try: 

42 from cdifflib import CSequenceMatcher as SequenceMatcher 

43except ModuleNotFoundError: 

44 from difflib import SequenceMatcher 

45 

46import os 

47import struct 

48import sys 

49import warnings 

50import zlib 

51from collections.abc import Iterable, Iterator, Sequence 

52from hashlib import sha1 

53from itertools import chain 

54from os import SEEK_CUR, SEEK_END 

55from struct import unpack_from 

56from typing import ( 

57 BinaryIO, 

58 Callable, 

59 Generic, 

60 Optional, 

61 Protocol, 

62 TypeVar, 

63 Union, 

64) 

65 

66try: 

67 import mmap 

68except ImportError: 

69 has_mmap = False 

70else: 

71 has_mmap = True 

72 

73# For some reason the above try, except fails to set has_mmap = False for plan9 

74if sys.platform == "Plan9": 

75 has_mmap = False 

76 

77from . import replace_me 

78from .errors import ApplyDeltaError, ChecksumMismatch 

79from .file import GitFile 

80from .lru_cache import LRUSizeCache 

81from .objects import ObjectID, ShaFile, hex_to_sha, object_header, sha_to_hex 

82 

83OFS_DELTA = 6 

84REF_DELTA = 7 

85 

86DELTA_TYPES = (OFS_DELTA, REF_DELTA) 

87 

88 

89DEFAULT_PACK_DELTA_WINDOW_SIZE = 10 

90 

91# Keep pack files under 16Mb in memory, otherwise write them out to disk 

92PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024 

93 

94# Default pack index version to use when none is specified 

95DEFAULT_PACK_INDEX_VERSION = 2 

96 

97 

98OldUnpackedObject = Union[tuple[Union[bytes, int], list[bytes]], list[bytes]] 

99ResolveExtRefFn = Callable[[bytes], tuple[int, OldUnpackedObject]] 

100ProgressFn = Callable[[int, str], None] 

101PackHint = tuple[int, Optional[bytes]] 

102 

103 

104class UnresolvedDeltas(Exception): 

105 """Delta objects could not be resolved.""" 

106 

107 def __init__(self, shas) -> None: 

108 self.shas = shas 

109 

110 

111class ObjectContainer(Protocol): 

112 def add_object(self, obj: ShaFile) -> None: 

113 """Add a single object to this object store.""" 

114 

115 def add_objects( 

116 self, 

117 objects: Sequence[tuple[ShaFile, Optional[str]]], 

118 progress: Optional[Callable[[str], None]] = None, 

119 ) -> None: 

120 """Add a set of objects to this object store. 

121 

122 Args: 

123 objects: Iterable over a list of (object, path) tuples 

124 """ 

125 

126 def __contains__(self, sha1: bytes) -> bool: 

127 """Check if a hex sha is present.""" 

128 

129 def __getitem__(self, sha1: bytes) -> ShaFile: 

130 """Retrieve an object.""" 

131 

132 

133class PackedObjectContainer(ObjectContainer): 

134 def get_unpacked_object( 

135 self, sha1: bytes, *, include_comp: bool = False 

136 ) -> "UnpackedObject": 

137 """Get a raw unresolved object.""" 

138 raise NotImplementedError(self.get_unpacked_object) 

139 

140 def iterobjects_subset( 

141 self, shas: Iterable[bytes], *, allow_missing: bool = False 

142 ) -> Iterator[ShaFile]: 

143 raise NotImplementedError(self.iterobjects_subset) 

144 

145 def iter_unpacked_subset( 

146 self, 

147 shas: set[bytes], 

148 include_comp: bool = False, 

149 allow_missing: bool = False, 

150 convert_ofs_delta: bool = True, 

151 ) -> Iterator["UnpackedObject"]: 

152 raise NotImplementedError(self.iter_unpacked_subset) 

153 

154 

155class UnpackedObjectStream: 

156 def __iter__(self) -> Iterator["UnpackedObject"]: 

157 raise NotImplementedError(self.__iter__) 

158 

159 def __len__(self) -> int: 

160 raise NotImplementedError(self.__len__) 

161 

162 

163def take_msb_bytes( 

164 read: Callable[[int], bytes], crc32: Optional[int] = None 

165) -> tuple[list[int], Optional[int]]: 

166 """Read bytes marked with most significant bit. 

167 

168 Args: 

169 read: Read function 

170 """ 

171 ret: list[int] = [] 

172 while len(ret) == 0 or ret[-1] & 0x80: 

173 b = read(1) 

174 if crc32 is not None: 

175 crc32 = binascii.crc32(b, crc32) 

176 ret.append(ord(b[:1])) 

177 return ret, crc32 

178 

179 

180class PackFileDisappeared(Exception): 

181 def __init__(self, obj) -> None: 

182 self.obj = obj 

183 

184 

185class UnpackedObject: 

186 """Class encapsulating an object unpacked from a pack file. 

187 

188 These objects should only be created from within unpack_object. Most 

189 members start out as empty and are filled in at various points by 

190 read_zlib_chunks, unpack_object, DeltaChainIterator, etc. 

191 

192 End users of this object should take care that the function they're getting 

193 this object from is guaranteed to set the members they need. 

194 """ 

195 

196 __slots__ = [ 

197 "_sha", # Cached binary SHA. 

198 "comp_chunks", # Compressed object chunks. 

199 "crc32", # CRC32. 

200 "decomp_chunks", # Decompressed object chunks. 

201 "decomp_len", # Decompressed length of this object. 

202 "delta_base", # Delta base offset or SHA. 

203 "obj_chunks", # Decompressed and delta-resolved chunks. 

204 "obj_type_num", # Type of this object. 

205 "offset", # Offset in its pack. 

206 "pack_type_num", # Type of this object in the pack (may be a delta). 

207 ] 

208 

209 obj_type_num: Optional[int] 

210 obj_chunks: Optional[list[bytes]] 

211 delta_base: Union[None, bytes, int] 

212 decomp_chunks: list[bytes] 

213 comp_chunks: Optional[list[bytes]] 

214 

215 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be 

216 # methods of this object. 

217 def __init__( 

218 self, 

219 pack_type_num, 

220 *, 

221 delta_base=None, 

222 decomp_len=None, 

223 crc32=None, 

224 sha=None, 

225 decomp_chunks=None, 

226 offset=None, 

227 ) -> None: 

228 self.offset = offset 

229 self._sha = sha 

230 self.pack_type_num = pack_type_num 

231 self.delta_base = delta_base 

232 self.comp_chunks = None 

233 self.decomp_chunks: list[bytes] = decomp_chunks or [] 

234 if decomp_chunks is not None and decomp_len is None: 

235 self.decomp_len = sum(map(len, decomp_chunks)) 

236 else: 

237 self.decomp_len = decomp_len 

238 self.crc32 = crc32 

239 

240 if pack_type_num in DELTA_TYPES: 

241 self.obj_type_num = None 

242 self.obj_chunks = None 

243 else: 

244 self.obj_type_num = pack_type_num 

245 self.obj_chunks = self.decomp_chunks 

246 self.delta_base = delta_base 

247 

248 def sha(self): 

249 """Return the binary SHA of this object.""" 

250 if self._sha is None: 

251 self._sha = obj_sha(self.obj_type_num, self.obj_chunks) 

252 return self._sha 

253 

254 def sha_file(self): 

255 """Return a ShaFile from this object.""" 

256 assert self.obj_type_num is not None and self.obj_chunks is not None 

257 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks) 

258 

259 # Only provided for backwards compatibility with code that expects either 

260 # chunks or a delta tuple. 

261 def _obj(self) -> OldUnpackedObject: 

262 """Return the decompressed chunks, or (delta base, delta chunks).""" 

263 if self.pack_type_num in DELTA_TYPES: 

264 assert isinstance(self.delta_base, (bytes, int)) 

265 return (self.delta_base, self.decomp_chunks) 

266 else: 

267 return self.decomp_chunks 

268 

269 def __eq__(self, other): 

270 if not isinstance(other, UnpackedObject): 

271 return False 

272 for slot in self.__slots__: 

273 if getattr(self, slot) != getattr(other, slot): 

274 return False 

275 return True 

276 

277 def __ne__(self, other): 

278 return not (self == other) 

279 

280 def __repr__(self) -> str: 

281 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__] 

282 return "{}({})".format(self.__class__.__name__, ", ".join(data)) 

283 

284 

285_ZLIB_BUFSIZE = 65536 # 64KB buffer for better I/O performance 

286 

287 

288def read_zlib_chunks( 

289 read_some: Callable[[int], bytes], 

290 unpacked: UnpackedObject, 

291 include_comp: bool = False, 

292 buffer_size: int = _ZLIB_BUFSIZE, 

293) -> bytes: 

294 """Read zlib data from a buffer. 

295 

296 This function requires that the buffer have additional data following the 

297 compressed data, which is guaranteed to be the case for git pack files. 

298 

299 Args: 

300 read_some: Read function that returns at least one byte, but may 

301 return less than the requested size. 

302 unpacked: An UnpackedObject to write result data to. If its crc32 

303 attr is not None, the CRC32 of the compressed bytes will be computed 

304 using this starting CRC32. 

305 After this function, will have the following attrs set: 

306 * comp_chunks (if include_comp is True) 

307 * decomp_chunks 

308 * decomp_len 

309 * crc32 

310 include_comp: If True, include compressed data in the result. 

311 buffer_size: Size of the read buffer. 

312 Returns: Leftover unused data from the decompression. 

313 

314 Raises: 

315 zlib.error: if a decompression error occurred. 

316 """ 

317 if unpacked.decomp_len <= -1: 

318 raise ValueError("non-negative zlib data stream size expected") 

319 decomp_obj = zlib.decompressobj() 

320 

321 comp_chunks = [] 

322 decomp_chunks = unpacked.decomp_chunks 

323 decomp_len = 0 

324 crc32 = unpacked.crc32 

325 

326 while True: 

327 add = read_some(buffer_size) 

328 if not add: 

329 raise zlib.error("EOF before end of zlib stream") 

330 comp_chunks.append(add) 

331 decomp = decomp_obj.decompress(add) 

332 decomp_len += len(decomp) 

333 decomp_chunks.append(decomp) 

334 unused = decomp_obj.unused_data 

335 if unused: 

336 left = len(unused) 

337 if crc32 is not None: 

338 crc32 = binascii.crc32(add[:-left], crc32) 

339 if include_comp: 

340 comp_chunks[-1] = add[:-left] 

341 break 

342 elif crc32 is not None: 

343 crc32 = binascii.crc32(add, crc32) 

344 if crc32 is not None: 

345 crc32 &= 0xFFFFFFFF 

346 

347 if decomp_len != unpacked.decomp_len: 

348 raise zlib.error("decompressed data does not match expected size") 

349 

350 unpacked.crc32 = crc32 

351 if include_comp: 

352 unpacked.comp_chunks = comp_chunks 

353 return unused 

354 

355 

356def iter_sha1(iter): 

357 """Return the hexdigest of the SHA1 over a set of names. 

358 

359 Args: 

360 iter: Iterator over string objects 

361 Returns: 40-byte hex sha1 digest 

362 """ 

363 sha = sha1() 

364 for name in iter: 

365 sha.update(name) 

366 return sha.hexdigest().encode("ascii") 

367 

368 

369def load_pack_index(path: Union[str, os.PathLike]): 

370 """Load an index file by path. 

371 

372 Args: 

373 path: Path to the index file 

374 Returns: A PackIndex loaded from the given path 

375 """ 

376 with GitFile(path, "rb") as f: 

377 return load_pack_index_file(path, f) 

378 

379 

380def _load_file_contents(f, size=None): 

381 try: 

382 fd = f.fileno() 

383 except (UnsupportedOperation, AttributeError): 

384 fd = None 

385 # Attempt to use mmap if possible 

386 if fd is not None: 

387 if size is None: 

388 size = os.fstat(fd).st_size 

389 if has_mmap: 

390 try: 

391 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ) 

392 except OSError: 

393 # Perhaps a socket? 

394 pass 

395 else: 

396 return contents, size 

397 contents = f.read() 

398 size = len(contents) 

399 return contents, size 

400 

401 

402def load_pack_index_file(path: Union[str, os.PathLike], f): 

403 """Load an index file from a file-like object. 

404 

405 Args: 

406 path: Path for the index file 

407 f: File-like object 

408 Returns: A PackIndex loaded from the given file 

409 """ 

410 contents, size = _load_file_contents(f) 

411 if contents[:4] == b"\377tOc": 

412 version = struct.unpack(b">L", contents[4:8])[0] 

413 if version == 2: 

414 return PackIndex2(path, file=f, contents=contents, size=size) 

415 elif version == 3: 

416 return PackIndex3(path, file=f, contents=contents, size=size) 

417 else: 

418 raise KeyError(f"Unknown pack index format {version}") 

419 else: 

420 return PackIndex1(path, file=f, contents=contents, size=size) 

421 

422 

423def bisect_find_sha(start, end, sha, unpack_name): 

424 """Find a SHA in a data blob with sorted SHAs. 

425 

426 Args: 

427 start: Start index of range to search 

428 end: End index of range to search 

429 sha: Sha to find 

430 unpack_name: Callback to retrieve SHA by index 

431 Returns: Index of the SHA, or None if it wasn't found 

432 """ 

433 assert start <= end 

434 while start <= end: 

435 i = (start + end) // 2 

436 file_sha = unpack_name(i) 

437 if file_sha < sha: 

438 start = i + 1 

439 elif file_sha > sha: 

440 end = i - 1 

441 else: 

442 return i 

443 return None 

444 

445 

446PackIndexEntry = tuple[bytes, int, Optional[int]] 

447 

448 

449class PackIndex: 

450 """An index in to a packfile. 

451 

452 Given a sha id of an object a pack index can tell you the location in the 

453 packfile of that object if it has it. 

454 """ 

455 

456 # Default to SHA-1 for backward compatibility 

457 hash_algorithm = 1 

458 hash_size = 20 

459 

460 def __eq__(self, other): 

461 if not isinstance(other, PackIndex): 

462 return False 

463 

464 for (name1, _, _), (name2, _, _) in zip( 

465 self.iterentries(), other.iterentries() 

466 ): 

467 if name1 != name2: 

468 return False 

469 return True 

470 

471 def __ne__(self, other): 

472 return not self.__eq__(other) 

473 

474 def __len__(self) -> int: 

475 """Return the number of entries in this pack index.""" 

476 raise NotImplementedError(self.__len__) 

477 

478 def __iter__(self) -> Iterator[bytes]: 

479 """Iterate over the SHAs in this pack.""" 

480 return map(sha_to_hex, self._itersha()) 

481 

482 def iterentries(self) -> Iterator[PackIndexEntry]: 

483 """Iterate over the entries in this pack index. 

484 

485 Returns: iterator over tuples with object name, offset in packfile and 

486 crc32 checksum. 

487 """ 

488 raise NotImplementedError(self.iterentries) 

489 

490 def get_pack_checksum(self) -> bytes: 

491 """Return the SHA1 checksum stored for the corresponding packfile. 

492 

493 Returns: 20-byte binary digest 

494 """ 

495 raise NotImplementedError(self.get_pack_checksum) 

496 

497 @replace_me(since="0.21.0", remove_in="0.23.0") 

498 def object_index(self, sha: bytes) -> int: 

499 return self.object_offset(sha) 

500 

501 def object_offset(self, sha: bytes) -> int: 

502 """Return the offset in to the corresponding packfile for the object. 

503 

504 Given the name of an object it will return the offset that object 

505 lives at within the corresponding pack file. If the pack file doesn't 

506 have the object then None will be returned. 

507 """ 

508 raise NotImplementedError(self.object_offset) 

509 

510 def object_sha1(self, index: int) -> bytes: 

511 """Return the SHA1 corresponding to the index in the pack file.""" 

512 for name, offset, crc32 in self.iterentries(): 

513 if offset == index: 

514 return name 

515 else: 

516 raise KeyError(index) 

517 

518 def _object_offset(self, sha: bytes) -> int: 

519 """See object_offset. 

520 

521 Args: 

522 sha: A *binary* SHA string. (20 characters long)_ 

523 """ 

524 raise NotImplementedError(self._object_offset) 

525 

526 def objects_sha1(self) -> bytes: 

527 """Return the hex SHA1 over all the shas of all objects in this pack. 

528 

529 Note: This is used for the filename of the pack. 

530 """ 

531 return iter_sha1(self._itersha()) 

532 

533 def _itersha(self) -> Iterator[bytes]: 

534 """Yield all the SHA1's of the objects in the index, sorted.""" 

535 raise NotImplementedError(self._itersha) 

536 

537 def close(self) -> None: 

538 pass 

539 

540 def check(self) -> None: 

541 pass 

542 

543 

544class MemoryPackIndex(PackIndex): 

545 """Pack index that is stored entirely in memory.""" 

546 

547 def __init__(self, entries, pack_checksum=None) -> None: 

548 """Create a new MemoryPackIndex. 

549 

550 Args: 

551 entries: Sequence of name, idx, crc32 (sorted) 

552 pack_checksum: Optional pack checksum 

553 """ 

554 self._by_sha = {} 

555 self._by_offset = {} 

556 for name, offset, crc32 in entries: 

557 self._by_sha[name] = offset 

558 self._by_offset[offset] = name 

559 self._entries = entries 

560 self._pack_checksum = pack_checksum 

561 

562 def get_pack_checksum(self): 

563 return self._pack_checksum 

564 

565 def __len__(self) -> int: 

566 return len(self._entries) 

567 

568 def object_offset(self, sha): 

569 if len(sha) == 40: 

570 sha = hex_to_sha(sha) 

571 return self._by_sha[sha] 

572 

573 def object_sha1(self, offset): 

574 return self._by_offset[offset] 

575 

576 def _itersha(self): 

577 return iter(self._by_sha) 

578 

579 def iterentries(self): 

580 return iter(self._entries) 

581 

582 @classmethod 

583 def for_pack(cls, pack): 

584 return MemoryPackIndex(pack.sorted_entries(), pack.calculate_checksum()) 

585 

586 @classmethod 

587 def clone(cls, other_index): 

588 return cls(other_index.iterentries(), other_index.get_pack_checksum()) 

589 

590 

591class FilePackIndex(PackIndex): 

592 """Pack index that is based on a file. 

593 

594 To do the loop it opens the file, and indexes first 256 4 byte groups 

595 with the first byte of the sha id. The value in the four byte group indexed 

596 is the end of the group that shares the same starting byte. Subtract one 

597 from the starting byte and index again to find the start of the group. 

598 The values are sorted by sha id within the group, so do the math to find 

599 the start and end offset and then bisect in to find if the value is 

600 present. 

601 """ 

602 

603 _fan_out_table: list[int] 

604 

605 def __init__(self, filename, file=None, contents=None, size=None) -> None: 

606 """Create a pack index object. 

607 

608 Provide it with the name of the index file to consider, and it will map 

609 it whenever required. 

610 """ 

611 self._filename = filename 

612 # Take the size now, so it can be checked each time we map the file to 

613 # ensure that it hasn't changed. 

614 if file is None: 

615 self._file = GitFile(filename, "rb") 

616 else: 

617 self._file = file 

618 if contents is None: 

619 self._contents, self._size = _load_file_contents(self._file, size) 

620 else: 

621 self._contents, self._size = (contents, size) 

622 

623 @property 

624 def path(self) -> str: 

625 return self._filename 

626 

627 def __eq__(self, other): 

628 # Quick optimization: 

629 if ( 

630 isinstance(other, FilePackIndex) 

631 and self._fan_out_table != other._fan_out_table 

632 ): 

633 return False 

634 

635 return super().__eq__(other) 

636 

637 def close(self) -> None: 

638 self._file.close() 

639 if getattr(self._contents, "close", None) is not None: 

640 self._contents.close() 

641 

642 def __len__(self) -> int: 

643 """Return the number of entries in this pack index.""" 

644 return self._fan_out_table[-1] 

645 

646 def _unpack_entry(self, i: int) -> PackIndexEntry: 

647 """Unpack the i-th entry in the index file. 

648 

649 Returns: Tuple with object name (SHA), offset in pack file and CRC32 

650 checksum (if known). 

651 """ 

652 raise NotImplementedError(self._unpack_entry) 

653 

654 def _unpack_name(self, i) -> bytes: 

655 """Unpack the i-th name from the index file.""" 

656 raise NotImplementedError(self._unpack_name) 

657 

658 def _unpack_offset(self, i) -> int: 

659 """Unpack the i-th object offset from the index file.""" 

660 raise NotImplementedError(self._unpack_offset) 

661 

662 def _unpack_crc32_checksum(self, i) -> Optional[int]: 

663 """Unpack the crc32 checksum for the ith object from the index file.""" 

664 raise NotImplementedError(self._unpack_crc32_checksum) 

665 

666 def _itersha(self) -> Iterator[bytes]: 

667 for i in range(len(self)): 

668 yield self._unpack_name(i) 

669 

670 def iterentries(self) -> Iterator[PackIndexEntry]: 

671 """Iterate over the entries in this pack index. 

672 

673 Returns: iterator over tuples with object name, offset in packfile and 

674 crc32 checksum. 

675 """ 

676 for i in range(len(self)): 

677 yield self._unpack_entry(i) 

678 

679 def _read_fan_out_table(self, start_offset: int): 

680 ret = [] 

681 for i in range(0x100): 

682 fanout_entry = self._contents[ 

683 start_offset + i * 4 : start_offset + (i + 1) * 4 

684 ] 

685 ret.append(struct.unpack(">L", fanout_entry)[0]) 

686 return ret 

687 

688 def check(self) -> None: 

689 """Check that the stored checksum matches the actual checksum.""" 

690 actual = self.calculate_checksum() 

691 stored = self.get_stored_checksum() 

692 if actual != stored: 

693 raise ChecksumMismatch(stored, actual) 

694 

695 def calculate_checksum(self) -> bytes: 

696 """Calculate the SHA1 checksum over this pack index. 

697 

698 Returns: This is a 20-byte binary digest 

699 """ 

700 return sha1(self._contents[:-20]).digest() 

701 

702 def get_pack_checksum(self) -> bytes: 

703 """Return the SHA1 checksum stored for the corresponding packfile. 

704 

705 Returns: 20-byte binary digest 

706 """ 

707 return bytes(self._contents[-40:-20]) 

708 

709 def get_stored_checksum(self) -> bytes: 

710 """Return the SHA1 checksum stored for this index. 

711 

712 Returns: 20-byte binary digest 

713 """ 

714 return bytes(self._contents[-20:]) 

715 

716 def object_offset(self, sha: bytes) -> int: 

717 """Return the offset in to the corresponding packfile for the object. 

718 

719 Given the name of an object it will return the offset that object 

720 lives at within the corresponding pack file. If the pack file doesn't 

721 have the object then None will be returned. 

722 """ 

723 if len(sha) == 40: 

724 sha = hex_to_sha(sha) 

725 try: 

726 return self._object_offset(sha) 

727 except ValueError as exc: 

728 closed = getattr(self._contents, "closed", None) 

729 if closed in (None, True): 

730 raise PackFileDisappeared(self) from exc 

731 raise 

732 

733 def _object_offset(self, sha: bytes) -> int: 

734 """See object_offset. 

735 

736 Args: 

737 sha: A *binary* SHA string. (20 characters long)_ 

738 """ 

739 assert len(sha) == 20 

740 idx = ord(sha[:1]) 

741 if idx == 0: 

742 start = 0 

743 else: 

744 start = self._fan_out_table[idx - 1] 

745 end = self._fan_out_table[idx] 

746 i = bisect_find_sha(start, end, sha, self._unpack_name) 

747 if i is None: 

748 raise KeyError(sha) 

749 return self._unpack_offset(i) 

750 

751 def iter_prefix(self, prefix: bytes) -> Iterator[bytes]: 

752 """Iterate over all SHA1s with the given prefix.""" 

753 start = ord(prefix[:1]) 

754 if start == 0: 

755 start = 0 

756 else: 

757 start = self._fan_out_table[start - 1] 

758 end = ord(prefix[:1]) + 1 

759 if end == 0x100: 

760 end = len(self) 

761 else: 

762 end = self._fan_out_table[end] 

763 assert start <= end 

764 started = False 

765 for i in range(start, end): 

766 name: bytes = self._unpack_name(i) 

767 if name.startswith(prefix): 

768 yield name 

769 started = True 

770 elif started: 

771 break 

772 

773 

774class PackIndex1(FilePackIndex): 

775 """Version 1 Pack Index file.""" 

776 

777 def __init__( 

778 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None 

779 ) -> None: 

780 super().__init__(filename, file, contents, size) 

781 self.version = 1 

782 self._fan_out_table = self._read_fan_out_table(0) 

783 

784 def _unpack_entry(self, i): 

785 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24)) 

786 return (name, offset, None) 

787 

788 def _unpack_name(self, i): 

789 offset = (0x100 * 4) + (i * 24) + 4 

790 return self._contents[offset : offset + 20] 

791 

792 def _unpack_offset(self, i): 

793 offset = (0x100 * 4) + (i * 24) 

794 return unpack_from(">L", self._contents, offset)[0] 

795 

796 def _unpack_crc32_checksum(self, i) -> None: 

797 # Not stored in v1 index files 

798 return None 

799 

800 

801class PackIndex2(FilePackIndex): 

802 """Version 2 Pack Index file.""" 

803 

804 def __init__( 

805 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None 

806 ) -> None: 

807 super().__init__(filename, file, contents, size) 

808 if self._contents[:4] != b"\377tOc": 

809 raise AssertionError("Not a v2 pack index file") 

810 (self.version,) = unpack_from(b">L", self._contents, 4) 

811 if self.version != 2: 

812 raise AssertionError(f"Version was {self.version}") 

813 self._fan_out_table = self._read_fan_out_table(8) 

814 self._name_table_offset = 8 + 0x100 * 4 

815 self._crc32_table_offset = self._name_table_offset + 20 * len(self) 

816 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self) 

817 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len( 

818 self 

819 ) 

820 

821 def _unpack_entry(self, i): 

822 return ( 

823 self._unpack_name(i), 

824 self._unpack_offset(i), 

825 self._unpack_crc32_checksum(i), 

826 ) 

827 

828 def _unpack_name(self, i): 

829 offset = self._name_table_offset + i * 20 

830 return self._contents[offset : offset + 20] 

831 

832 def _unpack_offset(self, i): 

833 offset = self._pack_offset_table_offset + i * 4 

834 offset = unpack_from(">L", self._contents, offset)[0] 

835 if offset & (2**31): 

836 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8 

837 offset = unpack_from(">Q", self._contents, offset)[0] 

838 return offset 

839 

840 def _unpack_crc32_checksum(self, i): 

841 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0] 

842 

843 

844class PackIndex3(FilePackIndex): 

845 """Version 3 Pack Index file. 

846 

847 Supports variable hash sizes for SHA-1 (20 bytes) and SHA-256 (32 bytes). 

848 """ 

849 

850 def __init__( 

851 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None 

852 ) -> None: 

853 super().__init__(filename, file, contents, size) 

854 if self._contents[:4] != b"\377tOc": 

855 raise AssertionError("Not a v3 pack index file") 

856 (self.version,) = unpack_from(b">L", self._contents, 4) 

857 if self.version != 3: 

858 raise AssertionError(f"Version was {self.version}") 

859 

860 # Read hash algorithm identifier (1 = SHA-1, 2 = SHA-256) 

861 (self.hash_algorithm,) = unpack_from(b">L", self._contents, 8) 

862 if self.hash_algorithm == 1: 

863 self.hash_size = 20 # SHA-1 

864 elif self.hash_algorithm == 2: 

865 self.hash_size = 32 # SHA-256 

866 else: 

867 raise AssertionError(f"Unknown hash algorithm {self.hash_algorithm}") 

868 

869 # Read length of shortened object names 

870 (self.shortened_oid_len,) = unpack_from(b">L", self._contents, 12) 

871 

872 # Calculate offsets based on variable hash size 

873 self._fan_out_table = self._read_fan_out_table( 

874 16 

875 ) # After header (4 + 4 + 4 + 4) 

876 self._name_table_offset = 16 + 0x100 * 4 

877 self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self) 

878 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self) 

879 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len( 

880 self 

881 ) 

882 

883 def _unpack_entry(self, i): 

884 return ( 

885 self._unpack_name(i), 

886 self._unpack_offset(i), 

887 self._unpack_crc32_checksum(i), 

888 ) 

889 

890 def _unpack_name(self, i): 

891 offset = self._name_table_offset + i * self.hash_size 

892 return self._contents[offset : offset + self.hash_size] 

893 

894 def _unpack_offset(self, i): 

895 offset = self._pack_offset_table_offset + i * 4 

896 offset = unpack_from(">L", self._contents, offset)[0] 

897 if offset & (2**31): 

898 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8 

899 offset = unpack_from(">Q", self._contents, offset)[0] 

900 return offset 

901 

902 def _unpack_crc32_checksum(self, i): 

903 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0] 

904 

905 

906def read_pack_header(read) -> tuple[int, int]: 

907 """Read the header of a pack file. 

908 

909 Args: 

910 read: Read function 

911 Returns: Tuple of (pack version, number of objects). If no data is 

912 available to read, returns (None, None). 

913 """ 

914 header = read(12) 

915 if not header: 

916 raise AssertionError("file too short to contain pack") 

917 if header[:4] != b"PACK": 

918 raise AssertionError(f"Invalid pack header {header!r}") 

919 (version,) = unpack_from(b">L", header, 4) 

920 if version not in (2, 3): 

921 raise AssertionError(f"Version was {version}") 

922 (num_objects,) = unpack_from(b">L", header, 8) 

923 return (version, num_objects) 

924 

925 

926def chunks_length(chunks: Union[bytes, Iterable[bytes]]) -> int: 

927 if isinstance(chunks, bytes): 

928 return len(chunks) 

929 else: 

930 return sum(map(len, chunks)) 

931 

932 

933def unpack_object( 

934 read_all: Callable[[int], bytes], 

935 read_some: Optional[Callable[[int], bytes]] = None, 

936 compute_crc32=False, 

937 include_comp=False, 

938 zlib_bufsize=_ZLIB_BUFSIZE, 

939) -> tuple[UnpackedObject, bytes]: 

940 """Unpack a Git object. 

941 

942 Args: 

943 read_all: Read function that blocks until the number of requested 

944 bytes are read. 

945 read_some: Read function that returns at least one byte, but may not 

946 return the number of bytes requested. 

947 compute_crc32: If True, compute the CRC32 of the compressed data. If 

948 False, the returned CRC32 will be None. 

949 include_comp: If True, include compressed data in the result. 

950 zlib_bufsize: An optional buffer size for zlib operations. 

951 Returns: A tuple of (unpacked, unused), where unused is the unused data 

952 leftover from decompression, and unpacked in an UnpackedObject with 

953 the following attrs set: 

954 

955 * obj_chunks (for non-delta types) 

956 * pack_type_num 

957 * delta_base (for delta types) 

958 * comp_chunks (if include_comp is True) 

959 * decomp_chunks 

960 * decomp_len 

961 * crc32 (if compute_crc32 is True) 

962 """ 

963 if read_some is None: 

964 read_some = read_all 

965 if compute_crc32: 

966 crc32 = 0 

967 else: 

968 crc32 = None 

969 

970 raw, crc32 = take_msb_bytes(read_all, crc32=crc32) 

971 type_num = (raw[0] >> 4) & 0x07 

972 size = raw[0] & 0x0F 

973 for i, byte in enumerate(raw[1:]): 

974 size += (byte & 0x7F) << ((i * 7) + 4) 

975 

976 delta_base: Union[int, bytes, None] 

977 raw_base = len(raw) 

978 if type_num == OFS_DELTA: 

979 raw, crc32 = take_msb_bytes(read_all, crc32=crc32) 

980 raw_base += len(raw) 

981 if raw[-1] & 0x80: 

982 raise AssertionError 

983 delta_base_offset = raw[0] & 0x7F 

984 for byte in raw[1:]: 

985 delta_base_offset += 1 

986 delta_base_offset <<= 7 

987 delta_base_offset += byte & 0x7F 

988 delta_base = delta_base_offset 

989 elif type_num == REF_DELTA: 

990 delta_base_obj = read_all(20) 

991 if crc32 is not None: 

992 crc32 = binascii.crc32(delta_base_obj, crc32) 

993 delta_base = delta_base_obj 

994 raw_base += 20 

995 else: 

996 delta_base = None 

997 

998 unpacked = UnpackedObject( 

999 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32 

1000 ) 

1001 unused = read_zlib_chunks( 

1002 read_some, 

1003 unpacked, 

1004 buffer_size=zlib_bufsize, 

1005 include_comp=include_comp, 

1006 ) 

1007 return unpacked, unused 

1008 

1009 

1010def _compute_object_size(value): 

1011 """Compute the size of a unresolved object for use with LRUSizeCache.""" 

1012 (num, obj) = value 

1013 if num in DELTA_TYPES: 

1014 return chunks_length(obj[1]) 

1015 return chunks_length(obj) 

1016 

1017 

1018class PackStreamReader: 

1019 """Class to read a pack stream. 

1020 

1021 The pack is read from a ReceivableProtocol using read() or recv() as 

1022 appropriate. 

1023 """ 

1024 

1025 def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE) -> None: 

1026 self.read_all = read_all 

1027 if read_some is None: 

1028 self.read_some = read_all 

1029 else: 

1030 self.read_some = read_some 

1031 self.sha = sha1() 

1032 self._offset = 0 

1033 self._rbuf = BytesIO() 

1034 # trailer is a deque to avoid memory allocation on small reads 

1035 self._trailer: deque[bytes] = deque() 

1036 self._zlib_bufsize = zlib_bufsize 

1037 

1038 def _read(self, read, size): 

1039 """Read up to size bytes using the given callback. 

1040 

1041 As a side effect, update the verifier's hash (excluding the last 20 

1042 bytes read). 

1043 

1044 Args: 

1045 read: The read callback to read from. 

1046 size: The maximum number of bytes to read; the particular 

1047 behavior is callback-specific. 

1048 """ 

1049 data = read(size) 

1050 

1051 # maintain a trailer of the last 20 bytes we've read 

1052 n = len(data) 

1053 self._offset += n 

1054 tn = len(self._trailer) 

1055 if n >= 20: 

1056 to_pop = tn 

1057 to_add = 20 

1058 else: 

1059 to_pop = max(n + tn - 20, 0) 

1060 to_add = n 

1061 self.sha.update( 

1062 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)])) 

1063 ) 

1064 self._trailer.extend(data[-to_add:]) 

1065 

1066 # hash everything but the trailer 

1067 self.sha.update(data[:-to_add]) 

1068 return data 

1069 

1070 def _buf_len(self): 

1071 buf = self._rbuf 

1072 start = buf.tell() 

1073 buf.seek(0, SEEK_END) 

1074 end = buf.tell() 

1075 buf.seek(start) 

1076 return end - start 

1077 

1078 @property 

1079 def offset(self): 

1080 return self._offset - self._buf_len() 

1081 

1082 def read(self, size): 

1083 """Read, blocking until size bytes are read.""" 

1084 buf_len = self._buf_len() 

1085 if buf_len >= size: 

1086 return self._rbuf.read(size) 

1087 buf_data = self._rbuf.read() 

1088 self._rbuf = BytesIO() 

1089 return buf_data + self._read(self.read_all, size - buf_len) 

1090 

1091 def recv(self, size): 

1092 """Read up to size bytes, blocking until one byte is read.""" 

1093 buf_len = self._buf_len() 

1094 if buf_len: 

1095 data = self._rbuf.read(size) 

1096 if size >= buf_len: 

1097 self._rbuf = BytesIO() 

1098 return data 

1099 return self._read(self.read_some, size) 

1100 

1101 def __len__(self) -> int: 

1102 return self._num_objects 

1103 

1104 def read_objects(self, compute_crc32=False) -> Iterator[UnpackedObject]: 

1105 """Read the objects in this pack file. 

1106 

1107 Args: 

1108 compute_crc32: If True, compute the CRC32 of the compressed 

1109 data. If False, the returned CRC32 will be None. 

1110 Returns: Iterator over UnpackedObjects with the following members set: 

1111 offset 

1112 obj_type_num 

1113 obj_chunks (for non-delta types) 

1114 delta_base (for delta types) 

1115 decomp_chunks 

1116 decomp_len 

1117 crc32 (if compute_crc32 is True) 

1118 

1119 Raises: 

1120 ChecksumMismatch: if the checksum of the pack contents does not 

1121 match the checksum in the pack trailer. 

1122 zlib.error: if an error occurred during zlib decompression. 

1123 IOError: if an error occurred writing to the output file. 

1124 """ 

1125 pack_version, self._num_objects = read_pack_header(self.read) 

1126 

1127 for i in range(self._num_objects): 

1128 offset = self.offset 

1129 unpacked, unused = unpack_object( 

1130 self.read, 

1131 read_some=self.recv, 

1132 compute_crc32=compute_crc32, 

1133 zlib_bufsize=self._zlib_bufsize, 

1134 ) 

1135 unpacked.offset = offset 

1136 

1137 # prepend any unused data to current read buffer 

1138 buf = BytesIO() 

1139 buf.write(unused) 

1140 buf.write(self._rbuf.read()) 

1141 buf.seek(0) 

1142 self._rbuf = buf 

1143 

1144 yield unpacked 

1145 

1146 if self._buf_len() < 20: 

1147 # If the read buffer is full, then the last read() got the whole 

1148 # trailer off the wire. If not, it means there is still some of the 

1149 # trailer to read. We need to read() all 20 bytes; N come from the 

1150 # read buffer and (20 - N) come from the wire. 

1151 self.read(20) 

1152 

1153 pack_sha = bytearray(self._trailer) # type: ignore 

1154 if pack_sha != self.sha.digest(): 

1155 raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest()) 

1156 

1157 

1158class PackStreamCopier(PackStreamReader): 

1159 """Class to verify a pack stream as it is being read. 

1160 

1161 The pack is read from a ReceivableProtocol using read() or recv() as 

1162 appropriate and written out to the given file-like object. 

1163 """ 

1164 

1165 def __init__(self, read_all, read_some, outfile, delta_iter=None) -> None: 

1166 """Initialize the copier. 

1167 

1168 Args: 

1169 read_all: Read function that blocks until the number of 

1170 requested bytes are read. 

1171 read_some: Read function that returns at least one byte, but may 

1172 not return the number of bytes requested. 

1173 outfile: File-like object to write output through. 

1174 delta_iter: Optional DeltaChainIterator to record deltas as we 

1175 read them. 

1176 """ 

1177 super().__init__(read_all, read_some=read_some) 

1178 self.outfile = outfile 

1179 self._delta_iter = delta_iter 

1180 

1181 def _read(self, read, size): 

1182 """Read data from the read callback and write it to the file.""" 

1183 data = super()._read(read, size) 

1184 self.outfile.write(data) 

1185 return data 

1186 

1187 def verify(self, progress=None) -> None: 

1188 """Verify a pack stream and write it to the output file. 

1189 

1190 See PackStreamReader.iterobjects for a list of exceptions this may 

1191 throw. 

1192 """ 

1193 i = 0 # default count of entries if read_objects() is empty 

1194 for i, unpacked in enumerate(self.read_objects()): 

1195 if self._delta_iter: 

1196 self._delta_iter.record(unpacked) 

1197 if progress is not None: 

1198 progress(f"copying pack entries: {i}/{len(self)}\r".encode("ascii")) 

1199 if progress is not None: 

1200 progress(f"copied {i} pack entries\n".encode("ascii")) 

1201 

1202 

1203def obj_sha(type, chunks): 

1204 """Compute the SHA for a numeric type and object chunks.""" 

1205 sha = sha1() 

1206 sha.update(object_header(type, chunks_length(chunks))) 

1207 if isinstance(chunks, bytes): 

1208 sha.update(chunks) 

1209 else: 

1210 for chunk in chunks: 

1211 sha.update(chunk) 

1212 return sha.digest() 

1213 

1214 

1215def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1 << 16): 

1216 """Hash a portion of a file into a new SHA. 

1217 

1218 Args: 

1219 f: A file-like object to read from that supports seek(). 

1220 start_ofs: The offset in the file to start reading at. 

1221 end_ofs: The offset in the file to end reading at, relative to the 

1222 end of the file. 

1223 buffer_size: A buffer size for reading. 

1224 Returns: A new SHA object updated with data read from the file. 

1225 """ 

1226 sha = sha1() 

1227 f.seek(0, SEEK_END) 

1228 length = f.tell() 

1229 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length: 

1230 raise AssertionError( 

1231 f"Attempt to read beyond file length. start_ofs: {start_ofs}, end_ofs: {end_ofs}, file length: {length}" 

1232 ) 

1233 todo = length + end_ofs - start_ofs 

1234 f.seek(start_ofs) 

1235 while todo: 

1236 data = f.read(min(todo, buffer_size)) 

1237 sha.update(data) 

1238 todo -= len(data) 

1239 return sha 

1240 

1241 

1242class PackData: 

1243 """The data contained in a packfile. 

1244 

1245 Pack files can be accessed both sequentially for exploding a pack, and 

1246 directly with the help of an index to retrieve a specific object. 

1247 

1248 The objects within are either complete or a delta against another. 

1249 

1250 The header is variable length. If the MSB of each byte is set then it 

1251 indicates that the subsequent byte is still part of the header. 

1252 For the first byte the next MS bits are the type, which tells you the type 

1253 of object, and whether it is a delta. The LS byte is the lowest bits of the 

1254 size. For each subsequent byte the LS 7 bits are the next MS bits of the 

1255 size, i.e. the last byte of the header contains the MS bits of the size. 

1256 

1257 For the complete objects the data is stored as zlib deflated data. 

1258 The size in the header is the uncompressed object size, so to uncompress 

1259 you need to just keep feeding data to zlib until you get an object back, 

1260 or it errors on bad data. This is done here by just giving the complete 

1261 buffer from the start of the deflated object on. This is bad, but until I 

1262 get mmap sorted out it will have to do. 

1263 

1264 Currently there are no integrity checks done. Also no attempt is made to 

1265 try and detect the delta case, or a request for an object at the wrong 

1266 position. It will all just throw a zlib or KeyError. 

1267 """ 

1268 

1269 def __init__(self, filename: Union[str, os.PathLike], file=None, size=None) -> None: 

1270 """Create a PackData object representing the pack in the given filename. 

1271 

1272 The file must exist and stay readable until the object is disposed of. 

1273 It must also stay the same size. It will be mapped whenever needed. 

1274 

1275 Currently there is a restriction on the size of the pack as the python 

1276 mmap implementation is flawed. 

1277 """ 

1278 self._filename = filename 

1279 self._size = size 

1280 self._header_size = 12 

1281 if file is None: 

1282 self._file = GitFile(self._filename, "rb") 

1283 else: 

1284 self._file = file 

1285 (version, self._num_objects) = read_pack_header(self._file.read) 

1286 self._offset_cache = LRUSizeCache[int, tuple[int, OldUnpackedObject]]( 

1287 1024 * 1024 * 20, compute_size=_compute_object_size 

1288 ) 

1289 

1290 @property 

1291 def filename(self): 

1292 return os.path.basename(self._filename) 

1293 

1294 @property 

1295 def path(self): 

1296 return self._filename 

1297 

1298 @classmethod 

1299 def from_file(cls, file, size=None): 

1300 return cls(str(file), file=file, size=size) 

1301 

1302 @classmethod 

1303 def from_path(cls, path: Union[str, os.PathLike]): 

1304 return cls(filename=path) 

1305 

1306 def close(self) -> None: 

1307 self._file.close() 

1308 

1309 def __enter__(self): 

1310 return self 

1311 

1312 def __exit__(self, exc_type, exc_val, exc_tb): 

1313 self.close() 

1314 

1315 def __eq__(self, other): 

1316 if isinstance(other, PackData): 

1317 return self.get_stored_checksum() == other.get_stored_checksum() 

1318 return False 

1319 

1320 def _get_size(self): 

1321 if self._size is not None: 

1322 return self._size 

1323 self._size = os.path.getsize(self._filename) 

1324 if self._size < self._header_size: 

1325 errmsg = f"{self._filename} is too small for a packfile ({self._size} < {self._header_size})" 

1326 raise AssertionError(errmsg) 

1327 return self._size 

1328 

1329 def __len__(self) -> int: 

1330 """Returns the number of objects in this pack.""" 

1331 return self._num_objects 

1332 

1333 def calculate_checksum(self): 

1334 """Calculate the checksum for this pack. 

1335 

1336 Returns: 20-byte binary SHA1 digest 

1337 """ 

1338 return compute_file_sha(self._file, end_ofs=-20).digest() 

1339 

1340 def iter_unpacked(self, *, include_comp: bool = False): 

1341 self._file.seek(self._header_size) 

1342 

1343 if self._num_objects is None: 

1344 return 

1345 

1346 for _ in range(self._num_objects): 

1347 offset = self._file.tell() 

1348 unpacked, unused = unpack_object( 

1349 self._file.read, compute_crc32=False, include_comp=include_comp 

1350 ) 

1351 unpacked.offset = offset 

1352 yield unpacked 

1353 # Back up over unused data. 

1354 self._file.seek(-len(unused), SEEK_CUR) 

1355 

1356 def iterentries( 

1357 self, progress=None, resolve_ext_ref: Optional[ResolveExtRefFn] = None 

1358 ): 

1359 """Yield entries summarizing the contents of this pack. 

1360 

1361 Args: 

1362 progress: Progress function, called with current and total 

1363 object count. 

1364 Returns: iterator of tuples with (sha, offset, crc32) 

1365 """ 

1366 num_objects = self._num_objects 

1367 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref) 

1368 for i, result in enumerate(indexer): 

1369 if progress is not None: 

1370 progress(i, num_objects) 

1371 yield result 

1372 

1373 def sorted_entries( 

1374 self, 

1375 progress: Optional[ProgressFn] = None, 

1376 resolve_ext_ref: Optional[ResolveExtRefFn] = None, 

1377 ): 

1378 """Return entries in this pack, sorted by SHA. 

1379 

1380 Args: 

1381 progress: Progress function, called with current and total 

1382 object count 

1383 Returns: Iterator of tuples with (sha, offset, crc32) 

1384 """ 

1385 return sorted( 

1386 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref) 

1387 ) 

1388 

1389 def create_index_v1(self, filename, progress=None, resolve_ext_ref=None): 

1390 """Create a version 1 file for this data file. 

1391 

1392 Args: 

1393 filename: Index filename. 

1394 progress: Progress report function 

1395 Returns: Checksum of index file 

1396 """ 

1397 entries = self.sorted_entries( 

1398 progress=progress, resolve_ext_ref=resolve_ext_ref 

1399 ) 

1400 with GitFile(filename, "wb") as f: 

1401 return write_pack_index_v1(f, entries, self.calculate_checksum()) 

1402 

1403 def create_index_v2(self, filename, progress=None, resolve_ext_ref=None): 

1404 """Create a version 2 index file for this data file. 

1405 

1406 Args: 

1407 filename: Index filename. 

1408 progress: Progress report function 

1409 Returns: Checksum of index file 

1410 """ 

1411 entries = self.sorted_entries( 

1412 progress=progress, resolve_ext_ref=resolve_ext_ref 

1413 ) 

1414 with GitFile(filename, "wb") as f: 

1415 return write_pack_index_v2(f, entries, self.calculate_checksum()) 

1416 

1417 def create_index_v3( 

1418 self, filename, progress=None, resolve_ext_ref=None, hash_algorithm=1 

1419 ): 

1420 """Create a version 3 index file for this data file. 

1421 

1422 Args: 

1423 filename: Index filename. 

1424 progress: Progress report function 

1425 resolve_ext_ref: Function to resolve external references 

1426 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256) 

1427 Returns: Checksum of index file 

1428 """ 

1429 entries = self.sorted_entries( 

1430 progress=progress, resolve_ext_ref=resolve_ext_ref 

1431 ) 

1432 with GitFile(filename, "wb") as f: 

1433 return write_pack_index_v3( 

1434 f, entries, self.calculate_checksum(), hash_algorithm 

1435 ) 

1436 

1437 def create_index( 

1438 self, filename, progress=None, version=2, resolve_ext_ref=None, hash_algorithm=1 

1439 ): 

1440 """Create an index file for this data file. 

1441 

1442 Args: 

1443 filename: Index filename. 

1444 progress: Progress report function 

1445 version: Index version (1, 2, or 3) 

1446 resolve_ext_ref: Function to resolve external references 

1447 hash_algorithm: Hash algorithm identifier for v3 (1 = SHA-1, 2 = SHA-256) 

1448 Returns: Checksum of index file 

1449 """ 

1450 if version == 1: 

1451 return self.create_index_v1( 

1452 filename, progress, resolve_ext_ref=resolve_ext_ref 

1453 ) 

1454 elif version == 2: 

1455 return self.create_index_v2( 

1456 filename, progress, resolve_ext_ref=resolve_ext_ref 

1457 ) 

1458 elif version == 3: 

1459 return self.create_index_v3( 

1460 filename, 

1461 progress, 

1462 resolve_ext_ref=resolve_ext_ref, 

1463 hash_algorithm=hash_algorithm, 

1464 ) 

1465 else: 

1466 raise ValueError(f"unknown index format {version}") 

1467 

1468 def get_stored_checksum(self): 

1469 """Return the expected checksum stored in this pack.""" 

1470 self._file.seek(-20, SEEK_END) 

1471 return self._file.read(20) 

1472 

1473 def check(self) -> None: 

1474 """Check the consistency of this pack.""" 

1475 actual = self.calculate_checksum() 

1476 stored = self.get_stored_checksum() 

1477 if actual != stored: 

1478 raise ChecksumMismatch(stored, actual) 

1479 

1480 def get_unpacked_object_at( 

1481 self, offset: int, *, include_comp: bool = False 

1482 ) -> UnpackedObject: 

1483 """Given offset in the packfile return a UnpackedObject.""" 

1484 assert offset >= self._header_size 

1485 self._file.seek(offset) 

1486 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp) 

1487 unpacked.offset = offset 

1488 return unpacked 

1489 

1490 def get_object_at(self, offset: int) -> tuple[int, OldUnpackedObject]: 

1491 """Given an offset in to the packfile return the object that is there. 

1492 

1493 Using the associated index the location of an object can be looked up, 

1494 and then the packfile can be asked directly for that object using this 

1495 function. 

1496 """ 

1497 try: 

1498 return self._offset_cache[offset] 

1499 except KeyError: 

1500 pass 

1501 unpacked = self.get_unpacked_object_at(offset, include_comp=False) 

1502 return (unpacked.pack_type_num, unpacked._obj()) 

1503 

1504 

1505T = TypeVar("T") 

1506 

1507 

1508class DeltaChainIterator(Generic[T]): 

1509 """Abstract iterator over pack data based on delta chains. 

1510 

1511 Each object in the pack is guaranteed to be inflated exactly once, 

1512 regardless of how many objects reference it as a delta base. As a result, 

1513 memory usage is proportional to the length of the longest delta chain. 

1514 

1515 Subclasses can override _result to define the result type of the iterator. 

1516 By default, results are UnpackedObjects with the following members set: 

1517 

1518 * offset 

1519 * obj_type_num 

1520 * obj_chunks 

1521 * pack_type_num 

1522 * delta_base (for delta types) 

1523 * comp_chunks (if _include_comp is True) 

1524 * decomp_chunks 

1525 * decomp_len 

1526 * crc32 (if _compute_crc32 is True) 

1527 """ 

1528 

1529 _compute_crc32 = False 

1530 _include_comp = False 

1531 

1532 def __init__(self, file_obj, *, resolve_ext_ref=None) -> None: 

1533 self._file = file_obj 

1534 self._resolve_ext_ref = resolve_ext_ref 

1535 self._pending_ofs: dict[int, list[int]] = defaultdict(list) 

1536 self._pending_ref: dict[bytes, list[int]] = defaultdict(list) 

1537 self._full_ofs: list[tuple[int, int]] = [] 

1538 self._ext_refs: list[bytes] = [] 

1539 

1540 @classmethod 

1541 def for_pack_data(cls, pack_data: PackData, resolve_ext_ref=None): 

1542 walker = cls(None, resolve_ext_ref=resolve_ext_ref) 

1543 walker.set_pack_data(pack_data) 

1544 for unpacked in pack_data.iter_unpacked(include_comp=False): 

1545 walker.record(unpacked) 

1546 return walker 

1547 

1548 @classmethod 

1549 def for_pack_subset( 

1550 cls, 

1551 pack: "Pack", 

1552 shas: Iterable[bytes], 

1553 *, 

1554 allow_missing: bool = False, 

1555 resolve_ext_ref=None, 

1556 ): 

1557 walker = cls(None, resolve_ext_ref=resolve_ext_ref) 

1558 walker.set_pack_data(pack.data) 

1559 todo = set() 

1560 for sha in shas: 

1561 assert isinstance(sha, bytes) 

1562 try: 

1563 off = pack.index.object_offset(sha) 

1564 except KeyError: 

1565 if not allow_missing: 

1566 raise 

1567 else: 

1568 todo.add(off) 

1569 done = set() 

1570 while todo: 

1571 off = todo.pop() 

1572 unpacked = pack.data.get_unpacked_object_at(off) 

1573 walker.record(unpacked) 

1574 done.add(off) 

1575 base_ofs = None 

1576 if unpacked.pack_type_num == OFS_DELTA: 

1577 base_ofs = unpacked.offset - unpacked.delta_base 

1578 elif unpacked.pack_type_num == REF_DELTA: 

1579 with suppress(KeyError): 

1580 assert isinstance(unpacked.delta_base, bytes) 

1581 base_ofs = pack.index.object_index(unpacked.delta_base) 

1582 if base_ofs is not None and base_ofs not in done: 

1583 todo.add(base_ofs) 

1584 return walker 

1585 

1586 def record(self, unpacked: UnpackedObject) -> None: 

1587 type_num = unpacked.pack_type_num 

1588 offset = unpacked.offset 

1589 if type_num == OFS_DELTA: 

1590 base_offset = offset - unpacked.delta_base 

1591 self._pending_ofs[base_offset].append(offset) 

1592 elif type_num == REF_DELTA: 

1593 assert isinstance(unpacked.delta_base, bytes) 

1594 self._pending_ref[unpacked.delta_base].append(offset) 

1595 else: 

1596 self._full_ofs.append((offset, type_num)) 

1597 

1598 def set_pack_data(self, pack_data: PackData) -> None: 

1599 self._file = pack_data._file 

1600 

1601 def _walk_all_chains(self): 

1602 for offset, type_num in self._full_ofs: 

1603 yield from self._follow_chain(offset, type_num, None) 

1604 yield from self._walk_ref_chains() 

1605 assert not self._pending_ofs, repr(self._pending_ofs) 

1606 

1607 def _ensure_no_pending(self) -> None: 

1608 if self._pending_ref: 

1609 raise UnresolvedDeltas([sha_to_hex(s) for s in self._pending_ref]) 

1610 

1611 def _walk_ref_chains(self): 

1612 if not self._resolve_ext_ref: 

1613 self._ensure_no_pending() 

1614 return 

1615 

1616 for base_sha, pending in sorted(self._pending_ref.items()): 

1617 if base_sha not in self._pending_ref: 

1618 continue 

1619 try: 

1620 type_num, chunks = self._resolve_ext_ref(base_sha) 

1621 except KeyError: 

1622 # Not an external ref, but may depend on one. Either it will 

1623 # get popped via a _follow_chain call, or we will raise an 

1624 # error below. 

1625 continue 

1626 self._ext_refs.append(base_sha) 

1627 self._pending_ref.pop(base_sha) 

1628 for new_offset in pending: 

1629 yield from self._follow_chain(new_offset, type_num, chunks) 

1630 

1631 self._ensure_no_pending() 

1632 

1633 def _result(self, unpacked: UnpackedObject) -> T: 

1634 raise NotImplementedError 

1635 

1636 def _resolve_object( 

1637 self, offset: int, obj_type_num: int, base_chunks: list[bytes] 

1638 ) -> UnpackedObject: 

1639 self._file.seek(offset) 

1640 unpacked, _ = unpack_object( 

1641 self._file.read, 

1642 include_comp=self._include_comp, 

1643 compute_crc32=self._compute_crc32, 

1644 ) 

1645 unpacked.offset = offset 

1646 if base_chunks is None: 

1647 assert unpacked.pack_type_num == obj_type_num 

1648 else: 

1649 assert unpacked.pack_type_num in DELTA_TYPES 

1650 unpacked.obj_type_num = obj_type_num 

1651 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks) 

1652 return unpacked 

1653 

1654 def _follow_chain(self, offset: int, obj_type_num: int, base_chunks: list[bytes]): 

1655 # Unlike PackData.get_object_at, there is no need to cache offsets as 

1656 # this approach by design inflates each object exactly once. 

1657 todo = [(offset, obj_type_num, base_chunks)] 

1658 while todo: 

1659 (offset, obj_type_num, base_chunks) = todo.pop() 

1660 unpacked = self._resolve_object(offset, obj_type_num, base_chunks) 

1661 yield self._result(unpacked) 

1662 

1663 unblocked = chain( 

1664 self._pending_ofs.pop(unpacked.offset, []), 

1665 self._pending_ref.pop(unpacked.sha(), []), 

1666 ) 

1667 todo.extend( 

1668 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore 

1669 for new_offset in unblocked 

1670 ) 

1671 

1672 def __iter__(self) -> Iterator[T]: 

1673 return self._walk_all_chains() 

1674 

1675 def ext_refs(self): 

1676 return self._ext_refs 

1677 

1678 

1679class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]): 

1680 """Delta chain iterator that yield unpacked objects.""" 

1681 

1682 def _result(self, unpacked): 

1683 return unpacked 

1684 

1685 

1686class PackIndexer(DeltaChainIterator[PackIndexEntry]): 

1687 """Delta chain iterator that yields index entries.""" 

1688 

1689 _compute_crc32 = True 

1690 

1691 def _result(self, unpacked): 

1692 return unpacked.sha(), unpacked.offset, unpacked.crc32 

1693 

1694 

1695class PackInflater(DeltaChainIterator[ShaFile]): 

1696 """Delta chain iterator that yields ShaFile objects.""" 

1697 

1698 def _result(self, unpacked): 

1699 return unpacked.sha_file() 

1700 

1701 

1702class SHA1Reader(BinaryIO): 

1703 """Wrapper for file-like object that remembers the SHA1 of its data.""" 

1704 

1705 def __init__(self, f) -> None: 

1706 self.f = f 

1707 self.sha1 = sha1(b"") 

1708 

1709 def read(self, size: int = -1) -> bytes: 

1710 data = self.f.read(size) 

1711 self.sha1.update(data) 

1712 return data 

1713 

1714 def check_sha(self, allow_empty: bool = False) -> None: 

1715 stored = self.f.read(20) 

1716 # If git option index.skipHash is set the index will be empty 

1717 if stored != self.sha1.digest() and ( 

1718 not allow_empty 

1719 or sha_to_hex(stored) != b"0000000000000000000000000000000000000000" 

1720 ): 

1721 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored)) 

1722 

1723 def close(self): 

1724 return self.f.close() 

1725 

1726 def tell(self) -> int: 

1727 return self.f.tell() 

1728 

1729 # BinaryIO abstract methods 

1730 def readable(self) -> bool: 

1731 return True 

1732 

1733 def writable(self) -> bool: 

1734 return False 

1735 

1736 def seekable(self) -> bool: 

1737 return getattr(self.f, "seekable", lambda: False)() 

1738 

1739 def seek(self, offset: int, whence: int = 0) -> int: 

1740 return self.f.seek(offset, whence) 

1741 

1742 def flush(self) -> None: 

1743 if hasattr(self.f, "flush"): 

1744 self.f.flush() 

1745 

1746 def readline(self, size: int = -1) -> bytes: 

1747 return self.f.readline(size) 

1748 

1749 def readlines(self, hint: int = -1) -> list[bytes]: 

1750 return self.f.readlines(hint) 

1751 

1752 def writelines(self, lines) -> None: 

1753 raise UnsupportedOperation("writelines") 

1754 

1755 def write(self, data) -> int: 

1756 raise UnsupportedOperation("write") 

1757 

1758 def __enter__(self): 

1759 return self 

1760 

1761 def __exit__(self, type, value, traceback): 

1762 self.close() 

1763 

1764 def __iter__(self): 

1765 return self 

1766 

1767 def __next__(self) -> bytes: 

1768 line = self.readline() 

1769 if not line: 

1770 raise StopIteration 

1771 return line 

1772 

1773 def fileno(self) -> int: 

1774 return self.f.fileno() 

1775 

1776 def isatty(self) -> bool: 

1777 return getattr(self.f, "isatty", lambda: False)() 

1778 

1779 def truncate(self, size: Optional[int] = None) -> int: 

1780 raise UnsupportedOperation("truncate") 

1781 

1782 

1783class SHA1Writer(BinaryIO): 

1784 """Wrapper for file-like object that remembers the SHA1 of its data.""" 

1785 

1786 def __init__(self, f) -> None: 

1787 self.f = f 

1788 self.length = 0 

1789 self.sha1 = sha1(b"") 

1790 

1791 def write(self, data) -> int: 

1792 self.sha1.update(data) 

1793 self.f.write(data) 

1794 self.length += len(data) 

1795 return len(data) 

1796 

1797 def write_sha(self): 

1798 sha = self.sha1.digest() 

1799 assert len(sha) == 20 

1800 self.f.write(sha) 

1801 self.length += len(sha) 

1802 return sha 

1803 

1804 def close(self): 

1805 sha = self.write_sha() 

1806 self.f.close() 

1807 return sha 

1808 

1809 def offset(self): 

1810 return self.length 

1811 

1812 def tell(self) -> int: 

1813 return self.f.tell() 

1814 

1815 # BinaryIO abstract methods 

1816 def readable(self) -> bool: 

1817 return False 

1818 

1819 def writable(self) -> bool: 

1820 return True 

1821 

1822 def seekable(self) -> bool: 

1823 return getattr(self.f, "seekable", lambda: False)() 

1824 

1825 def seek(self, offset: int, whence: int = 0) -> int: 

1826 return self.f.seek(offset, whence) 

1827 

1828 def flush(self) -> None: 

1829 if hasattr(self.f, "flush"): 

1830 self.f.flush() 

1831 

1832 def readline(self, size: int = -1) -> bytes: 

1833 raise UnsupportedOperation("readline") 

1834 

1835 def readlines(self, hint: int = -1) -> list[bytes]: 

1836 raise UnsupportedOperation("readlines") 

1837 

1838 def writelines(self, lines) -> None: 

1839 for line in lines: 

1840 self.write(line) 

1841 

1842 def read(self, size: int = -1) -> bytes: 

1843 raise UnsupportedOperation("read") 

1844 

1845 def __enter__(self): 

1846 return self 

1847 

1848 def __exit__(self, type, value, traceback): 

1849 self.close() 

1850 

1851 def __iter__(self): 

1852 return self 

1853 

1854 def __next__(self) -> bytes: 

1855 raise UnsupportedOperation("__next__") 

1856 

1857 def fileno(self) -> int: 

1858 return self.f.fileno() 

1859 

1860 def isatty(self) -> bool: 

1861 return getattr(self.f, "isatty", lambda: False)() 

1862 

1863 def truncate(self, size: Optional[int] = None) -> int: 

1864 raise UnsupportedOperation("truncate") 

1865 

1866 

1867def pack_object_header(type_num, delta_base, size): 

1868 """Create a pack object header for the given object info. 

1869 

1870 Args: 

1871 type_num: Numeric type of the object. 

1872 delta_base: Delta base offset or ref, or None for whole objects. 

1873 size: Uncompressed object size. 

1874 Returns: A header for a packed object. 

1875 """ 

1876 header = [] 

1877 c = (type_num << 4) | (size & 15) 

1878 size >>= 4 

1879 while size: 

1880 header.append(c | 0x80) 

1881 c = size & 0x7F 

1882 size >>= 7 

1883 header.append(c) 

1884 if type_num == OFS_DELTA: 

1885 ret = [delta_base & 0x7F] 

1886 delta_base >>= 7 

1887 while delta_base: 

1888 delta_base -= 1 

1889 ret.insert(0, 0x80 | (delta_base & 0x7F)) 

1890 delta_base >>= 7 

1891 header.extend(ret) 

1892 elif type_num == REF_DELTA: 

1893 assert len(delta_base) == 20 

1894 header += delta_base 

1895 return bytearray(header) 

1896 

1897 

1898def pack_object_chunks(type, object, compression_level=-1): 

1899 """Generate chunks for a pack object. 

1900 

1901 Args: 

1902 type: Numeric type of the object 

1903 object: Object to write 

1904 compression_level: the zlib compression level 

1905 Returns: Chunks 

1906 """ 

1907 if type in DELTA_TYPES: 

1908 delta_base, object = object 

1909 else: 

1910 delta_base = None 

1911 if isinstance(object, bytes): 

1912 object = [object] 

1913 yield bytes(pack_object_header(type, delta_base, sum(map(len, object)))) 

1914 compressor = zlib.compressobj(level=compression_level) 

1915 for data in object: 

1916 yield compressor.compress(data) 

1917 yield compressor.flush() 

1918 

1919 

1920def write_pack_object(write, type, object, sha=None, compression_level=-1): 

1921 """Write pack object to a file. 

1922 

1923 Args: 

1924 write: Write function to use 

1925 type: Numeric type of the object 

1926 object: Object to write 

1927 compression_level: the zlib compression level 

1928 Returns: Tuple with offset at which the object was written, and crc32 

1929 """ 

1930 crc32 = 0 

1931 for chunk in pack_object_chunks(type, object, compression_level=compression_level): 

1932 write(chunk) 

1933 if sha is not None: 

1934 sha.update(chunk) 

1935 crc32 = binascii.crc32(chunk, crc32) 

1936 return crc32 & 0xFFFFFFFF 

1937 

1938 

1939def write_pack( 

1940 filename, 

1941 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]], 

1942 *, 

1943 deltify: Optional[bool] = None, 

1944 delta_window_size: Optional[int] = None, 

1945 compression_level: int = -1, 

1946): 

1947 """Write a new pack data file. 

1948 

1949 Args: 

1950 filename: Path to the new pack file (without .pack extension) 

1951 delta_window_size: Delta window size 

1952 deltify: Whether to deltify pack objects 

1953 compression_level: the zlib compression level 

1954 Returns: Tuple with checksum of pack file and index file 

1955 """ 

1956 with GitFile(filename + ".pack", "wb") as f: 

1957 entries, data_sum = write_pack_objects( 

1958 f.write, 

1959 objects, 

1960 delta_window_size=delta_window_size, 

1961 deltify=deltify, 

1962 compression_level=compression_level, 

1963 ) 

1964 entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()]) 

1965 with GitFile(filename + ".idx", "wb") as f: 

1966 return data_sum, write_pack_index(f, entries, data_sum) 

1967 

1968 

1969def pack_header_chunks(num_objects): 

1970 """Yield chunks for a pack header.""" 

1971 yield b"PACK" # Pack header 

1972 yield struct.pack(b">L", 2) # Pack version 

1973 yield struct.pack(b">L", num_objects) # Number of objects in pack 

1974 

1975 

1976def write_pack_header(write, num_objects) -> None: 

1977 """Write a pack header for the given number of objects.""" 

1978 if hasattr(write, "write"): 

1979 write = write.write 

1980 warnings.warn( 

1981 "write_pack_header() now takes a write rather than file argument", 

1982 DeprecationWarning, 

1983 stacklevel=2, 

1984 ) 

1985 for chunk in pack_header_chunks(num_objects): 

1986 write(chunk) 

1987 

1988 

1989def find_reusable_deltas( 

1990 container: PackedObjectContainer, 

1991 object_ids: set[bytes], 

1992 *, 

1993 other_haves: Optional[set[bytes]] = None, 

1994 progress=None, 

1995) -> Iterator[UnpackedObject]: 

1996 if other_haves is None: 

1997 other_haves = set() 

1998 reused = 0 

1999 for i, unpacked in enumerate( 

2000 container.iter_unpacked_subset( 

2001 object_ids, allow_missing=True, convert_ofs_delta=True 

2002 ) 

2003 ): 

2004 if progress is not None and i % 1000 == 0: 

2005 progress(f"checking for reusable deltas: {i}/{len(object_ids)}\r".encode()) 

2006 if unpacked.pack_type_num == REF_DELTA: 

2007 hexsha = sha_to_hex(unpacked.delta_base) # type: ignore 

2008 if hexsha in object_ids or hexsha in other_haves: 

2009 yield unpacked 

2010 reused += 1 

2011 if progress is not None: 

2012 progress((f"found {reused} deltas to reuse\n").encode()) 

2013 

2014 

2015def deltify_pack_objects( 

2016 objects: Union[Iterator[bytes], Iterator[tuple[ShaFile, Optional[bytes]]]], 

2017 *, 

2018 window_size: Optional[int] = None, 

2019 progress=None, 

2020) -> Iterator[UnpackedObject]: 

2021 """Generate deltas for pack objects. 

2022 

2023 Args: 

2024 objects: An iterable of (object, path) tuples to deltify. 

2025 window_size: Window size; None for default 

2026 Returns: Iterator over type_num, object id, delta_base, content 

2027 delta_base is None for full text entries 

2028 """ 

2029 

2030 def objects_with_hints(): 

2031 for e in objects: 

2032 if isinstance(e, ShaFile): 

2033 yield (e, (e.type_num, None)) 

2034 else: 

2035 yield (e[0], (e[0].type_num, e[1])) 

2036 

2037 yield from deltas_from_sorted_objects( 

2038 sort_objects_for_delta(objects_with_hints()), 

2039 window_size=window_size, 

2040 progress=progress, 

2041 ) 

2042 

2043 

2044def sort_objects_for_delta( 

2045 objects: Union[Iterator[ShaFile], Iterator[tuple[ShaFile, Optional[PackHint]]]], 

2046) -> Iterator[ShaFile]: 

2047 magic = [] 

2048 for entry in objects: 

2049 if isinstance(entry, tuple): 

2050 obj, hint = entry 

2051 if hint is None: 

2052 type_num = None 

2053 path = None 

2054 else: 

2055 (type_num, path) = hint 

2056 else: 

2057 obj = entry 

2058 magic.append((type_num, path, -obj.raw_length(), obj)) 

2059 # Build a list of objects ordered by the magic Linus heuristic 

2060 # This helps us find good objects to diff against us 

2061 magic.sort() 

2062 return (x[3] for x in magic) 

2063 

2064 

2065def deltas_from_sorted_objects( 

2066 objects, window_size: Optional[int] = None, progress=None 

2067): 

2068 # TODO(jelmer): Use threads 

2069 if window_size is None: 

2070 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE 

2071 

2072 possible_bases: deque[tuple[bytes, int, list[bytes]]] = deque() 

2073 for i, o in enumerate(objects): 

2074 if progress is not None and i % 1000 == 0: 

2075 progress((f"generating deltas: {i}\r").encode()) 

2076 raw = o.as_raw_chunks() 

2077 winner = raw 

2078 winner_len = sum(map(len, winner)) 

2079 winner_base = None 

2080 for base_id, base_type_num, base in possible_bases: 

2081 if base_type_num != o.type_num: 

2082 continue 

2083 delta_len = 0 

2084 delta = [] 

2085 for chunk in create_delta(base, raw): 

2086 delta_len += len(chunk) 

2087 if delta_len >= winner_len: 

2088 break 

2089 delta.append(chunk) 

2090 else: 

2091 winner_base = base_id 

2092 winner = delta 

2093 winner_len = sum(map(len, winner)) 

2094 yield UnpackedObject( 

2095 o.type_num, 

2096 sha=o.sha().digest(), 

2097 delta_base=winner_base, 

2098 decomp_len=winner_len, 

2099 decomp_chunks=winner, 

2100 ) 

2101 possible_bases.appendleft((o.sha().digest(), o.type_num, raw)) 

2102 while len(possible_bases) > window_size: 

2103 possible_bases.pop() 

2104 

2105 

2106def pack_objects_to_data( 

2107 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]], 

2108 *, 

2109 deltify: Optional[bool] = None, 

2110 delta_window_size: Optional[int] = None, 

2111 ofs_delta: bool = True, 

2112 progress=None, 

2113) -> tuple[int, Iterator[UnpackedObject]]: 

2114 """Create pack data from objects. 

2115 

2116 Args: 

2117 objects: Pack objects 

2118 Returns: Tuples with (type_num, hexdigest, delta base, object chunks) 

2119 """ 

2120 # TODO(jelmer): support deltaifying 

2121 count = len(objects) 

2122 if deltify is None: 

2123 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too 

2124 # slow at the moment. 

2125 deltify = False 

2126 if deltify: 

2127 return ( 

2128 count, 

2129 deltify_pack_objects( 

2130 iter(objects), # type: ignore 

2131 window_size=delta_window_size, 

2132 progress=progress, 

2133 ), 

2134 ) 

2135 else: 

2136 

2137 def iter_without_path(): 

2138 for o in objects: 

2139 if isinstance(o, tuple): 

2140 yield full_unpacked_object(o[0]) 

2141 else: 

2142 yield full_unpacked_object(o) 

2143 

2144 return (count, iter_without_path()) 

2145 

2146 

2147def generate_unpacked_objects( 

2148 container: PackedObjectContainer, 

2149 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]], 

2150 delta_window_size: Optional[int] = None, 

2151 deltify: Optional[bool] = None, 

2152 reuse_deltas: bool = True, 

2153 ofs_delta: bool = True, 

2154 other_haves: Optional[set[bytes]] = None, 

2155 progress=None, 

2156) -> Iterator[UnpackedObject]: 

2157 """Create pack data from objects. 

2158 

2159 Returns: Tuples with (type_num, hexdigest, delta base, object chunks) 

2160 """ 

2161 todo = dict(object_ids) 

2162 if reuse_deltas: 

2163 for unpack in find_reusable_deltas( 

2164 container, set(todo), other_haves=other_haves, progress=progress 

2165 ): 

2166 del todo[sha_to_hex(unpack.sha())] 

2167 yield unpack 

2168 if deltify is None: 

2169 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too 

2170 # slow at the moment. 

2171 deltify = False 

2172 if deltify: 

2173 objects_to_delta = container.iterobjects_subset( 

2174 todo.keys(), allow_missing=False 

2175 ) 

2176 yield from deltas_from_sorted_objects( 

2177 sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta), 

2178 window_size=delta_window_size, 

2179 progress=progress, 

2180 ) 

2181 else: 

2182 for oid in todo: 

2183 yield full_unpacked_object(container[oid]) 

2184 

2185 

2186def full_unpacked_object(o: ShaFile) -> UnpackedObject: 

2187 return UnpackedObject( 

2188 o.type_num, 

2189 delta_base=None, 

2190 crc32=None, 

2191 decomp_chunks=o.as_raw_chunks(), 

2192 sha=o.sha().digest(), 

2193 ) 

2194 

2195 

2196def write_pack_from_container( 

2197 write, 

2198 container: PackedObjectContainer, 

2199 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]], 

2200 delta_window_size: Optional[int] = None, 

2201 deltify: Optional[bool] = None, 

2202 reuse_deltas: bool = True, 

2203 compression_level: int = -1, 

2204 other_haves: Optional[set[bytes]] = None, 

2205): 

2206 """Write a new pack data file. 

2207 

2208 Args: 

2209 write: write function to use 

2210 container: PackedObjectContainer 

2211 delta_window_size: Sliding window size for searching for deltas; 

2212 Set to None for default window size. 

2213 deltify: Whether to deltify objects 

2214 compression_level: the zlib compression level to use 

2215 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

2216 """ 

2217 pack_contents_count = len(object_ids) 

2218 pack_contents = generate_unpacked_objects( 

2219 container, 

2220 object_ids, 

2221 delta_window_size=delta_window_size, 

2222 deltify=deltify, 

2223 reuse_deltas=reuse_deltas, 

2224 other_haves=other_haves, 

2225 ) 

2226 

2227 return write_pack_data( 

2228 write, 

2229 pack_contents, 

2230 num_records=pack_contents_count, 

2231 compression_level=compression_level, 

2232 ) 

2233 

2234 

2235def write_pack_objects( 

2236 write, 

2237 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]], 

2238 *, 

2239 delta_window_size: Optional[int] = None, 

2240 deltify: Optional[bool] = None, 

2241 compression_level: int = -1, 

2242): 

2243 """Write a new pack data file. 

2244 

2245 Args: 

2246 write: write function to use 

2247 objects: Sequence of (object, path) tuples to write 

2248 delta_window_size: Sliding window size for searching for deltas; 

2249 Set to None for default window size. 

2250 deltify: Whether to deltify objects 

2251 compression_level: the zlib compression level to use 

2252 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

2253 """ 

2254 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify) 

2255 

2256 return write_pack_data( 

2257 write, 

2258 pack_contents, 

2259 num_records=pack_contents_count, 

2260 compression_level=compression_level, 

2261 ) 

2262 

2263 

2264class PackChunkGenerator: 

2265 def __init__( 

2266 self, 

2267 num_records=None, 

2268 records=None, 

2269 progress=None, 

2270 compression_level=-1, 

2271 reuse_compressed=True, 

2272 ) -> None: 

2273 self.cs = sha1(b"") 

2274 self.entries: dict[Union[int, bytes], tuple[int, int]] = {} 

2275 self._it = self._pack_data_chunks( 

2276 num_records=num_records, 

2277 records=records, 

2278 progress=progress, 

2279 compression_level=compression_level, 

2280 reuse_compressed=reuse_compressed, 

2281 ) 

2282 

2283 def sha1digest(self): 

2284 return self.cs.digest() 

2285 

2286 def __iter__(self): 

2287 return self._it 

2288 

2289 def _pack_data_chunks( 

2290 self, 

2291 records: Iterator[UnpackedObject], 

2292 *, 

2293 num_records=None, 

2294 progress=None, 

2295 compression_level: int = -1, 

2296 reuse_compressed: bool = True, 

2297 ) -> Iterator[bytes]: 

2298 """Iterate pack data file chunks. 

2299 

2300 Args: 

2301 records: Iterator over UnpackedObject 

2302 num_records: Number of records (defaults to len(records) if not specified) 

2303 progress: Function to report progress to 

2304 compression_level: the zlib compression level 

2305 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

2306 """ 

2307 # Write the pack 

2308 if num_records is None: 

2309 num_records = len(records) # type: ignore 

2310 offset = 0 

2311 for chunk in pack_header_chunks(num_records): 

2312 yield chunk 

2313 self.cs.update(chunk) 

2314 offset += len(chunk) 

2315 actual_num_records = 0 

2316 for i, unpacked in enumerate(records): 

2317 type_num = unpacked.pack_type_num 

2318 if progress is not None and i % 1000 == 0: 

2319 progress((f"writing pack data: {i}/{num_records}\r").encode("ascii")) 

2320 raw: Union[list[bytes], tuple[int, list[bytes]], tuple[bytes, list[bytes]]] 

2321 if unpacked.delta_base is not None: 

2322 try: 

2323 base_offset, base_crc32 = self.entries[unpacked.delta_base] 

2324 except KeyError: 

2325 type_num = REF_DELTA 

2326 assert isinstance(unpacked.delta_base, bytes) 

2327 raw = (unpacked.delta_base, unpacked.decomp_chunks) 

2328 else: 

2329 type_num = OFS_DELTA 

2330 raw = (offset - base_offset, unpacked.decomp_chunks) 

2331 else: 

2332 raw = unpacked.decomp_chunks 

2333 if unpacked.comp_chunks is not None and reuse_compressed: 

2334 chunks = unpacked.comp_chunks 

2335 else: 

2336 chunks = pack_object_chunks( 

2337 type_num, raw, compression_level=compression_level 

2338 ) 

2339 crc32 = 0 

2340 object_size = 0 

2341 for chunk in chunks: 

2342 yield chunk 

2343 crc32 = binascii.crc32(chunk, crc32) 

2344 self.cs.update(chunk) 

2345 object_size += len(chunk) 

2346 actual_num_records += 1 

2347 self.entries[unpacked.sha()] = (offset, crc32) 

2348 offset += object_size 

2349 if actual_num_records != num_records: 

2350 raise AssertionError( 

2351 f"actual records written differs: {actual_num_records} != {num_records}" 

2352 ) 

2353 

2354 yield self.cs.digest() 

2355 

2356 

2357def write_pack_data( 

2358 write, 

2359 records: Iterator[UnpackedObject], 

2360 *, 

2361 num_records=None, 

2362 progress=None, 

2363 compression_level=-1, 

2364): 

2365 """Write a new pack data file. 

2366 

2367 Args: 

2368 write: Write function to use 

2369 num_records: Number of records (defaults to len(records) if None) 

2370 records: Iterator over type_num, object_id, delta_base, raw 

2371 progress: Function to report progress to 

2372 compression_level: the zlib compression level 

2373 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum 

2374 """ 

2375 chunk_generator = PackChunkGenerator( 

2376 num_records=num_records, 

2377 records=records, 

2378 progress=progress, 

2379 compression_level=compression_level, 

2380 ) 

2381 for chunk in chunk_generator: 

2382 write(chunk) 

2383 return chunk_generator.entries, chunk_generator.sha1digest() 

2384 

2385 

2386def write_pack_index_v1(f, entries, pack_checksum): 

2387 """Write a new pack index file. 

2388 

2389 Args: 

2390 f: A file-like object to write to 

2391 entries: List of tuples with object name (sha), offset_in_pack, 

2392 and crc32_checksum. 

2393 pack_checksum: Checksum of the pack file. 

2394 Returns: The SHA of the written index file 

2395 """ 

2396 f = SHA1Writer(f) 

2397 fan_out_table = defaultdict(lambda: 0) 

2398 for name, offset, entry_checksum in entries: 

2399 fan_out_table[ord(name[:1])] += 1 

2400 # Fan-out table 

2401 for i in range(0x100): 

2402 f.write(struct.pack(">L", fan_out_table[i])) 

2403 fan_out_table[i + 1] += fan_out_table[i] 

2404 for name, offset, entry_checksum in entries: 

2405 if not (offset <= 0xFFFFFFFF): 

2406 raise TypeError("pack format 1 only supports offsets < 2Gb") 

2407 f.write(struct.pack(">L20s", offset, name)) 

2408 assert len(pack_checksum) == 20 

2409 f.write(pack_checksum) 

2410 return f.write_sha() 

2411 

2412 

2413def _delta_encode_size(size) -> bytes: 

2414 ret = bytearray() 

2415 c = size & 0x7F 

2416 size >>= 7 

2417 while size: 

2418 ret.append(c | 0x80) 

2419 c = size & 0x7F 

2420 size >>= 7 

2421 ret.append(c) 

2422 return bytes(ret) 

2423 

2424 

2425# The length of delta compression copy operations in version 2 packs is limited 

2426# to 64K. To copy more, we use several copy operations. Version 3 packs allow 

2427# 24-bit lengths in copy operations, but we always make version 2 packs. 

2428_MAX_COPY_LEN = 0xFFFF 

2429 

2430 

2431def _encode_copy_operation(start, length): 

2432 scratch = bytearray([0x80]) 

2433 for i in range(4): 

2434 if start & 0xFF << i * 8: 

2435 scratch.append((start >> i * 8) & 0xFF) 

2436 scratch[0] |= 1 << i 

2437 for i in range(2): 

2438 if length & 0xFF << i * 8: 

2439 scratch.append((length >> i * 8) & 0xFF) 

2440 scratch[0] |= 1 << (4 + i) 

2441 return bytes(scratch) 

2442 

2443 

2444def create_delta(base_buf, target_buf): 

2445 """Use python difflib to work out how to transform base_buf to target_buf. 

2446 

2447 Args: 

2448 base_buf: Base buffer 

2449 target_buf: Target buffer 

2450 """ 

2451 if isinstance(base_buf, list): 

2452 base_buf = b"".join(base_buf) 

2453 if isinstance(target_buf, list): 

2454 target_buf = b"".join(target_buf) 

2455 assert isinstance(base_buf, bytes) 

2456 assert isinstance(target_buf, bytes) 

2457 # write delta header 

2458 yield _delta_encode_size(len(base_buf)) 

2459 yield _delta_encode_size(len(target_buf)) 

2460 # write out delta opcodes 

2461 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf) 

2462 for opcode, i1, i2, j1, j2 in seq.get_opcodes(): 

2463 # Git patch opcodes don't care about deletes! 

2464 # if opcode == 'replace' or opcode == 'delete': 

2465 # pass 

2466 if opcode == "equal": 

2467 # If they are equal, unpacker will use data from base_buf 

2468 # Write out an opcode that says what range to use 

2469 copy_start = i1 

2470 copy_len = i2 - i1 

2471 while copy_len > 0: 

2472 to_copy = min(copy_len, _MAX_COPY_LEN) 

2473 yield _encode_copy_operation(copy_start, to_copy) 

2474 copy_start += to_copy 

2475 copy_len -= to_copy 

2476 if opcode == "replace" or opcode == "insert": 

2477 # If we are replacing a range or adding one, then we just 

2478 # output it to the stream (prefixed by its size) 

2479 s = j2 - j1 

2480 o = j1 

2481 while s > 127: 

2482 yield bytes([127]) 

2483 yield memoryview(target_buf)[o : o + 127] 

2484 s -= 127 

2485 o += 127 

2486 yield bytes([s]) 

2487 yield memoryview(target_buf)[o : o + s] 

2488 

2489 

2490def apply_delta(src_buf, delta): 

2491 """Based on the similar function in git's patch-delta.c. 

2492 

2493 Args: 

2494 src_buf: Source buffer 

2495 delta: Delta instructions 

2496 """ 

2497 if not isinstance(src_buf, bytes): 

2498 src_buf = b"".join(src_buf) 

2499 if not isinstance(delta, bytes): 

2500 delta = b"".join(delta) 

2501 out = [] 

2502 index = 0 

2503 delta_length = len(delta) 

2504 

2505 def get_delta_header_size(delta, index): 

2506 size = 0 

2507 i = 0 

2508 while delta: 

2509 cmd = ord(delta[index : index + 1]) 

2510 index += 1 

2511 size |= (cmd & ~0x80) << i 

2512 i += 7 

2513 if not cmd & 0x80: 

2514 break 

2515 return size, index 

2516 

2517 src_size, index = get_delta_header_size(delta, index) 

2518 dest_size, index = get_delta_header_size(delta, index) 

2519 if src_size != len(src_buf): 

2520 raise ApplyDeltaError( 

2521 f"Unexpected source buffer size: {src_size} vs {len(src_buf)}" 

2522 ) 

2523 while index < delta_length: 

2524 cmd = ord(delta[index : index + 1]) 

2525 index += 1 

2526 if cmd & 0x80: 

2527 cp_off = 0 

2528 for i in range(4): 

2529 if cmd & (1 << i): 

2530 x = ord(delta[index : index + 1]) 

2531 index += 1 

2532 cp_off |= x << (i * 8) 

2533 cp_size = 0 

2534 # Version 3 packs can contain copy sizes larger than 64K. 

2535 for i in range(3): 

2536 if cmd & (1 << (4 + i)): 

2537 x = ord(delta[index : index + 1]) 

2538 index += 1 

2539 cp_size |= x << (i * 8) 

2540 if cp_size == 0: 

2541 cp_size = 0x10000 

2542 if ( 

2543 cp_off + cp_size < cp_size 

2544 or cp_off + cp_size > src_size 

2545 or cp_size > dest_size 

2546 ): 

2547 break 

2548 out.append(src_buf[cp_off : cp_off + cp_size]) 

2549 elif cmd != 0: 

2550 out.append(delta[index : index + cmd]) 

2551 index += cmd 

2552 else: 

2553 raise ApplyDeltaError("Invalid opcode 0") 

2554 

2555 if index != delta_length: 

2556 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}") 

2557 

2558 if dest_size != chunks_length(out): 

2559 raise ApplyDeltaError("dest size incorrect") 

2560 

2561 return out 

2562 

2563 

2564def write_pack_index_v2( 

2565 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes 

2566) -> bytes: 

2567 """Write a new pack index file. 

2568 

2569 Args: 

2570 f: File-like object to write to 

2571 entries: List of tuples with object name (sha), offset_in_pack, and 

2572 crc32_checksum. 

2573 pack_checksum: Checksum of the pack file. 

2574 Returns: The SHA of the index file written 

2575 """ 

2576 f = SHA1Writer(f) 

2577 f.write(b"\377tOc") # Magic! 

2578 f.write(struct.pack(">L", 2)) 

2579 fan_out_table: dict[int, int] = defaultdict(lambda: 0) 

2580 for name, offset, entry_checksum in entries: 

2581 fan_out_table[ord(name[:1])] += 1 

2582 # Fan-out table 

2583 largetable: list[int] = [] 

2584 for i in range(0x100): 

2585 f.write(struct.pack(b">L", fan_out_table[i])) 

2586 fan_out_table[i + 1] += fan_out_table[i] 

2587 for name, offset, entry_checksum in entries: 

2588 f.write(name) 

2589 for name, offset, entry_checksum in entries: 

2590 f.write(struct.pack(b">L", entry_checksum)) 

2591 for name, offset, entry_checksum in entries: 

2592 if offset < 2**31: 

2593 f.write(struct.pack(b">L", offset)) 

2594 else: 

2595 f.write(struct.pack(b">L", 2**31 + len(largetable))) 

2596 largetable.append(offset) 

2597 for offset in largetable: 

2598 f.write(struct.pack(b">Q", offset)) 

2599 assert len(pack_checksum) == 20 

2600 f.write(pack_checksum) 

2601 return f.write_sha() 

2602 

2603 

2604def write_pack_index_v3( 

2605 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes, hash_algorithm: int = 1 

2606) -> bytes: 

2607 """Write a new pack index file in v3 format. 

2608 

2609 Args: 

2610 f: File-like object to write to 

2611 entries: List of tuples with object name (sha), offset_in_pack, and 

2612 crc32_checksum. 

2613 pack_checksum: Checksum of the pack file. 

2614 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256) 

2615 Returns: The SHA of the index file written 

2616 """ 

2617 if hash_algorithm == 1: 

2618 hash_size = 20 # SHA-1 

2619 writer_cls = SHA1Writer 

2620 elif hash_algorithm == 2: 

2621 hash_size = 32 # SHA-256 

2622 # TODO: Add SHA256Writer when SHA-256 support is implemented 

2623 raise NotImplementedError("SHA-256 support not yet implemented") 

2624 else: 

2625 raise ValueError(f"Unknown hash algorithm {hash_algorithm}") 

2626 

2627 # Convert entries to list to allow multiple iterations 

2628 entries_list = list(entries) 

2629 

2630 # Calculate shortest unambiguous prefix length for object names 

2631 # For now, use full hash size (this could be optimized) 

2632 shortened_oid_len = hash_size 

2633 

2634 f = writer_cls(f) 

2635 f.write(b"\377tOc") # Magic! 

2636 f.write(struct.pack(">L", 3)) # Version 3 

2637 f.write(struct.pack(">L", hash_algorithm)) # Hash algorithm 

2638 f.write(struct.pack(">L", shortened_oid_len)) # Shortened OID length 

2639 

2640 fan_out_table: dict[int, int] = defaultdict(lambda: 0) 

2641 for name, offset, entry_checksum in entries_list: 

2642 if len(name) != hash_size: 

2643 raise ValueError( 

2644 f"Object name has wrong length: expected {hash_size}, got {len(name)}" 

2645 ) 

2646 fan_out_table[ord(name[:1])] += 1 

2647 

2648 # Fan-out table 

2649 largetable: list[int] = [] 

2650 for i in range(0x100): 

2651 f.write(struct.pack(b">L", fan_out_table[i])) 

2652 fan_out_table[i + 1] += fan_out_table[i] 

2653 

2654 # Object names table 

2655 for name, offset, entry_checksum in entries_list: 

2656 f.write(name) 

2657 

2658 # CRC32 checksums table 

2659 for name, offset, entry_checksum in entries_list: 

2660 f.write(struct.pack(b">L", entry_checksum)) 

2661 

2662 # Offset table 

2663 for name, offset, entry_checksum in entries_list: 

2664 if offset < 2**31: 

2665 f.write(struct.pack(b">L", offset)) 

2666 else: 

2667 f.write(struct.pack(b">L", 2**31 + len(largetable))) 

2668 largetable.append(offset) 

2669 

2670 # Large offset table 

2671 for offset in largetable: 

2672 f.write(struct.pack(b">Q", offset)) 

2673 

2674 assert len(pack_checksum) == hash_size, ( 

2675 f"Pack checksum has wrong length: expected {hash_size}, got {len(pack_checksum)}" 

2676 ) 

2677 f.write(pack_checksum) 

2678 return f.write_sha() 

2679 

2680 

2681def write_pack_index( 

2682 index_filename, entries, pack_checksum, progress=None, version=None 

2683): 

2684 """Write a pack index file. 

2685 

2686 Args: 

2687 index_filename: Index filename. 

2688 entries: List of (checksum, offset, crc32) tuples 

2689 pack_checksum: Checksum of the pack file. 

2690 progress: Progress function (not currently used) 

2691 version: Pack index version to use (1, 2, or 3). If None, defaults to DEFAULT_PACK_INDEX_VERSION. 

2692 

2693 Returns: 

2694 SHA of the written index file 

2695 """ 

2696 if version is None: 

2697 version = DEFAULT_PACK_INDEX_VERSION 

2698 

2699 if version == 1: 

2700 return write_pack_index_v1(index_filename, entries, pack_checksum) 

2701 elif version == 2: 

2702 return write_pack_index_v2(index_filename, entries, pack_checksum) 

2703 elif version == 3: 

2704 return write_pack_index_v3(index_filename, entries, pack_checksum) 

2705 else: 

2706 raise ValueError(f"Unsupported pack index version: {version}") 

2707 

2708 

2709class Pack: 

2710 """A Git pack object.""" 

2711 

2712 _data_load: Optional[Callable[[], PackData]] 

2713 _idx_load: Optional[Callable[[], PackIndex]] 

2714 

2715 _data: Optional[PackData] 

2716 _idx: Optional[PackIndex] 

2717 

2718 def __init__( 

2719 self, basename, resolve_ext_ref: Optional[ResolveExtRefFn] = None 

2720 ) -> None: 

2721 self._basename = basename 

2722 self._data = None 

2723 self._idx = None 

2724 self._idx_path = self._basename + ".idx" 

2725 self._data_path = self._basename + ".pack" 

2726 self._data_load = lambda: PackData(self._data_path) 

2727 self._idx_load = lambda: load_pack_index(self._idx_path) 

2728 self.resolve_ext_ref = resolve_ext_ref 

2729 

2730 @classmethod 

2731 def from_lazy_objects(cls, data_fn, idx_fn): 

2732 """Create a new pack object from callables to load pack data and 

2733 index objects. 

2734 """ 

2735 ret = cls("") 

2736 ret._data_load = data_fn 

2737 ret._idx_load = idx_fn 

2738 return ret 

2739 

2740 @classmethod 

2741 def from_objects(cls, data, idx): 

2742 """Create a new pack object from pack data and index objects.""" 

2743 ret = cls("") 

2744 ret._data = data 

2745 ret._data_load = None 

2746 ret._idx = idx 

2747 ret._idx_load = None 

2748 ret.check_length_and_checksum() 

2749 return ret 

2750 

2751 def name(self): 

2752 """The SHA over the SHAs of the objects in this pack.""" 

2753 return self.index.objects_sha1() 

2754 

2755 @property 

2756 def data(self) -> PackData: 

2757 """The pack data object being used.""" 

2758 if self._data is None: 

2759 assert self._data_load 

2760 self._data = self._data_load() 

2761 self.check_length_and_checksum() 

2762 return self._data 

2763 

2764 @property 

2765 def index(self) -> PackIndex: 

2766 """The index being used. 

2767 

2768 Note: This may be an in-memory index 

2769 """ 

2770 if self._idx is None: 

2771 assert self._idx_load 

2772 self._idx = self._idx_load() 

2773 return self._idx 

2774 

2775 def close(self) -> None: 

2776 if self._data is not None: 

2777 self._data.close() 

2778 if self._idx is not None: 

2779 self._idx.close() 

2780 

2781 def __enter__(self): 

2782 return self 

2783 

2784 def __exit__(self, exc_type, exc_val, exc_tb): 

2785 self.close() 

2786 

2787 def __eq__(self, other): 

2788 return isinstance(self, type(other)) and self.index == other.index 

2789 

2790 def __len__(self) -> int: 

2791 """Number of entries in this pack.""" 

2792 return len(self.index) 

2793 

2794 def __repr__(self) -> str: 

2795 return f"{self.__class__.__name__}({self._basename!r})" 

2796 

2797 def __iter__(self): 

2798 """Iterate over all the sha1s of the objects in this pack.""" 

2799 return iter(self.index) 

2800 

2801 def check_length_and_checksum(self) -> None: 

2802 """Sanity check the length and checksum of the pack index and data.""" 

2803 assert len(self.index) == len(self.data), ( 

2804 f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)" 

2805 ) 

2806 idx_stored_checksum = self.index.get_pack_checksum() 

2807 data_stored_checksum = self.data.get_stored_checksum() 

2808 if idx_stored_checksum != data_stored_checksum: 

2809 raise ChecksumMismatch( 

2810 sha_to_hex(idx_stored_checksum), 

2811 sha_to_hex(data_stored_checksum), 

2812 ) 

2813 

2814 def check(self) -> None: 

2815 """Check the integrity of this pack. 

2816 

2817 Raises: 

2818 ChecksumMismatch: if a checksum for the index or data is wrong 

2819 """ 

2820 self.index.check() 

2821 self.data.check() 

2822 for obj in self.iterobjects(): 

2823 obj.check() 

2824 # TODO: object connectivity checks 

2825 

2826 def get_stored_checksum(self) -> bytes: 

2827 return self.data.get_stored_checksum() 

2828 

2829 def pack_tuples(self): 

2830 return [(o, None) for o in self.iterobjects()] 

2831 

2832 def __contains__(self, sha1: bytes) -> bool: 

2833 """Check whether this pack contains a particular SHA1.""" 

2834 try: 

2835 self.index.object_offset(sha1) 

2836 return True 

2837 except KeyError: 

2838 return False 

2839 

2840 def get_raw(self, sha1: bytes) -> tuple[int, bytes]: 

2841 offset = self.index.object_offset(sha1) 

2842 obj_type, obj = self.data.get_object_at(offset) 

2843 type_num, chunks = self.resolve_object(offset, obj_type, obj) 

2844 return type_num, b"".join(chunks) 

2845 

2846 def __getitem__(self, sha1: bytes) -> ShaFile: 

2847 """Retrieve the specified SHA1.""" 

2848 type, uncomp = self.get_raw(sha1) 

2849 return ShaFile.from_raw_string(type, uncomp, sha=sha1) 

2850 

2851 def iterobjects(self) -> Iterator[ShaFile]: 

2852 """Iterate over the objects in this pack.""" 

2853 return iter( 

2854 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref) 

2855 ) 

2856 

2857 def iterobjects_subset( 

2858 self, shas: Iterable[ObjectID], *, allow_missing: bool = False 

2859 ) -> Iterator[ShaFile]: 

2860 return ( 

2861 uo 

2862 for uo in PackInflater.for_pack_subset( 

2863 self, 

2864 shas, 

2865 allow_missing=allow_missing, 

2866 resolve_ext_ref=self.resolve_ext_ref, 

2867 ) 

2868 if uo.id in shas 

2869 ) 

2870 

2871 def iter_unpacked_subset( 

2872 self, 

2873 shas: Iterable[ObjectID], 

2874 *, 

2875 include_comp: bool = False, 

2876 allow_missing: bool = False, 

2877 convert_ofs_delta: bool = False, 

2878 ) -> Iterator[UnpackedObject]: 

2879 ofs_pending: dict[int, list[UnpackedObject]] = defaultdict(list) 

2880 ofs: dict[bytes, int] = {} 

2881 todo = set(shas) 

2882 for unpacked in self.iter_unpacked(include_comp=include_comp): 

2883 sha = unpacked.sha() 

2884 ofs[unpacked.offset] = sha 

2885 hexsha = sha_to_hex(sha) 

2886 if hexsha in todo: 

2887 if unpacked.pack_type_num == OFS_DELTA: 

2888 assert isinstance(unpacked.delta_base, int) 

2889 base_offset = unpacked.offset - unpacked.delta_base 

2890 try: 

2891 unpacked.delta_base = ofs[base_offset] 

2892 except KeyError: 

2893 ofs_pending[base_offset].append(unpacked) 

2894 continue 

2895 else: 

2896 unpacked.pack_type_num = REF_DELTA 

2897 yield unpacked 

2898 todo.remove(hexsha) 

2899 for child in ofs_pending.pop(unpacked.offset, []): 

2900 child.pack_type_num = REF_DELTA 

2901 child.delta_base = sha 

2902 yield child 

2903 assert not ofs_pending 

2904 if not allow_missing and todo: 

2905 raise UnresolvedDeltas(todo) 

2906 

2907 def iter_unpacked(self, include_comp=False): 

2908 ofs_to_entries = { 

2909 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries() 

2910 } 

2911 for unpacked in self.data.iter_unpacked(include_comp=include_comp): 

2912 (sha, crc32) = ofs_to_entries[unpacked.offset] 

2913 unpacked._sha = sha 

2914 unpacked.crc32 = crc32 

2915 yield unpacked 

2916 

2917 def keep(self, msg: Optional[bytes] = None) -> str: 

2918 """Add a .keep file for the pack, preventing git from garbage collecting it. 

2919 

2920 Args: 

2921 msg: A message written inside the .keep file; can be used later 

2922 to determine whether or not a .keep file is obsolete. 

2923 Returns: The path of the .keep file, as a string. 

2924 """ 

2925 keepfile_name = f"{self._basename}.keep" 

2926 with GitFile(keepfile_name, "wb") as keepfile: 

2927 if msg: 

2928 keepfile.write(msg) 

2929 keepfile.write(b"\n") 

2930 return keepfile_name 

2931 

2932 def get_ref(self, sha: bytes) -> tuple[Optional[int], int, OldUnpackedObject]: 

2933 """Get the object for a ref SHA, only looking in this pack.""" 

2934 # TODO: cache these results 

2935 try: 

2936 offset = self.index.object_offset(sha) 

2937 except KeyError: 

2938 offset = None 

2939 if offset: 

2940 type, obj = self.data.get_object_at(offset) 

2941 elif self.resolve_ext_ref: 

2942 type, obj = self.resolve_ext_ref(sha) 

2943 else: 

2944 raise KeyError(sha) 

2945 return offset, type, obj 

2946 

2947 def resolve_object( 

2948 self, offset: int, type: int, obj, get_ref=None 

2949 ) -> tuple[int, Iterable[bytes]]: 

2950 """Resolve an object, possibly resolving deltas when necessary. 

2951 

2952 Returns: Tuple with object type and contents. 

2953 """ 

2954 # Walk down the delta chain, building a stack of deltas to reach 

2955 # the requested object. 

2956 base_offset = offset 

2957 base_type = type 

2958 base_obj = obj 

2959 delta_stack = [] 

2960 while base_type in DELTA_TYPES: 

2961 prev_offset = base_offset 

2962 if get_ref is None: 

2963 get_ref = self.get_ref 

2964 if base_type == OFS_DELTA: 

2965 (delta_offset, delta) = base_obj 

2966 # TODO: clean up asserts and replace with nicer error messages 

2967 base_offset = base_offset - delta_offset 

2968 base_type, base_obj = self.data.get_object_at(base_offset) 

2969 assert isinstance(base_type, int) 

2970 elif base_type == REF_DELTA: 

2971 (basename, delta) = base_obj 

2972 assert isinstance(basename, bytes) and len(basename) == 20 

2973 base_offset, base_type, base_obj = get_ref(basename) 

2974 assert isinstance(base_type, int) 

2975 if base_offset == prev_offset: # object is based on itself 

2976 raise UnresolvedDeltas(sha_to_hex(basename)) 

2977 delta_stack.append((prev_offset, base_type, delta)) 

2978 

2979 # Now grab the base object (mustn't be a delta) and apply the 

2980 # deltas all the way up the stack. 

2981 chunks = base_obj 

2982 for prev_offset, delta_type, delta in reversed(delta_stack): 

2983 chunks = apply_delta(chunks, delta) 

2984 # TODO(dborowitz): This can result in poor performance if 

2985 # large base objects are separated from deltas in the pack. 

2986 # We should reorganize so that we apply deltas to all 

2987 # objects in a chain one after the other to optimize cache 

2988 # performance. 

2989 if prev_offset is not None: 

2990 self.data._offset_cache[prev_offset] = base_type, chunks 

2991 return base_type, chunks 

2992 

2993 def entries( 

2994 self, progress: Optional[ProgressFn] = None 

2995 ) -> Iterator[PackIndexEntry]: 

2996 """Yield entries summarizing the contents of this pack. 

2997 

2998 Args: 

2999 progress: Progress function, called with current and total 

3000 object count. 

3001 Returns: iterator of tuples with (sha, offset, crc32) 

3002 """ 

3003 return self.data.iterentries( 

3004 progress=progress, resolve_ext_ref=self.resolve_ext_ref 

3005 ) 

3006 

3007 def sorted_entries( 

3008 self, progress: Optional[ProgressFn] = None 

3009 ) -> Iterator[PackIndexEntry]: 

3010 """Return entries in this pack, sorted by SHA. 

3011 

3012 Args: 

3013 progress: Progress function, called with current and total 

3014 object count 

3015 Returns: Iterator of tuples with (sha, offset, crc32) 

3016 """ 

3017 return self.data.sorted_entries( 

3018 progress=progress, resolve_ext_ref=self.resolve_ext_ref 

3019 ) 

3020 

3021 def get_unpacked_object( 

3022 self, sha: bytes, *, include_comp: bool = False, convert_ofs_delta: bool = True 

3023 ) -> UnpackedObject: 

3024 """Get the unpacked object for a sha. 

3025 

3026 Args: 

3027 sha: SHA of object to fetch 

3028 include_comp: Whether to include compression data in UnpackedObject 

3029 """ 

3030 offset = self.index.object_offset(sha) 

3031 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp) 

3032 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta: 

3033 assert isinstance(unpacked.delta_base, int) 

3034 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base) 

3035 unpacked.pack_type_num = REF_DELTA 

3036 return unpacked 

3037 

3038 

3039def extend_pack( 

3040 f: BinaryIO, 

3041 object_ids: set[ObjectID], 

3042 get_raw, 

3043 *, 

3044 compression_level=-1, 

3045 progress=None, 

3046) -> tuple[bytes, list]: 

3047 """Extend a pack file with more objects. 

3048 

3049 The caller should make sure that object_ids does not contain any objects 

3050 that are already in the pack 

3051 """ 

3052 # Update the header with the new number of objects. 

3053 f.seek(0) 

3054 _version, num_objects = read_pack_header(f.read) 

3055 

3056 if object_ids: 

3057 f.seek(0) 

3058 write_pack_header(f.write, num_objects + len(object_ids)) 

3059 

3060 # Must flush before reading (http://bugs.python.org/issue3207) 

3061 f.flush() 

3062 

3063 # Rescan the rest of the pack, computing the SHA with the new header. 

3064 new_sha = compute_file_sha(f, end_ofs=-20) 

3065 

3066 # Must reposition before writing (http://bugs.python.org/issue3207) 

3067 f.seek(0, os.SEEK_CUR) 

3068 

3069 extra_entries = [] 

3070 

3071 # Complete the pack. 

3072 for i, object_id in enumerate(object_ids): 

3073 if progress is not None: 

3074 progress( 

3075 (f"writing extra base objects: {i}/{len(object_ids)}\r").encode("ascii") 

3076 ) 

3077 assert len(object_id) == 20 

3078 type_num, data = get_raw(object_id) 

3079 offset = f.tell() 

3080 crc32 = write_pack_object( 

3081 f.write, 

3082 type_num, 

3083 data, 

3084 sha=new_sha, 

3085 compression_level=compression_level, 

3086 ) 

3087 extra_entries.append((object_id, offset, crc32)) 

3088 pack_sha = new_sha.digest() 

3089 f.write(pack_sha) 

3090 return pack_sha, extra_entries 

3091 

3092 

3093try: 

3094 from dulwich._pack import ( # type: ignore 

3095 apply_delta, # type: ignore 

3096 bisect_find_sha, # type: ignore 

3097 ) 

3098except ImportError: 

3099 pass