Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/pack.py: 24%

1# pack.py -- For dealing with packed git objects.

5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later

6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU

7# General Public License as published by the Free Software Foundation; version 2.0

8# or (at your option) any later version. You can redistribute it and/or

9# modify it under the terms of either of these two licenses.

10#

11# Unless required by applicable law or agreed to in writing, software

12# distributed under the License is distributed on an "AS IS" BASIS,

13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14# See the License for the specific language governing permissions and

15# limitations under the License.

16#

17# You should have received a copy of the licenses; if not, see

18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License

19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache

20# License, Version 2.0.

21#

23"""Classes for dealing with packed git objects.

25A pack is a compact representation of a bunch of objects, stored

26using deltas where possible.

28They have two parts, the pack file, which stores the data, and an index

29that tells you where the data is.

31To find an object you look in all of the index files 'til you find a

32match for the object name. You then use the pointer got from this as

33a pointer in to the corresponding packfile.

34"""

36import binascii

37from collections import defaultdict, deque

38from contextlib import suppress

39from io import BytesIO, UnsupportedOperation

41try:

42 from cdifflib import CSequenceMatcher as SequenceMatcher

43except ModuleNotFoundError:

44 from difflib import SequenceMatcher

46import os

47import struct

48import sys

49import warnings

50import zlib

51from collections.abc import Iterable, Iterator, Sequence

52from hashlib import sha1

53from itertools import chain

54from os import SEEK_CUR, SEEK_END

55from struct import unpack_from

56from typing import (

57 IO,

58 TYPE_CHECKING,

59 Any,

60 BinaryIO,

61 Callable,

62 Generic,

63 Optional,

64 Protocol,

65 TypeVar,

66 Union,

67)

69try:

70 import mmap

71except ImportError:

72 has_mmap = False

73else:

74 has_mmap = True

76if TYPE_CHECKING:

77 from .commit_graph import CommitGraph

79# For some reason the above try, except fails to set has_mmap = False for plan9

80if sys.platform == "Plan9":

81 has_mmap = False

83from . import replace_me

84from .errors import ApplyDeltaError, ChecksumMismatch

85from .file import GitFile, _GitFile

86from .lru_cache import LRUSizeCache

87from .objects import ObjectID, ShaFile, hex_to_sha, object_header, sha_to_hex

89OFS_DELTA = 6

90REF_DELTA = 7

92DELTA_TYPES = (OFS_DELTA, REF_DELTA)

95DEFAULT_PACK_DELTA_WINDOW_SIZE = 10

97# Keep pack files under 16Mb in memory, otherwise write them out to disk

98PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024

100# Default pack index version to use when none is specified

101DEFAULT_PACK_INDEX_VERSION = 2

102

103

104OldUnpackedObject = Union[tuple[Union[bytes, int], list[bytes]], list[bytes]]

105ResolveExtRefFn = Callable[[bytes], tuple[int, OldUnpackedObject]]

106ProgressFn = Callable[[int, str], None]

107PackHint = tuple[int, Optional[bytes]]

108

109

110class UnresolvedDeltas(Exception):

111 """Delta objects could not be resolved."""

112

113 def __init__(self, shas: list[bytes]) -> None:

114 self.shas = shas

115

116

117class ObjectContainer(Protocol):

118 def add_object(self, obj: ShaFile) -> None:

119 """Add a single object to this object store."""

120

121 def add_objects(

122 self,

123 objects: Sequence[tuple[ShaFile, Optional[str]]],

124 progress: Optional[Callable[[str], None]] = None,

125 ) -> None:

126 """Add a set of objects to this object store.

127

128 Args:

129 objects: Iterable over a list of (object, path) tuples

130 """

131

132 def __contains__(self, sha1: bytes) -> bool:

133 """Check if a hex sha is present."""

134

135 def __getitem__(self, sha1: bytes) -> ShaFile:

136 """Retrieve an object."""

137

138 def get_commit_graph(self) -> Optional["CommitGraph"]:

139 """Get the commit graph for this object store.

140

141 Returns:

142 CommitGraph object if available, None otherwise

143 """

144 return None

145

146

147class PackedObjectContainer(ObjectContainer):

148 def get_unpacked_object(

149 self, sha1: bytes, *, include_comp: bool = False

150 ) -> "UnpackedObject":

151 """Get a raw unresolved object."""

152 raise NotImplementedError(self.get_unpacked_object)

153

154 def iterobjects_subset(

155 self, shas: Iterable[bytes], *, allow_missing: bool = False

156 ) -> Iterator[ShaFile]:

157 raise NotImplementedError(self.iterobjects_subset)

158

159 def iter_unpacked_subset(

160 self,

161 shas: set[bytes],

162 include_comp: bool = False,

163 allow_missing: bool = False,

164 convert_ofs_delta: bool = True,

165 ) -> Iterator["UnpackedObject"]:

166 raise NotImplementedError(self.iter_unpacked_subset)

167

168

169class UnpackedObjectStream:

170 """Abstract base class for a stream of unpacked objects."""

171

172 def __iter__(self) -> Iterator["UnpackedObject"]:

173 raise NotImplementedError(self.__iter__)

174

175 def __len__(self) -> int:

176 raise NotImplementedError(self.__len__)

177

178

179def take_msb_bytes(

180 read: Callable[[int], bytes], crc32: Optional[int] = None

181) -> tuple[list[int], Optional[int]]:

182 """Read bytes marked with most significant bit.

183

184 Args:

185 read: Read function

186 """

187 ret: list[int] = []

188 while len(ret) == 0 or ret[-1] & 0x80:

189 b = read(1)

190 if crc32 is not None:

191 crc32 = binascii.crc32(b, crc32)

192 ret.append(ord(b[:1]))

193 return ret, crc32

194

195

196class PackFileDisappeared(Exception):

197 """Raised when a pack file unexpectedly disappears."""

198

199 def __init__(self, obj: object) -> None:

200 self.obj = obj

201

202

203class UnpackedObject:

204 """Class encapsulating an object unpacked from a pack file.

205

206 These objects should only be created from within unpack_object. Most

207 members start out as empty and are filled in at various points by

208 read_zlib_chunks, unpack_object, DeltaChainIterator, etc.

209

210 End users of this object should take care that the function they're getting

211 this object from is guaranteed to set the members they need.

212 """

213

214 __slots__ = [

215 "_sha", # Cached binary SHA.

216 "comp_chunks", # Compressed object chunks.

217 "crc32", # CRC32.

218 "decomp_chunks", # Decompressed object chunks.

219 "decomp_len", # Decompressed length of this object.

220 "delta_base", # Delta base offset or SHA.

221 "obj_chunks", # Decompressed and delta-resolved chunks.

222 "obj_type_num", # Type of this object.

223 "offset", # Offset in its pack.

224 "pack_type_num", # Type of this object in the pack (may be a delta).

225 ]

226

227 obj_type_num: Optional[int]

228 obj_chunks: Optional[list[bytes]]

229 delta_base: Union[None, bytes, int]

230 decomp_chunks: list[bytes]

231 comp_chunks: Optional[list[bytes]]

232 decomp_len: Optional[int]

233 crc32: Optional[int]

234 offset: Optional[int]

235 pack_type_num: int

236 _sha: Optional[bytes]

237

238 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be

239 # methods of this object.

240 def __init__(

241 self,

242 pack_type_num: int,

243 *,

244 delta_base: Union[None, bytes, int] = None,

245 decomp_len: Optional[int] = None,

246 crc32: Optional[int] = None,

247 sha: Optional[bytes] = None,

248 decomp_chunks: Optional[list[bytes]] = None,

249 offset: Optional[int] = None,

250 ) -> None:

251 self.offset = offset

252 self._sha = sha

253 self.pack_type_num = pack_type_num

254 self.delta_base = delta_base

255 self.comp_chunks = None

256 self.decomp_chunks: list[bytes] = decomp_chunks or []

257 if decomp_chunks is not None and decomp_len is None:

258 self.decomp_len = sum(map(len, decomp_chunks))

259 else:

260 self.decomp_len = decomp_len

261 self.crc32 = crc32

262

263 if pack_type_num in DELTA_TYPES:

264 self.obj_type_num = None

265 self.obj_chunks = None

266 else:

267 self.obj_type_num = pack_type_num

268 self.obj_chunks = self.decomp_chunks

269 self.delta_base = delta_base

270

271 def sha(self) -> bytes:

272 """Return the binary SHA of this object."""

273 if self._sha is None:

274 self._sha = obj_sha(self.obj_type_num, self.obj_chunks)

275 return self._sha

276

277 def sha_file(self) -> ShaFile:

278 """Return a ShaFile from this object."""

279 assert self.obj_type_num is not None and self.obj_chunks is not None

280 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)

281

282 # Only provided for backwards compatibility with code that expects either

283 # chunks or a delta tuple.

284 def _obj(self) -> OldUnpackedObject:

285 """Return the decompressed chunks, or (delta base, delta chunks)."""

286 if self.pack_type_num in DELTA_TYPES:

287 assert isinstance(self.delta_base, (bytes, int))

288 return (self.delta_base, self.decomp_chunks)

289 else:

290 return self.decomp_chunks

291

292 def __eq__(self, other: object) -> bool:

293 if not isinstance(other, UnpackedObject):

294 return False

295 for slot in self.__slots__:

296 if getattr(self, slot) != getattr(other, slot):

297 return False

298 return True

299

300 def __ne__(self, other: object) -> bool:

301 """Check inequality with another UnpackedObject."""

302 return not (self == other)

303

304 def __repr__(self) -> str:

305 """Return string representation of this UnpackedObject."""

306 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__]

307 return "{}({})".format(self.__class__.__name__, ", ".join(data))

308

309

310_ZLIB_BUFSIZE = 65536 # 64KB buffer for better I/O performance

311

312

313def read_zlib_chunks(

314 read_some: Callable[[int], bytes],

315 unpacked: UnpackedObject,

316 include_comp: bool = False,

317 buffer_size: int = _ZLIB_BUFSIZE,

318) -> bytes:

319 """Read zlib data from a buffer.

320

321 This function requires that the buffer have additional data following the

322 compressed data, which is guaranteed to be the case for git pack files.

323

324 Args:

325 read_some: Read function that returns at least one byte, but may

326 return less than the requested size.

327 unpacked: An UnpackedObject to write result data to. If its crc32

328 attr is not None, the CRC32 of the compressed bytes will be computed

329 using this starting CRC32.

330 After this function, will have the following attrs set:

331 * comp_chunks (if include_comp is True)

332 * decomp_chunks

333 * decomp_len

334 * crc32

335 include_comp: If True, include compressed data in the result.

336 buffer_size: Size of the read buffer.

337 Returns: Leftover unused data from the decompression.

338

339 Raises:

340 zlib.error: if a decompression error occurred.

341 """

342 if unpacked.decomp_len is None or unpacked.decomp_len <= -1:

343 raise ValueError("non-negative zlib data stream size expected")

344 decomp_obj = zlib.decompressobj()

345

346 comp_chunks = []

347 decomp_chunks = unpacked.decomp_chunks

348 decomp_len = 0

349 crc32 = unpacked.crc32

350

351 while True:

352 add = read_some(buffer_size)

353 if not add:

354 raise zlib.error("EOF before end of zlib stream")

355 comp_chunks.append(add)

356 decomp = decomp_obj.decompress(add)

357 decomp_len += len(decomp)

358 decomp_chunks.append(decomp)

359 unused = decomp_obj.unused_data

360 if unused:

361 left = len(unused)

362 if crc32 is not None:

363 crc32 = binascii.crc32(add[:-left], crc32)

364 if include_comp:

365 comp_chunks[-1] = add[:-left]

366 break

367 elif crc32 is not None:

368 crc32 = binascii.crc32(add, crc32)

369 if crc32 is not None:

370 crc32 &= 0xFFFFFFFF

371

372 if decomp_len != unpacked.decomp_len:

373 raise zlib.error("decompressed data does not match expected size")

374

375 unpacked.crc32 = crc32

376 if include_comp:

377 unpacked.comp_chunks = comp_chunks

378 return unused

379

380

381def iter_sha1(iter: Iterable[bytes]) -> bytes:

382 """Return the hexdigest of the SHA1 over a set of names.

383

384 Args:

385 iter: Iterator over string objects

386 Returns: 40-byte hex sha1 digest

387 """

388 sha = sha1()

389 for name in iter:

390 sha.update(name)

391 return sha.hexdigest().encode("ascii")

392

393

394def load_pack_index(path: Union[str, os.PathLike]) -> "PackIndex":

395 """Load an index file by path.

396

397 Args:

398 path: Path to the index file

399 Returns: A PackIndex loaded from the given path

400 """

401 with GitFile(path, "rb") as f:

402 return load_pack_index_file(path, f)

403

404

405def _load_file_contents(

406 f: Union[IO[bytes], _GitFile], size: Optional[int] = None

407) -> tuple[Union[bytes, Any], int]:

408 """Load contents from a file, preferring mmap when possible.

409

410 Args:

411 f: File-like object to load

412 size: Expected size, or None to determine from file

413 Returns: Tuple of (contents, size)

414 """

415 try:

416 fd = f.fileno()

417 except (UnsupportedOperation, AttributeError):

418 fd = None

419 # Attempt to use mmap if possible

420 if fd is not None:

421 if size is None:

422 size = os.fstat(fd).st_size

423 if has_mmap:

424 try:

425 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)

426 except (OSError, ValueError):

427 # Can't mmap - perhaps a socket or invalid file descriptor

428 pass

429 else:

430 return contents, size

431 contents_bytes = f.read()

432 size = len(contents_bytes)

433 return contents_bytes, size

434

435

436def load_pack_index_file(

437 path: Union[str, os.PathLike], f: Union[IO[bytes], _GitFile]

438) -> "PackIndex":

439 """Load an index file from a file-like object.

440

441 Args:

442 path: Path for the index file

443 f: File-like object

444 Returns: A PackIndex loaded from the given file

445 """

446 contents, size = _load_file_contents(f)

447 if contents[:4] == b"\377tOc":

448 version = struct.unpack(b">L", contents[4:8])[0]

449 if version == 2:

450 return PackIndex2(path, file=f, contents=contents, size=size)

451 elif version == 3:

452 return PackIndex3(path, file=f, contents=contents, size=size)

453 else:

454 raise KeyError(f"Unknown pack index format {version}")

455 else:

456 return PackIndex1(path, file=f, contents=contents, size=size)

457

458

459def bisect_find_sha(

460 start: int, end: int, sha: bytes, unpack_name: Callable[[int], bytes]

461) -> Optional[int]:

462 """Find a SHA in a data blob with sorted SHAs.

463

464 Args:

465 start: Start index of range to search

466 end: End index of range to search

467 sha: Sha to find

468 unpack_name: Callback to retrieve SHA by index

469 Returns: Index of the SHA, or None if it wasn't found

470 """

471 assert start <= end

472 while start <= end:

473 i = (start + end) // 2

474 file_sha = unpack_name(i)

475 if file_sha < sha:

476 start = i + 1

477 elif file_sha > sha:

478 end = i - 1

479 else:

480 return i

481 return None

482

483

484PackIndexEntry = tuple[bytes, int, Optional[int]]

485

486

487class PackIndex:

488 """An index in to a packfile.

489

490 Given a sha id of an object a pack index can tell you the location in the

491 packfile of that object if it has it.

492 """

493

494 # Default to SHA-1 for backward compatibility

495 hash_algorithm = 1

496 hash_size = 20

497

498 def __eq__(self, other: object) -> bool:

499 if not isinstance(other, PackIndex):

500 return False

501

502 for (name1, _, _), (name2, _, _) in zip(

503 self.iterentries(), other.iterentries()

504 ):

505 if name1 != name2:

506 return False

507 return True

508

509 def __ne__(self, other: object) -> bool:

510 """Check if this pack index is not equal to another."""

511 return not self.__eq__(other)

512

513 def __len__(self) -> int:

514 """Return the number of entries in this pack index."""

515 raise NotImplementedError(self.__len__)

516

517 def __iter__(self) -> Iterator[bytes]:

518 """Iterate over the SHAs in this pack."""

519 return map(sha_to_hex, self._itersha())

520

521 def iterentries(self) -> Iterator[PackIndexEntry]:

522 """Iterate over the entries in this pack index.

523

524 Returns: iterator over tuples with object name, offset in packfile and

525 crc32 checksum.

526 """

527 raise NotImplementedError(self.iterentries)

528

529 def get_pack_checksum(self) -> Optional[bytes]:

530 """Return the SHA1 checksum stored for the corresponding packfile.

531

532 Returns: 20-byte binary digest, or None if not available

533 """

534 raise NotImplementedError(self.get_pack_checksum)

535

536 @replace_me(since="0.21.0", remove_in="0.23.0")

537 def object_index(self, sha: bytes) -> int:

538 return self.object_offset(sha)

539

540 def object_offset(self, sha: bytes) -> int:

541 """Return the offset in to the corresponding packfile for the object.

542

543 Given the name of an object it will return the offset that object

544 lives at within the corresponding pack file. If the pack file doesn't

545 have the object then None will be returned.

546 """

547 raise NotImplementedError(self.object_offset)

548

549 def object_sha1(self, index: int) -> bytes:

550 """Return the SHA1 corresponding to the index in the pack file."""

551 for name, offset, _crc32 in self.iterentries():

552 if offset == index:

553 return name

554 else:

555 raise KeyError(index)

556

557 def _object_offset(self, sha: bytes) -> int:

558 """See object_offset.

559

560 Args:

561 sha: A *binary* SHA string. (20 characters long)_

562 """

563 raise NotImplementedError(self._object_offset)

564

565 def objects_sha1(self) -> bytes:

566 """Return the hex SHA1 over all the shas of all objects in this pack.

567

568 Note: This is used for the filename of the pack.

569 """

570 return iter_sha1(self._itersha())

571

572 def _itersha(self) -> Iterator[bytes]:

573 """Yield all the SHA1's of the objects in the index, sorted."""

574 raise NotImplementedError(self._itersha)

575

576 def close(self) -> None:

577 """Close any open files."""

578

579 def check(self) -> None:

580 """Check the consistency of this pack index."""

581

582

583class MemoryPackIndex(PackIndex):

584 """Pack index that is stored entirely in memory."""

585

586 def __init__(

587 self,

588 entries: list[tuple[bytes, int, Optional[int]]],

589 pack_checksum: Optional[bytes] = None,

590 ) -> None:

591 """Create a new MemoryPackIndex.

592

593 Args:

594 entries: Sequence of name, idx, crc32 (sorted)

595 pack_checksum: Optional pack checksum

596 """

597 self._by_sha = {}

598 self._by_offset = {}

599 for name, offset, _crc32 in entries:

600 self._by_sha[name] = offset

601 self._by_offset[offset] = name

602 self._entries = entries

603 self._pack_checksum = pack_checksum

604

605 def get_pack_checksum(self) -> Optional[bytes]:

606 """Return the SHA checksum stored for the corresponding packfile."""

607 return self._pack_checksum

608

609 def __len__(self) -> int:

610 """Return the number of entries in this pack index."""

611 return len(self._entries)

612

613 def object_offset(self, sha: bytes) -> int:

614 """Return the offset for the given SHA.

615

616 Args:

617 sha: SHA to look up (binary or hex)

618 Returns: Offset in the pack file

619 """

620 if len(sha) == 40:

621 sha = hex_to_sha(sha)

622 return self._by_sha[sha]

623

624 def object_sha1(self, offset: int) -> bytes:

625 """Return the SHA1 for the object at the given offset."""

626 return self._by_offset[offset]

627

628 def _itersha(self) -> Iterator[bytes]:

629 """Iterate over all SHA1s in the index."""

630 return iter(self._by_sha)

631

632 def iterentries(self) -> Iterator[PackIndexEntry]:

633 """Iterate over all index entries."""

634 return iter(self._entries)

635

636 @classmethod

637 def for_pack(cls, pack_data: "PackData") -> "MemoryPackIndex":

638 """Create a MemoryPackIndex from a PackData object."""

639 return MemoryPackIndex(

640 list(pack_data.sorted_entries()), pack_data.get_stored_checksum()

641 )

642

643 @classmethod

644 def clone(cls, other_index: "PackIndex") -> "MemoryPackIndex":

645 """Create a copy of another PackIndex in memory."""

646 return cls(list(other_index.iterentries()), other_index.get_pack_checksum())

647

648

649class FilePackIndex(PackIndex):

650 """Pack index that is based on a file.

651

652 To do the loop it opens the file, and indexes first 256 4 byte groups

653 with the first byte of the sha id. The value in the four byte group indexed

654 is the end of the group that shares the same starting byte. Subtract one

655 from the starting byte and index again to find the start of the group.

656 The values are sorted by sha id within the group, so do the math to find

657 the start and end offset and then bisect in to find if the value is

658 present.

659 """

660

661 _fan_out_table: list[int]

662

663 def __init__(

664 self,

665 filename: Union[str, os.PathLike],

666 file: Optional[BinaryIO] = None,

667 contents: Optional[Union[bytes, "mmap.mmap"]] = None,

668 size: Optional[int] = None,

669 ) -> None:

670 """Create a pack index object.

671

672 Provide it with the name of the index file to consider, and it will map

673 it whenever required.

674 """

675 self._filename = filename

676 # Take the size now, so it can be checked each time we map the file to

677 # ensure that it hasn't changed.

678 if file is None:

679 self._file = GitFile(filename, "rb")

680 else:

681 self._file = file

682 if contents is None:

683 self._contents, self._size = _load_file_contents(self._file, size)

684 else:

685 self._contents = contents

686 self._size = size if size is not None else len(contents)

687

688 @property

689 def path(self) -> str:

690 """Return the path to this index file."""

691 return os.fspath(self._filename)

692

693 def __eq__(self, other: object) -> bool:

694 # Quick optimization:

695 if (

696 isinstance(other, FilePackIndex)

697 and self._fan_out_table != other._fan_out_table

698 ):

699 return False

700

701 return super().__eq__(other)

702

703 def close(self) -> None:

704 """Close the underlying file and any mmap."""

705 self._file.close()

706 close_fn = getattr(self._contents, "close", None)

707 if close_fn is not None:

708 close_fn()

709

710 def __len__(self) -> int:

711 """Return the number of entries in this pack index."""

712 return self._fan_out_table[-1]

713

714 def _unpack_entry(self, i: int) -> PackIndexEntry:

715 """Unpack the i-th entry in the index file.

716

717 Returns: Tuple with object name (SHA), offset in pack file and CRC32

718 checksum (if known).

719 """

720 raise NotImplementedError(self._unpack_entry)

721

722 def _unpack_name(self, i) -> bytes:

723 """Unpack the i-th name from the index file."""

724 raise NotImplementedError(self._unpack_name)

725

726 def _unpack_offset(self, i) -> int:

727 """Unpack the i-th object offset from the index file."""

728 raise NotImplementedError(self._unpack_offset)

729

730 def _unpack_crc32_checksum(self, i) -> Optional[int]:

731 """Unpack the crc32 checksum for the ith object from the index file."""

732 raise NotImplementedError(self._unpack_crc32_checksum)

733

734 def _itersha(self) -> Iterator[bytes]:

735 """Iterate over all SHA1s in the index."""

736 for i in range(len(self)):

737 yield self._unpack_name(i)

738

739 def iterentries(self) -> Iterator[PackIndexEntry]:

740 """Iterate over the entries in this pack index.

741

742 Returns: iterator over tuples with object name, offset in packfile and

743 crc32 checksum.

744 """

745 for i in range(len(self)):

746 yield self._unpack_entry(i)

747

748 def _read_fan_out_table(self, start_offset: int) -> list[int]:

749 """Read the fan-out table from the index.

750

751 The fan-out table contains 256 entries mapping first byte values

752 to the number of objects with SHA1s less than or equal to that byte.

753

754 Args:

755 start_offset: Offset in the file where the fan-out table starts

756 Returns: List of 256 integers

757 """

758 ret = []

759 for i in range(0x100):

760 fanout_entry = self._contents[

761 start_offset + i * 4 : start_offset + (i + 1) * 4

762 ]

763 ret.append(struct.unpack(">L", fanout_entry)[0])

764 return ret

765

766 def check(self) -> None:

767 """Check that the stored checksum matches the actual checksum."""

768 actual = self.calculate_checksum()

769 stored = self.get_stored_checksum()

770 if actual != stored:

771 raise ChecksumMismatch(stored, actual)

772

773 def calculate_checksum(self) -> bytes:

774 """Calculate the SHA1 checksum over this pack index.

775

776 Returns: This is a 20-byte binary digest

777 """

778 return sha1(self._contents[:-20]).digest()

779

780 def get_pack_checksum(self) -> bytes:

781 """Return the SHA1 checksum stored for the corresponding packfile.

782

783 Returns: 20-byte binary digest

784 """

785 return bytes(self._contents[-40:-20])

786

787 def get_stored_checksum(self) -> bytes:

788 """Return the SHA1 checksum stored for this index.

789

790 Returns: 20-byte binary digest

791 """

792 return bytes(self._contents[-20:])

793

794 def object_offset(self, sha: bytes) -> int:

795 """Return the offset in to the corresponding packfile for the object.

796

797 Given the name of an object it will return the offset that object

798 lives at within the corresponding pack file. If the pack file doesn't

799 have the object then None will be returned.

800 """

801 if len(sha) == 40:

802 sha = hex_to_sha(sha)

803 try:

804 return self._object_offset(sha)

805 except ValueError as exc:

806 closed = getattr(self._contents, "closed", None)

807 if closed in (None, True):

808 raise PackFileDisappeared(self) from exc

809 raise

810

811 def _object_offset(self, sha: bytes) -> int:

812 """See object_offset.

813

814 Args:

815 sha: A *binary* SHA string. (20 characters long)_

816 """

817 assert len(sha) == 20

818 idx = ord(sha[:1])

819 if idx == 0:

820 start = 0

821 else:

822 start = self._fan_out_table[idx - 1]

823 end = self._fan_out_table[idx]

824 i = bisect_find_sha(start, end, sha, self._unpack_name)

825 if i is None:

826 raise KeyError(sha)

827 return self._unpack_offset(i)

828

829 def iter_prefix(self, prefix: bytes) -> Iterator[bytes]:

830 """Iterate over all SHA1s with the given prefix."""

831 start = ord(prefix[:1])

832 if start == 0:

833 start = 0

834 else:

835 start = self._fan_out_table[start - 1]

836 end = ord(prefix[:1]) + 1

837 if end == 0x100:

838 end = len(self)

839 else:

840 end = self._fan_out_table[end]

841 assert start <= end

842 started = False

843 for i in range(start, end):

844 name: bytes = self._unpack_name(i)

845 if name.startswith(prefix):

846 yield name

847 started = True

848 elif started:

849 break

850

851

852class PackIndex1(FilePackIndex):

853 """Version 1 Pack Index file."""

854

855 def __init__(

856 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None

857 ) -> None:

858 super().__init__(filename, file, contents, size)

859 self.version = 1

860 self._fan_out_table = self._read_fan_out_table(0)

861

862 def _unpack_entry(self, i):

863 """Unpack the i-th entry from the v1 index."""

864 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24))

865 return (name, offset, None)

866

867 def _unpack_name(self, i):

868 """Unpack the i-th SHA1 from the v1 index."""

869 offset = (0x100 * 4) + (i * 24) + 4

870 return self._contents[offset : offset + 20]

871

872 def _unpack_offset(self, i):

873 """Unpack the i-th offset from the v1 index."""

874 offset = (0x100 * 4) + (i * 24)

875 return unpack_from(">L", self._contents, offset)[0]

876

877 def _unpack_crc32_checksum(self, i) -> None:

878 """Return None as v1 indexes don't store CRC32 checksums."""

879 # Not stored in v1 index files

880 return None

881

882

883class PackIndex2(FilePackIndex):

884 """Version 2 Pack Index file."""

885

886 def __init__(

887 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None

888 ) -> None:

889 super().__init__(filename, file, contents, size)

890 if self._contents[:4] != b"\377tOc":

891 raise AssertionError("Not a v2 pack index file")

892 (self.version,) = unpack_from(b">L", self._contents, 4)

893 if self.version != 2:

894 raise AssertionError(f"Version was {self.version}")

895 self._fan_out_table = self._read_fan_out_table(8)

896 self._name_table_offset = 8 + 0x100 * 4

897 self._crc32_table_offset = self._name_table_offset + 20 * len(self)

898 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)

899 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(

900 self

901 )

902

903 def _unpack_entry(self, i):

904 """Unpack the i-th entry from the v2 index."""

905 return (

906 self._unpack_name(i),

907 self._unpack_offset(i),

908 self._unpack_crc32_checksum(i),

909 )

910

911 def _unpack_name(self, i):

912 """Unpack the i-th SHA1 from the v2 index."""

913 offset = self._name_table_offset + i * 20

914 return self._contents[offset : offset + 20]

915

916 def _unpack_offset(self, i):

917 """Unpack the i-th offset from the v2 index.

918

919 Handles large offsets (>2GB) by reading from the large offset table.

920 """

921 offset = self._pack_offset_table_offset + i * 4

922 offset = unpack_from(">L", self._contents, offset)[0]

923 if offset & (2**31):

924 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8

925 offset = unpack_from(">Q", self._contents, offset)[0]

926 return offset

927

928 def _unpack_crc32_checksum(self, i):

929 """Unpack the i-th CRC32 checksum from the v2 index."""

930 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]

931

932

933class PackIndex3(FilePackIndex):

934 """Version 3 Pack Index file.

935

936 Supports variable hash sizes for SHA-1 (20 bytes) and SHA-256 (32 bytes).

937 """

938

939 def __init__(

940 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None

941 ) -> None:

942 super().__init__(filename, file, contents, size)

943 if self._contents[:4] != b"\377tOc":

944 raise AssertionError("Not a v3 pack index file")

945 (self.version,) = unpack_from(b">L", self._contents, 4)

946 if self.version != 3:

947 raise AssertionError(f"Version was {self.version}")

948

949 # Read hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

950 (self.hash_algorithm,) = unpack_from(b">L", self._contents, 8)

951 if self.hash_algorithm == 1:

952 self.hash_size = 20 # SHA-1

953 elif self.hash_algorithm == 2:

954 self.hash_size = 32 # SHA-256

955 else:

956 raise AssertionError(f"Unknown hash algorithm {self.hash_algorithm}")

957

958 # Read length of shortened object names

959 (self.shortened_oid_len,) = unpack_from(b">L", self._contents, 12)

960

961 # Calculate offsets based on variable hash size

962 self._fan_out_table = self._read_fan_out_table(

963 16

964 ) # After header (4 + 4 + 4 + 4)

965 self._name_table_offset = 16 + 0x100 * 4

966 self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self)

967 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)

968 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(

969 self

970 )

971

972 def _unpack_entry(self, i):

973 return (

974 self._unpack_name(i),

975 self._unpack_offset(i),

976 self._unpack_crc32_checksum(i),

977 )

978

979 def _unpack_name(self, i):

980 offset = self._name_table_offset + i * self.hash_size

981 return self._contents[offset : offset + self.hash_size]

982

983 def _unpack_offset(self, i):

984 offset = self._pack_offset_table_offset + i * 4

985 offset = unpack_from(">L", self._contents, offset)[0]

986 if offset & (2**31):

987 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8

988 offset = unpack_from(">Q", self._contents, offset)[0]

989 return offset

990

991 def _unpack_crc32_checksum(self, i):

992 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]

993

994

995def read_pack_header(read) -> tuple[int, int]:

996 """Read the header of a pack file.

997

998 Args:

999 read: Read function

1000 Returns: Tuple of (pack version, number of objects). If no data is

1001 available to read, returns (None, None).

1002 """

1003 header = read(12)

1004 if not header:

1005 raise AssertionError("file too short to contain pack")

1006 if header[:4] != b"PACK":

1007 raise AssertionError(f"Invalid pack header {header!r}")

1008 (version,) = unpack_from(b">L", header, 4)

1009 if version not in (2, 3):

1010 raise AssertionError(f"Version was {version}")

1011 (num_objects,) = unpack_from(b">L", header, 8)

1012 return (version, num_objects)

1013

1014

1015def chunks_length(chunks: Union[bytes, Iterable[bytes]]) -> int:

1016 """Get the total length of a sequence of chunks.

1017

1018 Args:

1019 chunks: Either a single bytes object or an iterable of bytes

1020 Returns: Total length in bytes

1021 """

1022 if isinstance(chunks, bytes):

1023 return len(chunks)

1024 else:

1025 return sum(map(len, chunks))

1026

1027

1028def unpack_object(

1029 read_all: Callable[[int], bytes],

1030 read_some: Optional[Callable[[int], bytes]] = None,

1031 compute_crc32=False,

1032 include_comp=False,

1033 zlib_bufsize=_ZLIB_BUFSIZE,

1034) -> tuple[UnpackedObject, bytes]:

1035 """Unpack a Git object.

1036

1037 Args:

1038 read_all: Read function that blocks until the number of requested

1039 bytes are read.

1040 read_some: Read function that returns at least one byte, but may not

1041 return the number of bytes requested.

1042 compute_crc32: If True, compute the CRC32 of the compressed data. If

1043 False, the returned CRC32 will be None.

1044 include_comp: If True, include compressed data in the result.

1045 zlib_bufsize: An optional buffer size for zlib operations.

1046 Returns: A tuple of (unpacked, unused), where unused is the unused data

1047 leftover from decompression, and unpacked in an UnpackedObject with

1048 the following attrs set:

1049

1050 * obj_chunks (for non-delta types)

1051 * pack_type_num

1052 * delta_base (for delta types)

1053 * comp_chunks (if include_comp is True)

1054 * decomp_chunks

1055 * decomp_len

1056 * crc32 (if compute_crc32 is True)

1057 """

1058 if read_some is None:

1059 read_some = read_all

1060 if compute_crc32:

1061 crc32 = 0

1062 else:

1063 crc32 = None

1064

1065 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)

1066 type_num = (raw[0] >> 4) & 0x07

1067 size = raw[0] & 0x0F

1068 for i, byte in enumerate(raw[1:]):

1069 size += (byte & 0x7F) << ((i * 7) + 4)

1070

1071 delta_base: Union[int, bytes, None]

1072 raw_base = len(raw)

1073 if type_num == OFS_DELTA:

1074 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)

1075 raw_base += len(raw)

1076 if raw[-1] & 0x80:

1077 raise AssertionError

1078 delta_base_offset = raw[0] & 0x7F

1079 for byte in raw[1:]:

1080 delta_base_offset += 1

1081 delta_base_offset <<= 7

1082 delta_base_offset += byte & 0x7F

1083 delta_base = delta_base_offset

1084 elif type_num == REF_DELTA:

1085 delta_base_obj = read_all(20)

1086 if crc32 is not None:

1087 crc32 = binascii.crc32(delta_base_obj, crc32)

1088 delta_base = delta_base_obj

1089 raw_base += 20

1090 else:

1091 delta_base = None

1092

1093 unpacked = UnpackedObject(

1094 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32

1095 )

1096 unused = read_zlib_chunks(

1097 read_some,

1098 unpacked,

1099 buffer_size=zlib_bufsize,

1100 include_comp=include_comp,

1101 )

1102 return unpacked, unused

1103

1104

1105def _compute_object_size(value):

1106 """Compute the size of an unresolved object for use with LRUSizeCache.

1107

1108 Args:

1109 value: Tuple of (type_num, object_chunks)

1110 Returns: Size in bytes

1111 """

1112 (num, obj) = value

1113 if num in DELTA_TYPES:

1114 return chunks_length(obj[1])

1115 return chunks_length(obj)

1116

1117

1118class PackStreamReader:

1119 """Class to read a pack stream.

1120

1121 The pack is read from a ReceivableProtocol using read() or recv() as

1122 appropriate.

1123 """

1124

1125 def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE) -> None:

1126 self.read_all = read_all

1127 if read_some is None:

1128 self.read_some = read_all

1129 else:

1130 self.read_some = read_some

1131 self.sha = sha1()

1132 self._offset = 0

1133 self._rbuf = BytesIO()

1134 # trailer is a deque to avoid memory allocation on small reads

1135 self._trailer: deque[bytes] = deque()

1136 self._zlib_bufsize = zlib_bufsize

1137

1138 def _read(self, read, size):

1139 """Read up to size bytes using the given callback.

1140

1141 As a side effect, update the verifier's hash (excluding the last 20

1142 bytes read).

1143

1144 Args:

1145 read: The read callback to read from.

1146 size: The maximum number of bytes to read; the particular

1147 behavior is callback-specific.

1148 Returns: Bytes read

1149 """

1150 data = read(size)

1151

1152 # maintain a trailer of the last 20 bytes we've read

1153 n = len(data)

1154 self._offset += n

1155 tn = len(self._trailer)

1156 if n >= 20:

1157 to_pop = tn

1158 to_add = 20

1159 else:

1160 to_pop = max(n + tn - 20, 0)

1161 to_add = n

1162 self.sha.update(

1163 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)]))

1164 )

1165 self._trailer.extend(data[-to_add:])

1166

1167 # hash everything but the trailer

1168 self.sha.update(data[:-to_add])

1169 return data

1170

1171 def _buf_len(self):

1172 """Get the number of bytes in the read buffer."""

1173 buf = self._rbuf

1174 start = buf.tell()

1175 buf.seek(0, SEEK_END)

1176 end = buf.tell()

1177 buf.seek(start)

1178 return end - start

1179

1180 @property

1181 def offset(self):

1182 """Return the current offset in the pack stream."""

1183 return self._offset - self._buf_len()

1184

1185 def read(self, size):

1186 """Read, blocking until size bytes are read."""

1187 buf_len = self._buf_len()

1188 if buf_len >= size:

1189 return self._rbuf.read(size)

1190 buf_data = self._rbuf.read()

1191 self._rbuf = BytesIO()

1192 return buf_data + self._read(self.read_all, size - buf_len)

1193

1194 def recv(self, size):

1195 """Read up to size bytes, blocking until one byte is read."""

1196 buf_len = self._buf_len()

1197 if buf_len:

1198 data = self._rbuf.read(size)

1199 if size >= buf_len:

1200 self._rbuf = BytesIO()

1201 return data

1202 return self._read(self.read_some, size)

1203

1204 def __len__(self) -> int:

1205 return self._num_objects

1206

1207 def read_objects(self, compute_crc32=False) -> Iterator[UnpackedObject]:

1208 """Read the objects in this pack file.

1209

1210 Args:

1211 compute_crc32: If True, compute the CRC32 of the compressed

1212 data. If False, the returned CRC32 will be None.

1213 Returns: Iterator over UnpackedObjects with the following members set:

1214 offset

1215 obj_type_num

1216 obj_chunks (for non-delta types)

1217 delta_base (for delta types)

1218 decomp_chunks

1219 decomp_len

1220 crc32 (if compute_crc32 is True)

1221

1222 Raises:

1223 ChecksumMismatch: if the checksum of the pack contents does not

1224 match the checksum in the pack trailer.

1225 zlib.error: if an error occurred during zlib decompression.

1226 IOError: if an error occurred writing to the output file.

1227 """

1228 pack_version, self._num_objects = read_pack_header(self.read)

1229

1230 for _ in range(self._num_objects):

1231 offset = self.offset

1232 unpacked, unused = unpack_object(

1233 self.read,

1234 read_some=self.recv,

1235 compute_crc32=compute_crc32,

1236 zlib_bufsize=self._zlib_bufsize,

1237 )

1238 unpacked.offset = offset

1239

1240 # prepend any unused data to current read buffer

1241 buf = BytesIO()

1242 buf.write(unused)

1243 buf.write(self._rbuf.read())

1244 buf.seek(0)

1245 self._rbuf = buf

1246

1247 yield unpacked

1248

1249 if self._buf_len() < 20:

1250 # If the read buffer is full, then the last read() got the whole

1251 # trailer off the wire. If not, it means there is still some of the

1252 # trailer to read. We need to read() all 20 bytes; N come from the

1253 # read buffer and (20 - N) come from the wire.

1254 self.read(20)

1255

1256 pack_sha = bytearray(self._trailer) # type: ignore

1257 if pack_sha != self.sha.digest():

1258 raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest())

1259

1260

1261class PackStreamCopier(PackStreamReader):

1262 """Class to verify a pack stream as it is being read.

1263

1264 The pack is read from a ReceivableProtocol using read() or recv() as

1265 appropriate and written out to the given file-like object.

1266 """

1267

1268 def __init__(self, read_all, read_some, outfile, delta_iter=None) -> None:

1269 """Initialize the copier.

1270

1271 Args:

1272 read_all: Read function that blocks until the number of

1273 requested bytes are read.

1274 read_some: Read function that returns at least one byte, but may

1275 not return the number of bytes requested.

1276 outfile: File-like object to write output through.

1277 delta_iter: Optional DeltaChainIterator to record deltas as we

1278 read them.

1279 """

1280 super().__init__(read_all, read_some=read_some)

1281 self.outfile = outfile

1282 self._delta_iter = delta_iter

1283

1284 def _read(self, read, size):

1285 """Read data from the read callback and write it to the file.

1286

1287 Args:

1288 read: Read callback function

1289 size: Number of bytes to read

1290 Returns: Data read

1291 """

1292 data = super()._read(read, size)

1293 self.outfile.write(data)

1294 return data

1295

1296 def verify(self, progress=None) -> None:

1297 """Verify a pack stream and write it to the output file.

1298

1299 See PackStreamReader.iterobjects for a list of exceptions this may

1300 throw.

1301 """

1302 i = 0 # default count of entries if read_objects() is empty

1303 for i, unpacked in enumerate(self.read_objects()):

1304 if self._delta_iter:

1305 self._delta_iter.record(unpacked)

1306 if progress is not None:

1307 progress(f"copying pack entries: {i}/{len(self)}\r".encode("ascii"))

1308 if progress is not None:

1309 progress(f"copied {i} pack entries\n".encode("ascii"))

1310

1311

1312def obj_sha(type, chunks):

1313 """Compute the SHA for a numeric type and object chunks.

1314

1315 Args:

1316 type: Numeric type of the object

1317 chunks: Object data as bytes or iterable of bytes

1318 Returns: SHA-1 digest (20 bytes)

1319 """

1320 sha = sha1()

1321 sha.update(object_header(type, chunks_length(chunks)))

1322 if isinstance(chunks, bytes):

1323 sha.update(chunks)

1324 else:

1325 for chunk in chunks:

1326 sha.update(chunk)

1327 return sha.digest()

1328

1329

1330def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1 << 16):

1331 """Hash a portion of a file into a new SHA.

1332

1333 Args:

1334 f: A file-like object to read from that supports seek().

1335 start_ofs: The offset in the file to start reading at.

1336 end_ofs: The offset in the file to end reading at, relative to the

1337 end of the file.

1338 buffer_size: A buffer size for reading.

1339 Returns: A new SHA object updated with data read from the file.

1340 """

1341 sha = sha1()

1342 f.seek(0, SEEK_END)

1343 length = f.tell()

1344 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length:

1345 raise AssertionError(

1346 f"Attempt to read beyond file length. start_ofs: {start_ofs}, end_ofs: {end_ofs}, file length: {length}"

1347 )

1348 todo = length + end_ofs - start_ofs

1349 f.seek(start_ofs)

1350 while todo:

1351 data = f.read(min(todo, buffer_size))

1352 sha.update(data)

1353 todo -= len(data)

1354 return sha

1355

1356

1357class PackData:

1358 """The data contained in a packfile.

1359

1360 Pack files can be accessed both sequentially for exploding a pack, and

1361 directly with the help of an index to retrieve a specific object.

1362

1363 The objects within are either complete or a delta against another.

1364

1365 The header is variable length. If the MSB of each byte is set then it

1366 indicates that the subsequent byte is still part of the header.

1367 For the first byte the next MS bits are the type, which tells you the type

1368 of object, and whether it is a delta. The LS byte is the lowest bits of the

1369 size. For each subsequent byte the LS 7 bits are the next MS bits of the

1370 size, i.e. the last byte of the header contains the MS bits of the size.

1371

1372 For the complete objects the data is stored as zlib deflated data.

1373 The size in the header is the uncompressed object size, so to uncompress

1374 you need to just keep feeding data to zlib until you get an object back,

1375 or it errors on bad data. This is done here by just giving the complete

1376 buffer from the start of the deflated object on. This is bad, but until I

1377 get mmap sorted out it will have to do.

1378

1379 Currently there are no integrity checks done. Also no attempt is made to

1380 try and detect the delta case, or a request for an object at the wrong

1381 position. It will all just throw a zlib or KeyError.

1382 """

1383

1384 def __init__(

1385 self,

1386 filename: Union[str, os.PathLike],

1387 file=None,

1388 size=None,

1389 *,

1390 delta_window_size=None,

1391 window_memory=None,

1392 delta_cache_size=None,

1393 depth=None,

1394 threads=None,

1395 big_file_threshold=None,

1396 ) -> None:

1397 """Create a PackData object representing the pack in the given filename.

1398

1399 The file must exist and stay readable until the object is disposed of.

1400 It must also stay the same size. It will be mapped whenever needed.

1401

1402 Currently there is a restriction on the size of the pack as the python

1403 mmap implementation is flawed.

1404 """

1405 self._filename = filename

1406 self._size = size

1407 self._header_size = 12

1408 self.delta_window_size = delta_window_size

1409 self.window_memory = window_memory

1410 self.delta_cache_size = delta_cache_size

1411 self.depth = depth

1412 self.threads = threads

1413 self.big_file_threshold = big_file_threshold

1414

1415 if file is None:

1416 self._file = GitFile(self._filename, "rb")

1417 else:

1418 self._file = file

1419 (version, self._num_objects) = read_pack_header(self._file.read)

1420

1421 # Use delta_cache_size config if available, otherwise default

1422 cache_size = delta_cache_size or (1024 * 1024 * 20)

1423 self._offset_cache = LRUSizeCache[int, tuple[int, OldUnpackedObject]](

1424 cache_size, compute_size=_compute_object_size

1425 )

1426

1427 @property

1428 def filename(self):

1429 return os.path.basename(self._filename)

1430

1431 @property

1432 def path(self):

1433 return self._filename

1434

1435 @classmethod

1436 def from_file(cls, file, size=None):

1437 return cls(str(file), file=file, size=size)

1438

1439 @classmethod

1440 def from_path(cls, path: Union[str, os.PathLike]):

1441 return cls(filename=path)

1442

1443 def close(self) -> None:

1444 """Close the underlying pack file."""

1445 self._file.close()

1446

1447 def __enter__(self):

1448 return self

1449

1450 def __exit__(self, exc_type, exc_val, exc_tb):

1451 self.close()

1452

1453 def __eq__(self, other):

1454 """Check equality based on pack checksum."""

1455 if isinstance(other, PackData):

1456 return self.get_stored_checksum() == other.get_stored_checksum()

1457 return False

1458

1459 def _get_size(self):

1460 """Get the size of the pack file.

1461

1462 Returns: Size in bytes

1463 Raises: AssertionError if file is too small to be a pack

1464 """

1465 if self._size is not None:

1466 return self._size

1467 self._size = os.path.getsize(self._filename)

1468 if self._size < self._header_size:

1469 errmsg = f"{self._filename} is too small for a packfile ({self._size} < {self._header_size})"

1470 raise AssertionError(errmsg)

1471 return self._size

1472

1473 def __len__(self) -> int:

1474 """Returns the number of objects in this pack."""

1475 return self._num_objects

1476

1477 def calculate_checksum(self):

1478 """Calculate the checksum for this pack.

1479

1480 Returns: 20-byte binary SHA1 digest

1481 """

1482 return compute_file_sha(self._file, end_ofs=-20).digest()

1483

1484 def iter_unpacked(self, *, include_comp: bool = False):

1485 """Iterate over unpacked objects in the pack.

1486

1487 Args:

1488 include_comp: If True, include compressed object data

1489 Yields: UnpackedObject instances

1490 """

1491 self._file.seek(self._header_size)

1492

1493 if self._num_objects is None:

1494 return

1495

1496 for _ in range(self._num_objects):

1497 offset = self._file.tell()

1498 unpacked, unused = unpack_object(

1499 self._file.read, compute_crc32=False, include_comp=include_comp

1500 )

1501 unpacked.offset = offset

1502 yield unpacked

1503 # Back up over unused data.

1504 self._file.seek(-len(unused), SEEK_CUR)

1505

1506 def iterentries(

1507 self, progress=None, resolve_ext_ref: Optional[ResolveExtRefFn] = None

1508 ):

1509 """Yield entries summarizing the contents of this pack.

1510

1511 Args:

1512 progress: Progress function, called with current and total

1513 object count.

1514 Returns: iterator of tuples with (sha, offset, crc32)

1515 """

1516 num_objects = self._num_objects

1517 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref)

1518 for i, result in enumerate(indexer):

1519 if progress is not None:

1520 progress(i, num_objects)

1521 yield result

1522

1523 def sorted_entries(

1524 self,

1525 progress: Optional[ProgressFn] = None,

1526 resolve_ext_ref: Optional[ResolveExtRefFn] = None,

1527 ):

1528 """Return entries in this pack, sorted by SHA.

1529

1530 Args:

1531 progress: Progress function, called with current and total

1532 object count

1533 Returns: Iterator of tuples with (sha, offset, crc32)

1534 """

1535 return sorted(

1536 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref)

1537 )

1538

1539 def create_index_v1(self, filename, progress=None, resolve_ext_ref=None):

1540 """Create a version 1 file for this data file.

1541

1542 Args:

1543 filename: Index filename.

1544 progress: Progress report function

1545 Returns: Checksum of index file

1546 """

1547 entries = self.sorted_entries(

1548 progress=progress, resolve_ext_ref=resolve_ext_ref

1549 )

1550 with GitFile(filename, "wb") as f:

1551 return write_pack_index_v1(f, entries, self.calculate_checksum())

1552

1553 def create_index_v2(self, filename, progress=None, resolve_ext_ref=None):

1554 """Create a version 2 index file for this data file.

1555

1556 Args:

1557 filename: Index filename.

1558 progress: Progress report function

1559 Returns: Checksum of index file

1560 """

1561 entries = self.sorted_entries(

1562 progress=progress, resolve_ext_ref=resolve_ext_ref

1563 )

1564 with GitFile(filename, "wb") as f:

1565 return write_pack_index_v2(f, entries, self.calculate_checksum())

1566

1567 def create_index_v3(

1568 self, filename, progress=None, resolve_ext_ref=None, hash_algorithm=1

1569 ):

1570 """Create a version 3 index file for this data file.

1571

1572 Args:

1573 filename: Index filename.

1574 progress: Progress report function

1575 resolve_ext_ref: Function to resolve external references

1576 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

1577 Returns: Checksum of index file

1578 """

1579 entries = self.sorted_entries(

1580 progress=progress, resolve_ext_ref=resolve_ext_ref

1581 )

1582 with GitFile(filename, "wb") as f:

1583 return write_pack_index_v3(

1584 f, entries, self.calculate_checksum(), hash_algorithm

1585 )

1586

1587 def create_index(

1588 self, filename, progress=None, version=2, resolve_ext_ref=None, hash_algorithm=1

1589 ):

1590 """Create an index file for this data file.

1591

1592 Args:

1593 filename: Index filename.

1594 progress: Progress report function

1595 version: Index version (1, 2, or 3)

1596 resolve_ext_ref: Function to resolve external references

1597 hash_algorithm: Hash algorithm identifier for v3 (1 = SHA-1, 2 = SHA-256)

1598 Returns: Checksum of index file

1599 """

1600 if version == 1:

1601 return self.create_index_v1(

1602 filename, progress, resolve_ext_ref=resolve_ext_ref

1603 )

1604 elif version == 2:

1605 return self.create_index_v2(

1606 filename, progress, resolve_ext_ref=resolve_ext_ref

1607 )

1608 elif version == 3:

1609 return self.create_index_v3(

1610 filename,

1611 progress,

1612 resolve_ext_ref=resolve_ext_ref,

1613 hash_algorithm=hash_algorithm,

1614 )

1615 else:

1616 raise ValueError(f"unknown index format {version}")

1617

1618 def get_stored_checksum(self):

1619 """Return the expected checksum stored in this pack."""

1620 self._file.seek(-20, SEEK_END)

1621 return self._file.read(20)

1622

1623 def check(self) -> None:

1624 """Check the consistency of this pack."""

1625 actual = self.calculate_checksum()

1626 stored = self.get_stored_checksum()

1627 if actual != stored:

1628 raise ChecksumMismatch(stored, actual)

1629

1630 def get_unpacked_object_at(

1631 self, offset: int, *, include_comp: bool = False

1632 ) -> UnpackedObject:

1633 """Given offset in the packfile return a UnpackedObject."""

1634 assert offset >= self._header_size

1635 self._file.seek(offset)

1636 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp)

1637 unpacked.offset = offset

1638 return unpacked

1639

1640 def get_object_at(self, offset: int) -> tuple[int, OldUnpackedObject]:

1641 """Given an offset in to the packfile return the object that is there.

1642

1643 Using the associated index the location of an object can be looked up,

1644 and then the packfile can be asked directly for that object using this

1645 function.

1646 """

1647 try:

1648 return self._offset_cache[offset]

1649 except KeyError:

1650 pass

1651 unpacked = self.get_unpacked_object_at(offset, include_comp=False)

1652 return (unpacked.pack_type_num, unpacked._obj())

1655T = TypeVar("T")

1658class DeltaChainIterator(Generic[T]):

1659 """Abstract iterator over pack data based on delta chains.

1660

1661 Each object in the pack is guaranteed to be inflated exactly once,

1662 regardless of how many objects reference it as a delta base. As a result,

1663 memory usage is proportional to the length of the longest delta chain.

1664

1665 Subclasses can override _result to define the result type of the iterator.

1666 By default, results are UnpackedObjects with the following members set:

1667

1668 * offset

1669 * obj_type_num

1670 * obj_chunks

1671 * pack_type_num

1672 * delta_base (for delta types)

1673 * comp_chunks (if _include_comp is True)

1674 * decomp_chunks

1675 * decomp_len

1676 * crc32 (if _compute_crc32 is True)

1677 """

1678

1679 _compute_crc32 = False

1680 _include_comp = False

1681

1682 def __init__(self, file_obj, *, resolve_ext_ref=None) -> None:

1683 self._file = file_obj

1684 self._resolve_ext_ref = resolve_ext_ref

1685 self._pending_ofs: dict[int, list[int]] = defaultdict(list)

1686 self._pending_ref: dict[bytes, list[int]] = defaultdict(list)

1687 self._full_ofs: list[tuple[int, int]] = []

1688 self._ext_refs: list[bytes] = []

1689

1690 @classmethod

1691 def for_pack_data(cls, pack_data: PackData, resolve_ext_ref=None):

1692 walker = cls(None, resolve_ext_ref=resolve_ext_ref)

1693 walker.set_pack_data(pack_data)

1694 for unpacked in pack_data.iter_unpacked(include_comp=False):

1695 walker.record(unpacked)

1696 return walker

1697

1698 @classmethod

1699 def for_pack_subset(

1700 cls,

1701 pack: "Pack",

1702 shas: Iterable[bytes],

1703 *,

1704 allow_missing: bool = False,

1705 resolve_ext_ref=None,

1706 ):

1707 walker = cls(None, resolve_ext_ref=resolve_ext_ref)

1708 walker.set_pack_data(pack.data)

1709 todo = set()

1710 for sha in shas:

1711 assert isinstance(sha, bytes)

1712 try:

1713 off = pack.index.object_offset(sha)

1714 except KeyError:

1715 if not allow_missing:

1716 raise

1717 else:

1718 todo.add(off)

1719 done = set()

1720 while todo:

1721 off = todo.pop()

1722 unpacked = pack.data.get_unpacked_object_at(off)

1723 walker.record(unpacked)

1724 done.add(off)

1725 base_ofs = None

1726 if unpacked.pack_type_num == OFS_DELTA:

1727 assert unpacked.offset is not None

1728 assert unpacked.delta_base is not None

1729 assert isinstance(unpacked.delta_base, int)

1730 base_ofs = unpacked.offset - unpacked.delta_base

1731 elif unpacked.pack_type_num == REF_DELTA:

1732 with suppress(KeyError):

1733 assert isinstance(unpacked.delta_base, bytes)

1734 base_ofs = pack.index.object_index(unpacked.delta_base)

1735 if base_ofs is not None and base_ofs not in done:

1736 todo.add(base_ofs)

1737 return walker

1738

1739 def record(self, unpacked: UnpackedObject) -> None:

1740 type_num = unpacked.pack_type_num

1741 offset = unpacked.offset

1742 assert offset is not None

1743 if type_num == OFS_DELTA:

1744 assert unpacked.delta_base is not None

1745 assert isinstance(unpacked.delta_base, int)

1746 base_offset = offset - unpacked.delta_base

1747 self._pending_ofs[base_offset].append(offset)

1748 elif type_num == REF_DELTA:

1749 assert isinstance(unpacked.delta_base, bytes)

1750 self._pending_ref[unpacked.delta_base].append(offset)

1751 else:

1752 self._full_ofs.append((offset, type_num))

1753

1754 def set_pack_data(self, pack_data: PackData) -> None:

1755 self._file = pack_data._file

1756

1757 def _walk_all_chains(self):

1758 for offset, type_num in self._full_ofs:

1759 yield from self._follow_chain(offset, type_num, None)

1760 yield from self._walk_ref_chains()

1761 assert not self._pending_ofs, repr(self._pending_ofs)

1762

1763 def _ensure_no_pending(self) -> None:

1764 if self._pending_ref:

1765 raise UnresolvedDeltas([sha_to_hex(s) for s in self._pending_ref])

1766

1767 def _walk_ref_chains(self):

1768 if not self._resolve_ext_ref:

1769 self._ensure_no_pending()

1770 return

1771

1772 for base_sha, pending in sorted(self._pending_ref.items()):

1773 if base_sha not in self._pending_ref:

1774 continue

1775 try:

1776 type_num, chunks = self._resolve_ext_ref(base_sha)

1777 except KeyError:

1778 # Not an external ref, but may depend on one. Either it will

1779 # get popped via a _follow_chain call, or we will raise an

1780 # error below.

1781 continue

1782 self._ext_refs.append(base_sha)

1783 self._pending_ref.pop(base_sha)

1784 for new_offset in pending:

1785 yield from self._follow_chain(new_offset, type_num, chunks)

1786

1787 self._ensure_no_pending()

1788

1789 def _result(self, unpacked: UnpackedObject) -> T:

1790 raise NotImplementedError

1791

1792 def _resolve_object(

1793 self, offset: int, obj_type_num: int, base_chunks: list[bytes]

1794 ) -> UnpackedObject:

1795 self._file.seek(offset)

1796 unpacked, _ = unpack_object(

1797 self._file.read,

1798 include_comp=self._include_comp,

1799 compute_crc32=self._compute_crc32,

1800 )

1801 unpacked.offset = offset

1802 if base_chunks is None:

1803 assert unpacked.pack_type_num == obj_type_num

1804 else:

1805 assert unpacked.pack_type_num in DELTA_TYPES

1806 unpacked.obj_type_num = obj_type_num

1807 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks)

1808 return unpacked

1809

1810 def _follow_chain(self, offset: int, obj_type_num: int, base_chunks: list[bytes]):

1811 # Unlike PackData.get_object_at, there is no need to cache offsets as

1812 # this approach by design inflates each object exactly once.

1813 todo = [(offset, obj_type_num, base_chunks)]

1814 while todo:

1815 (offset, obj_type_num, base_chunks) = todo.pop()

1816 unpacked = self._resolve_object(offset, obj_type_num, base_chunks)

1817 yield self._result(unpacked)

1818

1819 assert unpacked.offset is not None

1820 unblocked = chain(

1821 self._pending_ofs.pop(unpacked.offset, []),

1822 self._pending_ref.pop(unpacked.sha(), []),

1823 )

1824 todo.extend(

1825 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore

1826 for new_offset in unblocked

1827 )

1828

1829 def __iter__(self) -> Iterator[T]:

1830 return self._walk_all_chains()

1831

1832 def ext_refs(self):

1833 return self._ext_refs

1834

1835

1836class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]):

1837 """Delta chain iterator that yield unpacked objects."""

1838

1839 def _result(self, unpacked):

1840 return unpacked

1841

1842

1843class PackIndexer(DeltaChainIterator[PackIndexEntry]):

1844 """Delta chain iterator that yields index entries."""

1845

1846 _compute_crc32 = True

1847

1848 def _result(self, unpacked):

1849 return unpacked.sha(), unpacked.offset, unpacked.crc32

1850

1851

1852class PackInflater(DeltaChainIterator[ShaFile]):

1853 """Delta chain iterator that yields ShaFile objects."""

1854

1855 def _result(self, unpacked):

1856 return unpacked.sha_file()

1857

1858

1859class SHA1Reader(BinaryIO):

1860 """Wrapper for file-like object that remembers the SHA1 of its data."""

1861

1862 def __init__(self, f) -> None:

1863 self.f = f

1864 self.sha1 = sha1(b"")

1865

1866 def read(self, size: int = -1) -> bytes:

1867 data = self.f.read(size)

1868 self.sha1.update(data)

1869 return data

1870

1871 def check_sha(self, allow_empty: bool = False) -> None:

1872 stored = self.f.read(20)

1873 # If git option index.skipHash is set the index will be empty

1874 if stored != self.sha1.digest() and (

1875 not allow_empty

1876 or sha_to_hex(stored) != b"0000000000000000000000000000000000000000"

1877 ):

1878 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))

1879

1880 def close(self):

1881 return self.f.close()

1882

1883 def tell(self) -> int:

1884 return self.f.tell()

1885

1886 # BinaryIO abstract methods

1887 def readable(self) -> bool:

1888 return True

1889

1890 def writable(self) -> bool:

1891 return False

1892

1893 def seekable(self) -> bool:

1894 return getattr(self.f, "seekable", lambda: False)()

1895

1896 def seek(self, offset: int, whence: int = 0) -> int:

1897 return self.f.seek(offset, whence)

1898

1899 def flush(self) -> None:

1900 if hasattr(self.f, "flush"):

1901 self.f.flush()

1902

1903 def readline(self, size: int = -1) -> bytes:

1904 return self.f.readline(size)

1905

1906 def readlines(self, hint: int = -1) -> list[bytes]:

1907 return self.f.readlines(hint)

1908

1909 def writelines(self, lines) -> None:

1910 raise UnsupportedOperation("writelines")

1911

1912 def write(self, data) -> int:

1913 raise UnsupportedOperation("write")

1914

1915 def __enter__(self):

1916 return self

1917

1918 def __exit__(self, type, value, traceback):

1919 self.close()

1920

1921 def __iter__(self):

1922 return self

1923

1924 def __next__(self) -> bytes:

1925 line = self.readline()

1926 if not line:

1927 raise StopIteration

1928 return line

1929

1930 def fileno(self) -> int:

1931 return self.f.fileno()

1932

1933 def isatty(self) -> bool:

1934 return getattr(self.f, "isatty", lambda: False)()

1935

1936 def truncate(self, size: Optional[int] = None) -> int:

1937 raise UnsupportedOperation("truncate")

1938

1939

1940class SHA1Writer(BinaryIO):

1941 """Wrapper for file-like object that remembers the SHA1 of its data."""

1942

1943 def __init__(self, f) -> None:

1944 self.f = f

1945 self.length = 0

1946 self.sha1 = sha1(b"")

1947

1948 def write(self, data) -> int:

1949 self.sha1.update(data)

1950 self.f.write(data)

1951 self.length += len(data)

1952 return len(data)

1953

1954 def write_sha(self):

1955 sha = self.sha1.digest()

1956 assert len(sha) == 20

1957 self.f.write(sha)

1958 self.length += len(sha)

1959 return sha

1960

1961 def close(self):

1962 sha = self.write_sha()

1963 self.f.close()

1964 return sha

1965

1966 def offset(self):

1967 return self.length

1968

1969 def tell(self) -> int:

1970 return self.f.tell()

1971

1972 # BinaryIO abstract methods

1973 def readable(self) -> bool:

1974 return False

1975

1976 def writable(self) -> bool:

1977 return True

1978

1979 def seekable(self) -> bool:

1980 return getattr(self.f, "seekable", lambda: False)()

1981

1982 def seek(self, offset: int, whence: int = 0) -> int:

1983 return self.f.seek(offset, whence)

1984

1985 def flush(self) -> None:

1986 if hasattr(self.f, "flush"):

1987 self.f.flush()

1988

1989 def readline(self, size: int = -1) -> bytes:

1990 raise UnsupportedOperation("readline")

1991

1992 def readlines(self, hint: int = -1) -> list[bytes]:

1993 raise UnsupportedOperation("readlines")

1994

1995 def writelines(self, lines) -> None:

1996 for line in lines:

1997 self.write(line)

1998

1999 def read(self, size: int = -1) -> bytes:

2000 raise UnsupportedOperation("read")

2001

2002 def __enter__(self):

2003 return self

2004

2005 def __exit__(self, type, value, traceback):

2006 self.close()

2007

2008 def __iter__(self):

2009 return self

2010

2011 def __next__(self) -> bytes:

2012 raise UnsupportedOperation("__next__")

2013

2014 def fileno(self) -> int:

2015 return self.f.fileno()

2016

2017 def isatty(self) -> bool:

2018 return getattr(self.f, "isatty", lambda: False)()

2019

2020 def truncate(self, size: Optional[int] = None) -> int:

2021 raise UnsupportedOperation("truncate")

2022

2023

2024def pack_object_header(type_num, delta_base, size):

2025 """Create a pack object header for the given object info.

2026

2027 Args:

2028 type_num: Numeric type of the object.

2029 delta_base: Delta base offset or ref, or None for whole objects.

2030 size: Uncompressed object size.

2031 Returns: A header for a packed object.

2032 """

2033 header = []

2034 c = (type_num << 4) | (size & 15)

2035 size >>= 4

2036 while size:

2037 header.append(c | 0x80)

2038 c = size & 0x7F

2039 size >>= 7

2040 header.append(c)

2041 if type_num == OFS_DELTA:

2042 ret = [delta_base & 0x7F]

2043 delta_base >>= 7

2044 while delta_base:

2045 delta_base -= 1

2046 ret.insert(0, 0x80 | (delta_base & 0x7F))

2047 delta_base >>= 7

2048 header.extend(ret)

2049 elif type_num == REF_DELTA:

2050 assert len(delta_base) == 20

2051 header += delta_base

2052 return bytearray(header)

2053

2054

2055def pack_object_chunks(type, object, compression_level=-1):

2056 """Generate chunks for a pack object.

2057

2058 Args:

2059 type: Numeric type of the object

2060 object: Object to write

2061 compression_level: the zlib compression level

2062 Returns: Chunks

2063 """

2064 if type in DELTA_TYPES:

2065 delta_base, object = object

2066 else:

2067 delta_base = None

2068 if isinstance(object, bytes):

2069 object = [object]

2070 yield bytes(pack_object_header(type, delta_base, sum(map(len, object))))

2071 compressor = zlib.compressobj(level=compression_level)

2072 for data in object:

2073 yield compressor.compress(data)

2074 yield compressor.flush()

2075

2076

2077def write_pack_object(write, type, object, sha=None, compression_level=-1):

2078 """Write pack object to a file.

2079

2080 Args:

2081 write: Write function to use

2082 type: Numeric type of the object

2083 object: Object to write

2084 compression_level: the zlib compression level

2085 Returns: Tuple with offset at which the object was written, and crc32

2086 """

2087 crc32 = 0

2088 for chunk in pack_object_chunks(type, object, compression_level=compression_level):

2089 write(chunk)

2090 if sha is not None:

2091 sha.update(chunk)

2092 crc32 = binascii.crc32(chunk, crc32)

2093 return crc32 & 0xFFFFFFFF

2094

2095

2096def write_pack(

2097 filename,

2098 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],

2099 *,

2100 deltify: Optional[bool] = None,

2101 delta_window_size: Optional[int] = None,

2102 compression_level: int = -1,

2103):

2104 """Write a new pack data file.

2105

2106 Args:

2107 filename: Path to the new pack file (without .pack extension)

2108 delta_window_size: Delta window size

2109 deltify: Whether to deltify pack objects

2110 compression_level: the zlib compression level

2111 Returns: Tuple with checksum of pack file and index file

2112 """

2113 with GitFile(filename + ".pack", "wb") as f:

2114 entries, data_sum = write_pack_objects(

2115 f.write,

2116 objects,

2117 delta_window_size=delta_window_size,

2118 deltify=deltify,

2119 compression_level=compression_level,

2120 )

2121 entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])

2122 with GitFile(filename + ".idx", "wb") as f:

2123 return data_sum, write_pack_index(f, entries, data_sum)

2124

2125

2126def pack_header_chunks(num_objects):

2127 """Yield chunks for a pack header."""

2128 yield b"PACK" # Pack header

2129 yield struct.pack(b">L", 2) # Pack version

2130 yield struct.pack(b">L", num_objects) # Number of objects in pack

2131

2132

2133def write_pack_header(write, num_objects) -> None:

2134 """Write a pack header for the given number of objects."""

2135 if hasattr(write, "write"):

2136 write = write.write

2137 warnings.warn(

2138 "write_pack_header() now takes a write rather than file argument",

2139 DeprecationWarning,

2140 stacklevel=2,

2141 )

2142 for chunk in pack_header_chunks(num_objects):

2143 write(chunk)

2144

2145

2146def find_reusable_deltas(

2147 container: PackedObjectContainer,

2148 object_ids: set[bytes],

2149 *,

2150 other_haves: Optional[set[bytes]] = None,

2151 progress=None,

2152) -> Iterator[UnpackedObject]:

2153 if other_haves is None:

2154 other_haves = set()

2155 reused = 0

2156 for i, unpacked in enumerate(

2157 container.iter_unpacked_subset(

2158 object_ids, allow_missing=True, convert_ofs_delta=True

2159 )

2160 ):

2161 if progress is not None and i % 1000 == 0:

2162 progress(f"checking for reusable deltas: {i}/{len(object_ids)}\r".encode())

2163 if unpacked.pack_type_num == REF_DELTA:

2164 hexsha = sha_to_hex(unpacked.delta_base) # type: ignore

2165 if hexsha in object_ids or hexsha in other_haves:

2166 yield unpacked

2167 reused += 1

2168 if progress is not None:

2169 progress((f"found {reused} deltas to reuse\n").encode())

2170

2171

2172def deltify_pack_objects(

2173 objects: Union[Iterator[bytes], Iterator[tuple[ShaFile, Optional[bytes]]]],

2174 *,

2175 window_size: Optional[int] = None,

2176 progress=None,

2177) -> Iterator[UnpackedObject]:

2178 """Generate deltas for pack objects.

2179

2180 Args:

2181 objects: An iterable of (object, path) tuples to deltify.

2182 window_size: Window size; None for default

2183 Returns: Iterator over type_num, object id, delta_base, content

2184 delta_base is None for full text entries

2185 """

2186

2187 def objects_with_hints():

2188 for e in objects:

2189 if isinstance(e, ShaFile):

2190 yield (e, (e.type_num, None))

2191 else:

2192 yield (e[0], (e[0].type_num, e[1]))

2193

2194 yield from deltas_from_sorted_objects(

2195 sort_objects_for_delta(objects_with_hints()),

2196 window_size=window_size,

2197 progress=progress,

2198 )

2199

2200

2201def sort_objects_for_delta(

2202 objects: Union[Iterator[ShaFile], Iterator[tuple[ShaFile, Optional[PackHint]]]],

2203) -> Iterator[ShaFile]:

2204 magic = []

2205 for entry in objects:

2206 if isinstance(entry, tuple):

2207 obj, hint = entry

2208 if hint is None:

2209 type_num = None

2210 path = None

2211 else:

2212 (type_num, path) = hint

2213 else:

2214 obj = entry

2215 magic.append((type_num, path, -obj.raw_length(), obj))

2216 # Build a list of objects ordered by the magic Linus heuristic

2217 # This helps us find good objects to diff against us

2218 magic.sort()

2219 return (x[3] for x in magic)

2220

2221

2222def deltas_from_sorted_objects(

2223 objects, window_size: Optional[int] = None, progress=None

2224):

2225 # TODO(jelmer): Use threads

2226 if window_size is None:

2227 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE

2228

2229 possible_bases: deque[tuple[bytes, int, list[bytes]]] = deque()

2230 for i, o in enumerate(objects):

2231 if progress is not None and i % 1000 == 0:

2232 progress((f"generating deltas: {i}\r").encode())

2233 raw = o.as_raw_chunks()

2234 winner = raw

2235 winner_len = sum(map(len, winner))

2236 winner_base = None

2237 for base_id, base_type_num, base in possible_bases:

2238 if base_type_num != o.type_num:

2239 continue

2240 delta_len = 0

2241 delta = []

2242 for chunk in create_delta(base, raw):

2243 delta_len += len(chunk)

2244 if delta_len >= winner_len:

2245 break

2246 delta.append(chunk)

2247 else:

2248 winner_base = base_id

2249 winner = delta

2250 winner_len = sum(map(len, winner))

2251 yield UnpackedObject(

2252 o.type_num,

2253 sha=o.sha().digest(),

2254 delta_base=winner_base,

2255 decomp_len=winner_len,

2256 decomp_chunks=winner,

2257 )

2258 possible_bases.appendleft((o.sha().digest(), o.type_num, raw))

2259 while len(possible_bases) > window_size:

2260 possible_bases.pop()

2261

2262

2263def pack_objects_to_data(

2264 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],

2265 *,

2266 deltify: Optional[bool] = None,

2267 delta_window_size: Optional[int] = None,

2268 ofs_delta: bool = True,

2269 progress=None,

2270) -> tuple[int, Iterator[UnpackedObject]]:

2271 """Create pack data from objects.

2272

2273 Args:

2274 objects: Pack objects

2275 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)

2276 """

2277 # TODO(jelmer): support deltaifying

2278 count = len(objects)

2279 if deltify is None:

2280 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too

2281 # slow at the moment.

2282 deltify = False

2283 if deltify:

2284 return (

2285 count,

2286 deltify_pack_objects(

2287 iter(objects), # type: ignore

2288 window_size=delta_window_size,

2289 progress=progress,

2290 ),

2291 )

2292 else:

2293

2294 def iter_without_path():

2295 for o in objects:

2296 if isinstance(o, tuple):

2297 yield full_unpacked_object(o[0])

2298 else:

2299 yield full_unpacked_object(o)

2300

2301 return (count, iter_without_path())

2302

2303

2304def generate_unpacked_objects(

2305 container: PackedObjectContainer,

2306 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]],

2307 delta_window_size: Optional[int] = None,

2308 deltify: Optional[bool] = None,

2309 reuse_deltas: bool = True,

2310 ofs_delta: bool = True,

2311 other_haves: Optional[set[bytes]] = None,

2312 progress=None,

2313) -> Iterator[UnpackedObject]:

2314 """Create pack data from objects.

2315

2316 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)

2317 """

2318 todo = dict(object_ids)

2319 if reuse_deltas:

2320 for unpack in find_reusable_deltas(

2321 container, set(todo), other_haves=other_haves, progress=progress

2322 ):

2323 del todo[sha_to_hex(unpack.sha())]

2324 yield unpack

2325 if deltify is None:

2326 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too

2327 # slow at the moment.

2328 deltify = False

2329 if deltify:

2330 objects_to_delta = container.iterobjects_subset(

2331 todo.keys(), allow_missing=False

2332 )

2333 yield from deltas_from_sorted_objects(

2334 sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta),

2335 window_size=delta_window_size,

2336 progress=progress,

2337 )

2338 else:

2339 for oid in todo:

2340 yield full_unpacked_object(container[oid])

2341

2342

2343def full_unpacked_object(o: ShaFile) -> UnpackedObject:

2344 return UnpackedObject(

2345 o.type_num,

2346 delta_base=None,

2347 crc32=None,

2348 decomp_chunks=o.as_raw_chunks(),

2349 sha=o.sha().digest(),

2350 )

2351

2352

2353def write_pack_from_container(

2354 write,

2355 container: PackedObjectContainer,

2356 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]],

2357 delta_window_size: Optional[int] = None,

2358 deltify: Optional[bool] = None,

2359 reuse_deltas: bool = True,

2360 compression_level: int = -1,

2361 other_haves: Optional[set[bytes]] = None,

2362):

2363 """Write a new pack data file.

2364

2365 Args:

2366 write: write function to use

2367 container: PackedObjectContainer

2368 delta_window_size: Sliding window size for searching for deltas;

2369 Set to None for default window size.

2370 deltify: Whether to deltify objects

2371 compression_level: the zlib compression level to use

2372 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2373 """

2374 pack_contents_count = len(object_ids)

2375 pack_contents = generate_unpacked_objects(

2376 container,

2377 object_ids,

2378 delta_window_size=delta_window_size,

2379 deltify=deltify,

2380 reuse_deltas=reuse_deltas,

2381 other_haves=other_haves,

2382 )

2383

2384 return write_pack_data(

2385 write,

2386 pack_contents,

2387 num_records=pack_contents_count,

2388 compression_level=compression_level,

2389 )

2390

2391

2392def write_pack_objects(

2393 write,

2394 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],

2395 *,

2396 delta_window_size: Optional[int] = None,

2397 deltify: Optional[bool] = None,

2398 compression_level: int = -1,

2399):

2400 """Write a new pack data file.

2401

2402 Args:

2403 write: write function to use

2404 objects: Sequence of (object, path) tuples to write

2405 delta_window_size: Sliding window size for searching for deltas;

2406 Set to None for default window size.

2407 deltify: Whether to deltify objects

2408 compression_level: the zlib compression level to use

2409 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2410 """

2411 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify)

2412

2413 return write_pack_data(

2414 write,

2415 pack_contents,

2416 num_records=pack_contents_count,

2417 compression_level=compression_level,

2418 )

2419

2420

2421class PackChunkGenerator:

2422 def __init__(

2423 self,

2424 num_records=None,

2425 records=None,

2426 progress=None,

2427 compression_level=-1,

2428 reuse_compressed=True,

2429 ) -> None:

2430 self.cs = sha1(b"")

2431 self.entries: dict[Union[int, bytes], tuple[int, int]] = {}

2432 self._it = self._pack_data_chunks(

2433 num_records=num_records,

2434 records=records,

2435 progress=progress,

2436 compression_level=compression_level,

2437 reuse_compressed=reuse_compressed,

2438 )

2439

2440 def sha1digest(self):

2441 return self.cs.digest()

2442

2443 def __iter__(self):

2444 return self._it

2445

2446 def _pack_data_chunks(

2447 self,

2448 records: Iterator[UnpackedObject],

2449 *,

2450 num_records=None,

2451 progress=None,

2452 compression_level: int = -1,

2453 reuse_compressed: bool = True,

2454 ) -> Iterator[bytes]:

2455 """Iterate pack data file chunks.

2456

2457 Args:

2458 records: Iterator over UnpackedObject

2459 num_records: Number of records (defaults to len(records) if not specified)

2460 progress: Function to report progress to

2461 compression_level: the zlib compression level

2462 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2463 """

2464 # Write the pack

2465 if num_records is None:

2466 num_records = len(records) # type: ignore

2467 offset = 0

2468 for chunk in pack_header_chunks(num_records):

2469 yield chunk

2470 self.cs.update(chunk)

2471 offset += len(chunk)

2472 actual_num_records = 0

2473 for i, unpacked in enumerate(records):

2474 type_num = unpacked.pack_type_num

2475 if progress is not None and i % 1000 == 0:

2476 progress((f"writing pack data: {i}/{num_records}\r").encode("ascii"))

2477 raw: Union[list[bytes], tuple[int, list[bytes]], tuple[bytes, list[bytes]]]

2478 if unpacked.delta_base is not None:

2479 try:

2480 base_offset, base_crc32 = self.entries[unpacked.delta_base]

2481 except KeyError:

2482 type_num = REF_DELTA

2483 assert isinstance(unpacked.delta_base, bytes)

2484 raw = (unpacked.delta_base, unpacked.decomp_chunks)

2485 else:

2486 type_num = OFS_DELTA

2487 raw = (offset - base_offset, unpacked.decomp_chunks)

2488 else:

2489 raw = unpacked.decomp_chunks

2490 if unpacked.comp_chunks is not None and reuse_compressed:

2491 chunks = unpacked.comp_chunks

2492 else:

2493 chunks = pack_object_chunks(

2494 type_num, raw, compression_level=compression_level

2495 )

2496 crc32 = 0

2497 object_size = 0

2498 for chunk in chunks:

2499 yield chunk

2500 crc32 = binascii.crc32(chunk, crc32)

2501 self.cs.update(chunk)

2502 object_size += len(chunk)

2503 actual_num_records += 1

2504 self.entries[unpacked.sha()] = (offset, crc32)

2505 offset += object_size

2506 if actual_num_records != num_records:

2507 raise AssertionError(

2508 f"actual records written differs: {actual_num_records} != {num_records}"

2509 )

2510

2511 yield self.cs.digest()

2512

2513

2514def write_pack_data(

2515 write,

2516 records: Iterator[UnpackedObject],

2517 *,

2518 num_records=None,

2519 progress=None,

2520 compression_level=-1,

2521):

2522 """Write a new pack data file.

2523

2524 Args:

2525 write: Write function to use

2526 num_records: Number of records (defaults to len(records) if None)

2527 records: Iterator over type_num, object_id, delta_base, raw

2528 progress: Function to report progress to

2529 compression_level: the zlib compression level

2530 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2531 """

2532 chunk_generator = PackChunkGenerator(

2533 num_records=num_records,

2534 records=records,

2535 progress=progress,

2536 compression_level=compression_level,

2537 )

2538 for chunk in chunk_generator:

2539 write(chunk)

2540 return chunk_generator.entries, chunk_generator.sha1digest()

2541

2542

2543def write_pack_index_v1(f, entries, pack_checksum):

2544 """Write a new pack index file.

2545

2546 Args:

2547 f: A file-like object to write to

2548 entries: List of tuples with object name (sha), offset_in_pack,

2549 and crc32_checksum.

2550 pack_checksum: Checksum of the pack file.

2551 Returns: The SHA of the written index file

2552 """

2553 f = SHA1Writer(f)

2554 fan_out_table = defaultdict(lambda: 0)

2555 for name, _offset, _entry_checksum in entries:

2556 fan_out_table[ord(name[:1])] += 1

2557 # Fan-out table

2558 for i in range(0x100):

2559 f.write(struct.pack(">L", fan_out_table[i]))

2560 fan_out_table[i + 1] += fan_out_table[i]

2561 for name, offset, _entry_checksum in entries:

2562 if not (offset <= 0xFFFFFFFF):

2563 raise TypeError("pack format 1 only supports offsets < 2Gb")

2564 f.write(struct.pack(">L20s", offset, name))

2565 assert len(pack_checksum) == 20

2566 f.write(pack_checksum)

2567 return f.write_sha()

2568

2569

2570def _delta_encode_size(size) -> bytes:

2571 ret = bytearray()

2572 c = size & 0x7F

2573 size >>= 7

2574 while size:

2575 ret.append(c | 0x80)

2576 c = size & 0x7F

2577 size >>= 7

2578 ret.append(c)

2579 return bytes(ret)

2580

2581

2582# The length of delta compression copy operations in version 2 packs is limited

2583# to 64K. To copy more, we use several copy operations. Version 3 packs allow

2584# 24-bit lengths in copy operations, but we always make version 2 packs.

2585_MAX_COPY_LEN = 0xFFFF

2586

2587

2588def _encode_copy_operation(start, length):

2589 scratch = bytearray([0x80])

2590 for i in range(4):

2591 if start & 0xFF << i * 8:

2592 scratch.append((start >> i * 8) & 0xFF)

2593 scratch[0] |= 1 << i

2594 for i in range(2):

2595 if length & 0xFF << i * 8:

2596 scratch.append((length >> i * 8) & 0xFF)

2597 scratch[0] |= 1 << (4 + i)

2598 return bytes(scratch)

2599

2600

2601def create_delta(base_buf, target_buf):

2602 """Use python difflib to work out how to transform base_buf to target_buf.

2603

2604 Args:

2605 base_buf: Base buffer

2606 target_buf: Target buffer

2607 """

2608 if isinstance(base_buf, list):

2609 base_buf = b"".join(base_buf)

2610 if isinstance(target_buf, list):

2611 target_buf = b"".join(target_buf)

2612 assert isinstance(base_buf, bytes)

2613 assert isinstance(target_buf, bytes)

2614 # write delta header

2615 yield _delta_encode_size(len(base_buf))

2616 yield _delta_encode_size(len(target_buf))

2617 # write out delta opcodes

2618 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf)

2619 for opcode, i1, i2, j1, j2 in seq.get_opcodes():

2620 # Git patch opcodes don't care about deletes!

2621 # if opcode == 'replace' or opcode == 'delete':

2622 # pass

2623 if opcode == "equal":

2624 # If they are equal, unpacker will use data from base_buf

2625 # Write out an opcode that says what range to use

2626 copy_start = i1

2627 copy_len = i2 - i1

2628 while copy_len > 0:

2629 to_copy = min(copy_len, _MAX_COPY_LEN)

2630 yield _encode_copy_operation(copy_start, to_copy)

2631 copy_start += to_copy

2632 copy_len -= to_copy

2633 if opcode == "replace" or opcode == "insert":

2634 # If we are replacing a range or adding one, then we just

2635 # output it to the stream (prefixed by its size)

2636 s = j2 - j1

2637 o = j1

2638 while s > 127:

2639 yield bytes([127])

2640 yield memoryview(target_buf)[o : o + 127]

2641 s -= 127

2642 o += 127

2643 yield bytes([s])

2644 yield memoryview(target_buf)[o : o + s]

2645

2646

2647def apply_delta(src_buf, delta):

2648 """Based on the similar function in git's patch-delta.c.

2649

2650 Args:

2651 src_buf: Source buffer

2652 delta: Delta instructions

2653 """

2654 if not isinstance(src_buf, bytes):

2655 src_buf = b"".join(src_buf)

2656 if not isinstance(delta, bytes):

2657 delta = b"".join(delta)

2658 out = []

2659 index = 0

2660 delta_length = len(delta)

2661

2662 def get_delta_header_size(delta, index):

2663 size = 0

2664 i = 0

2665 while delta:

2666 cmd = ord(delta[index : index + 1])

2667 index += 1

2668 size |= (cmd & ~0x80) << i

2669 i += 7

2670 if not cmd & 0x80:

2671 break

2672 return size, index

2673

2674 src_size, index = get_delta_header_size(delta, index)

2675 dest_size, index = get_delta_header_size(delta, index)

2676 if src_size != len(src_buf):

2677 raise ApplyDeltaError(

2678 f"Unexpected source buffer size: {src_size} vs {len(src_buf)}"

2679 )

2680 while index < delta_length:

2681 cmd = ord(delta[index : index + 1])

2682 index += 1

2683 if cmd & 0x80:

2684 cp_off = 0

2685 for i in range(4):

2686 if cmd & (1 << i):

2687 x = ord(delta[index : index + 1])

2688 index += 1

2689 cp_off |= x << (i * 8)

2690 cp_size = 0

2691 # Version 3 packs can contain copy sizes larger than 64K.

2692 for i in range(3):

2693 if cmd & (1 << (4 + i)):

2694 x = ord(delta[index : index + 1])

2695 index += 1

2696 cp_size |= x << (i * 8)

2697 if cp_size == 0:

2698 cp_size = 0x10000

2699 if (

2700 cp_off + cp_size < cp_size

2701 or cp_off + cp_size > src_size

2702 or cp_size > dest_size

2703 ):

2704 break

2705 out.append(src_buf[cp_off : cp_off + cp_size])

2706 elif cmd != 0:

2707 out.append(delta[index : index + cmd])

2708 index += cmd

2709 else:

2710 raise ApplyDeltaError("Invalid opcode 0")

2711

2712 if index != delta_length:

2713 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}")

2714

2715 if dest_size != chunks_length(out):

2716 raise ApplyDeltaError("dest size incorrect")

2717

2718 return out

2719

2720

2721def write_pack_index_v2(

2722 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes

2723) -> bytes:

2724 """Write a new pack index file.

2725

2726 Args:

2727 f: File-like object to write to

2728 entries: List of tuples with object name (sha), offset_in_pack, and

2729 crc32_checksum.

2730 pack_checksum: Checksum of the pack file.

2731 Returns: The SHA of the index file written

2732 """

2733 f = SHA1Writer(f)

2734 f.write(b"\377tOc") # Magic!

2735 f.write(struct.pack(">L", 2))

2736 fan_out_table: dict[int, int] = defaultdict(lambda: 0)

2737 for name, offset, entry_checksum in entries:

2738 fan_out_table[ord(name[:1])] += 1

2739 # Fan-out table

2740 largetable: list[int] = []

2741 for i in range(0x100):

2742 f.write(struct.pack(b">L", fan_out_table[i]))

2743 fan_out_table[i + 1] += fan_out_table[i]

2744 for name, offset, entry_checksum in entries:

2745 f.write(name)

2746 for name, offset, entry_checksum in entries:

2747 f.write(struct.pack(b">L", entry_checksum))

2748 for name, offset, entry_checksum in entries:

2749 if offset < 2**31:

2750 f.write(struct.pack(b">L", offset))

2751 else:

2752 f.write(struct.pack(b">L", 2**31 + len(largetable)))

2753 largetable.append(offset)

2754 for offset in largetable:

2755 f.write(struct.pack(b">Q", offset))

2756 assert len(pack_checksum) == 20

2757 f.write(pack_checksum)

2758 return f.write_sha()

2759

2760

2761def write_pack_index_v3(

2762 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes, hash_algorithm: int = 1

2763) -> bytes:

2764 """Write a new pack index file in v3 format.

2765

2766 Args:

2767 f: File-like object to write to

2768 entries: List of tuples with object name (sha), offset_in_pack, and

2769 crc32_checksum.

2770 pack_checksum: Checksum of the pack file.

2771 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

2772 Returns: The SHA of the index file written

2773 """

2774 if hash_algorithm == 1:

2775 hash_size = 20 # SHA-1

2776 writer_cls = SHA1Writer

2777 elif hash_algorithm == 2:

2778 hash_size = 32 # SHA-256

2779 # TODO: Add SHA256Writer when SHA-256 support is implemented

2780 raise NotImplementedError("SHA-256 support not yet implemented")

2781 else:

2782 raise ValueError(f"Unknown hash algorithm {hash_algorithm}")

2783

2784 # Convert entries to list to allow multiple iterations

2785 entries_list = list(entries)

2786

2787 # Calculate shortest unambiguous prefix length for object names

2788 # For now, use full hash size (this could be optimized)

2789 shortened_oid_len = hash_size

2790

2791 f = writer_cls(f)

2792 f.write(b"\377tOc") # Magic!

2793 f.write(struct.pack(">L", 3)) # Version 3

2794 f.write(struct.pack(">L", hash_algorithm)) # Hash algorithm

2795 f.write(struct.pack(">L", shortened_oid_len)) # Shortened OID length

2796

2797 fan_out_table: dict[int, int] = defaultdict(lambda: 0)

2798 for name, offset, entry_checksum in entries_list:

2799 if len(name) != hash_size:

2800 raise ValueError(

2801 f"Object name has wrong length: expected {hash_size}, got {len(name)}"

2802 )

2803 fan_out_table[ord(name[:1])] += 1

2804

2805 # Fan-out table

2806 largetable: list[int] = []

2807 for i in range(0x100):

2808 f.write(struct.pack(b">L", fan_out_table[i]))

2809 fan_out_table[i + 1] += fan_out_table[i]

2810

2811 # Object names table

2812 for name, offset, entry_checksum in entries_list:

2813 f.write(name)

2814

2815 # CRC32 checksums table

2816 for name, offset, entry_checksum in entries_list:

2817 f.write(struct.pack(b">L", entry_checksum))

2818

2819 # Offset table

2820 for name, offset, entry_checksum in entries_list:

2821 if offset < 2**31:

2822 f.write(struct.pack(b">L", offset))

2823 else:

2824 f.write(struct.pack(b">L", 2**31 + len(largetable)))

2825 largetable.append(offset)

2826

2827 # Large offset table

2828 for offset in largetable:

2829 f.write(struct.pack(b">Q", offset))

2830

2831 assert len(pack_checksum) == hash_size, (

2832 f"Pack checksum has wrong length: expected {hash_size}, got {len(pack_checksum)}"

2833 )

2834 f.write(pack_checksum)

2835 return f.write_sha()

2836

2837

2838def write_pack_index(

2839 index_filename, entries, pack_checksum, progress=None, version=None

2840):

2841 """Write a pack index file.

2842

2843 Args:

2844 index_filename: Index filename.

2845 entries: List of (checksum, offset, crc32) tuples

2846 pack_checksum: Checksum of the pack file.

2847 progress: Progress function (not currently used)

2848 version: Pack index version to use (1, 2, or 3). If None, defaults to DEFAULT_PACK_INDEX_VERSION.

2849

2850 Returns:

2851 SHA of the written index file

2852 """

2853 if version is None:

2854 version = DEFAULT_PACK_INDEX_VERSION

2855

2856 if version == 1:

2857 return write_pack_index_v1(index_filename, entries, pack_checksum)

2858 elif version == 2:

2859 return write_pack_index_v2(index_filename, entries, pack_checksum)

2860 elif version == 3:

2861 return write_pack_index_v3(index_filename, entries, pack_checksum)

2862 else:

2863 raise ValueError(f"Unsupported pack index version: {version}")

2864

2865

2866class Pack:

2867 """A Git pack object."""

2868

2869 _data_load: Optional[Callable[[], PackData]]

2870 _idx_load: Optional[Callable[[], PackIndex]]

2871

2872 _data: Optional[PackData]

2873 _idx: Optional[PackIndex]

2874

2875 def __init__(

2876 self,

2877 basename,

2878 resolve_ext_ref: Optional[ResolveExtRefFn] = None,

2879 *,

2880 delta_window_size=None,

2881 window_memory=None,

2882 delta_cache_size=None,

2883 depth=None,

2884 threads=None,

2885 big_file_threshold=None,

2886 ) -> None:

2887 self._basename = basename

2888 self._data = None

2889 self._idx = None

2890 self._idx_path = self._basename + ".idx"

2891 self._data_path = self._basename + ".pack"

2892 self.delta_window_size = delta_window_size

2893 self.window_memory = window_memory

2894 self.delta_cache_size = delta_cache_size

2895 self.depth = depth

2896 self.threads = threads

2897 self.big_file_threshold = big_file_threshold

2898 self._data_load = lambda: PackData(

2899 self._data_path,

2900 delta_window_size=delta_window_size,

2901 window_memory=window_memory,

2902 delta_cache_size=delta_cache_size,

2903 depth=depth,

2904 threads=threads,

2905 big_file_threshold=big_file_threshold,

2906 )

2907 self._idx_load = lambda: load_pack_index(self._idx_path)

2908 self.resolve_ext_ref = resolve_ext_ref

2909

2910 @classmethod

2911 def from_lazy_objects(cls, data_fn, idx_fn):

2912 """Create a new pack object from callables to load pack data and

2913 index objects.

2914 """

2915 ret = cls("")

2916 ret._data_load = data_fn

2917 ret._idx_load = idx_fn

2918 return ret

2919

2920 @classmethod

2921 def from_objects(cls, data, idx):

2922 """Create a new pack object from pack data and index objects."""

2923 ret = cls("")

2924 ret._data = data

2925 ret._data_load = None

2926 ret._idx = idx

2927 ret._idx_load = None

2928 ret.check_length_and_checksum()

2929 return ret

2930

2931 def name(self):

2932 """The SHA over the SHAs of the objects in this pack."""

2933 return self.index.objects_sha1()

2934

2935 @property

2936 def data(self) -> PackData:

2937 """The pack data object being used."""

2938 if self._data is None:

2939 assert self._data_load

2940 self._data = self._data_load()

2941 self.check_length_and_checksum()

2942 return self._data

2943

2944 @property

2945 def index(self) -> PackIndex:

2946 """The index being used.

2947

2948 Note: This may be an in-memory index

2949 """

2950 if self._idx is None:

2951 assert self._idx_load

2952 self._idx = self._idx_load()

2953 return self._idx

2954

2955 def close(self) -> None:

2956 if self._data is not None:

2957 self._data.close()

2958 if self._idx is not None:

2959 self._idx.close()

2960

2961 def __enter__(self):

2962 return self

2963

2964 def __exit__(self, exc_type, exc_val, exc_tb):

2965 self.close()

2966

2967 def __eq__(self, other):

2968 return isinstance(self, type(other)) and self.index == other.index

2969

2970 def __len__(self) -> int:

2971 """Number of entries in this pack."""

2972 return len(self.index)

2973

2974 def __repr__(self) -> str:

2975 return f"{self.__class__.__name__}({self._basename!r})"

2976

2977 def __iter__(self):

2978 """Iterate over all the sha1s of the objects in this pack."""

2979 return iter(self.index)

2980

2981 def check_length_and_checksum(self) -> None:

2982 """Sanity check the length and checksum of the pack index and data."""

2983 assert len(self.index) == len(self.data), (

2984 f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)"

2985 )

2986 idx_stored_checksum = self.index.get_pack_checksum()

2987 data_stored_checksum = self.data.get_stored_checksum()

2988 if (

2989 idx_stored_checksum is not None

2990 and idx_stored_checksum != data_stored_checksum

2991 ):

2992 raise ChecksumMismatch(

2993 sha_to_hex(idx_stored_checksum),

2994 sha_to_hex(data_stored_checksum),

2995 )

2996

2997 def check(self) -> None:

2998 """Check the integrity of this pack.

2999

3000 Raises:

3001 ChecksumMismatch: if a checksum for the index or data is wrong

3002 """

3003 self.index.check()

3004 self.data.check()

3005 for obj in self.iterobjects():

3006 obj.check()

3007 # TODO: object connectivity checks

3008

3009 def get_stored_checksum(self) -> bytes:

3010 return self.data.get_stored_checksum()

3011

3012 def pack_tuples(self):

3013 return [(o, None) for o in self.iterobjects()]

3014

3015 def __contains__(self, sha1: bytes) -> bool:

3016 """Check whether this pack contains a particular SHA1."""

3017 try:

3018 self.index.object_offset(sha1)

3019 return True

3020 except KeyError:

3021 return False

3022

3023 def get_raw(self, sha1: bytes) -> tuple[int, bytes]:

3024 offset = self.index.object_offset(sha1)

3025 obj_type, obj = self.data.get_object_at(offset)

3026 type_num, chunks = self.resolve_object(offset, obj_type, obj)

3027 return type_num, b"".join(chunks)

3028

3029 def __getitem__(self, sha1: bytes) -> ShaFile:

3030 """Retrieve the specified SHA1."""

3031 type, uncomp = self.get_raw(sha1)

3032 return ShaFile.from_raw_string(type, uncomp, sha=sha1)

3033

3034 def iterobjects(self) -> Iterator[ShaFile]:

3035 """Iterate over the objects in this pack."""

3036 return iter(

3037 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref)

3038 )

3039

3040 def iterobjects_subset(

3041 self, shas: Iterable[ObjectID], *, allow_missing: bool = False

3042 ) -> Iterator[ShaFile]:

3043 return (

3044 uo

3045 for uo in PackInflater.for_pack_subset(

3046 self,

3047 shas,

3048 allow_missing=allow_missing,

3049 resolve_ext_ref=self.resolve_ext_ref,

3050 )

3051 if uo.id in shas

3052 )

3053

3054 def iter_unpacked_subset(

3055 self,

3056 shas: Iterable[ObjectID],

3057 *,

3058 include_comp: bool = False,

3059 allow_missing: bool = False,

3060 convert_ofs_delta: bool = False,

3061 ) -> Iterator[UnpackedObject]:

3062 ofs_pending: dict[int, list[UnpackedObject]] = defaultdict(list)

3063 ofs: dict[bytes, int] = {}

3064 todo = set(shas)

3065 for unpacked in self.iter_unpacked(include_comp=include_comp):

3066 sha = unpacked.sha()

3067 ofs[unpacked.offset] = sha

3068 hexsha = sha_to_hex(sha)

3069 if hexsha in todo:

3070 if unpacked.pack_type_num == OFS_DELTA:

3071 assert isinstance(unpacked.delta_base, int)

3072 base_offset = unpacked.offset - unpacked.delta_base

3073 try:

3074 unpacked.delta_base = ofs[base_offset]

3075 except KeyError:

3076 ofs_pending[base_offset].append(unpacked)

3077 continue

3078 else:

3079 unpacked.pack_type_num = REF_DELTA

3080 yield unpacked

3081 todo.remove(hexsha)

3082 for child in ofs_pending.pop(unpacked.offset, []):

3083 child.pack_type_num = REF_DELTA

3084 child.delta_base = sha

3085 yield child

3086 assert not ofs_pending

3087 if not allow_missing and todo:

3088 raise UnresolvedDeltas(list(todo))

3089

3090 def iter_unpacked(self, include_comp=False):

3091 ofs_to_entries = {

3092 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries()

3093 }

3094 for unpacked in self.data.iter_unpacked(include_comp=include_comp):

3095 (sha, crc32) = ofs_to_entries[unpacked.offset]

3096 unpacked._sha = sha

3097 unpacked.crc32 = crc32

3098 yield unpacked

3099

3100 def keep(self, msg: Optional[bytes] = None) -> str:

3101 """Add a .keep file for the pack, preventing git from garbage collecting it.

3102

3103 Args:

3104 msg: A message written inside the .keep file; can be used later

3105 to determine whether or not a .keep file is obsolete.

3106 Returns: The path of the .keep file, as a string.

3107 """

3108 keepfile_name = f"{self._basename}.keep"

3109 with GitFile(keepfile_name, "wb") as keepfile:

3110 if msg:

3111 keepfile.write(msg)

3112 keepfile.write(b"\n")

3113 return keepfile_name

3114

3115 def get_ref(self, sha: bytes) -> tuple[Optional[int], int, OldUnpackedObject]:

3116 """Get the object for a ref SHA, only looking in this pack."""

3117 # TODO: cache these results

3118 try:

3119 offset = self.index.object_offset(sha)

3120 except KeyError:

3121 offset = None

3122 if offset:

3123 type, obj = self.data.get_object_at(offset)

3124 elif self.resolve_ext_ref:

3125 type, obj = self.resolve_ext_ref(sha)

3126 else:

3127 raise KeyError(sha)

3128 return offset, type, obj

3129

3130 def resolve_object(

3131 self, offset: int, type: int, obj, get_ref=None

3132 ) -> tuple[int, Iterable[bytes]]:

3133 """Resolve an object, possibly resolving deltas when necessary.

3134

3135 Returns: Tuple with object type and contents.

3136 """

3137 # Walk down the delta chain, building a stack of deltas to reach

3138 # the requested object.

3139 base_offset = offset

3140 base_type = type

3141 base_obj = obj

3142 delta_stack = []

3143 while base_type in DELTA_TYPES:

3144 prev_offset = base_offset

3145 if get_ref is None:

3146 get_ref = self.get_ref

3147 if base_type == OFS_DELTA:

3148 (delta_offset, delta) = base_obj

3149 # TODO: clean up asserts and replace with nicer error messages

3150 base_offset = base_offset - delta_offset

3151 base_type, base_obj = self.data.get_object_at(base_offset)

3152 assert isinstance(base_type, int)

3153 elif base_type == REF_DELTA:

3154 (basename, delta) = base_obj

3155 assert isinstance(basename, bytes) and len(basename) == 20

3156 base_offset, base_type, base_obj = get_ref(basename)

3157 assert isinstance(base_type, int)

3158 if base_offset == prev_offset: # object is based on itself

3159 raise UnresolvedDeltas([basename])

3160 delta_stack.append((prev_offset, base_type, delta))

3161

3162 # Now grab the base object (mustn't be a delta) and apply the

3163 # deltas all the way up the stack.

3164 chunks = base_obj

3165 for prev_offset, _delta_type, delta in reversed(delta_stack):

3166 chunks = apply_delta(chunks, delta)

3167 if prev_offset is not None:

3168 self.data._offset_cache[prev_offset] = base_type, chunks

3169 return base_type, chunks

3170

3171 def entries(

3172 self, progress: Optional[ProgressFn] = None

3173 ) -> Iterator[PackIndexEntry]:

3174 """Yield entries summarizing the contents of this pack.

3175

3176 Args:

3177 progress: Progress function, called with current and total

3178 object count.

3179 Returns: iterator of tuples with (sha, offset, crc32)

3180 """

3181 return self.data.iterentries(

3182 progress=progress, resolve_ext_ref=self.resolve_ext_ref

3183 )

3184

3185 def sorted_entries(

3186 self, progress: Optional[ProgressFn] = None

3187 ) -> Iterator[PackIndexEntry]:

3188 """Return entries in this pack, sorted by SHA.

3189

3190 Args:

3191 progress: Progress function, called with current and total

3192 object count

3193 Returns: Iterator of tuples with (sha, offset, crc32)

3194 """

3195 return self.data.sorted_entries(

3196 progress=progress, resolve_ext_ref=self.resolve_ext_ref

3197 )

3198

3199 def get_unpacked_object(

3200 self, sha: bytes, *, include_comp: bool = False, convert_ofs_delta: bool = True

3201 ) -> UnpackedObject:

3202 """Get the unpacked object for a sha.

3203

3204 Args:

3205 sha: SHA of object to fetch

3206 include_comp: Whether to include compression data in UnpackedObject

3207 """

3208 offset = self.index.object_offset(sha)

3209 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp)

3210 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta:

3211 assert isinstance(unpacked.delta_base, int)

3212 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base)

3213 unpacked.pack_type_num = REF_DELTA

3214 return unpacked

3215

3216

3217def extend_pack(

3218 f: BinaryIO,

3219 object_ids: set[ObjectID],

3220 get_raw,

3221 *,

3222 compression_level=-1,

3223 progress=None,

3224) -> tuple[bytes, list]:

3225 """Extend a pack file with more objects.

3226

3227 The caller should make sure that object_ids does not contain any objects

3228 that are already in the pack

3229 """

3230 # Update the header with the new number of objects.

3231 f.seek(0)

3232 _version, num_objects = read_pack_header(f.read)

3233

3234 if object_ids:

3235 f.seek(0)

3236 write_pack_header(f.write, num_objects + len(object_ids))

3237

3238 # Must flush before reading (http://bugs.python.org/issue3207)

3239 f.flush()

3240

3241 # Rescan the rest of the pack, computing the SHA with the new header.

3242 new_sha = compute_file_sha(f, end_ofs=-20)

3243

3244 # Must reposition before writing (http://bugs.python.org/issue3207)

3245 f.seek(0, os.SEEK_CUR)

3246

3247 extra_entries = []

3248

3249 # Complete the pack.

3250 for i, object_id in enumerate(object_ids):

3251 if progress is not None:

3252 progress(

3253 (f"writing extra base objects: {i}/{len(object_ids)}\r").encode("ascii")

3254 )

3255 assert len(object_id) == 20

3256 type_num, data = get_raw(object_id)

3257 offset = f.tell()

3258 crc32 = write_pack_object(

3259 f.write,

3260 type_num,

3261 data,

3262 sha=new_sha,

3263 compression_level=compression_level,

3264 )

3265 extra_entries.append((object_id, offset, crc32))

3266 pack_sha = new_sha.digest()

3267 f.write(pack_sha)

3268 return pack_sha, extra_entries

3269

3270

3271try:

3272 from dulwich._pack import ( # type: ignore

3273 apply_delta, # type: ignore

3274 bisect_find_sha, # type: ignore

3275 )

3276except ImportError:

3277 pass