Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/pack.py: 23%

1# pack.py -- For dealing with packed git objects.

5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later

6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU

7# General Public License as published by the Free Software Foundation; version 2.0

8# or (at your option) any later version. You can redistribute it and/or

9# modify it under the terms of either of these two licenses.

10#

11# Unless required by applicable law or agreed to in writing, software

12# distributed under the License is distributed on an "AS IS" BASIS,

13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14# See the License for the specific language governing permissions and

15# limitations under the License.

16#

17# You should have received a copy of the licenses; if not, see

18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License

19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache

20# License, Version 2.0.

21#

23"""Classes for dealing with packed git objects.

25A pack is a compact representation of a bunch of objects, stored

26using deltas where possible.

28They have two parts, the pack file, which stores the data, and an index

29that tells you where the data is.

31To find an object you look in all of the index files 'til you find a

32match for the object name. You then use the pointer got from this as

33a pointer in to the corresponding packfile.

34"""

36__all__ = [

37 "DEFAULT_PACK_DELTA_WINDOW_SIZE",

38 "DEFAULT_PACK_INDEX_VERSION",

39 "DELTA_TYPES",

40 "OFS_DELTA",

41 "PACK_SPOOL_FILE_MAX_SIZE",

42 "REF_DELTA",

43 "DeltaChainIterator",

44 "FilePackIndex",

45 "MemoryPackIndex",

46 "ObjectContainer",

47 "Pack",

48 "PackChunkGenerator",

49 "PackData",

50 "PackFileDisappeared",

51 "PackHint",

52 "PackIndex",

53 "PackIndex1",

54 "PackIndex2",

55 "PackIndex3",

56 "PackIndexEntry",

57 "PackIndexer",

58 "PackInflater",

59 "PackStreamCopier",

60 "PackStreamReader",

61 "PackedObjectContainer",

62 "SHA1Reader",

63 "SHA1Writer",

64 "UnpackedObject",

65 "UnpackedObjectIterator",

66 "UnpackedObjectStream",

67 "UnresolvedDeltas",

68 "apply_delta",

69 "bisect_find_sha",

70 "chunks_length",

71 "compute_file_sha",

72 "deltas_from_sorted_objects",

73 "deltify_pack_objects",

74 "extend_pack",

75 "find_reusable_deltas",

76 "full_unpacked_object",

77 "generate_unpacked_objects",

78 "iter_sha1",

79 "load_pack_index",

80 "load_pack_index_file",

81 "obj_sha",

82 "pack_header_chunks",

83 "pack_object_chunks",

84 "pack_object_header",

85 "pack_objects_to_data",

86 "read_pack_header",

87 "read_zlib_chunks",

88 "sort_objects_for_delta",

89 "take_msb_bytes",

90 "unpack_object",

91 "write_pack",

92 "write_pack_data",

93 "write_pack_from_container",

94 "write_pack_header",

95 "write_pack_index",

96 "write_pack_object",

97 "write_pack_objects",

98]

100import binascii

101from collections import defaultdict, deque

102from contextlib import suppress

103from io import BytesIO, UnsupportedOperation

104

105try:

106 from cdifflib import CSequenceMatcher as SequenceMatcher

107except ModuleNotFoundError:

108 from difflib import SequenceMatcher

109

110import os

111import struct

112import sys

113import warnings

114import zlib

115from collections.abc import Callable, Iterable, Iterator, Sequence, Set

116from hashlib import sha1

117from itertools import chain

118from os import SEEK_CUR, SEEK_END

119from struct import unpack_from

120from types import TracebackType

121from typing import (

122 IO,

123 TYPE_CHECKING,

124 Any,

125 BinaryIO,

126 Generic,

127 Protocol,

128 TypeVar,

129 cast,

130)

131

132try:

133 import mmap

134except ImportError:

135 has_mmap = False

136else:

137 has_mmap = True

138

139if TYPE_CHECKING:

140 from _hashlib import HASH as HashObject

141

142 from .bitmap import PackBitmap

143 from .commit_graph import CommitGraph

144 from .object_store import BaseObjectStore

145 from .refs import Ref

146

147# For some reason the above try, except fails to set has_mmap = False for plan9

148if sys.platform == "Plan9":

149 has_mmap = False

150

151from . import replace_me

152from .errors import ApplyDeltaError, ChecksumMismatch

153from .file import GitFile, _GitFile

154from .lru_cache import LRUSizeCache

155from .objects import (

156 ObjectID,

157 RawObjectID,

158 ShaFile,

159 hex_to_sha,

160 object_header,

161 sha_to_hex,

162)

163

164OFS_DELTA = 6

165REF_DELTA = 7

166

167DELTA_TYPES = (OFS_DELTA, REF_DELTA)

168

169

170DEFAULT_PACK_DELTA_WINDOW_SIZE = 10

171

172# Keep pack files under 16Mb in memory, otherwise write them out to disk

173PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024

174

175# Default pack index version to use when none is specified

176DEFAULT_PACK_INDEX_VERSION = 2

177

178

179OldUnpackedObject = tuple[bytes | int, list[bytes]] | list[bytes]

180ResolveExtRefFn = Callable[[bytes], tuple[int, OldUnpackedObject]]

181ProgressFn = Callable[[int, str], None]

182PackHint = tuple[int, bytes | None]

183

184

185class UnresolvedDeltas(Exception):

186 """Delta objects could not be resolved."""

187

188 def __init__(self, shas: list[bytes]) -> None:

189 """Initialize UnresolvedDeltas exception.

190

191 Args:

192 shas: List of SHA hashes for unresolved delta objects

193 """

194 self.shas = shas

195

196

197class ObjectContainer(Protocol):

198 """Protocol for objects that can contain git objects."""

199

200 def add_object(self, obj: ShaFile) -> None:

201 """Add a single object to this object store."""

202

203 def add_objects(

204 self,

205 objects: Sequence[tuple[ShaFile, str | None]],

206 progress: Callable[..., None] | None = None,

207 ) -> "Pack | None":

208 """Add a set of objects to this object store.

209

210 Args:

211 objects: Iterable over a list of (object, path) tuples

212 progress: Progress callback for object insertion

213 Returns: Optional Pack object of the objects written.

214 """

215

216 def __contains__(self, sha1: "ObjectID") -> bool:

217 """Check if a hex sha is present."""

218

219 def __getitem__(self, sha1: "ObjectID | RawObjectID") -> ShaFile:

220 """Retrieve an object."""

221

222 def get_commit_graph(self) -> "CommitGraph | None":

223 """Get the commit graph for this object store.

224

225 Returns:

226 CommitGraph object if available, None otherwise

227 """

228 return None

229

230

231class PackedObjectContainer(ObjectContainer):

232 """Container for objects packed in a pack file."""

233

234 def get_unpacked_object(

235 self, sha1: "ObjectID | RawObjectID", *, include_comp: bool = False

236 ) -> "UnpackedObject":

237 """Get a raw unresolved object.

238

239 Args:

240 sha1: SHA-1 hash of the object

241 include_comp: Whether to include compressed data

242

243 Returns:

244 UnpackedObject instance

245 """

246 raise NotImplementedError(self.get_unpacked_object)

247

248 def iterobjects_subset(

249 self, shas: Iterable["ObjectID"], *, allow_missing: bool = False

250 ) -> Iterator[ShaFile]:

251 """Iterate over a subset of objects.

252

253 Args:

254 shas: Iterable of object SHAs to retrieve

255 allow_missing: If True, skip missing objects

256

257 Returns:

258 Iterator of ShaFile objects

259 """

260 raise NotImplementedError(self.iterobjects_subset)

261

262 def iter_unpacked_subset(

263 self,

264 shas: Iterable["ObjectID | RawObjectID"],

265 *,

266 include_comp: bool = False,

267 allow_missing: bool = False,

268 convert_ofs_delta: bool = True,

269 ) -> Iterator["UnpackedObject"]:

270 """Iterate over unpacked objects from a subset of SHAs.

271

272 Args:

273 shas: Set of object SHAs to retrieve

274 include_comp: Include compressed data if True

275 allow_missing: If True, skip missing objects

276 convert_ofs_delta: If True, convert offset deltas to ref deltas

277

278 Returns:

279 Iterator of UnpackedObject instances

280 """

281 raise NotImplementedError(self.iter_unpacked_subset)

282

283

284class UnpackedObjectStream:

285 """Abstract base class for a stream of unpacked objects."""

286

287 def __iter__(self) -> Iterator["UnpackedObject"]:

288 """Iterate over unpacked objects."""

289 raise NotImplementedError(self.__iter__)

290

291 def __len__(self) -> int:

292 """Return the number of objects in the stream."""

293 raise NotImplementedError(self.__len__)

294

295

296def take_msb_bytes(

297 read: Callable[[int], bytes], crc32: int | None = None

298) -> tuple[list[int], int | None]:

299 """Read bytes marked with most significant bit.

300

301 Args:

302 read: Read function

303 crc32: Optional CRC32 checksum to update

304

305 Returns:

306 Tuple of (list of bytes read, updated CRC32 or None)

307 """

308 ret: list[int] = []

309 while len(ret) == 0 or ret[-1] & 0x80:

310 b = read(1)

311 if crc32 is not None:

312 crc32 = binascii.crc32(b, crc32)

313 ret.append(ord(b[:1]))

314 return ret, crc32

315

316

317class PackFileDisappeared(Exception):

318 """Raised when a pack file unexpectedly disappears."""

319

320 def __init__(self, obj: object) -> None:

321 """Initialize PackFileDisappeared exception.

322

323 Args:

324 obj: The object that triggered the exception

325 """

326 self.obj = obj

327

328

329class UnpackedObject:

330 """Class encapsulating an object unpacked from a pack file.

331

332 These objects should only be created from within unpack_object. Most

333 members start out as empty and are filled in at various points by

334 read_zlib_chunks, unpack_object, DeltaChainIterator, etc.

335

336 End users of this object should take care that the function they're getting

337 this object from is guaranteed to set the members they need.

338 """

339

340 __slots__ = [

341 "_sha", # Cached binary SHA.

342 "comp_chunks", # Compressed object chunks.

343 "crc32", # CRC32.

344 "decomp_chunks", # Decompressed object chunks.

345 "decomp_len", # Decompressed length of this object.

346 "delta_base", # Delta base offset or SHA.

347 "obj_chunks", # Decompressed and delta-resolved chunks.

348 "obj_type_num", # Type of this object.

349 "offset", # Offset in its pack.

350 "pack_type_num", # Type of this object in the pack (may be a delta).

351 ]

352

353 obj_type_num: int | None

354 obj_chunks: list[bytes] | None

355 delta_base: None | bytes | int

356 decomp_chunks: list[bytes]

357 comp_chunks: list[bytes] | None

358 decomp_len: int | None

359 crc32: int | None

360 offset: int | None

361 pack_type_num: int

362 _sha: bytes | None

363

364 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be

365 # methods of this object.

366 def __init__(

367 self,

368 pack_type_num: int,

369 *,

370 delta_base: None | bytes | int = None,

371 decomp_len: int | None = None,

372 crc32: int | None = None,

373 sha: bytes | None = None,

374 decomp_chunks: list[bytes] | None = None,

375 offset: int | None = None,

376 ) -> None:

377 """Initialize an UnpackedObject.

378

379 Args:

380 pack_type_num: Type number of this object in the pack

381 delta_base: Delta base (offset or SHA) if this is a delta object

382 decomp_len: Decompressed length of this object

383 crc32: CRC32 checksum

384 sha: SHA-1 hash of the object

385 decomp_chunks: Decompressed chunks

386 offset: Offset in the pack file

387 """

388 self.offset = offset

389 self._sha = sha

390 self.pack_type_num = pack_type_num

391 self.delta_base = delta_base

392 self.comp_chunks = None

393 self.decomp_chunks: list[bytes] = decomp_chunks or []

394 if decomp_chunks is not None and decomp_len is None:

395 self.decomp_len = sum(map(len, decomp_chunks))

396 else:

397 self.decomp_len = decomp_len

398 self.crc32 = crc32

399

400 if pack_type_num in DELTA_TYPES:

401 self.obj_type_num = None

402 self.obj_chunks = None

403 else:

404 self.obj_type_num = pack_type_num

405 self.obj_chunks = self.decomp_chunks

406 self.delta_base = delta_base

407

408 def sha(self) -> RawObjectID:

409 """Return the binary SHA of this object."""

410 if self._sha is None:

411 assert self.obj_type_num is not None and self.obj_chunks is not None

412 self._sha = obj_sha(self.obj_type_num, self.obj_chunks)

413 return RawObjectID(self._sha)

414

415 def sha_file(self) -> ShaFile:

416 """Return a ShaFile from this object."""

417 assert self.obj_type_num is not None and self.obj_chunks is not None

418 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)

419

420 # Only provided for backwards compatibility with code that expects either

421 # chunks or a delta tuple.

422 def _obj(self) -> OldUnpackedObject:

423 """Return the decompressed chunks, or (delta base, delta chunks)."""

424 if self.pack_type_num in DELTA_TYPES:

425 assert isinstance(self.delta_base, (bytes, int))

426 return (self.delta_base, self.decomp_chunks)

427 else:

428 return self.decomp_chunks

429

430 def __eq__(self, other: object) -> bool:

431 """Check equality with another UnpackedObject."""

432 if not isinstance(other, UnpackedObject):

433 return False

434 for slot in self.__slots__:

435 if getattr(self, slot) != getattr(other, slot):

436 return False

437 return True

438

439 def __ne__(self, other: object) -> bool:

440 """Check inequality with another UnpackedObject."""

441 return not (self == other)

442

443 def __repr__(self) -> str:

444 """Return string representation of this UnpackedObject."""

445 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__]

446 return "{}({})".format(self.__class__.__name__, ", ".join(data))

447

448

449_ZLIB_BUFSIZE = 65536 # 64KB buffer for better I/O performance

450

451

452def read_zlib_chunks(

453 read_some: Callable[[int], bytes],

454 unpacked: UnpackedObject,

455 include_comp: bool = False,

456 buffer_size: int = _ZLIB_BUFSIZE,

457) -> bytes:

458 """Read zlib data from a buffer.

459

460 This function requires that the buffer have additional data following the

461 compressed data, which is guaranteed to be the case for git pack files.

462

463 Args:

464 read_some: Read function that returns at least one byte, but may

465 return less than the requested size.

466 unpacked: An UnpackedObject to write result data to. If its crc32

467 attr is not None, the CRC32 of the compressed bytes will be computed

468 using this starting CRC32.

469 After this function, will have the following attrs set:

470 * comp_chunks (if include_comp is True)

471 * decomp_chunks

472 * decomp_len

473 * crc32

474 include_comp: If True, include compressed data in the result.

475 buffer_size: Size of the read buffer.

476 Returns: Leftover unused data from the decompression.

477

478 Raises:

479 zlib.error: if a decompression error occurred.

480 """

481 if unpacked.decomp_len is None or unpacked.decomp_len <= -1:

482 raise ValueError("non-negative zlib data stream size expected")

483 decomp_obj = zlib.decompressobj()

484

485 comp_chunks = []

486 decomp_chunks = unpacked.decomp_chunks

487 decomp_len = 0

488 crc32 = unpacked.crc32

489

490 while True:

491 add = read_some(buffer_size)

492 if not add:

493 raise zlib.error("EOF before end of zlib stream")

494 comp_chunks.append(add)

495 decomp = decomp_obj.decompress(add)

496 decomp_len += len(decomp)

497 decomp_chunks.append(decomp)

498 unused = decomp_obj.unused_data

499 if unused:

500 left = len(unused)

501 if crc32 is not None:

502 crc32 = binascii.crc32(add[:-left], crc32)

503 if include_comp:

504 comp_chunks[-1] = add[:-left]

505 break

506 elif crc32 is not None:

507 crc32 = binascii.crc32(add, crc32)

508 if crc32 is not None:

509 crc32 &= 0xFFFFFFFF

510

511 if decomp_len != unpacked.decomp_len:

512 raise zlib.error("decompressed data does not match expected size")

513

514 unpacked.crc32 = crc32

515 if include_comp:

516 unpacked.comp_chunks = comp_chunks

517 return unused

518

519

520def iter_sha1(iter: Iterable[bytes]) -> bytes:

521 """Return the hexdigest of the SHA1 over a set of names.

522

523 Args:

524 iter: Iterator over string objects

525 Returns: 40-byte hex sha1 digest

526 """

527 sha = sha1()

528 for name in iter:

529 sha.update(name)

530 return sha.hexdigest().encode("ascii")

531

532

533def load_pack_index(path: str | os.PathLike[str]) -> "PackIndex":

534 """Load an index file by path.

535

536 Args:

537 path: Path to the index file

538 Returns: A PackIndex loaded from the given path

539 """

540 with GitFile(path, "rb") as f:

541 return load_pack_index_file(path, f)

542

543

544def _load_file_contents(

545 f: IO[bytes] | _GitFile, size: int | None = None

546) -> tuple[bytes | Any, int]:

547 """Load contents from a file, preferring mmap when possible.

548

549 Args:

550 f: File-like object to load

551 size: Expected size, or None to determine from file

552 Returns: Tuple of (contents, size)

553 """

554 try:

555 fd = f.fileno()

556 except (UnsupportedOperation, AttributeError):

557 fd = None

558 # Attempt to use mmap if possible

559 if fd is not None:

560 if size is None:

561 size = os.fstat(fd).st_size

562 if has_mmap:

563 try:

564 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)

565 except (OSError, ValueError):

566 # Can't mmap - perhaps a socket or invalid file descriptor

567 pass

568 else:

569 return contents, size

570 contents_bytes = f.read()

571 size = len(contents_bytes)

572 return contents_bytes, size

573

574

575def load_pack_index_file(

576 path: str | os.PathLike[str], f: IO[bytes] | _GitFile

577) -> "PackIndex":

578 """Load an index file from a file-like object.

579

580 Args:

581 path: Path for the index file

582 f: File-like object

583 Returns: A PackIndex loaded from the given file

584 """

585 contents, size = _load_file_contents(f)

586 if contents[:4] == b"\377tOc":

587 version = struct.unpack(b">L", contents[4:8])[0]

588 if version == 2:

589 return PackIndex2(path, file=f, contents=contents, size=size)

590 elif version == 3:

591 return PackIndex3(path, file=f, contents=contents, size=size)

592 else:

593 raise KeyError(f"Unknown pack index format {version}")

594 else:

595 return PackIndex1(path, file=f, contents=contents, size=size)

596

597

598def bisect_find_sha(

599 start: int, end: int, sha: bytes, unpack_name: Callable[[int], bytes]

600) -> int | None:

601 """Find a SHA in a data blob with sorted SHAs.

602

603 Args:

604 start: Start index of range to search

605 end: End index of range to search

606 sha: Sha to find

607 unpack_name: Callback to retrieve SHA by index

608 Returns: Index of the SHA, or None if it wasn't found

609 """

610 assert start <= end

611 while start <= end:

612 i = (start + end) // 2

613 file_sha = unpack_name(i)

614 if file_sha < sha:

615 start = i + 1

616 elif file_sha > sha:

617 end = i - 1

618 else:

619 return i

620 return None

621

622

623PackIndexEntry = tuple[RawObjectID, int, int | None]

624

625

626class PackIndex:

627 """An index in to a packfile.

628

629 Given a sha id of an object a pack index can tell you the location in the

630 packfile of that object if it has it.

631 """

632

633 # Default to SHA-1 for backward compatibility

634 hash_algorithm = 1

635 hash_size = 20

636

637 def __eq__(self, other: object) -> bool:

638 """Check equality with another PackIndex."""

639 if not isinstance(other, PackIndex):

640 return False

641

642 for (name1, _, _), (name2, _, _) in zip(

643 self.iterentries(), other.iterentries()

644 ):

645 if name1 != name2:

646 return False

647 return True

648

649 def __ne__(self, other: object) -> bool:

650 """Check if this pack index is not equal to another."""

651 return not self.__eq__(other)

652

653 def __len__(self) -> int:

654 """Return the number of entries in this pack index."""

655 raise NotImplementedError(self.__len__)

656

657 def __iter__(self) -> Iterator[ObjectID]:

658 """Iterate over the SHAs in this pack."""

659 return map(lambda sha: sha_to_hex(RawObjectID(sha)), self._itersha())

660

661 def iterentries(self) -> Iterator[PackIndexEntry]:

662 """Iterate over the entries in this pack index.

663

664 Returns: iterator over tuples with object name, offset in packfile and

665 crc32 checksum.

666 """

667 raise NotImplementedError(self.iterentries)

668

669 def get_pack_checksum(self) -> bytes | None:

670 """Return the SHA1 checksum stored for the corresponding packfile.

671

672 Returns: 20-byte binary digest, or None if not available

673 """

674 raise NotImplementedError(self.get_pack_checksum)

675

676 @replace_me(since="0.21.0", remove_in="0.23.0")

677 def object_index(self, sha: ObjectID | RawObjectID) -> int:

678 """Return the index for the given SHA.

679

680 Args:

681 sha: SHA-1 hash

682

683 Returns:

684 Index position

685 """

686 return self.object_offset(sha)

687

688 def object_offset(self, sha: ObjectID | RawObjectID) -> int:

689 """Return the offset in to the corresponding packfile for the object.

690

691 Given the name of an object it will return the offset that object

692 lives at within the corresponding pack file. If the pack file doesn't

693 have the object then None will be returned.

694 """

695 raise NotImplementedError(self.object_offset)

696

697 def object_sha1(self, index: int) -> bytes:

698 """Return the SHA1 corresponding to the index in the pack file."""

699 for name, offset, _crc32 in self.iterentries():

700 if offset == index:

701 return name

702 else:

703 raise KeyError(index)

704

705 def _object_offset(self, sha: bytes) -> int:

706 """See object_offset.

707

708 Args:

709 sha: A *binary* SHA string. (20 characters long)_

710 """

711 raise NotImplementedError(self._object_offset)

712

713 def objects_sha1(self) -> bytes:

714 """Return the hex SHA1 over all the shas of all objects in this pack.

715

716 Note: This is used for the filename of the pack.

717 """

718 return iter_sha1(self._itersha())

719

720 def _itersha(self) -> Iterator[bytes]:

721 """Yield all the SHA1's of the objects in the index, sorted."""

722 raise NotImplementedError(self._itersha)

723

724 def iter_prefix(self, prefix: bytes) -> Iterator[RawObjectID]:

725 """Iterate over all SHA1s with the given prefix.

726

727 Args:

728 prefix: Binary prefix to match

729 Returns: Iterator of matching SHA1s

730 """

731 # Default implementation for PackIndex classes that don't override

732 for sha, _, _ in self.iterentries():

733 if sha.startswith(prefix):

734 yield RawObjectID(sha)

735

736 def close(self) -> None:

737 """Close any open files."""

738

739 def check(self) -> None:

740 """Check the consistency of this pack index."""

741

742

743class MemoryPackIndex(PackIndex):

744 """Pack index that is stored entirely in memory."""

745

746 def __init__(

747 self,

748 entries: list[PackIndexEntry],

749 pack_checksum: bytes | None = None,

750 ) -> None:

751 """Create a new MemoryPackIndex.

752

753 Args:

754 entries: Sequence of name, idx, crc32 (sorted)

755 pack_checksum: Optional pack checksum

756 """

757 self._by_sha = {}

758 self._by_offset = {}

759 for name, offset, _crc32 in entries:

760 self._by_sha[name] = offset

761 self._by_offset[offset] = name

762 self._entries = entries

763 self._pack_checksum = pack_checksum

764

765 def get_pack_checksum(self) -> bytes | None:

766 """Return the SHA checksum stored for the corresponding packfile."""

767 return self._pack_checksum

768

769 def __len__(self) -> int:

770 """Return the number of entries in this pack index."""

771 return len(self._entries)

772

773 def object_offset(self, sha: ObjectID | RawObjectID) -> int:

774 """Return the offset for the given SHA.

775

776 Args:

777 sha: SHA to look up (binary or hex)

778 Returns: Offset in the pack file

779 """

780 if len(sha) == 40:

781 sha = hex_to_sha(cast(ObjectID, sha))

782 return self._by_sha[cast(RawObjectID, sha)]

783

784 def object_sha1(self, offset: int) -> bytes:

785 """Return the SHA1 for the object at the given offset."""

786 return self._by_offset[offset]

787

788 def _itersha(self) -> Iterator[bytes]:

789 """Iterate over all SHA1s in the index."""

790 return iter(self._by_sha)

791

792 def iterentries(self) -> Iterator[PackIndexEntry]:

793 """Iterate over all index entries."""

794 return iter(self._entries)

795

796 @classmethod

797 def for_pack(cls, pack_data: "PackData") -> "MemoryPackIndex":

798 """Create a MemoryPackIndex from a PackData object."""

799 return MemoryPackIndex(

800 list(pack_data.sorted_entries()), pack_data.get_stored_checksum()

801 )

802

803 @classmethod

804 def clone(cls, other_index: "PackIndex") -> "MemoryPackIndex":

805 """Create a copy of another PackIndex in memory."""

806 return cls(list(other_index.iterentries()), other_index.get_pack_checksum())

807

808

809class FilePackIndex(PackIndex):

810 """Pack index that is based on a file.

811

812 To do the loop it opens the file, and indexes first 256 4 byte groups

813 with the first byte of the sha id. The value in the four byte group indexed

814 is the end of the group that shares the same starting byte. Subtract one

815 from the starting byte and index again to find the start of the group.

816 The values are sorted by sha id within the group, so do the math to find

817 the start and end offset and then bisect in to find if the value is

818 present.

819 """

820

821 _fan_out_table: list[int]

822 _file: IO[bytes] | _GitFile

823

824 def __init__(

825 self,

826 filename: str | os.PathLike[str],

827 file: IO[bytes] | _GitFile | None = None,

828 contents: "bytes | mmap.mmap | None" = None,

829 size: int | None = None,

830 ) -> None:

831 """Create a pack index object.

832

833 Provide it with the name of the index file to consider, and it will map

834 it whenever required.

835 """

836 self._filename = filename

837 # Take the size now, so it can be checked each time we map the file to

838 # ensure that it hasn't changed.

839 if file is None:

840 self._file = GitFile(filename, "rb")

841 else:

842 self._file = file

843 if contents is None:

844 self._contents, self._size = _load_file_contents(self._file, size)

845 else:

846 self._contents = contents

847 self._size = size if size is not None else len(contents)

848

849 @property

850 def path(self) -> str:

851 """Return the path to this index file."""

852 return os.fspath(self._filename)

853

854 def __eq__(self, other: object) -> bool:

855 """Check equality with another FilePackIndex."""

856 # Quick optimization:

857 if (

858 isinstance(other, FilePackIndex)

859 and self._fan_out_table != other._fan_out_table

860 ):

861 return False

862

863 return super().__eq__(other)

864

865 def close(self) -> None:

866 """Close the underlying file and any mmap."""

867 self._file.close()

868 close_fn = getattr(self._contents, "close", None)

869 if close_fn is not None:

870 close_fn()

871

872 def __len__(self) -> int:

873 """Return the number of entries in this pack index."""

874 return self._fan_out_table[-1]

875

876 def _unpack_entry(self, i: int) -> PackIndexEntry:

877 """Unpack the i-th entry in the index file.

878

879 Returns: Tuple with object name (SHA), offset in pack file and CRC32

880 checksum (if known).

881 """

882 raise NotImplementedError(self._unpack_entry)

883

884 def _unpack_name(self, i: int) -> bytes:

885 """Unpack the i-th name from the index file."""

886 raise NotImplementedError(self._unpack_name)

887

888 def _unpack_offset(self, i: int) -> int:

889 """Unpack the i-th object offset from the index file."""

890 raise NotImplementedError(self._unpack_offset)

891

892 def _unpack_crc32_checksum(self, i: int) -> int | None:

893 """Unpack the crc32 checksum for the ith object from the index file."""

894 raise NotImplementedError(self._unpack_crc32_checksum)

895

896 def _itersha(self) -> Iterator[bytes]:

897 """Iterate over all SHA1s in the index."""

898 for i in range(len(self)):

899 yield self._unpack_name(i)

900

901 def iterentries(self) -> Iterator[PackIndexEntry]:

902 """Iterate over the entries in this pack index.

903

904 Returns: iterator over tuples with object name, offset in packfile and

905 crc32 checksum.

906 """

907 for i in range(len(self)):

908 yield self._unpack_entry(i)

909

910 def _read_fan_out_table(self, start_offset: int) -> list[int]:

911 """Read the fan-out table from the index.

912

913 The fan-out table contains 256 entries mapping first byte values

914 to the number of objects with SHA1s less than or equal to that byte.

915

916 Args:

917 start_offset: Offset in the file where the fan-out table starts

918 Returns: List of 256 integers

919 """

920 ret = []

921 for i in range(0x100):

922 fanout_entry = self._contents[

923 start_offset + i * 4 : start_offset + (i + 1) * 4

924 ]

925 ret.append(struct.unpack(">L", fanout_entry)[0])

926 return ret

927

928 def check(self) -> None:

929 """Check that the stored checksum matches the actual checksum."""

930 actual = self.calculate_checksum()

931 stored = self.get_stored_checksum()

932 if actual != stored:

933 raise ChecksumMismatch(stored, actual)

934

935 def calculate_checksum(self) -> bytes:

936 """Calculate the SHA1 checksum over this pack index.

937

938 Returns: This is a 20-byte binary digest

939 """

940 return sha1(self._contents[:-20]).digest()

941

942 def get_pack_checksum(self) -> bytes:

943 """Return the SHA1 checksum stored for the corresponding packfile.

944

945 Returns: 20-byte binary digest

946 """

947 return bytes(self._contents[-40:-20])

948

949 def get_stored_checksum(self) -> bytes:

950 """Return the SHA1 checksum stored for this index.

951

952 Returns: 20-byte binary digest

953 """

954 return bytes(self._contents[-20:])

955

956 def object_offset(self, sha: ObjectID | RawObjectID) -> int:

957 """Return the offset in to the corresponding packfile for the object.

958

959 Given the name of an object it will return the offset that object

960 lives at within the corresponding pack file. If the pack file doesn't

961 have the object then None will be returned.

962 """

963 if len(sha) == 40:

964 sha = hex_to_sha(cast(ObjectID, sha))

965 try:

966 return self._object_offset(sha)

967 except ValueError as exc:

968 closed = getattr(self._contents, "closed", None)

969 if closed in (None, True):

970 raise PackFileDisappeared(self) from exc

971 raise

972

973 def _object_offset(self, sha: bytes) -> int:

974 """See object_offset.

975

976 Args:

977 sha: A *binary* SHA string. (20 characters long)_

978 """

979 assert len(sha) == 20

980 idx = ord(sha[:1])

981 if idx == 0:

982 start = 0

983 else:

984 start = self._fan_out_table[idx - 1]

985 end = self._fan_out_table[idx]

986 i = bisect_find_sha(start, end, sha, self._unpack_name)

987 if i is None:

988 raise KeyError(sha)

989 return self._unpack_offset(i)

990

991 def iter_prefix(self, prefix: bytes) -> Iterator[RawObjectID]:

992 """Iterate over all SHA1s with the given prefix."""

993 start = ord(prefix[:1])

994 if start == 0:

995 start = 0

996 else:

997 start = self._fan_out_table[start - 1]

998 end = ord(prefix[:1]) + 1

999 if end == 0x100:

1000 end = len(self)

1001 else:

1002 end = self._fan_out_table[end]

1003 assert start <= end

1004 started = False

1005 for i in range(start, end):

1006 name: bytes = self._unpack_name(i)

1007 if name.startswith(prefix):

1008 yield RawObjectID(name)

1009 started = True

1010 elif started:

1011 break

1012

1013

1014class PackIndex1(FilePackIndex):

1015 """Version 1 Pack Index file."""

1016

1017 def __init__(

1018 self,

1019 filename: str | os.PathLike[str],

1020 file: IO[bytes] | _GitFile | None = None,

1021 contents: bytes | None = None,

1022 size: int | None = None,

1023 ) -> None:

1024 """Initialize a version 1 pack index.

1025

1026 Args:

1027 filename: Path to the index file

1028 file: Optional file object

1029 contents: Optional mmap'd contents

1030 size: Optional size of the index

1031 """

1032 super().__init__(filename, file, contents, size)

1033 self.version = 1

1034 self._fan_out_table = self._read_fan_out_table(0)

1035

1036 def _unpack_entry(self, i: int) -> tuple[RawObjectID, int, None]:

1037 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24))

1038 return (RawObjectID(name), offset, None)

1039

1040 def _unpack_name(self, i: int) -> bytes:

1041 offset = (0x100 * 4) + (i * 24) + 4

1042 return self._contents[offset : offset + 20]

1043

1044 def _unpack_offset(self, i: int) -> int:

1045 offset = (0x100 * 4) + (i * 24)

1046 result = unpack_from(">L", self._contents, offset)[0]

1047 assert isinstance(result, int)

1048 return result

1049

1050 def _unpack_crc32_checksum(self, i: int) -> None:

1051 # Not stored in v1 index files

1052 return None

1053

1054

1055class PackIndex2(FilePackIndex):

1056 """Version 2 Pack Index file."""

1057

1058 def __init__(

1059 self,

1060 filename: str | os.PathLike[str],

1061 file: IO[bytes] | _GitFile | None = None,

1062 contents: bytes | None = None,

1063 size: int | None = None,

1064 ) -> None:

1065 """Initialize a version 2 pack index.

1066

1067 Args:

1068 filename: Path to the index file

1069 file: Optional file object

1070 contents: Optional mmap'd contents

1071 size: Optional size of the index

1072 """

1073 super().__init__(filename, file, contents, size)

1074 if self._contents[:4] != b"\377tOc":

1075 raise AssertionError("Not a v2 pack index file")

1076 (self.version,) = unpack_from(b">L", self._contents, 4)

1077 if self.version != 2:

1078 raise AssertionError(f"Version was {self.version}")

1079 self._fan_out_table = self._read_fan_out_table(8)

1080 self._name_table_offset = 8 + 0x100 * 4

1081 self._crc32_table_offset = self._name_table_offset + 20 * len(self)

1082 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)

1083 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(

1084 self

1085 )

1086

1087 def _unpack_entry(self, i: int) -> tuple[RawObjectID, int, int]:

1088 return (

1089 RawObjectID(self._unpack_name(i)),

1090 self._unpack_offset(i),

1091 self._unpack_crc32_checksum(i),

1092 )

1093

1094 def _unpack_name(self, i: int) -> bytes:

1095 offset = self._name_table_offset + i * 20

1096 return self._contents[offset : offset + 20]

1097

1098 def _unpack_offset(self, i: int) -> int:

1099 offset_pos = self._pack_offset_table_offset + i * 4

1100 offset = unpack_from(">L", self._contents, offset_pos)[0]

1101 assert isinstance(offset, int)

1102 if offset & (2**31):

1103 large_offset_pos = (

1104 self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8

1105 )

1106 offset = unpack_from(">Q", self._contents, large_offset_pos)[0]

1107 assert isinstance(offset, int)

1108 return offset

1109

1110 def _unpack_crc32_checksum(self, i: int) -> int:

1111 result = unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]

1112 assert isinstance(result, int)

1113 return result

1114

1115

1116class PackIndex3(FilePackIndex):

1117 """Version 3 Pack Index file.

1118

1119 Supports variable hash sizes for SHA-1 (20 bytes) and SHA-256 (32 bytes).

1120 """

1121

1122 def __init__(

1123 self,

1124 filename: str | os.PathLike[str],

1125 file: IO[bytes] | _GitFile | None = None,

1126 contents: bytes | None = None,

1127 size: int | None = None,

1128 ) -> None:

1129 """Initialize a version 3 pack index.

1130

1131 Args:

1132 filename: Path to the index file

1133 file: Optional file object

1134 contents: Optional mmap'd contents

1135 size: Optional size of the index

1136 """

1137 super().__init__(filename, file, contents, size)

1138 if self._contents[:4] != b"\377tOc":

1139 raise AssertionError("Not a v3 pack index file")

1140 (self.version,) = unpack_from(b">L", self._contents, 4)

1141 if self.version != 3:

1142 raise AssertionError(f"Version was {self.version}")

1143

1144 # Read hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

1145 (self.hash_algorithm,) = unpack_from(b">L", self._contents, 8)

1146 if self.hash_algorithm == 1:

1147 self.hash_size = 20 # SHA-1

1148 elif self.hash_algorithm == 2:

1149 self.hash_size = 32 # SHA-256

1150 else:

1151 raise AssertionError(f"Unknown hash algorithm {self.hash_algorithm}")

1152

1153 # Read length of shortened object names

1154 (self.shortened_oid_len,) = unpack_from(b">L", self._contents, 12)

1155

1156 # Calculate offsets based on variable hash size

1157 self._fan_out_table = self._read_fan_out_table(

1158 16

1159 ) # After header (4 + 4 + 4 + 4)

1160 self._name_table_offset = 16 + 0x100 * 4

1161 self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self)

1162 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)

1163 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(

1164 self

1165 )

1166

1167 def _unpack_entry(self, i: int) -> tuple[RawObjectID, int, int]:

1168 return (

1169 RawObjectID(self._unpack_name(i)),

1170 self._unpack_offset(i),

1171 self._unpack_crc32_checksum(i),

1172 )

1173

1174 def _unpack_name(self, i: int) -> bytes:

1175 offset = self._name_table_offset + i * self.hash_size

1176 return self._contents[offset : offset + self.hash_size]

1177

1178 def _unpack_offset(self, i: int) -> int:

1179 offset_pos = self._pack_offset_table_offset + i * 4

1180 offset = unpack_from(">L", self._contents, offset_pos)[0]

1181 assert isinstance(offset, int)

1182 if offset & (2**31):

1183 large_offset_pos = (

1184 self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8

1185 )

1186 offset = unpack_from(">Q", self._contents, large_offset_pos)[0]

1187 assert isinstance(offset, int)

1188 return offset

1189

1190 def _unpack_crc32_checksum(self, i: int) -> int:

1191 result = unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]

1192 assert isinstance(result, int)

1193 return result

1194

1195

1196def read_pack_header(read: Callable[[int], bytes]) -> tuple[int, int]:

1197 """Read the header of a pack file.

1198

1199 Args:

1200 read: Read function

1201 Returns: Tuple of (pack version, number of objects). If no data is

1202 available to read, returns (None, None).

1203 """

1204 header = read(12)

1205 if not header:

1206 raise AssertionError("file too short to contain pack")

1207 if header[:4] != b"PACK":

1208 raise AssertionError(f"Invalid pack header {header!r}")

1209 (version,) = unpack_from(b">L", header, 4)

1210 if version not in (2, 3):

1211 raise AssertionError(f"Version was {version}")

1212 (num_objects,) = unpack_from(b">L", header, 8)

1213 return (version, num_objects)

1214

1215

1216def chunks_length(chunks: bytes | Iterable[bytes]) -> int:

1217 """Get the total length of a sequence of chunks.

1218

1219 Args:

1220 chunks: Either a single bytes object or an iterable of bytes

1221 Returns: Total length in bytes

1222 """

1223 if isinstance(chunks, bytes):

1224 return len(chunks)

1225 else:

1226 return sum(map(len, chunks))

1227

1228

1229def unpack_object(

1230 read_all: Callable[[int], bytes],

1231 read_some: Callable[[int], bytes] | None = None,

1232 compute_crc32: bool = False,

1233 include_comp: bool = False,

1234 zlib_bufsize: int = _ZLIB_BUFSIZE,

1235) -> tuple[UnpackedObject, bytes]:

1236 """Unpack a Git object.

1237

1238 Args:

1239 read_all: Read function that blocks until the number of requested

1240 bytes are read.

1241 read_some: Read function that returns at least one byte, but may not

1242 return the number of bytes requested.

1243 compute_crc32: If True, compute the CRC32 of the compressed data. If

1244 False, the returned CRC32 will be None.

1245 include_comp: If True, include compressed data in the result.

1246 zlib_bufsize: An optional buffer size for zlib operations.

1247 Returns: A tuple of (unpacked, unused), where unused is the unused data

1248 leftover from decompression, and unpacked in an UnpackedObject with

1249 the following attrs set:

1250

1251 * obj_chunks (for non-delta types)

1252 * pack_type_num

1253 * delta_base (for delta types)

1254 * comp_chunks (if include_comp is True)

1255 * decomp_chunks

1256 * decomp_len

1257 * crc32 (if compute_crc32 is True)

1258 """

1259 if read_some is None:

1260 read_some = read_all

1261 if compute_crc32:

1262 crc32 = 0

1263 else:

1264 crc32 = None

1265

1266 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)

1267 type_num = (raw[0] >> 4) & 0x07

1268 size = raw[0] & 0x0F

1269 for i, byte in enumerate(raw[1:]):

1270 size += (byte & 0x7F) << ((i * 7) + 4)

1271

1272 delta_base: int | bytes | None

1273 raw_base = len(raw)

1274 if type_num == OFS_DELTA:

1275 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)

1276 raw_base += len(raw)

1277 if raw[-1] & 0x80:

1278 raise AssertionError

1279 delta_base_offset = raw[0] & 0x7F

1280 for byte in raw[1:]:

1281 delta_base_offset += 1

1282 delta_base_offset <<= 7

1283 delta_base_offset += byte & 0x7F

1284 delta_base = delta_base_offset

1285 elif type_num == REF_DELTA:

1286 delta_base_obj = read_all(20)

1287 if crc32 is not None:

1288 crc32 = binascii.crc32(delta_base_obj, crc32)

1289 delta_base = delta_base_obj

1290 raw_base += 20

1291 else:

1292 delta_base = None

1293

1294 unpacked = UnpackedObject(

1295 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32

1296 )

1297 unused = read_zlib_chunks(

1298 read_some,

1299 unpacked,

1300 buffer_size=zlib_bufsize,

1301 include_comp=include_comp,

1302 )

1303 return unpacked, unused

1304

1305

1306def _compute_object_size(value: tuple[int, Any]) -> int:

1307 """Compute the size of a unresolved object for use with LRUSizeCache."""

1308 (num, obj) = value

1309 if num in DELTA_TYPES:

1310 return chunks_length(obj[1])

1311 return chunks_length(obj)

1312

1313

1314class PackStreamReader:

1315 """Class to read a pack stream.

1316

1317 The pack is read from a ReceivableProtocol using read() or recv() as

1318 appropriate.

1319 """

1320

1321 def __init__(

1322 self,

1323 read_all: Callable[[int], bytes],

1324 read_some: Callable[[int], bytes] | None = None,

1325 zlib_bufsize: int = _ZLIB_BUFSIZE,

1326 ) -> None:

1327 """Initialize pack stream reader.

1328

1329 Args:

1330 read_all: Function to read all requested bytes

1331 read_some: Function to read some bytes (optional)

1332 zlib_bufsize: Buffer size for zlib decompression

1333 """

1334 self.read_all = read_all

1335 if read_some is None:

1336 self.read_some = read_all

1337 else:

1338 self.read_some = read_some

1339 self.sha = sha1()

1340 self._offset = 0

1341 self._rbuf = BytesIO()

1342 # trailer is a deque to avoid memory allocation on small reads

1343 self._trailer: deque[int] = deque()

1344 self._zlib_bufsize = zlib_bufsize

1345

1346 def _read(self, read: Callable[[int], bytes], size: int) -> bytes:

1347 """Read up to size bytes using the given callback.

1348

1349 As a side effect, update the verifier's hash (excluding the last 20

1350 bytes read).

1351

1352 Args:

1353 read: The read callback to read from.

1354 size: The maximum number of bytes to read; the particular

1355 behavior is callback-specific.

1356 Returns: Bytes read

1357 """

1358 data = read(size)

1359

1360 # maintain a trailer of the last 20 bytes we've read

1361 n = len(data)

1362 self._offset += n

1363 tn = len(self._trailer)

1364 if n >= 20:

1365 to_pop = tn

1366 to_add = 20

1367 else:

1368 to_pop = max(n + tn - 20, 0)

1369 to_add = n

1370 self.sha.update(

1371 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)]))

1372 )

1373 self._trailer.extend(data[-to_add:])

1374

1375 # hash everything but the trailer

1376 self.sha.update(data[:-to_add])

1377 return data

1378

1379 def _buf_len(self) -> int:

1380 buf = self._rbuf

1381 start = buf.tell()

1382 buf.seek(0, SEEK_END)

1383 end = buf.tell()

1384 buf.seek(start)

1385 return end - start

1386

1387 @property

1388 def offset(self) -> int:

1389 """Return current offset in the stream."""

1390 return self._offset - self._buf_len()

1391

1392 def read(self, size: int) -> bytes:

1393 """Read, blocking until size bytes are read."""

1394 buf_len = self._buf_len()

1395 if buf_len >= size:

1396 return self._rbuf.read(size)

1397 buf_data = self._rbuf.read()

1398 self._rbuf = BytesIO()

1399 return buf_data + self._read(self.read_all, size - buf_len)

1400

1401 def recv(self, size: int) -> bytes:

1402 """Read up to size bytes, blocking until one byte is read."""

1403 buf_len = self._buf_len()

1404 if buf_len:

1405 data = self._rbuf.read(size)

1406 if size >= buf_len:

1407 self._rbuf = BytesIO()

1408 return data

1409 return self._read(self.read_some, size)

1410

1411 def __len__(self) -> int:

1412 """Return the number of objects in this pack."""

1413 return self._num_objects

1414

1415 def read_objects(self, compute_crc32: bool = False) -> Iterator[UnpackedObject]:

1416 """Read the objects in this pack file.

1417

1418 Args:

1419 compute_crc32: If True, compute the CRC32 of the compressed

1420 data. If False, the returned CRC32 will be None.

1421 Returns: Iterator over UnpackedObjects with the following members set:

1422 offset

1423 obj_type_num

1424 obj_chunks (for non-delta types)

1425 delta_base (for delta types)

1426 decomp_chunks

1427 decomp_len

1428 crc32 (if compute_crc32 is True)

1429

1430 Raises:

1431 ChecksumMismatch: if the checksum of the pack contents does not

1432 match the checksum in the pack trailer.

1433 zlib.error: if an error occurred during zlib decompression.

1434 IOError: if an error occurred writing to the output file.

1435 """

1436 _pack_version, self._num_objects = read_pack_header(self.read)

1437

1438 for _ in range(self._num_objects):

1439 offset = self.offset

1440 unpacked, unused = unpack_object(

1441 self.read,

1442 read_some=self.recv,

1443 compute_crc32=compute_crc32,

1444 zlib_bufsize=self._zlib_bufsize,

1445 )

1446 unpacked.offset = offset

1447

1448 # prepend any unused data to current read buffer

1449 buf = BytesIO()

1450 buf.write(unused)

1451 buf.write(self._rbuf.read())

1452 buf.seek(0)

1453 self._rbuf = buf

1454

1455 yield unpacked

1456

1457 if self._buf_len() < 20:

1458 # If the read buffer is full, then the last read() got the whole

1459 # trailer off the wire. If not, it means there is still some of the

1460 # trailer to read. We need to read() all 20 bytes; N come from the

1461 # read buffer and (20 - N) come from the wire.

1462 self.read(20)

1463

1464 pack_sha = bytearray(self._trailer)

1465 if pack_sha != self.sha.digest():

1466 raise ChecksumMismatch(

1467 sha_to_hex(RawObjectID(bytes(pack_sha))), self.sha.hexdigest()

1468 )

1469

1470

1471class PackStreamCopier(PackStreamReader):

1472 """Class to verify a pack stream as it is being read.

1473

1474 The pack is read from a ReceivableProtocol using read() or recv() as

1475 appropriate and written out to the given file-like object.

1476 """

1477

1478 def __init__(

1479 self,

1480 read_all: Callable[[int], bytes],

1481 read_some: Callable[[int], bytes] | None,

1482 outfile: IO[bytes],

1483 delta_iter: "DeltaChainIterator[UnpackedObject] | None" = None,

1484 ) -> None:

1485 """Initialize the copier.

1486

1487 Args:

1488 read_all: Read function that blocks until the number of

1489 requested bytes are read.

1490 read_some: Read function that returns at least one byte, but may

1491 not return the number of bytes requested.

1492 outfile: File-like object to write output through.

1493 delta_iter: Optional DeltaChainIterator to record deltas as we

1494 read them.

1495 """

1496 super().__init__(read_all, read_some=read_some)

1497 self.outfile = outfile

1498 self._delta_iter = delta_iter

1499

1500 def _read(self, read: Callable[[int], bytes], size: int) -> bytes:

1501 """Read data from the read callback and write it to the file."""

1502 data = super()._read(read, size)

1503 self.outfile.write(data)

1504 return data

1505

1506 def verify(self, progress: Callable[..., None] | None = None) -> None:

1507 """Verify a pack stream and write it to the output file.

1508

1509 See PackStreamReader.iterobjects for a list of exceptions this may

1510 throw.

1511 """

1512 i = 0 # default count of entries if read_objects() is empty

1513 for i, unpacked in enumerate(self.read_objects()):

1514 if self._delta_iter:

1515 self._delta_iter.record(unpacked)

1516 if progress is not None:

1517 progress(f"copying pack entries: {i}/{len(self)}\r".encode("ascii"))

1518 if progress is not None:

1519 progress(f"copied {i} pack entries\n".encode("ascii"))

1520

1521

1522def obj_sha(type: int, chunks: bytes | Iterable[bytes]) -> bytes:

1523 """Compute the SHA for a numeric type and object chunks."""

1524 sha = sha1()

1525 sha.update(object_header(type, chunks_length(chunks)))

1526 if isinstance(chunks, bytes):

1527 sha.update(chunks)

1528 else:

1529 for chunk in chunks:

1530 sha.update(chunk)

1531 return sha.digest()

1532

1533

1534def compute_file_sha(

1535 f: IO[bytes], start_ofs: int = 0, end_ofs: int = 0, buffer_size: int = 1 << 16

1536) -> "HashObject":

1537 """Hash a portion of a file into a new SHA.

1538

1539 Args:

1540 f: A file-like object to read from that supports seek().

1541 start_ofs: The offset in the file to start reading at.

1542 end_ofs: The offset in the file to end reading at, relative to the

1543 end of the file.

1544 buffer_size: A buffer size for reading.

1545 Returns: A new SHA object updated with data read from the file.

1546 """

1547 sha = sha1()

1548 f.seek(0, SEEK_END)

1549 length = f.tell()

1550 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length:

1551 raise AssertionError(

1552 f"Attempt to read beyond file length. start_ofs: {start_ofs}, end_ofs: {end_ofs}, file length: {length}"

1553 )

1554 todo = length + end_ofs - start_ofs

1555 f.seek(start_ofs)

1556 while todo:

1557 data = f.read(min(todo, buffer_size))

1558 sha.update(data)

1559 todo -= len(data)

1560 return sha

1561

1562

1563class PackData:

1564 """The data contained in a packfile.

1565

1566 Pack files can be accessed both sequentially for exploding a pack, and

1567 directly with the help of an index to retrieve a specific object.

1568

1569 The objects within are either complete or a delta against another.

1570

1571 The header is variable length. If the MSB of each byte is set then it

1572 indicates that the subsequent byte is still part of the header.

1573 For the first byte the next MS bits are the type, which tells you the type

1574 of object, and whether it is a delta. The LS byte is the lowest bits of the

1575 size. For each subsequent byte the LS 7 bits are the next MS bits of the

1576 size, i.e. the last byte of the header contains the MS bits of the size.

1577

1578 For the complete objects the data is stored as zlib deflated data.

1579 The size in the header is the uncompressed object size, so to uncompress

1580 you need to just keep feeding data to zlib until you get an object back,

1581 or it errors on bad data. This is done here by just giving the complete

1582 buffer from the start of the deflated object on. This is bad, but until I

1583 get mmap sorted out it will have to do.

1584

1585 Currently there are no integrity checks done. Also no attempt is made to

1586 try and detect the delta case, or a request for an object at the wrong

1587 position. It will all just throw a zlib or KeyError.

1588 """

1589

1590 def __init__(

1591 self,

1592 filename: str | os.PathLike[str],

1593 file: IO[bytes] | None = None,

1594 size: int | None = None,

1595 *,

1596 delta_window_size: int | None = None,

1597 window_memory: int | None = None,

1598 delta_cache_size: int | None = None,

1599 depth: int | None = None,

1600 threads: int | None = None,

1601 big_file_threshold: int | None = None,

1602 ) -> None:

1603 """Create a PackData object representing the pack in the given filename.

1604

1605 The file must exist and stay readable until the object is disposed of.

1606 It must also stay the same size. It will be mapped whenever needed.

1607

1608 Currently there is a restriction on the size of the pack as the python

1609 mmap implementation is flawed.

1610 """

1611 self._filename = filename

1612 self._size = size

1613 self._header_size = 12

1614 self.delta_window_size = delta_window_size

1615 self.window_memory = window_memory

1616 self.delta_cache_size = delta_cache_size

1617 self.depth = depth

1618 self.threads = threads

1619 self.big_file_threshold = big_file_threshold

1620 self._file: IO[bytes]

1621

1622 if file is None:

1623 self._file = GitFile(self._filename, "rb")

1624 else:

1625 self._file = file

1626 (_version, self._num_objects) = read_pack_header(self._file.read)

1627

1628 # Use delta_cache_size config if available, otherwise default

1629 cache_size = delta_cache_size or (1024 * 1024 * 20)

1630 self._offset_cache = LRUSizeCache[int, tuple[int, OldUnpackedObject]](

1631 cache_size, compute_size=_compute_object_size

1632 )

1633

1634 @property

1635 def filename(self) -> str:

1636 """Get the filename of the pack file.

1637

1638 Returns:

1639 Base filename without directory path

1640 """

1641 return os.path.basename(self._filename)

1642

1643 @property

1644 def path(self) -> str | os.PathLike[str]:

1645 """Get the full path of the pack file.

1646

1647 Returns:

1648 Full path to the pack file

1649 """

1650 return self._filename

1651

1652 @classmethod

1653 def from_file(cls, file: IO[bytes], size: int | None = None) -> "PackData":

1654 """Create a PackData object from an open file.

1655

1656 Args:

1657 file: Open file object

1658 size: Optional file size

1659

1660 Returns:

1661 PackData instance

1662 """

1663 return cls(str(file), file=file, size=size)

1664

1665 @classmethod

1666 def from_path(cls, path: str | os.PathLike[str]) -> "PackData":

1667 """Create a PackData object from a file path.

1668

1669 Args:

1670 path: Path to the pack file

1671

1672 Returns:

1673 PackData instance

1674 """

1675 return cls(filename=path)

1676

1677 def close(self) -> None:

1678 """Close the underlying pack file."""

1679 self._file.close()

1680

1681 def __enter__(self) -> "PackData":

1682 """Enter context manager."""

1683 return self

1684

1685 def __exit__(

1686 self,

1687 exc_type: type | None,

1688 exc_val: BaseException | None,

1689 exc_tb: TracebackType | None,

1690 ) -> None:

1691 """Exit context manager."""

1692 self.close()

1693

1694 def __eq__(self, other: object) -> bool:

1695 """Check equality with another object."""

1696 if isinstance(other, PackData):

1697 return self.get_stored_checksum() == other.get_stored_checksum()

1698 return False

1699

1700 def _get_size(self) -> int:

1701 if self._size is not None:

1702 return self._size

1703 self._size = os.path.getsize(self._filename)

1704 if self._size < self._header_size:

1705 errmsg = f"{self._filename} is too small for a packfile ({self._size} < {self._header_size})"

1706 raise AssertionError(errmsg)

1707 return self._size

1708

1709 def __len__(self) -> int:

1710 """Returns the number of objects in this pack."""

1711 return self._num_objects

1712

1713 def calculate_checksum(self) -> bytes:

1714 """Calculate the checksum for this pack.

1715

1716 Returns: 20-byte binary SHA1 digest

1717 """

1718 return compute_file_sha(self._file, end_ofs=-20).digest()

1719

1720 def iter_unpacked(self, *, include_comp: bool = False) -> Iterator[UnpackedObject]:

1721 """Iterate over unpacked objects in the pack."""

1722 self._file.seek(self._header_size)

1723

1724 if self._num_objects is None:

1725 return

1726

1727 for _ in range(self._num_objects):

1728 offset = self._file.tell()

1729 unpacked, unused = unpack_object(

1730 self._file.read, compute_crc32=False, include_comp=include_comp

1731 )

1732 unpacked.offset = offset

1733 yield unpacked

1734 # Back up over unused data.

1735 self._file.seek(-len(unused), SEEK_CUR)

1736

1737 def iterentries(

1738 self,

1739 progress: Callable[[int, int], None] | None = None,

1740 resolve_ext_ref: ResolveExtRefFn | None = None,

1741 ) -> Iterator[PackIndexEntry]:

1742 """Yield entries summarizing the contents of this pack.

1743

1744 Args:

1745 progress: Progress function, called with current and total

1746 object count.

1747 resolve_ext_ref: Optional function to resolve external references

1748 Returns: iterator of tuples with (sha, offset, crc32)

1749 """

1750 num_objects = self._num_objects

1751 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref)

1752 for i, result in enumerate(indexer):

1753 if progress is not None:

1754 progress(i, num_objects)

1755 yield result

1756

1757 def sorted_entries(

1758 self,

1759 progress: ProgressFn | None = None,

1760 resolve_ext_ref: ResolveExtRefFn | None = None,

1761 ) -> list[tuple[RawObjectID, int, int]]:

1762 """Return entries in this pack, sorted by SHA.

1763

1764 Args:

1765 progress: Progress function, called with current and total

1766 object count

1767 resolve_ext_ref: Optional function to resolve external references

1768 Returns: Iterator of tuples with (sha, offset, crc32)

1769 """

1770 return sorted(

1771 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref) # type: ignore

1772 )

1773

1774 def create_index_v1(

1775 self,

1776 filename: str,

1777 progress: Callable[..., None] | None = None,

1778 resolve_ext_ref: ResolveExtRefFn | None = None,

1779 ) -> bytes:

1780 """Create a version 1 file for this data file.

1781

1782 Args:

1783 filename: Index filename.

1784 progress: Progress report function

1785 resolve_ext_ref: Optional function to resolve external references

1786 Returns: Checksum of index file

1787 """

1788 entries = self.sorted_entries(

1789 progress=progress, resolve_ext_ref=resolve_ext_ref

1790 )

1791 checksum = self.calculate_checksum()

1792 with GitFile(filename, "wb") as f:

1793 write_pack_index_v1(

1794 f,

1795 entries,

1796 checksum,

1797 )

1798 return checksum

1799

1800 def create_index_v2(

1801 self,

1802 filename: str,

1803 progress: Callable[..., None] | None = None,

1804 resolve_ext_ref: ResolveExtRefFn | None = None,

1805 ) -> bytes:

1806 """Create a version 2 index file for this data file.

1807

1808 Args:

1809 filename: Index filename.

1810 progress: Progress report function

1811 resolve_ext_ref: Optional function to resolve external references

1812 Returns: Checksum of index file

1813 """

1814 entries = self.sorted_entries(

1815 progress=progress, resolve_ext_ref=resolve_ext_ref

1816 )

1817 with GitFile(filename, "wb") as f:

1818 return write_pack_index_v2(f, entries, self.calculate_checksum())

1819

1820 def create_index_v3(

1821 self,

1822 filename: str,

1823 progress: Callable[..., None] | None = None,

1824 resolve_ext_ref: ResolveExtRefFn | None = None,

1825 hash_algorithm: int = 1,

1826 ) -> bytes:

1827 """Create a version 3 index file for this data file.

1828

1829 Args:

1830 filename: Index filename.

1831 progress: Progress report function

1832 resolve_ext_ref: Function to resolve external references

1833 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

1834 Returns: Checksum of index file

1835 """

1836 entries = self.sorted_entries(

1837 progress=progress, resolve_ext_ref=resolve_ext_ref

1838 )

1839 with GitFile(filename, "wb") as f:

1840 return write_pack_index_v3(

1841 f, entries, self.calculate_checksum(), hash_algorithm

1842 )

1843

1844 def create_index(

1845 self,

1846 filename: str,

1847 progress: Callable[..., None] | None = None,

1848 version: int = 2,

1849 resolve_ext_ref: ResolveExtRefFn | None = None,

1850 hash_algorithm: int = 1,

1851 ) -> bytes:

1852 """Create an index file for this data file.

1853

1854 Args:

1855 filename: Index filename.

1856 progress: Progress report function

1857 version: Index version (1, 2, or 3)

1858 resolve_ext_ref: Function to resolve external references

1859 hash_algorithm: Hash algorithm identifier for v3 (1 = SHA-1, 2 = SHA-256)

1860 Returns: Checksum of index file

1861 """

1862 if version == 1:

1863 return self.create_index_v1(

1864 filename, progress, resolve_ext_ref=resolve_ext_ref

1865 )

1866 elif version == 2:

1867 return self.create_index_v2(

1868 filename, progress, resolve_ext_ref=resolve_ext_ref

1869 )

1870 elif version == 3:

1871 return self.create_index_v3(

1872 filename,

1873 progress,

1874 resolve_ext_ref=resolve_ext_ref,

1875 hash_algorithm=hash_algorithm,

1876 )

1877 else:

1878 raise ValueError(f"unknown index format {version}")

1879

1880 def get_stored_checksum(self) -> bytes:

1881 """Return the expected checksum stored in this pack."""

1882 self._file.seek(-20, SEEK_END)

1883 return self._file.read(20)

1884

1885 def check(self) -> None:

1886 """Check the consistency of this pack."""

1887 actual = self.calculate_checksum()

1888 stored = self.get_stored_checksum()

1889 if actual != stored:

1890 raise ChecksumMismatch(stored, actual)

1891

1892 def get_unpacked_object_at(

1893 self, offset: int, *, include_comp: bool = False

1894 ) -> UnpackedObject:

1895 """Given offset in the packfile return a UnpackedObject."""

1896 assert offset >= self._header_size

1897 self._file.seek(offset)

1898 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp)

1899 unpacked.offset = offset

1900 return unpacked

1901

1902 def get_object_at(self, offset: int) -> tuple[int, OldUnpackedObject]:

1903 """Given an offset in to the packfile return the object that is there.

1904

1905 Using the associated index the location of an object can be looked up,

1906 and then the packfile can be asked directly for that object using this

1907 function.

1908 """

1909 try:

1910 return self._offset_cache[offset]

1911 except KeyError:

1912 pass

1913 unpacked = self.get_unpacked_object_at(offset, include_comp=False)

1914 return (unpacked.pack_type_num, unpacked._obj())

1917T = TypeVar("T")

1920class DeltaChainIterator(Generic[T]):

1921 """Abstract iterator over pack data based on delta chains.

1922

1923 Each object in the pack is guaranteed to be inflated exactly once,

1924 regardless of how many objects reference it as a delta base. As a result,

1925 memory usage is proportional to the length of the longest delta chain.

1926

1927 Subclasses can override _result to define the result type of the iterator.

1928 By default, results are UnpackedObjects with the following members set:

1929

1930 * offset

1931 * obj_type_num

1932 * obj_chunks

1933 * pack_type_num

1934 * delta_base (for delta types)

1935 * comp_chunks (if _include_comp is True)

1936 * decomp_chunks

1937 * decomp_len

1938 * crc32 (if _compute_crc32 is True)

1939 """

1940

1941 _compute_crc32 = False

1942 _include_comp = False

1943

1944 def __init__(

1945 self,

1946 file_obj: IO[bytes] | None,

1947 *,

1948 resolve_ext_ref: ResolveExtRefFn | None = None,

1949 ) -> None:

1950 """Initialize DeltaChainIterator.

1951

1952 Args:

1953 file_obj: File object to read pack data from

1954 resolve_ext_ref: Optional function to resolve external references

1955 """

1956 self._file = file_obj

1957 self._resolve_ext_ref = resolve_ext_ref

1958 self._pending_ofs: dict[int, list[int]] = defaultdict(list)

1959 self._pending_ref: dict[bytes, list[int]] = defaultdict(list)

1960 self._full_ofs: list[tuple[int, int]] = []

1961 self._ext_refs: list[RawObjectID] = []

1962

1963 @classmethod

1964 def for_pack_data(

1965 cls, pack_data: PackData, resolve_ext_ref: ResolveExtRefFn | None = None

1966 ) -> "DeltaChainIterator[T]":

1967 """Create a DeltaChainIterator from pack data.

1968

1969 Args:

1970 pack_data: PackData object to iterate

1971 resolve_ext_ref: Optional function to resolve external refs

1972

1973 Returns:

1974 DeltaChainIterator instance

1975 """

1976 walker = cls(None, resolve_ext_ref=resolve_ext_ref)

1977 walker.set_pack_data(pack_data)

1978 for unpacked in pack_data.iter_unpacked(include_comp=False):

1979 walker.record(unpacked)

1980 return walker

1981

1982 @classmethod

1983 def for_pack_subset(

1984 cls,

1985 pack: "Pack",

1986 shas: Iterable[ObjectID | RawObjectID],

1987 *,

1988 allow_missing: bool = False,

1989 resolve_ext_ref: ResolveExtRefFn | None = None,

1990 ) -> "DeltaChainIterator[T]":

1991 """Create a DeltaChainIterator for a subset of objects.

1992

1993 Args:

1994 pack: Pack object containing the data

1995 shas: Iterable of object SHAs to include

1996 allow_missing: If True, skip missing objects

1997 resolve_ext_ref: Optional function to resolve external refs

1998

1999 Returns:

2000 DeltaChainIterator instance

2001 """

2002 walker = cls(None, resolve_ext_ref=resolve_ext_ref)

2003 walker.set_pack_data(pack.data)

2004 todo = set()

2005 for sha in shas:

2006 try:

2007 off = pack.index.object_offset(sha)

2008 except KeyError:

2009 if not allow_missing:

2010 raise

2011 else:

2012 todo.add(off)

2013 done = set()

2014 while todo:

2015 off = todo.pop()

2016 unpacked = pack.data.get_unpacked_object_at(off)

2017 walker.record(unpacked)

2018 done.add(off)

2019 base_ofs = None

2020 if unpacked.pack_type_num == OFS_DELTA:

2021 assert unpacked.offset is not None

2022 assert unpacked.delta_base is not None

2023 assert isinstance(unpacked.delta_base, int)

2024 base_ofs = unpacked.offset - unpacked.delta_base

2025 elif unpacked.pack_type_num == REF_DELTA:

2026 with suppress(KeyError):

2027 assert isinstance(unpacked.delta_base, bytes)

2028 base_ofs = pack.index.object_index(RawObjectID(unpacked.delta_base))

2029 if base_ofs is not None and base_ofs not in done:

2030 todo.add(base_ofs)

2031 return walker

2032

2033 def record(self, unpacked: UnpackedObject) -> None:

2034 """Record an unpacked object for later processing.

2035

2036 Args:

2037 unpacked: UnpackedObject to record

2038 """

2039 type_num = unpacked.pack_type_num

2040 offset = unpacked.offset

2041 assert offset is not None

2042 if type_num == OFS_DELTA:

2043 assert unpacked.delta_base is not None

2044 assert isinstance(unpacked.delta_base, int)

2045 base_offset = offset - unpacked.delta_base

2046 self._pending_ofs[base_offset].append(offset)

2047 elif type_num == REF_DELTA:

2048 assert isinstance(unpacked.delta_base, bytes)

2049 self._pending_ref[unpacked.delta_base].append(offset)

2050 else:

2051 self._full_ofs.append((offset, type_num))

2052

2053 def set_pack_data(self, pack_data: PackData) -> None:

2054 """Set the pack data for iteration.

2055

2056 Args:

2057 pack_data: PackData object to use

2058 """

2059 self._file = pack_data._file

2060

2061 def _walk_all_chains(self) -> Iterator[T]:

2062 for offset, type_num in self._full_ofs:

2063 yield from self._follow_chain(offset, type_num, None)

2064 yield from self._walk_ref_chains()

2065 assert not self._pending_ofs, repr(self._pending_ofs)

2066

2067 def _ensure_no_pending(self) -> None:

2068 if self._pending_ref:

2069 raise UnresolvedDeltas(

2070 [sha_to_hex(RawObjectID(s)) for s in self._pending_ref]

2071 )

2072

2073 def _walk_ref_chains(self) -> Iterator[T]:

2074 if not self._resolve_ext_ref:

2075 self._ensure_no_pending()

2076 return

2077

2078 for base_sha, pending in sorted(self._pending_ref.items()):

2079 if base_sha not in self._pending_ref:

2080 continue

2081 try:

2082 type_num, chunks = self._resolve_ext_ref(base_sha)

2083 except KeyError:

2084 # Not an external ref, but may depend on one. Either it will

2085 # get popped via a _follow_chain call, or we will raise an

2086 # error below.

2087 continue

2088 self._ext_refs.append(RawObjectID(base_sha))

2089 self._pending_ref.pop(base_sha)

2090 for new_offset in pending:

2091 yield from self._follow_chain(new_offset, type_num, chunks) # type: ignore[arg-type]

2092

2093 self._ensure_no_pending()

2094

2095 def _result(self, unpacked: UnpackedObject) -> T:

2096 raise NotImplementedError

2097

2098 def _resolve_object(

2099 self, offset: int, obj_type_num: int, base_chunks: list[bytes] | None

2100 ) -> UnpackedObject:

2101 assert self._file is not None

2102 self._file.seek(offset)

2103 unpacked, _ = unpack_object(

2104 self._file.read,

2105 include_comp=self._include_comp,

2106 compute_crc32=self._compute_crc32,

2107 )

2108 unpacked.offset = offset

2109 if base_chunks is None:

2110 assert unpacked.pack_type_num == obj_type_num

2111 else:

2112 assert unpacked.pack_type_num in DELTA_TYPES

2113 unpacked.obj_type_num = obj_type_num

2114 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks)

2115 return unpacked

2116

2117 def _follow_chain(

2118 self, offset: int, obj_type_num: int, base_chunks: list[bytes] | None

2119 ) -> Iterator[T]:

2120 # Unlike PackData.get_object_at, there is no need to cache offsets as

2121 # this approach by design inflates each object exactly once.

2122 todo = [(offset, obj_type_num, base_chunks)]

2123 while todo:

2124 (offset, obj_type_num, base_chunks) = todo.pop()

2125 unpacked = self._resolve_object(offset, obj_type_num, base_chunks)

2126 yield self._result(unpacked)

2127

2128 assert unpacked.offset is not None

2129 unblocked = chain(

2130 self._pending_ofs.pop(unpacked.offset, []),

2131 self._pending_ref.pop(unpacked.sha(), []),

2132 )

2133 todo.extend(

2134 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore

2135 for new_offset in unblocked

2136 )

2137

2138 def __iter__(self) -> Iterator[T]:

2139 """Iterate over objects in the pack."""

2140 return self._walk_all_chains()

2141

2142 def ext_refs(self) -> list[RawObjectID]:

2143 """Return external references."""

2144 return self._ext_refs

2145

2146

2147class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]):

2148 """Delta chain iterator that yield unpacked objects."""

2149

2150 def _result(self, unpacked: UnpackedObject) -> UnpackedObject:

2151 """Return the unpacked object.

2152

2153 Args:

2154 unpacked: The unpacked object

2155

2156 Returns:

2157 The unpacked object unchanged

2158 """

2159 return unpacked

2160

2161

2162class PackIndexer(DeltaChainIterator[PackIndexEntry]):

2163 """Delta chain iterator that yields index entries."""

2164

2165 _compute_crc32 = True

2166

2167 def _result(self, unpacked: UnpackedObject) -> PackIndexEntry:

2168 """Convert unpacked object to pack index entry.

2169

2170 Args:

2171 unpacked: The unpacked object

2172

2173 Returns:

2174 Tuple of (sha, offset, crc32) for index entry

2175 """

2176 assert unpacked.offset is not None

2177 return unpacked.sha(), unpacked.offset, unpacked.crc32

2178

2179

2180class PackInflater(DeltaChainIterator[ShaFile]):

2181 """Delta chain iterator that yields ShaFile objects."""

2182

2183 def _result(self, unpacked: UnpackedObject) -> ShaFile:

2184 """Convert unpacked object to ShaFile.

2185

2186 Args:

2187 unpacked: The unpacked object

2188

2189 Returns:

2190 ShaFile object from the unpacked data

2191 """

2192 return unpacked.sha_file()

2193

2194

2195class SHA1Reader(BinaryIO):

2196 """Wrapper for file-like object that remembers the SHA1 of its data."""

2197

2198 def __init__(self, f: IO[bytes]) -> None:

2199 """Initialize SHA1Reader.

2200

2201 Args:

2202 f: File-like object to wrap

2203 """

2204 self.f = f

2205 self.sha1 = sha1(b"")

2206

2207 def read(self, size: int = -1) -> bytes:

2208 """Read bytes and update SHA1.

2209

2210 Args:

2211 size: Number of bytes to read, -1 for all

2212

2213 Returns:

2214 Bytes read from file

2215 """

2216 data = self.f.read(size)

2217 self.sha1.update(data)

2218 return data

2219

2220 def check_sha(self, allow_empty: bool = False) -> None:

2221 """Check if the SHA1 matches the expected value.

2222

2223 Args:

2224 allow_empty: Allow empty SHA1 hash

2225

2226 Raises:

2227 ChecksumMismatch: If SHA1 doesn't match

2228 """

2229 stored = self.f.read(20)

2230 # If git option index.skipHash is set the index will be empty

2231 if stored != self.sha1.digest() and (

2232 not allow_empty

2233 or sha_to_hex(RawObjectID(stored))

2234 != b"0000000000000000000000000000000000000000"

2235 ):

2236 raise ChecksumMismatch(

2237 self.sha1.hexdigest(), sha_to_hex(RawObjectID(stored))

2238 )

2239

2240 def close(self) -> None:

2241 """Close the underlying file."""

2242 return self.f.close()

2243

2244 def tell(self) -> int:

2245 """Return current file position."""

2246 return self.f.tell()

2247

2248 # BinaryIO abstract methods

2249 def readable(self) -> bool:

2250 """Check if file is readable."""

2251 return True

2252

2253 def writable(self) -> bool:

2254 """Check if file is writable."""

2255 return False

2256

2257 def seekable(self) -> bool:

2258 """Check if file is seekable."""

2259 return getattr(self.f, "seekable", lambda: False)()

2260

2261 def seek(self, offset: int, whence: int = 0) -> int:

2262 """Seek to position in file.

2263

2264 Args:

2265 offset: Position offset

2266 whence: Reference point (0=start, 1=current, 2=end)

2267

2268 Returns:

2269 New file position

2270 """

2271 return self.f.seek(offset, whence)

2272

2273 def flush(self) -> None:

2274 """Flush the file buffer."""

2275 if hasattr(self.f, "flush"):

2276 self.f.flush()

2277

2278 def readline(self, size: int = -1) -> bytes:

2279 """Read a line from the file.

2280

2281 Args:

2282 size: Maximum bytes to read

2283

2284 Returns:

2285 Line read from file

2286 """

2287 return self.f.readline(size)

2288

2289 def readlines(self, hint: int = -1) -> list[bytes]:

2290 """Read all lines from the file.

2291

2292 Args:

2293 hint: Approximate number of bytes to read

2294

2295 Returns:

2296 List of lines

2297 """

2298 return self.f.readlines(hint)

2299

2300 def writelines(self, lines: Iterable[bytes], /) -> None: # type: ignore[override]

2301 """Write multiple lines to the file (not supported)."""

2302 raise UnsupportedOperation("writelines")

2303

2304 def write(self, data: bytes, /) -> int: # type: ignore[override]

2305 """Write data to the file (not supported)."""

2306 raise UnsupportedOperation("write")

2307

2308 def __enter__(self) -> "SHA1Reader":

2309 """Enter context manager."""

2310 return self

2311

2312 def __exit__(

2313 self,

2314 type: type | None,

2315 value: BaseException | None,

2316 traceback: TracebackType | None,

2317 ) -> None:

2318 """Exit context manager and close file."""

2319 self.close()

2320

2321 def __iter__(self) -> "SHA1Reader":

2322 """Return iterator for reading file lines."""

2323 return self

2324

2325 def __next__(self) -> bytes:

2326 """Get next line from file.

2327

2328 Returns:

2329 Next line

2330

2331 Raises:

2332 StopIteration: When no more lines

2333 """

2334 line = self.readline()

2335 if not line:

2336 raise StopIteration

2337 return line

2338

2339 def fileno(self) -> int:

2340 """Return file descriptor number."""

2341 return self.f.fileno()

2342

2343 def isatty(self) -> bool:

2344 """Check if file is a terminal."""

2345 return getattr(self.f, "isatty", lambda: False)()

2346

2347 def truncate(self, size: int | None = None) -> int:

2348 """Not supported for read-only file.

2349

2350 Raises:

2351 UnsupportedOperation: Always raised

2352 """

2353 raise UnsupportedOperation("truncate")

2354

2355

2356class SHA1Writer(BinaryIO):

2357 """Wrapper for file-like object that remembers the SHA1 of its data."""

2358

2359 def __init__(self, f: BinaryIO | IO[bytes]) -> None:

2360 """Initialize SHA1Writer.

2361

2362 Args:

2363 f: File-like object to wrap

2364 """

2365 self.f = f

2366 self.length = 0

2367 self.sha1 = sha1(b"")

2368 self.digest: bytes | None = None

2369

2370 def write(self, data: bytes | bytearray | memoryview, /) -> int: # type: ignore[override]

2371 """Write data and update SHA1.

2372

2373 Args:

2374 data: Data to write

2375

2376 Returns:

2377 Number of bytes written

2378 """

2379 self.sha1.update(data)

2380 written = self.f.write(data)

2381 self.length += written

2382 return written

2383

2384 def write_sha(self) -> bytes:

2385 """Write the SHA1 digest to the file.

2386

2387 Returns:

2388 The SHA1 digest bytes

2389 """

2390 sha = self.sha1.digest()

2391 assert len(sha) == 20

2392 self.f.write(sha)

2393 self.length += len(sha)

2394 return sha

2395

2396 def close(self) -> None:

2397 """Close the pack file and finalize the SHA."""

2398 self.digest = self.write_sha()

2399 self.f.close()

2400

2401 def offset(self) -> int:

2402 """Get the total number of bytes written.

2403

2404 Returns:

2405 Total bytes written

2406 """

2407 return self.length

2408

2409 def tell(self) -> int:

2410 """Return current file position."""

2411 return self.f.tell()

2412

2413 # BinaryIO abstract methods

2414 def readable(self) -> bool:

2415 """Check if file is readable."""

2416 return False

2417

2418 def writable(self) -> bool:

2419 """Check if file is writable."""

2420 return True

2421

2422 def seekable(self) -> bool:

2423 """Check if file is seekable."""

2424 return getattr(self.f, "seekable", lambda: False)()

2425

2426 def seek(self, offset: int, whence: int = 0) -> int:

2427 """Seek to position in file.

2428

2429 Args:

2430 offset: Position offset

2431 whence: Reference point (0=start, 1=current, 2=end)

2432

2433 Returns:

2434 New file position

2435 """

2436 return self.f.seek(offset, whence)

2437

2438 def flush(self) -> None:

2439 """Flush the file buffer."""

2440 if hasattr(self.f, "flush"):

2441 self.f.flush()

2442

2443 def readline(self, size: int = -1) -> bytes:

2444 """Not supported for write-only file.

2445

2446 Raises:

2447 UnsupportedOperation: Always raised

2448 """

2449 raise UnsupportedOperation("readline")

2450

2451 def readlines(self, hint: int = -1) -> list[bytes]:

2452 """Not supported for write-only file.

2453

2454 Raises:

2455 UnsupportedOperation: Always raised

2456 """

2457 raise UnsupportedOperation("readlines")

2458

2459 def writelines(self, lines: Iterable[bytes], /) -> None: # type: ignore[override]

2460 """Write multiple lines to the file.

2461

2462 Args:

2463 lines: Iterable of lines to write

2464 """

2465 for line in lines:

2466 self.write(line)

2467

2468 def read(self, size: int = -1) -> bytes:

2469 """Not supported for write-only file.

2470

2471 Raises:

2472 UnsupportedOperation: Always raised

2473 """

2474 raise UnsupportedOperation("read")

2475

2476 def __enter__(self) -> "SHA1Writer":

2477 """Enter context manager."""

2478 return self

2479

2480 def __exit__(

2481 self,

2482 type: type | None,

2483 value: BaseException | None,

2484 traceback: TracebackType | None,

2485 ) -> None:

2486 """Exit context manager and close file."""

2487 self.close()

2488

2489 def __iter__(self) -> "SHA1Writer":

2490 """Return iterator."""

2491 return self

2492

2493 def __next__(self) -> bytes:

2494 """Not supported for write-only file.

2495

2496 Raises:

2497 UnsupportedOperation: Always raised

2498 """

2499 raise UnsupportedOperation("__next__")

2500

2501 def fileno(self) -> int:

2502 """Return file descriptor number."""

2503 return self.f.fileno()

2504

2505 def isatty(self) -> bool:

2506 """Check if file is a terminal."""

2507 return getattr(self.f, "isatty", lambda: False)()

2508

2509 def truncate(self, size: int | None = None) -> int:

2510 """Not supported for write-only file.

2511

2512 Raises:

2513 UnsupportedOperation: Always raised

2514 """

2515 raise UnsupportedOperation("truncate")

2516

2517

2518def pack_object_header(

2519 type_num: int, delta_base: bytes | int | None, size: int

2520) -> bytearray:

2521 """Create a pack object header for the given object info.

2522

2523 Args:

2524 type_num: Numeric type of the object.

2525 delta_base: Delta base offset or ref, or None for whole objects.

2526 size: Uncompressed object size.

2527 Returns: A header for a packed object.

2528 """

2529 header = []

2530 c = (type_num << 4) | (size & 15)

2531 size >>= 4

2532 while size:

2533 header.append(c | 0x80)

2534 c = size & 0x7F

2535 size >>= 7

2536 header.append(c)

2537 if type_num == OFS_DELTA:

2538 assert isinstance(delta_base, int)

2539 ret = [delta_base & 0x7F]

2540 delta_base >>= 7

2541 while delta_base:

2542 delta_base -= 1

2543 ret.insert(0, 0x80 | (delta_base & 0x7F))

2544 delta_base >>= 7

2545 header.extend(ret)

2546 elif type_num == REF_DELTA:

2547 assert isinstance(delta_base, bytes)

2548 assert len(delta_base) == 20

2549 header += delta_base

2550 return bytearray(header)

2551

2552

2553def pack_object_chunks(

2554 type: int,

2555 object: list[bytes] | tuple[bytes | int, list[bytes]],

2556 compression_level: int = -1,

2557) -> Iterator[bytes]:

2558 """Generate chunks for a pack object.

2559

2560 Args:

2561 type: Numeric type of the object

2562 object: Object to write

2563 compression_level: the zlib compression level

2564 Returns: Chunks

2565 """

2566 if type in DELTA_TYPES:

2567 if isinstance(object, tuple):

2568 delta_base, object = object

2569 else:

2570 raise TypeError("Delta types require a tuple of (delta_base, object)")

2571 else:

2572 delta_base = None

2573

2574 # Convert object to list of bytes chunks

2575 if isinstance(object, bytes):

2576 chunks = [object]

2577 elif isinstance(object, list):

2578 chunks = object

2579 elif isinstance(object, ShaFile):

2580 chunks = object.as_raw_chunks()

2581 else:

2582 # Shouldn't reach here with proper typing

2583 raise TypeError(f"Unexpected object type: {object.__class__.__name__}")

2584

2585 yield bytes(pack_object_header(type, delta_base, sum(map(len, chunks))))

2586 compressor = zlib.compressobj(level=compression_level)

2587 for data in chunks:

2588 yield compressor.compress(data)

2589 yield compressor.flush()

2590

2591

2592def write_pack_object(

2593 write: Callable[[bytes], int],

2594 type: int,

2595 object: list[bytes] | tuple[bytes | int, list[bytes]],

2596 sha: "HashObject | None" = None,

2597 compression_level: int = -1,

2598) -> int:

2599 """Write pack object to a file.

2600

2601 Args:

2602 write: Write function to use

2603 type: Numeric type of the object

2604 object: Object to write

2605 sha: Optional SHA-1 hasher to update

2606 compression_level: the zlib compression level

2607 Returns: CRC32 checksum of the written object

2608 """

2609 crc32 = 0

2610 for chunk in pack_object_chunks(type, object, compression_level=compression_level):

2611 write(chunk)

2612 if sha is not None:

2613 sha.update(chunk)

2614 crc32 = binascii.crc32(chunk, crc32)

2615 return crc32 & 0xFFFFFFFF

2616

2617

2618def write_pack(

2619 filename: str,

2620 objects: Sequence[ShaFile] | Sequence[tuple[ShaFile, bytes | None]],

2621 *,

2622 deltify: bool | None = None,

2623 delta_window_size: int | None = None,

2624 compression_level: int = -1,

2625) -> tuple[bytes, bytes]:

2626 """Write a new pack data file.

2627

2628 Args:

2629 filename: Path to the new pack file (without .pack extension)

2630 objects: Objects to write to the pack

2631 delta_window_size: Delta window size

2632 deltify: Whether to deltify pack objects

2633 compression_level: the zlib compression level

2634 Returns: Tuple with checksum of pack file and index file

2635 """

2636 with GitFile(filename + ".pack", "wb") as f:

2637 entries, data_sum = write_pack_objects(

2638 f,

2639 objects,

2640 delta_window_size=delta_window_size,

2641 deltify=deltify,

2642 compression_level=compression_level,

2643 )

2644 entries_list = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])

2645 with GitFile(filename + ".idx", "wb") as f:

2646 idx_sha = write_pack_index(f, entries_list, data_sum)

2647 return data_sum, idx_sha

2648

2649

2650def pack_header_chunks(num_objects: int) -> Iterator[bytes]:

2651 """Yield chunks for a pack header."""

2652 yield b"PACK" # Pack header

2653 yield struct.pack(b">L", 2) # Pack version

2654 yield struct.pack(b">L", num_objects) # Number of objects in pack

2655

2656

2657def write_pack_header(

2658 write: Callable[[bytes], int] | IO[bytes], num_objects: int

2659) -> None:

2660 """Write a pack header for the given number of objects."""

2661 write_fn: Callable[[bytes], int]

2662 if hasattr(write, "write"):

2663 write_fn = write.write

2664 warnings.warn(

2665 "write_pack_header() now takes a write rather than file argument",

2666 DeprecationWarning,

2667 stacklevel=2,

2668 )

2669 else:

2670 write_fn = write

2671 for chunk in pack_header_chunks(num_objects):

2672 write_fn(chunk)

2673

2674

2675def find_reusable_deltas(

2676 container: PackedObjectContainer,

2677 object_ids: Set[ObjectID],

2678 *,

2679 other_haves: Set[ObjectID] | None = None,

2680 progress: Callable[..., None] | None = None,

2681) -> Iterator[UnpackedObject]:

2682 """Find deltas in a pack that can be reused.

2683

2684 Args:

2685 container: Pack container to search for deltas

2686 object_ids: Set of object IDs to find deltas for

2687 other_haves: Set of other object IDs we have

2688 progress: Optional progress reporting callback

2689

2690 Returns:

2691 Iterator of UnpackedObject entries that can be reused

2692 """

2693 if other_haves is None:

2694 other_haves = set()

2695 reused = 0

2696 for i, unpacked in enumerate(

2697 container.iter_unpacked_subset(

2698 object_ids, allow_missing=True, convert_ofs_delta=True

2699 )

2700 ):

2701 if progress is not None and i % 1000 == 0:

2702 progress(f"checking for reusable deltas: {i}/{len(object_ids)}\r".encode())

2703 if unpacked.pack_type_num == REF_DELTA:

2704 hexsha = sha_to_hex(unpacked.delta_base) # type: ignore

2705 if hexsha in object_ids or hexsha in other_haves:

2706 yield unpacked

2707 reused += 1

2708 if progress is not None:

2709 progress((f"found {reused} deltas to reuse\n").encode())

2710

2711

2712def deltify_pack_objects(

2713 objects: Iterator[ShaFile] | Iterator[tuple[ShaFile, bytes | None]],

2714 *,

2715 window_size: int | None = None,

2716 progress: Callable[..., None] | None = None,

2717) -> Iterator[UnpackedObject]:

2718 """Generate deltas for pack objects.

2719

2720 Args:

2721 objects: An iterable of (object, path) tuples to deltify.

2722 window_size: Window size; None for default

2723 progress: Optional progress reporting callback

2724 Returns: Iterator over type_num, object id, delta_base, content

2725 delta_base is None for full text entries

2726 """

2727

2728 def objects_with_hints() -> Iterator[tuple[ShaFile, tuple[int, bytes | None]]]:

2729 for e in objects:

2730 if isinstance(e, ShaFile):

2731 yield (e, (e.type_num, None))

2732 else:

2733 yield (e[0], (e[0].type_num, e[1]))

2734

2735 sorted_objs = sort_objects_for_delta(objects_with_hints())

2736 yield from deltas_from_sorted_objects(

2737 sorted_objs,

2738 window_size=window_size,

2739 progress=progress,

2740 )

2741

2742

2743def sort_objects_for_delta(

2744 objects: Iterator[ShaFile] | Iterator[tuple[ShaFile, PackHint | None]],

2745) -> Iterator[tuple[ShaFile, bytes | None]]:

2746 """Sort objects for optimal delta compression.

2747

2748 Args:

2749 objects: Iterator of objects or (object, hint) tuples

2750

2751 Returns:

2752 Iterator of sorted (ShaFile, path) tuples

2753 """

2754 magic = []

2755 for entry in objects:

2756 if isinstance(entry, tuple):

2757 obj, hint = entry

2758 if hint is None:

2759 type_num = None

2760 path = None

2761 else:

2762 (type_num, path) = hint

2763 else:

2764 obj = entry

2765 type_num = None

2766 path = None

2767 magic.append((type_num, path, -obj.raw_length(), obj))

2768 # Build a list of objects ordered by the magic Linus heuristic

2769 # This helps us find good objects to diff against us

2770 magic.sort()

2771 return ((x[3], x[1]) for x in magic)

2772

2773

2774def deltas_from_sorted_objects(

2775 objects: Iterator[tuple[ShaFile, bytes | None]],

2776 window_size: int | None = None,

2777 progress: Callable[..., None] | None = None,

2778) -> Iterator[UnpackedObject]:

2779 """Create deltas from sorted objects.

2780

2781 Args:

2782 objects: Iterator of sorted objects to deltify

2783 window_size: Delta window size; None for default

2784 progress: Optional progress reporting callback

2785

2786 Returns:

2787 Iterator of UnpackedObject entries

2788 """

2789 # TODO(jelmer): Use threads

2790 if window_size is None:

2791 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE

2792

2793 possible_bases: deque[tuple[bytes, int, list[bytes]]] = deque()

2794 for i, (o, path) in enumerate(objects):

2795 if progress is not None and i % 1000 == 0:

2796 progress((f"generating deltas: {i}\r").encode())

2797 raw = o.as_raw_chunks()

2798 winner = raw

2799 winner_len = sum(map(len, winner))

2800 winner_base = None

2801 for base_id, base_type_num, base in possible_bases:

2802 if base_type_num != o.type_num:

2803 continue

2804 delta_len = 0

2805 delta = []

2806 for chunk in create_delta(b"".join(base), b"".join(raw)):

2807 delta_len += len(chunk)

2808 if delta_len >= winner_len:

2809 break

2810 delta.append(chunk)

2811 else:

2812 winner_base = base_id

2813 winner = delta

2814 winner_len = sum(map(len, winner))

2815 yield UnpackedObject(

2816 o.type_num,

2817 sha=o.sha().digest(),

2818 delta_base=winner_base,

2819 decomp_len=winner_len,

2820 decomp_chunks=winner,

2821 )

2822 possible_bases.appendleft((o.sha().digest(), o.type_num, raw))

2823 while len(possible_bases) > window_size:

2824 possible_bases.pop()

2825

2826

2827def pack_objects_to_data(

2828 objects: Sequence[ShaFile]

2829 | Sequence[tuple[ShaFile, bytes | None]]

2830 | Sequence[tuple[ShaFile, PackHint | None]],

2831 *,

2832 deltify: bool | None = None,

2833 delta_window_size: int | None = None,

2834 ofs_delta: bool = True,

2835 progress: Callable[..., None] | None = None,

2836) -> tuple[int, Iterator[UnpackedObject]]:

2837 """Create pack data from objects.

2838

2839 Args:

2840 objects: Pack objects

2841 deltify: Whether to deltify pack objects

2842 delta_window_size: Delta window size

2843 ofs_delta: Whether to use offset deltas

2844 progress: Optional progress reporting callback

2845 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)

2846 """

2847 count = len(objects)

2848 if deltify is None:

2849 # PERFORMANCE/TODO(jelmer): This should be enabled but the python

2850 # implementation is *much* too slow at the moment.

2851 # Maybe consider enabling it just if the rust extension is available?

2852 deltify = False

2853 if deltify:

2854 return (

2855 count,

2856 deltify_pack_objects(

2857 iter(objects), # type: ignore

2858 window_size=delta_window_size,

2859 progress=progress,

2860 ),

2861 )

2862 else:

2863

2864 def iter_without_path() -> Iterator[UnpackedObject]:

2865 for o in objects:

2866 if isinstance(o, tuple):

2867 yield full_unpacked_object(o[0])

2868 else:

2869 yield full_unpacked_object(o)

2870

2871 return (count, iter_without_path())

2872

2873

2874def generate_unpacked_objects(

2875 container: PackedObjectContainer,

2876 object_ids: Sequence[tuple[ObjectID, PackHint | None]],

2877 delta_window_size: int | None = None,

2878 deltify: bool | None = None,

2879 reuse_deltas: bool = True,

2880 ofs_delta: bool = True,

2881 other_haves: set[ObjectID] | None = None,

2882 progress: Callable[..., None] | None = None,

2883) -> Iterator[UnpackedObject]:

2884 """Create pack data from objects.

2885

2886 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)

2887 """

2888 todo = dict(object_ids)

2889 if reuse_deltas:

2890 for unpack in find_reusable_deltas(

2891 container, set(todo), other_haves=other_haves, progress=progress

2892 ):

2893 del todo[sha_to_hex(RawObjectID(unpack.sha()))]

2894 yield unpack

2895 if deltify is None:

2896 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too

2897 # slow at the moment.

2898 deltify = False

2899 if deltify:

2900 objects_to_delta = container.iterobjects_subset(

2901 todo.keys(), allow_missing=False

2902 )

2903 sorted_objs = sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta)

2904 yield from deltas_from_sorted_objects(

2905 sorted_objs,

2906 window_size=delta_window_size,

2907 progress=progress,

2908 )

2909 else:

2910 for oid in todo:

2911 yield full_unpacked_object(container[oid])

2912

2913

2914def full_unpacked_object(o: ShaFile) -> UnpackedObject:

2915 """Create an UnpackedObject from a ShaFile.

2916

2917 Args:

2918 o: ShaFile object to convert

2919

2920 Returns:

2921 UnpackedObject with full object data

2922 """

2923 return UnpackedObject(

2924 o.type_num,

2925 delta_base=None,

2926 crc32=None,

2927 decomp_chunks=o.as_raw_chunks(),

2928 sha=o.sha().digest(),

2929 )

2930

2931

2932def write_pack_from_container(

2933 write: Callable[[bytes], None]

2934 | Callable[[bytes | bytearray | memoryview], int]

2935 | IO[bytes],

2936 container: PackedObjectContainer,

2937 object_ids: Sequence[tuple[ObjectID, PackHint | None]],

2938 delta_window_size: int | None = None,

2939 deltify: bool | None = None,

2940 reuse_deltas: bool = True,

2941 compression_level: int = -1,

2942 other_haves: set[ObjectID] | None = None,

2943) -> tuple[dict[bytes, tuple[int, int]], bytes]:

2944 """Write a new pack data file.

2945

2946 Args:

2947 write: write function to use

2948 container: PackedObjectContainer

2949 object_ids: Sequence of (object_id, hint) tuples to write

2950 delta_window_size: Sliding window size for searching for deltas;

2951 Set to None for default window size.

2952 deltify: Whether to deltify objects

2953 reuse_deltas: Whether to reuse existing deltas

2954 compression_level: the zlib compression level to use

2955 other_haves: Set of additional object IDs the receiver has

2956 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2957 """

2958 pack_contents_count = len(object_ids)

2959 pack_contents = generate_unpacked_objects(

2960 container,

2961 object_ids,

2962 delta_window_size=delta_window_size,

2963 deltify=deltify,

2964 reuse_deltas=reuse_deltas,

2965 other_haves=other_haves,

2966 )

2967

2968 return write_pack_data(

2969 write,

2970 pack_contents,

2971 num_records=pack_contents_count,

2972 compression_level=compression_level,

2973 )

2974

2975

2976def write_pack_objects(

2977 write: Callable[[bytes], None] | IO[bytes],

2978 objects: Sequence[ShaFile] | Sequence[tuple[ShaFile, bytes | None]],

2979 *,

2980 delta_window_size: int | None = None,

2981 deltify: bool | None = None,

2982 compression_level: int = -1,

2983) -> tuple[dict[bytes, tuple[int, int]], bytes]:

2984 """Write a new pack data file.

2985

2986 Args:

2987 write: write function to use

2988 objects: Sequence of (object, path) tuples to write

2989 delta_window_size: Sliding window size for searching for deltas;

2990 Set to None for default window size.

2991 deltify: Whether to deltify objects

2992 compression_level: the zlib compression level to use

2993 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

2994 """

2995 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify)

2996

2997 return write_pack_data(

2998 write,

2999 pack_contents,

3000 num_records=pack_contents_count,

3001 compression_level=compression_level,

3002 )

3003

3004

3005class PackChunkGenerator:

3006 """Generator for pack data chunks."""

3007

3008 def __init__(

3009 self,

3010 num_records: int | None = None,

3011 records: Iterator[UnpackedObject] | None = None,

3012 progress: Callable[..., None] | None = None,

3013 compression_level: int = -1,

3014 reuse_compressed: bool = True,

3015 ) -> None:

3016 """Initialize PackChunkGenerator.

3017

3018 Args:

3019 num_records: Expected number of records

3020 records: Iterator of pack records

3021 progress: Optional progress callback

3022 compression_level: Compression level (-1 for default)

3023 reuse_compressed: Whether to reuse compressed chunks

3024 """

3025 self.cs = sha1(b"")

3026 self.entries: dict[bytes, tuple[int, int]] = {}

3027 if records is None:

3028 records = iter([]) # Empty iterator if None

3029 self._it = self._pack_data_chunks(

3030 records=records,

3031 num_records=num_records,

3032 progress=progress,

3033 compression_level=compression_level,

3034 reuse_compressed=reuse_compressed,

3035 )

3036

3037 def sha1digest(self) -> bytes:

3038 """Return the SHA1 digest of the pack data."""

3039 return self.cs.digest()

3040

3041 def __iter__(self) -> Iterator[bytes]:

3042 """Iterate over pack data chunks."""

3043 return self._it

3044

3045 def _pack_data_chunks(

3046 self,

3047 records: Iterator[UnpackedObject],

3048 *,

3049 num_records: int | None = None,

3050 progress: Callable[..., None] | None = None,

3051 compression_level: int = -1,

3052 reuse_compressed: bool = True,

3053 ) -> Iterator[bytes]:

3054 """Iterate pack data file chunks.

3055

3056 Args:

3057 records: Iterator over UnpackedObject

3058 num_records: Number of records (defaults to len(records) if not specified)

3059 progress: Function to report progress to

3060 compression_level: the zlib compression level

3061 reuse_compressed: Whether to reuse compressed chunks

3062 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

3063 """

3064 # Write the pack

3065 if num_records is None:

3066 num_records = len(records) # type: ignore

3067 offset = 0

3068 for chunk in pack_header_chunks(num_records):

3069 yield chunk

3070 self.cs.update(chunk)

3071 offset += len(chunk)

3072 actual_num_records = 0

3073 for i, unpacked in enumerate(records):

3074 type_num = unpacked.pack_type_num

3075 if progress is not None and i % 1000 == 0:

3076 progress((f"writing pack data: {i}/{num_records}\r").encode("ascii"))

3077 raw: list[bytes] | tuple[int, list[bytes]] | tuple[bytes, list[bytes]]

3078 if unpacked.delta_base is not None:

3079 assert isinstance(unpacked.delta_base, bytes), (

3080 f"Expected bytes, got {type(unpacked.delta_base)}"

3081 )

3082 try:

3083 base_offset, _base_crc32 = self.entries[unpacked.delta_base]

3084 except KeyError:

3085 type_num = REF_DELTA

3086 assert isinstance(unpacked.delta_base, bytes)

3087 raw = (unpacked.delta_base, unpacked.decomp_chunks)

3088 else:

3089 type_num = OFS_DELTA

3090 raw = (offset - base_offset, unpacked.decomp_chunks)

3091 else:

3092 raw = unpacked.decomp_chunks

3093 chunks: list[bytes] | Iterator[bytes]

3094 if unpacked.comp_chunks is not None and reuse_compressed:

3095 chunks = unpacked.comp_chunks

3096 else:

3097 chunks = pack_object_chunks(

3098 type_num, raw, compression_level=compression_level

3099 )

3100 crc32 = 0

3101 object_size = 0

3102 for chunk in chunks:

3103 yield chunk

3104 crc32 = binascii.crc32(chunk, crc32)

3105 self.cs.update(chunk)

3106 object_size += len(chunk)

3107 actual_num_records += 1

3108 self.entries[unpacked.sha()] = (offset, crc32)

3109 offset += object_size

3110 if actual_num_records != num_records:

3111 raise AssertionError(

3112 f"actual records written differs: {actual_num_records} != {num_records}"

3113 )

3114

3115 yield self.cs.digest()

3116

3117

3118def write_pack_data(

3119 write: Callable[[bytes], None]

3120 | Callable[[bytes | bytearray | memoryview], int]

3121 | IO[bytes],

3122 records: Iterator[UnpackedObject],

3123 *,

3124 num_records: int | None = None,

3125 progress: Callable[..., None] | None = None,

3126 compression_level: int = -1,

3127) -> tuple[dict[bytes, tuple[int, int]], bytes]:

3128 """Write a new pack data file.

3129

3130 Args:

3131 write: Write function to use

3132 num_records: Number of records (defaults to len(records) if None)

3133 records: Iterator over type_num, object_id, delta_base, raw

3134 progress: Function to report progress to

3135 compression_level: the zlib compression level

3136 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum

3137 """

3138 chunk_generator = PackChunkGenerator(

3139 num_records=num_records,

3140 records=records,

3141 progress=progress,

3142 compression_level=compression_level,

3143 )

3144 for chunk in chunk_generator:

3145 if callable(write):

3146 write(chunk)

3147 else:

3148 write.write(chunk)

3149 return chunk_generator.entries, chunk_generator.sha1digest()

3150

3151

3152def write_pack_index_v1(

3153 f: IO[bytes],

3154 entries: Iterable[tuple[bytes, int, int | None]],

3155 pack_checksum: bytes,

3156) -> bytes:

3157 """Write a new pack index file.

3158

3159 Args:

3160 f: A file-like object to write to

3161 entries: List of tuples with object name (sha), offset_in_pack,

3162 and crc32_checksum.

3163 pack_checksum: Checksum of the pack file.

3164 Returns: The SHA of the written index file

3165 """

3166 f = SHA1Writer(f)

3167 fan_out_table: dict[int, int] = defaultdict(lambda: 0)

3168 for name, _offset, _entry_checksum in entries:

3169 fan_out_table[ord(name[:1])] += 1

3170 # Fan-out table

3171 for i in range(0x100):

3172 f.write(struct.pack(">L", fan_out_table[i]))

3173 fan_out_table[i + 1] += fan_out_table[i]

3174 for name, offset, _entry_checksum in entries:

3175 if not (offset <= 0xFFFFFFFF):

3176 raise TypeError("pack format 1 only supports offsets < 2Gb")

3177 f.write(struct.pack(">L20s", offset, name))

3178 assert len(pack_checksum) == 20

3179 f.write(pack_checksum)

3180 return f.write_sha()

3181

3182

3183def _delta_encode_size(size: int) -> bytes:

3184 ret = bytearray()

3185 c = size & 0x7F

3186 size >>= 7

3187 while size:

3188 ret.append(c | 0x80)

3189 c = size & 0x7F

3190 size >>= 7

3191 ret.append(c)

3192 return bytes(ret)

3193

3194

3195# The length of delta compression copy operations in version 2 packs is limited

3196# to 64K. To copy more, we use several copy operations. Version 3 packs allow

3197# 24-bit lengths in copy operations, but we always make version 2 packs.

3198_MAX_COPY_LEN = 0xFFFF

3199

3200

3201def _encode_copy_operation(start: int, length: int) -> bytes:

3202 scratch = bytearray([0x80])

3203 for i in range(4):

3204 if start & 0xFF << i * 8:

3205 scratch.append((start >> i * 8) & 0xFF)

3206 scratch[0] |= 1 << i

3207 for i in range(2):

3208 if length & 0xFF << i * 8:

3209 scratch.append((length >> i * 8) & 0xFF)

3210 scratch[0] |= 1 << (4 + i)

3211 return bytes(scratch)

3212

3213

3214def _create_delta_py(base_buf: bytes, target_buf: bytes) -> Iterator[bytes]:

3215 """Use python difflib to work out how to transform base_buf to target_buf.

3216

3217 Args:

3218 base_buf: Base buffer

3219 target_buf: Target buffer

3220 """

3221 if isinstance(base_buf, list):

3222 base_buf = b"".join(base_buf)

3223 if isinstance(target_buf, list):

3224 target_buf = b"".join(target_buf)

3225 assert isinstance(base_buf, bytes)

3226 assert isinstance(target_buf, bytes)

3227 # write delta header

3228 yield _delta_encode_size(len(base_buf))

3229 yield _delta_encode_size(len(target_buf))

3230 # write out delta opcodes

3231 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf)

3232 for opcode, i1, i2, j1, j2 in seq.get_opcodes():

3233 # Git patch opcodes don't care about deletes!

3234 # if opcode == 'replace' or opcode == 'delete':

3235 # pass

3236 if opcode == "equal":

3237 # If they are equal, unpacker will use data from base_buf

3238 # Write out an opcode that says what range to use

3239 copy_start = i1

3240 copy_len = i2 - i1

3241 while copy_len > 0:

3242 to_copy = min(copy_len, _MAX_COPY_LEN)

3243 yield _encode_copy_operation(copy_start, to_copy)

3244 copy_start += to_copy

3245 copy_len -= to_copy

3246 if opcode == "replace" or opcode == "insert":

3247 # If we are replacing a range or adding one, then we just

3248 # output it to the stream (prefixed by its size)

3249 s = j2 - j1

3250 o = j1

3251 while s > 127:

3252 yield bytes([127])

3253 yield bytes(memoryview(target_buf)[o : o + 127])

3254 s -= 127

3255 o += 127

3256 yield bytes([s])

3257 yield bytes(memoryview(target_buf)[o : o + s])

3258

3259

3260# Default to pure Python implementation

3261create_delta = _create_delta_py

3262

3263

3264def apply_delta(

3265 src_buf: bytes | list[bytes], delta: bytes | list[bytes]

3266) -> list[bytes]:

3267 """Based on the similar function in git's patch-delta.c.

3268

3269 Args:

3270 src_buf: Source buffer

3271 delta: Delta instructions

3272 """

3273 if not isinstance(src_buf, bytes):

3274 src_buf = b"".join(src_buf)

3275 if not isinstance(delta, bytes):

3276 delta = b"".join(delta)

3277 out = []

3278 index = 0

3279 delta_length = len(delta)

3280

3281 def get_delta_header_size(delta: bytes, index: int) -> tuple[int, int]:

3282 size = 0

3283 i = 0

3284 while delta:

3285 cmd = ord(delta[index : index + 1])

3286 index += 1

3287 size |= (cmd & ~0x80) << i

3288 i += 7

3289 if not cmd & 0x80:

3290 break

3291 return size, index

3292

3293 src_size, index = get_delta_header_size(delta, index)

3294 dest_size, index = get_delta_header_size(delta, index)

3295 if src_size != len(src_buf):

3296 raise ApplyDeltaError(

3297 f"Unexpected source buffer size: {src_size} vs {len(src_buf)}"

3298 )

3299 while index < delta_length:

3300 cmd = ord(delta[index : index + 1])

3301 index += 1

3302 if cmd & 0x80:

3303 cp_off = 0

3304 for i in range(4):

3305 if cmd & (1 << i):

3306 x = ord(delta[index : index + 1])

3307 index += 1

3308 cp_off |= x << (i * 8)

3309 cp_size = 0

3310 # Version 3 packs can contain copy sizes larger than 64K.

3311 for i in range(3):

3312 if cmd & (1 << (4 + i)):

3313 x = ord(delta[index : index + 1])

3314 index += 1

3315 cp_size |= x << (i * 8)

3316 if cp_size == 0:

3317 cp_size = 0x10000

3318 if (

3319 cp_off + cp_size < cp_size

3320 or cp_off + cp_size > src_size

3321 or cp_size > dest_size

3322 ):

3323 break

3324 out.append(src_buf[cp_off : cp_off + cp_size])

3325 elif cmd != 0:

3326 out.append(delta[index : index + cmd])

3327 index += cmd

3328 else:

3329 raise ApplyDeltaError("Invalid opcode 0")

3330

3331 if index != delta_length:

3332 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}")

3333

3334 if dest_size != chunks_length(out):

3335 raise ApplyDeltaError("dest size incorrect")

3336

3337 return out

3338

3339

3340def write_pack_index_v2(

3341 f: IO[bytes],

3342 entries: Iterable[tuple[bytes, int, int | None]],

3343 pack_checksum: bytes,

3344) -> bytes:

3345 """Write a new pack index file.

3346

3347 Args:

3348 f: File-like object to write to

3349 entries: List of tuples with object name (sha), offset_in_pack, and

3350 crc32_checksum.

3351 pack_checksum: Checksum of the pack file.

3352 Returns: The SHA of the index file written

3353 """

3354 f = SHA1Writer(f)

3355 f.write(b"\377tOc") # Magic!

3356 f.write(struct.pack(">L", 2))

3357 fan_out_table: dict[int, int] = defaultdict(lambda: 0)

3358 for name, offset, entry_checksum in entries:

3359 fan_out_table[ord(name[:1])] += 1

3360 # Fan-out table

3361 largetable: list[int] = []

3362 for i in range(0x100):

3363 f.write(struct.pack(b">L", fan_out_table[i]))

3364 fan_out_table[i + 1] += fan_out_table[i]

3365 for name, offset, entry_checksum in entries:

3366 f.write(name)

3367 for name, offset, entry_checksum in entries:

3368 f.write(struct.pack(b">L", entry_checksum))

3369 for name, offset, entry_checksum in entries:

3370 if offset < 2**31:

3371 f.write(struct.pack(b">L", offset))

3372 else:

3373 f.write(struct.pack(b">L", 2**31 + len(largetable)))

3374 largetable.append(offset)

3375 for offset in largetable:

3376 f.write(struct.pack(b">Q", offset))

3377 assert len(pack_checksum) == 20

3378 f.write(pack_checksum)

3379 return f.write_sha()

3380

3381

3382def write_pack_index_v3(

3383 f: IO[bytes],

3384 entries: Iterable[tuple[bytes, int, int | None]],

3385 pack_checksum: bytes,

3386 hash_algorithm: int = 1,

3387) -> bytes:

3388 """Write a new pack index file in v3 format.

3389

3390 Args:

3391 f: File-like object to write to

3392 entries: List of tuples with object name (sha), offset_in_pack, and

3393 crc32_checksum.

3394 pack_checksum: Checksum of the pack file.

3395 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)

3396 Returns: The SHA of the index file written

3397 """

3398 if hash_algorithm == 1:

3399 hash_size = 20 # SHA-1

3400 writer_cls = SHA1Writer

3401 elif hash_algorithm == 2:

3402 hash_size = 32 # SHA-256

3403 # TODO: Add SHA256Writer when SHA-256 support is implemented

3404 raise NotImplementedError("SHA-256 support not yet implemented")

3405 else:

3406 raise ValueError(f"Unknown hash algorithm {hash_algorithm}")

3407

3408 # Convert entries to list to allow multiple iterations

3409 entries_list = list(entries)

3410

3411 # Calculate shortest unambiguous prefix length for object names

3412 # For now, use full hash size (this could be optimized)

3413 shortened_oid_len = hash_size

3414

3415 f = writer_cls(f)

3416 f.write(b"\377tOc") # Magic!

3417 f.write(struct.pack(">L", 3)) # Version 3

3418 f.write(struct.pack(">L", hash_algorithm)) # Hash algorithm

3419 f.write(struct.pack(">L", shortened_oid_len)) # Shortened OID length

3420

3421 fan_out_table: dict[int, int] = defaultdict(lambda: 0)

3422 for name, offset, entry_checksum in entries_list:

3423 if len(name) != hash_size:

3424 raise ValueError(

3425 f"Object name has wrong length: expected {hash_size}, got {len(name)}"

3426 )

3427 fan_out_table[ord(name[:1])] += 1

3428

3429 # Fan-out table

3430 largetable: list[int] = []

3431 for i in range(0x100):

3432 f.write(struct.pack(b">L", fan_out_table[i]))

3433 fan_out_table[i + 1] += fan_out_table[i]

3434

3435 # Object names table

3436 for name, offset, entry_checksum in entries_list:

3437 f.write(name)

3438

3439 # CRC32 checksums table

3440 for name, offset, entry_checksum in entries_list:

3441 f.write(struct.pack(b">L", entry_checksum))

3442

3443 # Offset table

3444 for name, offset, entry_checksum in entries_list:

3445 if offset < 2**31:

3446 f.write(struct.pack(b">L", offset))

3447 else:

3448 f.write(struct.pack(b">L", 2**31 + len(largetable)))

3449 largetable.append(offset)

3450

3451 # Large offset table

3452 for offset in largetable:

3453 f.write(struct.pack(b">Q", offset))

3454

3455 assert len(pack_checksum) == hash_size, (

3456 f"Pack checksum has wrong length: expected {hash_size}, got {len(pack_checksum)}"

3457 )

3458 f.write(pack_checksum)

3459 return f.write_sha()

3460

3461

3462def write_pack_index(

3463 f: IO[bytes],

3464 entries: Iterable[tuple[bytes, int, int | None]],

3465 pack_checksum: bytes,

3466 progress: Callable[..., None] | None = None,

3467 version: int | None = None,

3468) -> bytes:

3469 """Write a pack index file.

3470

3471 Args:

3472 f: File-like object to write to.

3473 entries: List of (checksum, offset, crc32) tuples

3474 pack_checksum: Checksum of the pack file.

3475 progress: Progress function (not currently used)

3476 version: Pack index version to use (1, 2, or 3). If None, defaults to DEFAULT_PACK_INDEX_VERSION.

3477

3478 Returns:

3479 SHA of the written index file

3480 """

3481 if version is None:

3482 version = DEFAULT_PACK_INDEX_VERSION

3483

3484 if version == 1:

3485 return write_pack_index_v1(f, entries, pack_checksum)

3486 elif version == 2:

3487 return write_pack_index_v2(f, entries, pack_checksum)

3488 elif version == 3:

3489 return write_pack_index_v3(f, entries, pack_checksum)

3490 else:

3491 raise ValueError(f"Unsupported pack index version: {version}")

3492

3493

3494class Pack:

3495 """A Git pack object."""

3496

3497 _data_load: Callable[[], PackData] | None

3498 _idx_load: Callable[[], PackIndex] | None

3499

3500 _data: PackData | None

3501 _idx: PackIndex | None

3502 _bitmap: "PackBitmap | None"

3503

3504 def __init__(

3505 self,

3506 basename: str,

3507 resolve_ext_ref: ResolveExtRefFn | None = None,

3508 *,

3509 delta_window_size: int | None = None,

3510 window_memory: int | None = None,

3511 delta_cache_size: int | None = None,

3512 depth: int | None = None,

3513 threads: int | None = None,

3514 big_file_threshold: int | None = None,

3515 ) -> None:

3516 """Initialize a Pack object.

3517

3518 Args:

3519 basename: Base path for pack files (without .pack/.idx extension)

3520 resolve_ext_ref: Optional function to resolve external references

3521 delta_window_size: Size of the delta compression window

3522 window_memory: Memory limit for delta compression window

3523 delta_cache_size: Size of the delta cache

3524 depth: Maximum depth for delta chains

3525 threads: Number of threads to use for operations

3526 big_file_threshold: Size threshold for big file handling

3527 """

3528 self._basename = basename

3529 self._data = None

3530 self._idx = None

3531 self._bitmap = None

3532 self._idx_path = self._basename + ".idx"

3533 self._data_path = self._basename + ".pack"

3534 self._bitmap_path = self._basename + ".bitmap"

3535 self.delta_window_size = delta_window_size

3536 self.window_memory = window_memory

3537 self.delta_cache_size = delta_cache_size

3538 self.depth = depth

3539 self.threads = threads

3540 self.big_file_threshold = big_file_threshold

3541 self._data_load = lambda: PackData(

3542 self._data_path,

3543 delta_window_size=delta_window_size,

3544 window_memory=window_memory,

3545 delta_cache_size=delta_cache_size,

3546 depth=depth,

3547 threads=threads,

3548 big_file_threshold=big_file_threshold,

3549 )

3550 self._idx_load = lambda: load_pack_index(self._idx_path)

3551 self.resolve_ext_ref = resolve_ext_ref

3552

3553 @classmethod

3554 def from_lazy_objects(

3555 cls, data_fn: Callable[[], PackData], idx_fn: Callable[[], PackIndex]

3556 ) -> "Pack":

3557 """Create a new pack object from callables to load pack data and index objects."""

3558 ret = cls("")

3559 ret._data_load = data_fn

3560 ret._idx_load = idx_fn

3561 return ret

3562

3563 @classmethod

3564 def from_objects(cls, data: PackData, idx: PackIndex) -> "Pack":

3565 """Create a new pack object from pack data and index objects."""

3566 ret = cls("")

3567 ret._data = data

3568 ret._data_load = None

3569 ret._idx = idx

3570 ret._idx_load = None

3571 ret.check_length_and_checksum()

3572 return ret

3573

3574 def name(self) -> bytes:

3575 """The SHA over the SHAs of the objects in this pack."""

3576 return self.index.objects_sha1()

3577

3578 @property

3579 def data(self) -> PackData:

3580 """The pack data object being used."""

3581 if self._data is None:

3582 assert self._data_load

3583 self._data = self._data_load()

3584 self.check_length_and_checksum()

3585 return self._data

3586

3587 @property

3588 def index(self) -> PackIndex:

3589 """The index being used.

3590

3591 Note: This may be an in-memory index

3592 """

3593 if self._idx is None:

3594 assert self._idx_load

3595 self._idx = self._idx_load()

3596 return self._idx

3597

3598 @property

3599 def bitmap(self) -> "PackBitmap | None":

3600 """The bitmap being used, if available.

3601

3602 Returns:

3603 PackBitmap instance or None if no bitmap exists

3604

3605 Raises:

3606 ValueError: If bitmap file is invalid or corrupt

3607 """

3608 if self._bitmap is None:

3609 from .bitmap import read_bitmap

3610

3611 self._bitmap = read_bitmap(self._bitmap_path, pack_index=self.index)

3612 return self._bitmap

3613

3614 def ensure_bitmap(

3615 self,

3616 object_store: "BaseObjectStore",

3617 refs: dict["Ref", "ObjectID"],

3618 commit_interval: int | None = None,

3619 progress: Callable[[str], None] | None = None,

3620 ) -> "PackBitmap":

3621 """Ensure a bitmap exists for this pack, generating one if needed.

3622

3623 Args:

3624 object_store: Object store to read objects from

3625 refs: Dictionary of ref names to commit SHAs

3626 commit_interval: Include every Nth commit in bitmap index

3627 progress: Optional progress reporting callback

3628

3629 Returns:

3630 PackBitmap instance (either existing or newly generated)

3631 """

3632 from .bitmap import generate_bitmap, write_bitmap

3633

3634 # Check if bitmap already exists

3635 try:

3636 existing = self.bitmap

3637 if existing is not None:

3638 return existing

3639 except FileNotFoundError:

3640 pass # No bitmap, we'll generate one

3641

3642 # Generate new bitmap

3643 if progress:

3644 progress(f"Generating bitmap for {self.name().decode('utf-8')}...\n")

3645

3646 pack_bitmap = generate_bitmap(

3647 self.index,

3648 object_store,

3649 refs,

3650 self.get_stored_checksum(),

3651 commit_interval=commit_interval,

3652 progress=progress,

3653 )

3654

3655 # Write bitmap file

3656 write_bitmap(self._bitmap_path, pack_bitmap)

3657

3658 if progress:

3659 progress(f"Wrote {self._bitmap_path}\n")

3660

3661 # Update cached bitmap

3662 self._bitmap = pack_bitmap

3663

3664 return pack_bitmap

3665

3666 def close(self) -> None:

3667 """Close the pack file and index."""

3668 if self._data is not None:

3669 self._data.close()

3670 if self._idx is not None:

3671 self._idx.close()

3672

3673 def __enter__(self) -> "Pack":

3674 """Enter context manager."""

3675 return self

3676

3677 def __exit__(

3678 self,

3679 exc_type: type | None,

3680 exc_val: BaseException | None,

3681 exc_tb: TracebackType | None,

3682 ) -> None:

3683 """Exit context manager."""

3684 self.close()

3685

3686 def __eq__(self, other: object) -> bool:

3687 """Check equality with another pack."""

3688 if not isinstance(other, Pack):

3689 return False

3690 return self.index == other.index

3691

3692 def __len__(self) -> int:

3693 """Number of entries in this pack."""

3694 return len(self.index)

3695

3696 def __repr__(self) -> str:

3697 """Return string representation of this pack."""

3698 return f"{self.__class__.__name__}({self._basename!r})"

3699

3700 def __iter__(self) -> Iterator[ObjectID]:

3701 """Iterate over all the sha1s of the objects in this pack."""

3702 return iter(self.index)

3703

3704 def check_length_and_checksum(self) -> None:

3705 """Sanity check the length and checksum of the pack index and data."""

3706 assert len(self.index) == len(self.data), (

3707 f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)"

3708 )

3709 idx_stored_checksum = self.index.get_pack_checksum()

3710 data_stored_checksum = self.data.get_stored_checksum()

3711 if (

3712 idx_stored_checksum is not None

3713 and idx_stored_checksum != data_stored_checksum

3714 ):

3715 raise ChecksumMismatch(

3716 sha_to_hex(RawObjectID(idx_stored_checksum)),

3717 sha_to_hex(RawObjectID(data_stored_checksum)),

3718 )

3719

3720 def check(self) -> None:

3721 """Check the integrity of this pack.

3722

3723 Raises:

3724 ChecksumMismatch: if a checksum for the index or data is wrong

3725 """

3726 self.index.check()

3727 self.data.check()

3728 for obj in self.iterobjects():

3729 obj.check()

3730 # TODO: object connectivity checks

3731

3732 def get_stored_checksum(self) -> bytes:

3733 """Return the stored checksum of the pack data."""

3734 return self.data.get_stored_checksum()

3735

3736 def pack_tuples(self) -> list[tuple[ShaFile, None]]:

3737 """Return pack tuples for all objects in pack."""

3738 return [(o, None) for o in self.iterobjects()]

3739

3740 def __contains__(self, sha1: ObjectID | RawObjectID) -> bool:

3741 """Check whether this pack contains a particular SHA1."""

3742 try:

3743 self.index.object_offset(sha1)

3744 return True

3745 except KeyError:

3746 return False

3747

3748 def get_raw(self, sha1: RawObjectID | ObjectID) -> tuple[int, bytes]:

3749 """Get raw object data by SHA1."""

3750 offset = self.index.object_offset(sha1)

3751 obj_type, obj = self.data.get_object_at(offset)

3752 type_num, chunks = self.resolve_object(offset, obj_type, obj)

3753 return type_num, b"".join(chunks) # type: ignore[arg-type]

3754

3755 def __getitem__(self, sha1: "ObjectID | RawObjectID") -> ShaFile:

3756 """Retrieve the specified SHA1."""

3757 type, uncomp = self.get_raw(sha1)

3758 return ShaFile.from_raw_string(type, uncomp, sha=sha1)

3759

3760 def iterobjects(self) -> Iterator[ShaFile]:

3761 """Iterate over the objects in this pack."""

3762 return iter(

3763 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref)

3764 )

3765

3766 def iterobjects_subset(

3767 self, shas: Iterable[ObjectID], *, allow_missing: bool = False

3768 ) -> Iterator[ShaFile]:

3769 """Iterate over a subset of objects in this pack."""

3770 return (

3771 uo

3772 for uo in PackInflater.for_pack_subset(

3773 self,

3774 shas,

3775 allow_missing=allow_missing,

3776 resolve_ext_ref=self.resolve_ext_ref,

3777 )

3778 if uo.id in shas

3779 )

3780

3781 def iter_unpacked_subset(

3782 self,

3783 shas: Iterable[ObjectID | RawObjectID],

3784 *,

3785 include_comp: bool = False,

3786 allow_missing: bool = False,

3787 convert_ofs_delta: bool = False,

3788 ) -> Iterator[UnpackedObject]:

3789 """Iterate over unpacked objects in subset."""

3790 ofs_pending: dict[int, list[UnpackedObject]] = defaultdict(list)

3791 ofs: dict[int, bytes] = {}

3792 todo: set[ObjectID | RawObjectID] = set(shas)

3793 for unpacked in self.iter_unpacked(include_comp=include_comp):

3794 sha = unpacked.sha()

3795 if unpacked.offset is not None:

3796 ofs[unpacked.offset] = sha

3797 hexsha = sha_to_hex(RawObjectID(sha))

3798 if hexsha in todo:

3799 if unpacked.pack_type_num == OFS_DELTA:

3800 assert isinstance(unpacked.delta_base, int)

3801 assert unpacked.offset is not None

3802 base_offset = unpacked.offset - unpacked.delta_base

3803 try:

3804 unpacked.delta_base = ofs[base_offset]

3805 except KeyError:

3806 ofs_pending[base_offset].append(unpacked)

3807 continue

3808 else:

3809 unpacked.pack_type_num = REF_DELTA

3810 yield unpacked

3811 todo.remove(hexsha)

3812 if unpacked.offset is not None:

3813 for child in ofs_pending.pop(unpacked.offset, []):

3814 child.pack_type_num = REF_DELTA

3815 child.delta_base = sha

3816 yield child

3817 assert not ofs_pending

3818 if not allow_missing and todo:

3819 raise UnresolvedDeltas(list(todo))

3820

3821 def iter_unpacked(self, include_comp: bool = False) -> Iterator[UnpackedObject]:

3822 """Iterate over all unpacked objects in this pack."""

3823 ofs_to_entries = {

3824 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries()

3825 }

3826 for unpacked in self.data.iter_unpacked(include_comp=include_comp):

3827 assert unpacked.offset is not None

3828 (sha, crc32) = ofs_to_entries[unpacked.offset]

3829 unpacked._sha = sha

3830 unpacked.crc32 = crc32

3831 yield unpacked

3832

3833 def keep(self, msg: bytes | None = None) -> str:

3834 """Add a .keep file for the pack, preventing git from garbage collecting it.

3835

3836 Args:

3837 msg: A message written inside the .keep file; can be used later

3838 to determine whether or not a .keep file is obsolete.

3839 Returns: The path of the .keep file, as a string.

3840 """

3841 keepfile_name = f"{self._basename}.keep"

3842 with GitFile(keepfile_name, "wb") as keepfile:

3843 if msg:

3844 keepfile.write(msg)

3845 keepfile.write(b"\n")

3846 return keepfile_name

3847

3848 def get_ref(

3849 self, sha: RawObjectID | ObjectID

3850 ) -> tuple[int | None, int, OldUnpackedObject]:

3851 """Get the object for a ref SHA, only looking in this pack."""

3852 # TODO: cache these results

3853 try:

3854 offset = self.index.object_offset(sha)

3855 except KeyError:

3856 offset = None

3857 if offset:

3858 type, obj = self.data.get_object_at(offset)

3859 elif self.resolve_ext_ref:

3860 type, obj = self.resolve_ext_ref(sha)

3861 else:

3862 raise KeyError(sha)

3863 return offset, type, obj

3864

3865 def resolve_object(

3866 self,

3867 offset: int,

3868 type: int,

3869 obj: OldUnpackedObject,

3870 get_ref: Callable[

3871 [RawObjectID | ObjectID], tuple[int | None, int, OldUnpackedObject]

3872 ]

3873 | None = None,

3874 ) -> tuple[int, OldUnpackedObject]:

3875 """Resolve an object, possibly resolving deltas when necessary.

3876

3877 Returns: Tuple with object type and contents.

3878 """

3879 # Walk down the delta chain, building a stack of deltas to reach

3880 # the requested object.

3881 base_offset: int | None = offset

3882 base_type = type

3883 base_obj = obj

3884 delta_stack = []

3885 while base_type in DELTA_TYPES:

3886 prev_offset = base_offset

3887 if get_ref is None:

3888 get_ref = self.get_ref

3889 if base_type == OFS_DELTA:

3890 (delta_offset, delta) = base_obj

3891 # TODO: clean up asserts and replace with nicer error messages

3892 assert isinstance(delta_offset, int), (

3893 f"Expected int, got {delta_offset.__class__}"

3894 )

3895 assert base_offset is not None

3896 base_offset = base_offset - delta_offset

3897 base_type, base_obj = self.data.get_object_at(base_offset)

3898 assert isinstance(base_type, int)

3899 elif base_type == REF_DELTA:

3900 (basename, delta) = base_obj

3901 assert isinstance(basename, bytes) and len(basename) == 20

3902 base_offset, base_type, base_obj = get_ref(cast(RawObjectID, basename))

3903 assert isinstance(base_type, int)

3904 if base_offset == prev_offset: # object is based on itself

3905 raise UnresolvedDeltas([basename])

3906 delta_stack.append((prev_offset, base_type, delta))

3907

3908 # Now grab the base object (mustn't be a delta) and apply the

3909 # deltas all the way up the stack.

3910 chunks = base_obj

3911 for prev_offset, _delta_type, delta in reversed(delta_stack):

3912 # Convert chunks to bytes for apply_delta if needed

3913 if isinstance(chunks, list):

3914 chunks_bytes = b"".join(chunks)

3915 elif isinstance(chunks, tuple):

3916 # For tuple type, second element is the actual data

3917 _, chunk_data = chunks

3918 if isinstance(chunk_data, list):

3919 chunks_bytes = b"".join(chunk_data)

3920 else:

3921 chunks_bytes = chunk_data

3922 else:

3923 chunks_bytes = chunks

3924

3925 # Apply delta and get result as list

3926 chunks = apply_delta(chunks_bytes, delta)

3927

3928 if prev_offset is not None:

3929 self.data._offset_cache[prev_offset] = base_type, chunks

3930 return base_type, chunks

3931

3932 def entries(

3933 self, progress: Callable[[int, int], None] | None = None

3934 ) -> Iterator[PackIndexEntry]:

3935 """Yield entries summarizing the contents of this pack.

3936

3937 Args:

3938 progress: Progress function, called with current and total

3939 object count.

3940 Returns: iterator of tuples with (sha, offset, crc32)

3941 """

3942 return self.data.iterentries(

3943 progress=progress, resolve_ext_ref=self.resolve_ext_ref

3944 )

3945

3946 def sorted_entries(

3947 self, progress: ProgressFn | None = None

3948 ) -> Iterator[PackIndexEntry]:

3949 """Return entries in this pack, sorted by SHA.

3950

3951 Args:

3952 progress: Progress function, called with current and total

3953 object count

3954 Returns: Iterator of tuples with (sha, offset, crc32)

3955 """

3956 return iter(

3957 self.data.sorted_entries(

3958 progress=progress, resolve_ext_ref=self.resolve_ext_ref

3959 )

3960 )

3961

3962 def get_unpacked_object(

3963 self,

3964 sha: ObjectID | RawObjectID,

3965 *,

3966 include_comp: bool = False,

3967 convert_ofs_delta: bool = True,

3968 ) -> UnpackedObject:

3969 """Get the unpacked object for a sha.

3970

3971 Args:

3972 sha: SHA of object to fetch

3973 include_comp: Whether to include compression data in UnpackedObject

3974 convert_ofs_delta: Whether to convert offset deltas to ref deltas

3975 """

3976 offset = self.index.object_offset(sha)

3977 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp)

3978 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta:

3979 assert isinstance(unpacked.delta_base, int)

3980 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base)

3981 unpacked.pack_type_num = REF_DELTA

3982 return unpacked

3983

3984

3985def extend_pack(

3986 f: BinaryIO,

3987 object_ids: Set["RawObjectID"],

3988 get_raw: Callable[["RawObjectID | ObjectID"], tuple[int, bytes]],

3989 *,

3990 compression_level: int = -1,

3991 progress: Callable[[bytes], None] | None = None,

3992) -> tuple[bytes, list[tuple["RawObjectID", int, int]]]:

3993 """Extend a pack file with more objects.

3994

3995 The caller should make sure that object_ids does not contain any objects

3996 that are already in the pack

3997 """

3998 # Update the header with the new number of objects.

3999 f.seek(0)

4000 _version, num_objects = read_pack_header(f.read)

4001

4002 if object_ids:

4003 f.seek(0)

4004 write_pack_header(f.write, num_objects + len(object_ids))

4005

4006 # Must flush before reading (http://bugs.python.org/issue3207)

4007 f.flush()

4008

4009 # Rescan the rest of the pack, computing the SHA with the new header.

4010 new_sha = compute_file_sha(f, end_ofs=-20)

4011

4012 # Must reposition before writing (http://bugs.python.org/issue3207)

4013 f.seek(0, os.SEEK_CUR)

4014

4015 extra_entries = []

4016

4017 # Complete the pack.

4018 for i, object_id in enumerate(object_ids):

4019 if progress is not None:

4020 progress(

4021 (f"writing extra base objects: {i}/{len(object_ids)}\r").encode("ascii")

4022 )

4023 assert len(object_id) == 20

4024 type_num, data = get_raw(object_id)

4025 offset = f.tell()

4026 crc32 = write_pack_object(

4027 f.write,

4028 type_num,

4029 [data], # Convert bytes to list[bytes]

4030 sha=new_sha,

4031 compression_level=compression_level,

4032 )

4033 extra_entries.append((object_id, offset, crc32))

4034 pack_sha = new_sha.digest()

4035 f.write(pack_sha)

4036 return pack_sha, extra_entries

4037

4038

4039try:

4040 from dulwich._pack import ( # type: ignore

4041 apply_delta,

4042 bisect_find_sha,

4043 )

4044except ImportError:

4045 pass

4046

4047# Try to import the Rust version of create_delta

4048try:

4049 from dulwich._pack import create_delta as _create_delta_rs

4050except ImportError:

4051 pass

4052else:

4053 # Wrap the Rust version to match the Python API (returns bytes instead of Iterator)

4054 def _create_delta_rs_wrapper(base_buf: bytes, target_buf: bytes) -> Iterator[bytes]:

4055 """Wrapper for Rust create_delta to match Python API."""

4056 yield _create_delta_rs(base_buf, target_buf)

4057

4058 create_delta = _create_delta_rs_wrapper