Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/pack.py: 23%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# pack.py -- For dealing with packed git objects.
2# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk>
4#
5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
7# General Public License as published by the Free Software Foundation; version 2.0
8# or (at your option) any later version. You can redistribute it and/or
9# modify it under the terms of either of these two licenses.
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# You should have received a copy of the licenses; if not, see
18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
20# License, Version 2.0.
21#
23"""Classes for dealing with packed git objects.
25A pack is a compact representation of a bunch of objects, stored
26using deltas where possible.
28They have two parts, the pack file, which stores the data, and an index
29that tells you where the data is.
31To find an object you look in all of the index files 'til you find a
32match for the object name. You then use the pointer got from this as
33a pointer in to the corresponding packfile.
34"""
36__all__ = [
37 "DEFAULT_PACK_DELTA_WINDOW_SIZE",
38 "DEFAULT_PACK_INDEX_VERSION",
39 "DELTA_TYPES",
40 "OFS_DELTA",
41 "PACK_SPOOL_FILE_MAX_SIZE",
42 "REF_DELTA",
43 "DeltaChainIterator",
44 "FilePackIndex",
45 "MemoryPackIndex",
46 "ObjectContainer",
47 "Pack",
48 "PackChunkGenerator",
49 "PackData",
50 "PackFileDisappeared",
51 "PackHint",
52 "PackIndex",
53 "PackIndex1",
54 "PackIndex2",
55 "PackIndex3",
56 "PackIndexEntry",
57 "PackIndexer",
58 "PackInflater",
59 "PackStreamCopier",
60 "PackStreamReader",
61 "PackedObjectContainer",
62 "SHA1Reader",
63 "SHA1Writer",
64 "UnpackedObject",
65 "UnpackedObjectIterator",
66 "UnpackedObjectStream",
67 "UnresolvedDeltas",
68 "apply_delta",
69 "bisect_find_sha",
70 "chunks_length",
71 "compute_file_sha",
72 "deltas_from_sorted_objects",
73 "deltify_pack_objects",
74 "extend_pack",
75 "find_reusable_deltas",
76 "full_unpacked_object",
77 "generate_unpacked_objects",
78 "iter_sha1",
79 "load_pack_index",
80 "load_pack_index_file",
81 "obj_sha",
82 "pack_header_chunks",
83 "pack_object_chunks",
84 "pack_object_header",
85 "pack_objects_to_data",
86 "read_pack_header",
87 "read_zlib_chunks",
88 "sort_objects_for_delta",
89 "take_msb_bytes",
90 "unpack_object",
91 "write_pack",
92 "write_pack_data",
93 "write_pack_from_container",
94 "write_pack_header",
95 "write_pack_index",
96 "write_pack_object",
97 "write_pack_objects",
98]
100import binascii
101from collections import defaultdict, deque
102from contextlib import suppress
103from io import BytesIO, UnsupportedOperation
105try:
106 from cdifflib import CSequenceMatcher as SequenceMatcher
107except ModuleNotFoundError:
108 from difflib import SequenceMatcher
110import os
111import struct
112import sys
113import warnings
114import zlib
115from collections.abc import Callable, Iterable, Iterator, Sequence, Set
116from hashlib import sha1
117from itertools import chain
118from os import SEEK_CUR, SEEK_END
119from struct import unpack_from
120from types import TracebackType
121from typing import (
122 IO,
123 TYPE_CHECKING,
124 Any,
125 BinaryIO,
126 Generic,
127 Protocol,
128 TypeVar,
129 cast,
130)
132try:
133 import mmap
134except ImportError:
135 has_mmap = False
136else:
137 has_mmap = True
139if TYPE_CHECKING:
140 from _hashlib import HASH as HashObject
142 from .bitmap import PackBitmap
143 from .commit_graph import CommitGraph
144 from .object_store import BaseObjectStore
145 from .refs import Ref
147# For some reason the above try, except fails to set has_mmap = False for plan9
148if sys.platform == "Plan9":
149 has_mmap = False
151from . import replace_me
152from .errors import ApplyDeltaError, ChecksumMismatch
153from .file import GitFile, _GitFile
154from .lru_cache import LRUSizeCache
155from .objects import (
156 ObjectID,
157 RawObjectID,
158 ShaFile,
159 hex_to_sha,
160 object_header,
161 sha_to_hex,
162)
164OFS_DELTA = 6
165REF_DELTA = 7
167DELTA_TYPES = (OFS_DELTA, REF_DELTA)
170DEFAULT_PACK_DELTA_WINDOW_SIZE = 10
172# Keep pack files under 16Mb in memory, otherwise write them out to disk
173PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024
175# Default pack index version to use when none is specified
176DEFAULT_PACK_INDEX_VERSION = 2
179OldUnpackedObject = tuple[bytes | int, list[bytes]] | list[bytes]
180ResolveExtRefFn = Callable[[bytes], tuple[int, OldUnpackedObject]]
181ProgressFn = Callable[[int, str], None]
182PackHint = tuple[int, bytes | None]
185class UnresolvedDeltas(Exception):
186 """Delta objects could not be resolved."""
188 def __init__(self, shas: list[bytes]) -> None:
189 """Initialize UnresolvedDeltas exception.
191 Args:
192 shas: List of SHA hashes for unresolved delta objects
193 """
194 self.shas = shas
197class ObjectContainer(Protocol):
198 """Protocol for objects that can contain git objects."""
200 def add_object(self, obj: ShaFile) -> None:
201 """Add a single object to this object store."""
203 def add_objects(
204 self,
205 objects: Sequence[tuple[ShaFile, str | None]],
206 progress: Callable[..., None] | None = None,
207 ) -> "Pack | None":
208 """Add a set of objects to this object store.
210 Args:
211 objects: Iterable over a list of (object, path) tuples
212 progress: Progress callback for object insertion
213 Returns: Optional Pack object of the objects written.
214 """
216 def __contains__(self, sha1: "ObjectID") -> bool:
217 """Check if a hex sha is present."""
219 def __getitem__(self, sha1: "ObjectID | RawObjectID") -> ShaFile:
220 """Retrieve an object."""
222 def get_commit_graph(self) -> "CommitGraph | None":
223 """Get the commit graph for this object store.
225 Returns:
226 CommitGraph object if available, None otherwise
227 """
228 return None
231class PackedObjectContainer(ObjectContainer):
232 """Container for objects packed in a pack file."""
234 def get_unpacked_object(
235 self, sha1: "ObjectID | RawObjectID", *, include_comp: bool = False
236 ) -> "UnpackedObject":
237 """Get a raw unresolved object.
239 Args:
240 sha1: SHA-1 hash of the object
241 include_comp: Whether to include compressed data
243 Returns:
244 UnpackedObject instance
245 """
246 raise NotImplementedError(self.get_unpacked_object)
248 def iterobjects_subset(
249 self, shas: Iterable["ObjectID"], *, allow_missing: bool = False
250 ) -> Iterator[ShaFile]:
251 """Iterate over a subset of objects.
253 Args:
254 shas: Iterable of object SHAs to retrieve
255 allow_missing: If True, skip missing objects
257 Returns:
258 Iterator of ShaFile objects
259 """
260 raise NotImplementedError(self.iterobjects_subset)
262 def iter_unpacked_subset(
263 self,
264 shas: Iterable["ObjectID | RawObjectID"],
265 *,
266 include_comp: bool = False,
267 allow_missing: bool = False,
268 convert_ofs_delta: bool = True,
269 ) -> Iterator["UnpackedObject"]:
270 """Iterate over unpacked objects from a subset of SHAs.
272 Args:
273 shas: Set of object SHAs to retrieve
274 include_comp: Include compressed data if True
275 allow_missing: If True, skip missing objects
276 convert_ofs_delta: If True, convert offset deltas to ref deltas
278 Returns:
279 Iterator of UnpackedObject instances
280 """
281 raise NotImplementedError(self.iter_unpacked_subset)
284class UnpackedObjectStream:
285 """Abstract base class for a stream of unpacked objects."""
287 def __iter__(self) -> Iterator["UnpackedObject"]:
288 """Iterate over unpacked objects."""
289 raise NotImplementedError(self.__iter__)
291 def __len__(self) -> int:
292 """Return the number of objects in the stream."""
293 raise NotImplementedError(self.__len__)
296def take_msb_bytes(
297 read: Callable[[int], bytes], crc32: int | None = None
298) -> tuple[list[int], int | None]:
299 """Read bytes marked with most significant bit.
301 Args:
302 read: Read function
303 crc32: Optional CRC32 checksum to update
305 Returns:
306 Tuple of (list of bytes read, updated CRC32 or None)
307 """
308 ret: list[int] = []
309 while len(ret) == 0 or ret[-1] & 0x80:
310 b = read(1)
311 if crc32 is not None:
312 crc32 = binascii.crc32(b, crc32)
313 ret.append(ord(b[:1]))
314 return ret, crc32
317class PackFileDisappeared(Exception):
318 """Raised when a pack file unexpectedly disappears."""
320 def __init__(self, obj: object) -> None:
321 """Initialize PackFileDisappeared exception.
323 Args:
324 obj: The object that triggered the exception
325 """
326 self.obj = obj
329class UnpackedObject:
330 """Class encapsulating an object unpacked from a pack file.
332 These objects should only be created from within unpack_object. Most
333 members start out as empty and are filled in at various points by
334 read_zlib_chunks, unpack_object, DeltaChainIterator, etc.
336 End users of this object should take care that the function they're getting
337 this object from is guaranteed to set the members they need.
338 """
340 __slots__ = [
341 "_sha", # Cached binary SHA.
342 "comp_chunks", # Compressed object chunks.
343 "crc32", # CRC32.
344 "decomp_chunks", # Decompressed object chunks.
345 "decomp_len", # Decompressed length of this object.
346 "delta_base", # Delta base offset or SHA.
347 "obj_chunks", # Decompressed and delta-resolved chunks.
348 "obj_type_num", # Type of this object.
349 "offset", # Offset in its pack.
350 "pack_type_num", # Type of this object in the pack (may be a delta).
351 ]
353 obj_type_num: int | None
354 obj_chunks: list[bytes] | None
355 delta_base: None | bytes | int
356 decomp_chunks: list[bytes]
357 comp_chunks: list[bytes] | None
358 decomp_len: int | None
359 crc32: int | None
360 offset: int | None
361 pack_type_num: int
362 _sha: bytes | None
364 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be
365 # methods of this object.
366 def __init__(
367 self,
368 pack_type_num: int,
369 *,
370 delta_base: None | bytes | int = None,
371 decomp_len: int | None = None,
372 crc32: int | None = None,
373 sha: bytes | None = None,
374 decomp_chunks: list[bytes] | None = None,
375 offset: int | None = None,
376 ) -> None:
377 """Initialize an UnpackedObject.
379 Args:
380 pack_type_num: Type number of this object in the pack
381 delta_base: Delta base (offset or SHA) if this is a delta object
382 decomp_len: Decompressed length of this object
383 crc32: CRC32 checksum
384 sha: SHA-1 hash of the object
385 decomp_chunks: Decompressed chunks
386 offset: Offset in the pack file
387 """
388 self.offset = offset
389 self._sha = sha
390 self.pack_type_num = pack_type_num
391 self.delta_base = delta_base
392 self.comp_chunks = None
393 self.decomp_chunks: list[bytes] = decomp_chunks or []
394 if decomp_chunks is not None and decomp_len is None:
395 self.decomp_len = sum(map(len, decomp_chunks))
396 else:
397 self.decomp_len = decomp_len
398 self.crc32 = crc32
400 if pack_type_num in DELTA_TYPES:
401 self.obj_type_num = None
402 self.obj_chunks = None
403 else:
404 self.obj_type_num = pack_type_num
405 self.obj_chunks = self.decomp_chunks
406 self.delta_base = delta_base
408 def sha(self) -> RawObjectID:
409 """Return the binary SHA of this object."""
410 if self._sha is None:
411 assert self.obj_type_num is not None and self.obj_chunks is not None
412 self._sha = obj_sha(self.obj_type_num, self.obj_chunks)
413 return RawObjectID(self._sha)
415 def sha_file(self) -> ShaFile:
416 """Return a ShaFile from this object."""
417 assert self.obj_type_num is not None and self.obj_chunks is not None
418 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)
420 # Only provided for backwards compatibility with code that expects either
421 # chunks or a delta tuple.
422 def _obj(self) -> OldUnpackedObject:
423 """Return the decompressed chunks, or (delta base, delta chunks)."""
424 if self.pack_type_num in DELTA_TYPES:
425 assert isinstance(self.delta_base, (bytes, int))
426 return (self.delta_base, self.decomp_chunks)
427 else:
428 return self.decomp_chunks
430 def __eq__(self, other: object) -> bool:
431 """Check equality with another UnpackedObject."""
432 if not isinstance(other, UnpackedObject):
433 return False
434 for slot in self.__slots__:
435 if getattr(self, slot) != getattr(other, slot):
436 return False
437 return True
439 def __ne__(self, other: object) -> bool:
440 """Check inequality with another UnpackedObject."""
441 return not (self == other)
443 def __repr__(self) -> str:
444 """Return string representation of this UnpackedObject."""
445 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__]
446 return "{}({})".format(self.__class__.__name__, ", ".join(data))
449_ZLIB_BUFSIZE = 65536 # 64KB buffer for better I/O performance
452def read_zlib_chunks(
453 read_some: Callable[[int], bytes],
454 unpacked: UnpackedObject,
455 include_comp: bool = False,
456 buffer_size: int = _ZLIB_BUFSIZE,
457) -> bytes:
458 """Read zlib data from a buffer.
460 This function requires that the buffer have additional data following the
461 compressed data, which is guaranteed to be the case for git pack files.
463 Args:
464 read_some: Read function that returns at least one byte, but may
465 return less than the requested size.
466 unpacked: An UnpackedObject to write result data to. If its crc32
467 attr is not None, the CRC32 of the compressed bytes will be computed
468 using this starting CRC32.
469 After this function, will have the following attrs set:
470 * comp_chunks (if include_comp is True)
471 * decomp_chunks
472 * decomp_len
473 * crc32
474 include_comp: If True, include compressed data in the result.
475 buffer_size: Size of the read buffer.
476 Returns: Leftover unused data from the decompression.
478 Raises:
479 zlib.error: if a decompression error occurred.
480 """
481 if unpacked.decomp_len is None or unpacked.decomp_len <= -1:
482 raise ValueError("non-negative zlib data stream size expected")
483 decomp_obj = zlib.decompressobj()
485 comp_chunks = []
486 decomp_chunks = unpacked.decomp_chunks
487 decomp_len = 0
488 crc32 = unpacked.crc32
490 while True:
491 add = read_some(buffer_size)
492 if not add:
493 raise zlib.error("EOF before end of zlib stream")
494 comp_chunks.append(add)
495 decomp = decomp_obj.decompress(add)
496 decomp_len += len(decomp)
497 decomp_chunks.append(decomp)
498 unused = decomp_obj.unused_data
499 if unused:
500 left = len(unused)
501 if crc32 is not None:
502 crc32 = binascii.crc32(add[:-left], crc32)
503 if include_comp:
504 comp_chunks[-1] = add[:-left]
505 break
506 elif crc32 is not None:
507 crc32 = binascii.crc32(add, crc32)
508 if crc32 is not None:
509 crc32 &= 0xFFFFFFFF
511 if decomp_len != unpacked.decomp_len:
512 raise zlib.error("decompressed data does not match expected size")
514 unpacked.crc32 = crc32
515 if include_comp:
516 unpacked.comp_chunks = comp_chunks
517 return unused
520def iter_sha1(iter: Iterable[bytes]) -> bytes:
521 """Return the hexdigest of the SHA1 over a set of names.
523 Args:
524 iter: Iterator over string objects
525 Returns: 40-byte hex sha1 digest
526 """
527 sha = sha1()
528 for name in iter:
529 sha.update(name)
530 return sha.hexdigest().encode("ascii")
533def load_pack_index(path: str | os.PathLike[str]) -> "PackIndex":
534 """Load an index file by path.
536 Args:
537 path: Path to the index file
538 Returns: A PackIndex loaded from the given path
539 """
540 with GitFile(path, "rb") as f:
541 return load_pack_index_file(path, f)
544def _load_file_contents(
545 f: IO[bytes] | _GitFile, size: int | None = None
546) -> tuple[bytes | Any, int]:
547 """Load contents from a file, preferring mmap when possible.
549 Args:
550 f: File-like object to load
551 size: Expected size, or None to determine from file
552 Returns: Tuple of (contents, size)
553 """
554 try:
555 fd = f.fileno()
556 except (UnsupportedOperation, AttributeError):
557 fd = None
558 # Attempt to use mmap if possible
559 if fd is not None:
560 if size is None:
561 size = os.fstat(fd).st_size
562 if has_mmap:
563 try:
564 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
565 except (OSError, ValueError):
566 # Can't mmap - perhaps a socket or invalid file descriptor
567 pass
568 else:
569 return contents, size
570 contents_bytes = f.read()
571 size = len(contents_bytes)
572 return contents_bytes, size
575def load_pack_index_file(
576 path: str | os.PathLike[str], f: IO[bytes] | _GitFile
577) -> "PackIndex":
578 """Load an index file from a file-like object.
580 Args:
581 path: Path for the index file
582 f: File-like object
583 Returns: A PackIndex loaded from the given file
584 """
585 contents, size = _load_file_contents(f)
586 if contents[:4] == b"\377tOc":
587 version = struct.unpack(b">L", contents[4:8])[0]
588 if version == 2:
589 return PackIndex2(path, file=f, contents=contents, size=size)
590 elif version == 3:
591 return PackIndex3(path, file=f, contents=contents, size=size)
592 else:
593 raise KeyError(f"Unknown pack index format {version}")
594 else:
595 return PackIndex1(path, file=f, contents=contents, size=size)
598def bisect_find_sha(
599 start: int, end: int, sha: bytes, unpack_name: Callable[[int], bytes]
600) -> int | None:
601 """Find a SHA in a data blob with sorted SHAs.
603 Args:
604 start: Start index of range to search
605 end: End index of range to search
606 sha: Sha to find
607 unpack_name: Callback to retrieve SHA by index
608 Returns: Index of the SHA, or None if it wasn't found
609 """
610 assert start <= end
611 while start <= end:
612 i = (start + end) // 2
613 file_sha = unpack_name(i)
614 if file_sha < sha:
615 start = i + 1
616 elif file_sha > sha:
617 end = i - 1
618 else:
619 return i
620 return None
623PackIndexEntry = tuple[RawObjectID, int, int | None]
626class PackIndex:
627 """An index in to a packfile.
629 Given a sha id of an object a pack index can tell you the location in the
630 packfile of that object if it has it.
631 """
633 # Default to SHA-1 for backward compatibility
634 hash_algorithm = 1
635 hash_size = 20
637 def __eq__(self, other: object) -> bool:
638 """Check equality with another PackIndex."""
639 if not isinstance(other, PackIndex):
640 return False
642 for (name1, _, _), (name2, _, _) in zip(
643 self.iterentries(), other.iterentries()
644 ):
645 if name1 != name2:
646 return False
647 return True
649 def __ne__(self, other: object) -> bool:
650 """Check if this pack index is not equal to another."""
651 return not self.__eq__(other)
653 def __len__(self) -> int:
654 """Return the number of entries in this pack index."""
655 raise NotImplementedError(self.__len__)
657 def __iter__(self) -> Iterator[ObjectID]:
658 """Iterate over the SHAs in this pack."""
659 return map(lambda sha: sha_to_hex(RawObjectID(sha)), self._itersha())
661 def iterentries(self) -> Iterator[PackIndexEntry]:
662 """Iterate over the entries in this pack index.
664 Returns: iterator over tuples with object name, offset in packfile and
665 crc32 checksum.
666 """
667 raise NotImplementedError(self.iterentries)
669 def get_pack_checksum(self) -> bytes | None:
670 """Return the SHA1 checksum stored for the corresponding packfile.
672 Returns: 20-byte binary digest, or None if not available
673 """
674 raise NotImplementedError(self.get_pack_checksum)
676 @replace_me(since="0.21.0", remove_in="0.23.0")
677 def object_index(self, sha: ObjectID | RawObjectID) -> int:
678 """Return the index for the given SHA.
680 Args:
681 sha: SHA-1 hash
683 Returns:
684 Index position
685 """
686 return self.object_offset(sha)
688 def object_offset(self, sha: ObjectID | RawObjectID) -> int:
689 """Return the offset in to the corresponding packfile for the object.
691 Given the name of an object it will return the offset that object
692 lives at within the corresponding pack file. If the pack file doesn't
693 have the object then None will be returned.
694 """
695 raise NotImplementedError(self.object_offset)
697 def object_sha1(self, index: int) -> bytes:
698 """Return the SHA1 corresponding to the index in the pack file."""
699 for name, offset, _crc32 in self.iterentries():
700 if offset == index:
701 return name
702 else:
703 raise KeyError(index)
705 def _object_offset(self, sha: bytes) -> int:
706 """See object_offset.
708 Args:
709 sha: A *binary* SHA string. (20 characters long)_
710 """
711 raise NotImplementedError(self._object_offset)
713 def objects_sha1(self) -> bytes:
714 """Return the hex SHA1 over all the shas of all objects in this pack.
716 Note: This is used for the filename of the pack.
717 """
718 return iter_sha1(self._itersha())
720 def _itersha(self) -> Iterator[bytes]:
721 """Yield all the SHA1's of the objects in the index, sorted."""
722 raise NotImplementedError(self._itersha)
724 def iter_prefix(self, prefix: bytes) -> Iterator[RawObjectID]:
725 """Iterate over all SHA1s with the given prefix.
727 Args:
728 prefix: Binary prefix to match
729 Returns: Iterator of matching SHA1s
730 """
731 # Default implementation for PackIndex classes that don't override
732 for sha, _, _ in self.iterentries():
733 if sha.startswith(prefix):
734 yield RawObjectID(sha)
736 def close(self) -> None:
737 """Close any open files."""
739 def check(self) -> None:
740 """Check the consistency of this pack index."""
743class MemoryPackIndex(PackIndex):
744 """Pack index that is stored entirely in memory."""
746 def __init__(
747 self,
748 entries: list[PackIndexEntry],
749 pack_checksum: bytes | None = None,
750 ) -> None:
751 """Create a new MemoryPackIndex.
753 Args:
754 entries: Sequence of name, idx, crc32 (sorted)
755 pack_checksum: Optional pack checksum
756 """
757 self._by_sha = {}
758 self._by_offset = {}
759 for name, offset, _crc32 in entries:
760 self._by_sha[name] = offset
761 self._by_offset[offset] = name
762 self._entries = entries
763 self._pack_checksum = pack_checksum
765 def get_pack_checksum(self) -> bytes | None:
766 """Return the SHA checksum stored for the corresponding packfile."""
767 return self._pack_checksum
769 def __len__(self) -> int:
770 """Return the number of entries in this pack index."""
771 return len(self._entries)
773 def object_offset(self, sha: ObjectID | RawObjectID) -> int:
774 """Return the offset for the given SHA.
776 Args:
777 sha: SHA to look up (binary or hex)
778 Returns: Offset in the pack file
779 """
780 if len(sha) == 40:
781 sha = hex_to_sha(cast(ObjectID, sha))
782 return self._by_sha[cast(RawObjectID, sha)]
784 def object_sha1(self, offset: int) -> bytes:
785 """Return the SHA1 for the object at the given offset."""
786 return self._by_offset[offset]
788 def _itersha(self) -> Iterator[bytes]:
789 """Iterate over all SHA1s in the index."""
790 return iter(self._by_sha)
792 def iterentries(self) -> Iterator[PackIndexEntry]:
793 """Iterate over all index entries."""
794 return iter(self._entries)
796 @classmethod
797 def for_pack(cls, pack_data: "PackData") -> "MemoryPackIndex":
798 """Create a MemoryPackIndex from a PackData object."""
799 return MemoryPackIndex(
800 list(pack_data.sorted_entries()), pack_data.get_stored_checksum()
801 )
803 @classmethod
804 def clone(cls, other_index: "PackIndex") -> "MemoryPackIndex":
805 """Create a copy of another PackIndex in memory."""
806 return cls(list(other_index.iterentries()), other_index.get_pack_checksum())
809class FilePackIndex(PackIndex):
810 """Pack index that is based on a file.
812 To do the loop it opens the file, and indexes first 256 4 byte groups
813 with the first byte of the sha id. The value in the four byte group indexed
814 is the end of the group that shares the same starting byte. Subtract one
815 from the starting byte and index again to find the start of the group.
816 The values are sorted by sha id within the group, so do the math to find
817 the start and end offset and then bisect in to find if the value is
818 present.
819 """
821 _fan_out_table: list[int]
822 _file: IO[bytes] | _GitFile
824 def __init__(
825 self,
826 filename: str | os.PathLike[str],
827 file: IO[bytes] | _GitFile | None = None,
828 contents: "bytes | mmap.mmap | None" = None,
829 size: int | None = None,
830 ) -> None:
831 """Create a pack index object.
833 Provide it with the name of the index file to consider, and it will map
834 it whenever required.
835 """
836 self._filename = filename
837 # Take the size now, so it can be checked each time we map the file to
838 # ensure that it hasn't changed.
839 if file is None:
840 self._file = GitFile(filename, "rb")
841 else:
842 self._file = file
843 if contents is None:
844 self._contents, self._size = _load_file_contents(self._file, size)
845 else:
846 self._contents = contents
847 self._size = size if size is not None else len(contents)
849 @property
850 def path(self) -> str:
851 """Return the path to this index file."""
852 return os.fspath(self._filename)
854 def __eq__(self, other: object) -> bool:
855 """Check equality with another FilePackIndex."""
856 # Quick optimization:
857 if (
858 isinstance(other, FilePackIndex)
859 and self._fan_out_table != other._fan_out_table
860 ):
861 return False
863 return super().__eq__(other)
865 def close(self) -> None:
866 """Close the underlying file and any mmap."""
867 self._file.close()
868 close_fn = getattr(self._contents, "close", None)
869 if close_fn is not None:
870 close_fn()
872 def __len__(self) -> int:
873 """Return the number of entries in this pack index."""
874 return self._fan_out_table[-1]
876 def _unpack_entry(self, i: int) -> PackIndexEntry:
877 """Unpack the i-th entry in the index file.
879 Returns: Tuple with object name (SHA), offset in pack file and CRC32
880 checksum (if known).
881 """
882 raise NotImplementedError(self._unpack_entry)
884 def _unpack_name(self, i: int) -> bytes:
885 """Unpack the i-th name from the index file."""
886 raise NotImplementedError(self._unpack_name)
888 def _unpack_offset(self, i: int) -> int:
889 """Unpack the i-th object offset from the index file."""
890 raise NotImplementedError(self._unpack_offset)
892 def _unpack_crc32_checksum(self, i: int) -> int | None:
893 """Unpack the crc32 checksum for the ith object from the index file."""
894 raise NotImplementedError(self._unpack_crc32_checksum)
896 def _itersha(self) -> Iterator[bytes]:
897 """Iterate over all SHA1s in the index."""
898 for i in range(len(self)):
899 yield self._unpack_name(i)
901 def iterentries(self) -> Iterator[PackIndexEntry]:
902 """Iterate over the entries in this pack index.
904 Returns: iterator over tuples with object name, offset in packfile and
905 crc32 checksum.
906 """
907 for i in range(len(self)):
908 yield self._unpack_entry(i)
910 def _read_fan_out_table(self, start_offset: int) -> list[int]:
911 """Read the fan-out table from the index.
913 The fan-out table contains 256 entries mapping first byte values
914 to the number of objects with SHA1s less than or equal to that byte.
916 Args:
917 start_offset: Offset in the file where the fan-out table starts
918 Returns: List of 256 integers
919 """
920 ret = []
921 for i in range(0x100):
922 fanout_entry = self._contents[
923 start_offset + i * 4 : start_offset + (i + 1) * 4
924 ]
925 ret.append(struct.unpack(">L", fanout_entry)[0])
926 return ret
928 def check(self) -> None:
929 """Check that the stored checksum matches the actual checksum."""
930 actual = self.calculate_checksum()
931 stored = self.get_stored_checksum()
932 if actual != stored:
933 raise ChecksumMismatch(stored, actual)
935 def calculate_checksum(self) -> bytes:
936 """Calculate the SHA1 checksum over this pack index.
938 Returns: This is a 20-byte binary digest
939 """
940 return sha1(self._contents[:-20]).digest()
942 def get_pack_checksum(self) -> bytes:
943 """Return the SHA1 checksum stored for the corresponding packfile.
945 Returns: 20-byte binary digest
946 """
947 return bytes(self._contents[-40:-20])
949 def get_stored_checksum(self) -> bytes:
950 """Return the SHA1 checksum stored for this index.
952 Returns: 20-byte binary digest
953 """
954 return bytes(self._contents[-20:])
956 def object_offset(self, sha: ObjectID | RawObjectID) -> int:
957 """Return the offset in to the corresponding packfile for the object.
959 Given the name of an object it will return the offset that object
960 lives at within the corresponding pack file. If the pack file doesn't
961 have the object then None will be returned.
962 """
963 if len(sha) == 40:
964 sha = hex_to_sha(cast(ObjectID, sha))
965 try:
966 return self._object_offset(sha)
967 except ValueError as exc:
968 closed = getattr(self._contents, "closed", None)
969 if closed in (None, True):
970 raise PackFileDisappeared(self) from exc
971 raise
973 def _object_offset(self, sha: bytes) -> int:
974 """See object_offset.
976 Args:
977 sha: A *binary* SHA string. (20 characters long)_
978 """
979 assert len(sha) == 20
980 idx = ord(sha[:1])
981 if idx == 0:
982 start = 0
983 else:
984 start = self._fan_out_table[idx - 1]
985 end = self._fan_out_table[idx]
986 i = bisect_find_sha(start, end, sha, self._unpack_name)
987 if i is None:
988 raise KeyError(sha)
989 return self._unpack_offset(i)
991 def iter_prefix(self, prefix: bytes) -> Iterator[RawObjectID]:
992 """Iterate over all SHA1s with the given prefix."""
993 start = ord(prefix[:1])
994 if start == 0:
995 start = 0
996 else:
997 start = self._fan_out_table[start - 1]
998 end = ord(prefix[:1]) + 1
999 if end == 0x100:
1000 end = len(self)
1001 else:
1002 end = self._fan_out_table[end]
1003 assert start <= end
1004 started = False
1005 for i in range(start, end):
1006 name: bytes = self._unpack_name(i)
1007 if name.startswith(prefix):
1008 yield RawObjectID(name)
1009 started = True
1010 elif started:
1011 break
1014class PackIndex1(FilePackIndex):
1015 """Version 1 Pack Index file."""
1017 def __init__(
1018 self,
1019 filename: str | os.PathLike[str],
1020 file: IO[bytes] | _GitFile | None = None,
1021 contents: bytes | None = None,
1022 size: int | None = None,
1023 ) -> None:
1024 """Initialize a version 1 pack index.
1026 Args:
1027 filename: Path to the index file
1028 file: Optional file object
1029 contents: Optional mmap'd contents
1030 size: Optional size of the index
1031 """
1032 super().__init__(filename, file, contents, size)
1033 self.version = 1
1034 self._fan_out_table = self._read_fan_out_table(0)
1036 def _unpack_entry(self, i: int) -> tuple[RawObjectID, int, None]:
1037 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24))
1038 return (RawObjectID(name), offset, None)
1040 def _unpack_name(self, i: int) -> bytes:
1041 offset = (0x100 * 4) + (i * 24) + 4
1042 return self._contents[offset : offset + 20]
1044 def _unpack_offset(self, i: int) -> int:
1045 offset = (0x100 * 4) + (i * 24)
1046 result = unpack_from(">L", self._contents, offset)[0]
1047 assert isinstance(result, int)
1048 return result
1050 def _unpack_crc32_checksum(self, i: int) -> None:
1051 # Not stored in v1 index files
1052 return None
1055class PackIndex2(FilePackIndex):
1056 """Version 2 Pack Index file."""
1058 def __init__(
1059 self,
1060 filename: str | os.PathLike[str],
1061 file: IO[bytes] | _GitFile | None = None,
1062 contents: bytes | None = None,
1063 size: int | None = None,
1064 ) -> None:
1065 """Initialize a version 2 pack index.
1067 Args:
1068 filename: Path to the index file
1069 file: Optional file object
1070 contents: Optional mmap'd contents
1071 size: Optional size of the index
1072 """
1073 super().__init__(filename, file, contents, size)
1074 if self._contents[:4] != b"\377tOc":
1075 raise AssertionError("Not a v2 pack index file")
1076 (self.version,) = unpack_from(b">L", self._contents, 4)
1077 if self.version != 2:
1078 raise AssertionError(f"Version was {self.version}")
1079 self._fan_out_table = self._read_fan_out_table(8)
1080 self._name_table_offset = 8 + 0x100 * 4
1081 self._crc32_table_offset = self._name_table_offset + 20 * len(self)
1082 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
1083 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(
1084 self
1085 )
1087 def _unpack_entry(self, i: int) -> tuple[RawObjectID, int, int]:
1088 return (
1089 RawObjectID(self._unpack_name(i)),
1090 self._unpack_offset(i),
1091 self._unpack_crc32_checksum(i),
1092 )
1094 def _unpack_name(self, i: int) -> bytes:
1095 offset = self._name_table_offset + i * 20
1096 return self._contents[offset : offset + 20]
1098 def _unpack_offset(self, i: int) -> int:
1099 offset_pos = self._pack_offset_table_offset + i * 4
1100 offset = unpack_from(">L", self._contents, offset_pos)[0]
1101 assert isinstance(offset, int)
1102 if offset & (2**31):
1103 large_offset_pos = (
1104 self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8
1105 )
1106 offset = unpack_from(">Q", self._contents, large_offset_pos)[0]
1107 assert isinstance(offset, int)
1108 return offset
1110 def _unpack_crc32_checksum(self, i: int) -> int:
1111 result = unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
1112 assert isinstance(result, int)
1113 return result
1116class PackIndex3(FilePackIndex):
1117 """Version 3 Pack Index file.
1119 Supports variable hash sizes for SHA-1 (20 bytes) and SHA-256 (32 bytes).
1120 """
1122 def __init__(
1123 self,
1124 filename: str | os.PathLike[str],
1125 file: IO[bytes] | _GitFile | None = None,
1126 contents: bytes | None = None,
1127 size: int | None = None,
1128 ) -> None:
1129 """Initialize a version 3 pack index.
1131 Args:
1132 filename: Path to the index file
1133 file: Optional file object
1134 contents: Optional mmap'd contents
1135 size: Optional size of the index
1136 """
1137 super().__init__(filename, file, contents, size)
1138 if self._contents[:4] != b"\377tOc":
1139 raise AssertionError("Not a v3 pack index file")
1140 (self.version,) = unpack_from(b">L", self._contents, 4)
1141 if self.version != 3:
1142 raise AssertionError(f"Version was {self.version}")
1144 # Read hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
1145 (self.hash_algorithm,) = unpack_from(b">L", self._contents, 8)
1146 if self.hash_algorithm == 1:
1147 self.hash_size = 20 # SHA-1
1148 elif self.hash_algorithm == 2:
1149 self.hash_size = 32 # SHA-256
1150 else:
1151 raise AssertionError(f"Unknown hash algorithm {self.hash_algorithm}")
1153 # Read length of shortened object names
1154 (self.shortened_oid_len,) = unpack_from(b">L", self._contents, 12)
1156 # Calculate offsets based on variable hash size
1157 self._fan_out_table = self._read_fan_out_table(
1158 16
1159 ) # After header (4 + 4 + 4 + 4)
1160 self._name_table_offset = 16 + 0x100 * 4
1161 self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self)
1162 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
1163 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(
1164 self
1165 )
1167 def _unpack_entry(self, i: int) -> tuple[RawObjectID, int, int]:
1168 return (
1169 RawObjectID(self._unpack_name(i)),
1170 self._unpack_offset(i),
1171 self._unpack_crc32_checksum(i),
1172 )
1174 def _unpack_name(self, i: int) -> bytes:
1175 offset = self._name_table_offset + i * self.hash_size
1176 return self._contents[offset : offset + self.hash_size]
1178 def _unpack_offset(self, i: int) -> int:
1179 offset_pos = self._pack_offset_table_offset + i * 4
1180 offset = unpack_from(">L", self._contents, offset_pos)[0]
1181 assert isinstance(offset, int)
1182 if offset & (2**31):
1183 large_offset_pos = (
1184 self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8
1185 )
1186 offset = unpack_from(">Q", self._contents, large_offset_pos)[0]
1187 assert isinstance(offset, int)
1188 return offset
1190 def _unpack_crc32_checksum(self, i: int) -> int:
1191 result = unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
1192 assert isinstance(result, int)
1193 return result
1196def read_pack_header(read: Callable[[int], bytes]) -> tuple[int, int]:
1197 """Read the header of a pack file.
1199 Args:
1200 read: Read function
1201 Returns: Tuple of (pack version, number of objects). If no data is
1202 available to read, returns (None, None).
1203 """
1204 header = read(12)
1205 if not header:
1206 raise AssertionError("file too short to contain pack")
1207 if header[:4] != b"PACK":
1208 raise AssertionError(f"Invalid pack header {header!r}")
1209 (version,) = unpack_from(b">L", header, 4)
1210 if version not in (2, 3):
1211 raise AssertionError(f"Version was {version}")
1212 (num_objects,) = unpack_from(b">L", header, 8)
1213 return (version, num_objects)
1216def chunks_length(chunks: bytes | Iterable[bytes]) -> int:
1217 """Get the total length of a sequence of chunks.
1219 Args:
1220 chunks: Either a single bytes object or an iterable of bytes
1221 Returns: Total length in bytes
1222 """
1223 if isinstance(chunks, bytes):
1224 return len(chunks)
1225 else:
1226 return sum(map(len, chunks))
1229def unpack_object(
1230 read_all: Callable[[int], bytes],
1231 read_some: Callable[[int], bytes] | None = None,
1232 compute_crc32: bool = False,
1233 include_comp: bool = False,
1234 zlib_bufsize: int = _ZLIB_BUFSIZE,
1235) -> tuple[UnpackedObject, bytes]:
1236 """Unpack a Git object.
1238 Args:
1239 read_all: Read function that blocks until the number of requested
1240 bytes are read.
1241 read_some: Read function that returns at least one byte, but may not
1242 return the number of bytes requested.
1243 compute_crc32: If True, compute the CRC32 of the compressed data. If
1244 False, the returned CRC32 will be None.
1245 include_comp: If True, include compressed data in the result.
1246 zlib_bufsize: An optional buffer size for zlib operations.
1247 Returns: A tuple of (unpacked, unused), where unused is the unused data
1248 leftover from decompression, and unpacked in an UnpackedObject with
1249 the following attrs set:
1251 * obj_chunks (for non-delta types)
1252 * pack_type_num
1253 * delta_base (for delta types)
1254 * comp_chunks (if include_comp is True)
1255 * decomp_chunks
1256 * decomp_len
1257 * crc32 (if compute_crc32 is True)
1258 """
1259 if read_some is None:
1260 read_some = read_all
1261 if compute_crc32:
1262 crc32 = 0
1263 else:
1264 crc32 = None
1266 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)
1267 type_num = (raw[0] >> 4) & 0x07
1268 size = raw[0] & 0x0F
1269 for i, byte in enumerate(raw[1:]):
1270 size += (byte & 0x7F) << ((i * 7) + 4)
1272 delta_base: int | bytes | None
1273 raw_base = len(raw)
1274 if type_num == OFS_DELTA:
1275 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)
1276 raw_base += len(raw)
1277 if raw[-1] & 0x80:
1278 raise AssertionError
1279 delta_base_offset = raw[0] & 0x7F
1280 for byte in raw[1:]:
1281 delta_base_offset += 1
1282 delta_base_offset <<= 7
1283 delta_base_offset += byte & 0x7F
1284 delta_base = delta_base_offset
1285 elif type_num == REF_DELTA:
1286 delta_base_obj = read_all(20)
1287 if crc32 is not None:
1288 crc32 = binascii.crc32(delta_base_obj, crc32)
1289 delta_base = delta_base_obj
1290 raw_base += 20
1291 else:
1292 delta_base = None
1294 unpacked = UnpackedObject(
1295 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32
1296 )
1297 unused = read_zlib_chunks(
1298 read_some,
1299 unpacked,
1300 buffer_size=zlib_bufsize,
1301 include_comp=include_comp,
1302 )
1303 return unpacked, unused
1306def _compute_object_size(value: tuple[int, Any]) -> int:
1307 """Compute the size of a unresolved object for use with LRUSizeCache."""
1308 (num, obj) = value
1309 if num in DELTA_TYPES:
1310 return chunks_length(obj[1])
1311 return chunks_length(obj)
1314class PackStreamReader:
1315 """Class to read a pack stream.
1317 The pack is read from a ReceivableProtocol using read() or recv() as
1318 appropriate.
1319 """
1321 def __init__(
1322 self,
1323 read_all: Callable[[int], bytes],
1324 read_some: Callable[[int], bytes] | None = None,
1325 zlib_bufsize: int = _ZLIB_BUFSIZE,
1326 ) -> None:
1327 """Initialize pack stream reader.
1329 Args:
1330 read_all: Function to read all requested bytes
1331 read_some: Function to read some bytes (optional)
1332 zlib_bufsize: Buffer size for zlib decompression
1333 """
1334 self.read_all = read_all
1335 if read_some is None:
1336 self.read_some = read_all
1337 else:
1338 self.read_some = read_some
1339 self.sha = sha1()
1340 self._offset = 0
1341 self._rbuf = BytesIO()
1342 # trailer is a deque to avoid memory allocation on small reads
1343 self._trailer: deque[int] = deque()
1344 self._zlib_bufsize = zlib_bufsize
1346 def _read(self, read: Callable[[int], bytes], size: int) -> bytes:
1347 """Read up to size bytes using the given callback.
1349 As a side effect, update the verifier's hash (excluding the last 20
1350 bytes read).
1352 Args:
1353 read: The read callback to read from.
1354 size: The maximum number of bytes to read; the particular
1355 behavior is callback-specific.
1356 Returns: Bytes read
1357 """
1358 data = read(size)
1360 # maintain a trailer of the last 20 bytes we've read
1361 n = len(data)
1362 self._offset += n
1363 tn = len(self._trailer)
1364 if n >= 20:
1365 to_pop = tn
1366 to_add = 20
1367 else:
1368 to_pop = max(n + tn - 20, 0)
1369 to_add = n
1370 self.sha.update(
1371 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)]))
1372 )
1373 self._trailer.extend(data[-to_add:])
1375 # hash everything but the trailer
1376 self.sha.update(data[:-to_add])
1377 return data
1379 def _buf_len(self) -> int:
1380 buf = self._rbuf
1381 start = buf.tell()
1382 buf.seek(0, SEEK_END)
1383 end = buf.tell()
1384 buf.seek(start)
1385 return end - start
1387 @property
1388 def offset(self) -> int:
1389 """Return current offset in the stream."""
1390 return self._offset - self._buf_len()
1392 def read(self, size: int) -> bytes:
1393 """Read, blocking until size bytes are read."""
1394 buf_len = self._buf_len()
1395 if buf_len >= size:
1396 return self._rbuf.read(size)
1397 buf_data = self._rbuf.read()
1398 self._rbuf = BytesIO()
1399 return buf_data + self._read(self.read_all, size - buf_len)
1401 def recv(self, size: int) -> bytes:
1402 """Read up to size bytes, blocking until one byte is read."""
1403 buf_len = self._buf_len()
1404 if buf_len:
1405 data = self._rbuf.read(size)
1406 if size >= buf_len:
1407 self._rbuf = BytesIO()
1408 return data
1409 return self._read(self.read_some, size)
1411 def __len__(self) -> int:
1412 """Return the number of objects in this pack."""
1413 return self._num_objects
1415 def read_objects(self, compute_crc32: bool = False) -> Iterator[UnpackedObject]:
1416 """Read the objects in this pack file.
1418 Args:
1419 compute_crc32: If True, compute the CRC32 of the compressed
1420 data. If False, the returned CRC32 will be None.
1421 Returns: Iterator over UnpackedObjects with the following members set:
1422 offset
1423 obj_type_num
1424 obj_chunks (for non-delta types)
1425 delta_base (for delta types)
1426 decomp_chunks
1427 decomp_len
1428 crc32 (if compute_crc32 is True)
1430 Raises:
1431 ChecksumMismatch: if the checksum of the pack contents does not
1432 match the checksum in the pack trailer.
1433 zlib.error: if an error occurred during zlib decompression.
1434 IOError: if an error occurred writing to the output file.
1435 """
1436 _pack_version, self._num_objects = read_pack_header(self.read)
1438 for _ in range(self._num_objects):
1439 offset = self.offset
1440 unpacked, unused = unpack_object(
1441 self.read,
1442 read_some=self.recv,
1443 compute_crc32=compute_crc32,
1444 zlib_bufsize=self._zlib_bufsize,
1445 )
1446 unpacked.offset = offset
1448 # prepend any unused data to current read buffer
1449 buf = BytesIO()
1450 buf.write(unused)
1451 buf.write(self._rbuf.read())
1452 buf.seek(0)
1453 self._rbuf = buf
1455 yield unpacked
1457 if self._buf_len() < 20:
1458 # If the read buffer is full, then the last read() got the whole
1459 # trailer off the wire. If not, it means there is still some of the
1460 # trailer to read. We need to read() all 20 bytes; N come from the
1461 # read buffer and (20 - N) come from the wire.
1462 self.read(20)
1464 pack_sha = bytearray(self._trailer)
1465 if pack_sha != self.sha.digest():
1466 raise ChecksumMismatch(
1467 sha_to_hex(RawObjectID(bytes(pack_sha))), self.sha.hexdigest()
1468 )
1471class PackStreamCopier(PackStreamReader):
1472 """Class to verify a pack stream as it is being read.
1474 The pack is read from a ReceivableProtocol using read() or recv() as
1475 appropriate and written out to the given file-like object.
1476 """
1478 def __init__(
1479 self,
1480 read_all: Callable[[int], bytes],
1481 read_some: Callable[[int], bytes] | None,
1482 outfile: IO[bytes],
1483 delta_iter: "DeltaChainIterator[UnpackedObject] | None" = None,
1484 ) -> None:
1485 """Initialize the copier.
1487 Args:
1488 read_all: Read function that blocks until the number of
1489 requested bytes are read.
1490 read_some: Read function that returns at least one byte, but may
1491 not return the number of bytes requested.
1492 outfile: File-like object to write output through.
1493 delta_iter: Optional DeltaChainIterator to record deltas as we
1494 read them.
1495 """
1496 super().__init__(read_all, read_some=read_some)
1497 self.outfile = outfile
1498 self._delta_iter = delta_iter
1500 def _read(self, read: Callable[[int], bytes], size: int) -> bytes:
1501 """Read data from the read callback and write it to the file."""
1502 data = super()._read(read, size)
1503 self.outfile.write(data)
1504 return data
1506 def verify(self, progress: Callable[..., None] | None = None) -> None:
1507 """Verify a pack stream and write it to the output file.
1509 See PackStreamReader.iterobjects for a list of exceptions this may
1510 throw.
1511 """
1512 i = 0 # default count of entries if read_objects() is empty
1513 for i, unpacked in enumerate(self.read_objects()):
1514 if self._delta_iter:
1515 self._delta_iter.record(unpacked)
1516 if progress is not None:
1517 progress(f"copying pack entries: {i}/{len(self)}\r".encode("ascii"))
1518 if progress is not None:
1519 progress(f"copied {i} pack entries\n".encode("ascii"))
1522def obj_sha(type: int, chunks: bytes | Iterable[bytes]) -> bytes:
1523 """Compute the SHA for a numeric type and object chunks."""
1524 sha = sha1()
1525 sha.update(object_header(type, chunks_length(chunks)))
1526 if isinstance(chunks, bytes):
1527 sha.update(chunks)
1528 else:
1529 for chunk in chunks:
1530 sha.update(chunk)
1531 return sha.digest()
1534def compute_file_sha(
1535 f: IO[bytes], start_ofs: int = 0, end_ofs: int = 0, buffer_size: int = 1 << 16
1536) -> "HashObject":
1537 """Hash a portion of a file into a new SHA.
1539 Args:
1540 f: A file-like object to read from that supports seek().
1541 start_ofs: The offset in the file to start reading at.
1542 end_ofs: The offset in the file to end reading at, relative to the
1543 end of the file.
1544 buffer_size: A buffer size for reading.
1545 Returns: A new SHA object updated with data read from the file.
1546 """
1547 sha = sha1()
1548 f.seek(0, SEEK_END)
1549 length = f.tell()
1550 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length:
1551 raise AssertionError(
1552 f"Attempt to read beyond file length. start_ofs: {start_ofs}, end_ofs: {end_ofs}, file length: {length}"
1553 )
1554 todo = length + end_ofs - start_ofs
1555 f.seek(start_ofs)
1556 while todo:
1557 data = f.read(min(todo, buffer_size))
1558 sha.update(data)
1559 todo -= len(data)
1560 return sha
1563class PackData:
1564 """The data contained in a packfile.
1566 Pack files can be accessed both sequentially for exploding a pack, and
1567 directly with the help of an index to retrieve a specific object.
1569 The objects within are either complete or a delta against another.
1571 The header is variable length. If the MSB of each byte is set then it
1572 indicates that the subsequent byte is still part of the header.
1573 For the first byte the next MS bits are the type, which tells you the type
1574 of object, and whether it is a delta. The LS byte is the lowest bits of the
1575 size. For each subsequent byte the LS 7 bits are the next MS bits of the
1576 size, i.e. the last byte of the header contains the MS bits of the size.
1578 For the complete objects the data is stored as zlib deflated data.
1579 The size in the header is the uncompressed object size, so to uncompress
1580 you need to just keep feeding data to zlib until you get an object back,
1581 or it errors on bad data. This is done here by just giving the complete
1582 buffer from the start of the deflated object on. This is bad, but until I
1583 get mmap sorted out it will have to do.
1585 Currently there are no integrity checks done. Also no attempt is made to
1586 try and detect the delta case, or a request for an object at the wrong
1587 position. It will all just throw a zlib or KeyError.
1588 """
1590 def __init__(
1591 self,
1592 filename: str | os.PathLike[str],
1593 file: IO[bytes] | None = None,
1594 size: int | None = None,
1595 *,
1596 delta_window_size: int | None = None,
1597 window_memory: int | None = None,
1598 delta_cache_size: int | None = None,
1599 depth: int | None = None,
1600 threads: int | None = None,
1601 big_file_threshold: int | None = None,
1602 ) -> None:
1603 """Create a PackData object representing the pack in the given filename.
1605 The file must exist and stay readable until the object is disposed of.
1606 It must also stay the same size. It will be mapped whenever needed.
1608 Currently there is a restriction on the size of the pack as the python
1609 mmap implementation is flawed.
1610 """
1611 self._filename = filename
1612 self._size = size
1613 self._header_size = 12
1614 self.delta_window_size = delta_window_size
1615 self.window_memory = window_memory
1616 self.delta_cache_size = delta_cache_size
1617 self.depth = depth
1618 self.threads = threads
1619 self.big_file_threshold = big_file_threshold
1620 self._file: IO[bytes]
1622 if file is None:
1623 self._file = GitFile(self._filename, "rb")
1624 else:
1625 self._file = file
1626 (_version, self._num_objects) = read_pack_header(self._file.read)
1628 # Use delta_cache_size config if available, otherwise default
1629 cache_size = delta_cache_size or (1024 * 1024 * 20)
1630 self._offset_cache = LRUSizeCache[int, tuple[int, OldUnpackedObject]](
1631 cache_size, compute_size=_compute_object_size
1632 )
1634 @property
1635 def filename(self) -> str:
1636 """Get the filename of the pack file.
1638 Returns:
1639 Base filename without directory path
1640 """
1641 return os.path.basename(self._filename)
1643 @property
1644 def path(self) -> str | os.PathLike[str]:
1645 """Get the full path of the pack file.
1647 Returns:
1648 Full path to the pack file
1649 """
1650 return self._filename
1652 @classmethod
1653 def from_file(cls, file: IO[bytes], size: int | None = None) -> "PackData":
1654 """Create a PackData object from an open file.
1656 Args:
1657 file: Open file object
1658 size: Optional file size
1660 Returns:
1661 PackData instance
1662 """
1663 return cls(str(file), file=file, size=size)
1665 @classmethod
1666 def from_path(cls, path: str | os.PathLike[str]) -> "PackData":
1667 """Create a PackData object from a file path.
1669 Args:
1670 path: Path to the pack file
1672 Returns:
1673 PackData instance
1674 """
1675 return cls(filename=path)
1677 def close(self) -> None:
1678 """Close the underlying pack file."""
1679 self._file.close()
1681 def __enter__(self) -> "PackData":
1682 """Enter context manager."""
1683 return self
1685 def __exit__(
1686 self,
1687 exc_type: type | None,
1688 exc_val: BaseException | None,
1689 exc_tb: TracebackType | None,
1690 ) -> None:
1691 """Exit context manager."""
1692 self.close()
1694 def __eq__(self, other: object) -> bool:
1695 """Check equality with another object."""
1696 if isinstance(other, PackData):
1697 return self.get_stored_checksum() == other.get_stored_checksum()
1698 return False
1700 def _get_size(self) -> int:
1701 if self._size is not None:
1702 return self._size
1703 self._size = os.path.getsize(self._filename)
1704 if self._size < self._header_size:
1705 errmsg = f"{self._filename} is too small for a packfile ({self._size} < {self._header_size})"
1706 raise AssertionError(errmsg)
1707 return self._size
1709 def __len__(self) -> int:
1710 """Returns the number of objects in this pack."""
1711 return self._num_objects
1713 def calculate_checksum(self) -> bytes:
1714 """Calculate the checksum for this pack.
1716 Returns: 20-byte binary SHA1 digest
1717 """
1718 return compute_file_sha(self._file, end_ofs=-20).digest()
1720 def iter_unpacked(self, *, include_comp: bool = False) -> Iterator[UnpackedObject]:
1721 """Iterate over unpacked objects in the pack."""
1722 self._file.seek(self._header_size)
1724 if self._num_objects is None:
1725 return
1727 for _ in range(self._num_objects):
1728 offset = self._file.tell()
1729 unpacked, unused = unpack_object(
1730 self._file.read, compute_crc32=False, include_comp=include_comp
1731 )
1732 unpacked.offset = offset
1733 yield unpacked
1734 # Back up over unused data.
1735 self._file.seek(-len(unused), SEEK_CUR)
1737 def iterentries(
1738 self,
1739 progress: Callable[[int, int], None] | None = None,
1740 resolve_ext_ref: ResolveExtRefFn | None = None,
1741 ) -> Iterator[PackIndexEntry]:
1742 """Yield entries summarizing the contents of this pack.
1744 Args:
1745 progress: Progress function, called with current and total
1746 object count.
1747 resolve_ext_ref: Optional function to resolve external references
1748 Returns: iterator of tuples with (sha, offset, crc32)
1749 """
1750 num_objects = self._num_objects
1751 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref)
1752 for i, result in enumerate(indexer):
1753 if progress is not None:
1754 progress(i, num_objects)
1755 yield result
1757 def sorted_entries(
1758 self,
1759 progress: ProgressFn | None = None,
1760 resolve_ext_ref: ResolveExtRefFn | None = None,
1761 ) -> list[tuple[RawObjectID, int, int]]:
1762 """Return entries in this pack, sorted by SHA.
1764 Args:
1765 progress: Progress function, called with current and total
1766 object count
1767 resolve_ext_ref: Optional function to resolve external references
1768 Returns: Iterator of tuples with (sha, offset, crc32)
1769 """
1770 return sorted(
1771 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref) # type: ignore
1772 )
1774 def create_index_v1(
1775 self,
1776 filename: str,
1777 progress: Callable[..., None] | None = None,
1778 resolve_ext_ref: ResolveExtRefFn | None = None,
1779 ) -> bytes:
1780 """Create a version 1 file for this data file.
1782 Args:
1783 filename: Index filename.
1784 progress: Progress report function
1785 resolve_ext_ref: Optional function to resolve external references
1786 Returns: Checksum of index file
1787 """
1788 entries = self.sorted_entries(
1789 progress=progress, resolve_ext_ref=resolve_ext_ref
1790 )
1791 checksum = self.calculate_checksum()
1792 with GitFile(filename, "wb") as f:
1793 write_pack_index_v1(
1794 f,
1795 entries,
1796 checksum,
1797 )
1798 return checksum
1800 def create_index_v2(
1801 self,
1802 filename: str,
1803 progress: Callable[..., None] | None = None,
1804 resolve_ext_ref: ResolveExtRefFn | None = None,
1805 ) -> bytes:
1806 """Create a version 2 index file for this data file.
1808 Args:
1809 filename: Index filename.
1810 progress: Progress report function
1811 resolve_ext_ref: Optional function to resolve external references
1812 Returns: Checksum of index file
1813 """
1814 entries = self.sorted_entries(
1815 progress=progress, resolve_ext_ref=resolve_ext_ref
1816 )
1817 with GitFile(filename, "wb") as f:
1818 return write_pack_index_v2(f, entries, self.calculate_checksum())
1820 def create_index_v3(
1821 self,
1822 filename: str,
1823 progress: Callable[..., None] | None = None,
1824 resolve_ext_ref: ResolveExtRefFn | None = None,
1825 hash_algorithm: int = 1,
1826 ) -> bytes:
1827 """Create a version 3 index file for this data file.
1829 Args:
1830 filename: Index filename.
1831 progress: Progress report function
1832 resolve_ext_ref: Function to resolve external references
1833 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
1834 Returns: Checksum of index file
1835 """
1836 entries = self.sorted_entries(
1837 progress=progress, resolve_ext_ref=resolve_ext_ref
1838 )
1839 with GitFile(filename, "wb") as f:
1840 return write_pack_index_v3(
1841 f, entries, self.calculate_checksum(), hash_algorithm
1842 )
1844 def create_index(
1845 self,
1846 filename: str,
1847 progress: Callable[..., None] | None = None,
1848 version: int = 2,
1849 resolve_ext_ref: ResolveExtRefFn | None = None,
1850 hash_algorithm: int = 1,
1851 ) -> bytes:
1852 """Create an index file for this data file.
1854 Args:
1855 filename: Index filename.
1856 progress: Progress report function
1857 version: Index version (1, 2, or 3)
1858 resolve_ext_ref: Function to resolve external references
1859 hash_algorithm: Hash algorithm identifier for v3 (1 = SHA-1, 2 = SHA-256)
1860 Returns: Checksum of index file
1861 """
1862 if version == 1:
1863 return self.create_index_v1(
1864 filename, progress, resolve_ext_ref=resolve_ext_ref
1865 )
1866 elif version == 2:
1867 return self.create_index_v2(
1868 filename, progress, resolve_ext_ref=resolve_ext_ref
1869 )
1870 elif version == 3:
1871 return self.create_index_v3(
1872 filename,
1873 progress,
1874 resolve_ext_ref=resolve_ext_ref,
1875 hash_algorithm=hash_algorithm,
1876 )
1877 else:
1878 raise ValueError(f"unknown index format {version}")
1880 def get_stored_checksum(self) -> bytes:
1881 """Return the expected checksum stored in this pack."""
1882 self._file.seek(-20, SEEK_END)
1883 return self._file.read(20)
1885 def check(self) -> None:
1886 """Check the consistency of this pack."""
1887 actual = self.calculate_checksum()
1888 stored = self.get_stored_checksum()
1889 if actual != stored:
1890 raise ChecksumMismatch(stored, actual)
1892 def get_unpacked_object_at(
1893 self, offset: int, *, include_comp: bool = False
1894 ) -> UnpackedObject:
1895 """Given offset in the packfile return a UnpackedObject."""
1896 assert offset >= self._header_size
1897 self._file.seek(offset)
1898 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp)
1899 unpacked.offset = offset
1900 return unpacked
1902 def get_object_at(self, offset: int) -> tuple[int, OldUnpackedObject]:
1903 """Given an offset in to the packfile return the object that is there.
1905 Using the associated index the location of an object can be looked up,
1906 and then the packfile can be asked directly for that object using this
1907 function.
1908 """
1909 try:
1910 return self._offset_cache[offset]
1911 except KeyError:
1912 pass
1913 unpacked = self.get_unpacked_object_at(offset, include_comp=False)
1914 return (unpacked.pack_type_num, unpacked._obj())
1917T = TypeVar("T")
1920class DeltaChainIterator(Generic[T]):
1921 """Abstract iterator over pack data based on delta chains.
1923 Each object in the pack is guaranteed to be inflated exactly once,
1924 regardless of how many objects reference it as a delta base. As a result,
1925 memory usage is proportional to the length of the longest delta chain.
1927 Subclasses can override _result to define the result type of the iterator.
1928 By default, results are UnpackedObjects with the following members set:
1930 * offset
1931 * obj_type_num
1932 * obj_chunks
1933 * pack_type_num
1934 * delta_base (for delta types)
1935 * comp_chunks (if _include_comp is True)
1936 * decomp_chunks
1937 * decomp_len
1938 * crc32 (if _compute_crc32 is True)
1939 """
1941 _compute_crc32 = False
1942 _include_comp = False
1944 def __init__(
1945 self,
1946 file_obj: IO[bytes] | None,
1947 *,
1948 resolve_ext_ref: ResolveExtRefFn | None = None,
1949 ) -> None:
1950 """Initialize DeltaChainIterator.
1952 Args:
1953 file_obj: File object to read pack data from
1954 resolve_ext_ref: Optional function to resolve external references
1955 """
1956 self._file = file_obj
1957 self._resolve_ext_ref = resolve_ext_ref
1958 self._pending_ofs: dict[int, list[int]] = defaultdict(list)
1959 self._pending_ref: dict[bytes, list[int]] = defaultdict(list)
1960 self._full_ofs: list[tuple[int, int]] = []
1961 self._ext_refs: list[RawObjectID] = []
1963 @classmethod
1964 def for_pack_data(
1965 cls, pack_data: PackData, resolve_ext_ref: ResolveExtRefFn | None = None
1966 ) -> "DeltaChainIterator[T]":
1967 """Create a DeltaChainIterator from pack data.
1969 Args:
1970 pack_data: PackData object to iterate
1971 resolve_ext_ref: Optional function to resolve external refs
1973 Returns:
1974 DeltaChainIterator instance
1975 """
1976 walker = cls(None, resolve_ext_ref=resolve_ext_ref)
1977 walker.set_pack_data(pack_data)
1978 for unpacked in pack_data.iter_unpacked(include_comp=False):
1979 walker.record(unpacked)
1980 return walker
1982 @classmethod
1983 def for_pack_subset(
1984 cls,
1985 pack: "Pack",
1986 shas: Iterable[ObjectID | RawObjectID],
1987 *,
1988 allow_missing: bool = False,
1989 resolve_ext_ref: ResolveExtRefFn | None = None,
1990 ) -> "DeltaChainIterator[T]":
1991 """Create a DeltaChainIterator for a subset of objects.
1993 Args:
1994 pack: Pack object containing the data
1995 shas: Iterable of object SHAs to include
1996 allow_missing: If True, skip missing objects
1997 resolve_ext_ref: Optional function to resolve external refs
1999 Returns:
2000 DeltaChainIterator instance
2001 """
2002 walker = cls(None, resolve_ext_ref=resolve_ext_ref)
2003 walker.set_pack_data(pack.data)
2004 todo = set()
2005 for sha in shas:
2006 try:
2007 off = pack.index.object_offset(sha)
2008 except KeyError:
2009 if not allow_missing:
2010 raise
2011 else:
2012 todo.add(off)
2013 done = set()
2014 while todo:
2015 off = todo.pop()
2016 unpacked = pack.data.get_unpacked_object_at(off)
2017 walker.record(unpacked)
2018 done.add(off)
2019 base_ofs = None
2020 if unpacked.pack_type_num == OFS_DELTA:
2021 assert unpacked.offset is not None
2022 assert unpacked.delta_base is not None
2023 assert isinstance(unpacked.delta_base, int)
2024 base_ofs = unpacked.offset - unpacked.delta_base
2025 elif unpacked.pack_type_num == REF_DELTA:
2026 with suppress(KeyError):
2027 assert isinstance(unpacked.delta_base, bytes)
2028 base_ofs = pack.index.object_index(RawObjectID(unpacked.delta_base))
2029 if base_ofs is not None and base_ofs not in done:
2030 todo.add(base_ofs)
2031 return walker
2033 def record(self, unpacked: UnpackedObject) -> None:
2034 """Record an unpacked object for later processing.
2036 Args:
2037 unpacked: UnpackedObject to record
2038 """
2039 type_num = unpacked.pack_type_num
2040 offset = unpacked.offset
2041 assert offset is not None
2042 if type_num == OFS_DELTA:
2043 assert unpacked.delta_base is not None
2044 assert isinstance(unpacked.delta_base, int)
2045 base_offset = offset - unpacked.delta_base
2046 self._pending_ofs[base_offset].append(offset)
2047 elif type_num == REF_DELTA:
2048 assert isinstance(unpacked.delta_base, bytes)
2049 self._pending_ref[unpacked.delta_base].append(offset)
2050 else:
2051 self._full_ofs.append((offset, type_num))
2053 def set_pack_data(self, pack_data: PackData) -> None:
2054 """Set the pack data for iteration.
2056 Args:
2057 pack_data: PackData object to use
2058 """
2059 self._file = pack_data._file
2061 def _walk_all_chains(self) -> Iterator[T]:
2062 for offset, type_num in self._full_ofs:
2063 yield from self._follow_chain(offset, type_num, None)
2064 yield from self._walk_ref_chains()
2065 assert not self._pending_ofs, repr(self._pending_ofs)
2067 def _ensure_no_pending(self) -> None:
2068 if self._pending_ref:
2069 raise UnresolvedDeltas(
2070 [sha_to_hex(RawObjectID(s)) for s in self._pending_ref]
2071 )
2073 def _walk_ref_chains(self) -> Iterator[T]:
2074 if not self._resolve_ext_ref:
2075 self._ensure_no_pending()
2076 return
2078 for base_sha, pending in sorted(self._pending_ref.items()):
2079 if base_sha not in self._pending_ref:
2080 continue
2081 try:
2082 type_num, chunks = self._resolve_ext_ref(base_sha)
2083 except KeyError:
2084 # Not an external ref, but may depend on one. Either it will
2085 # get popped via a _follow_chain call, or we will raise an
2086 # error below.
2087 continue
2088 self._ext_refs.append(RawObjectID(base_sha))
2089 self._pending_ref.pop(base_sha)
2090 for new_offset in pending:
2091 yield from self._follow_chain(new_offset, type_num, chunks) # type: ignore[arg-type]
2093 self._ensure_no_pending()
2095 def _result(self, unpacked: UnpackedObject) -> T:
2096 raise NotImplementedError
2098 def _resolve_object(
2099 self, offset: int, obj_type_num: int, base_chunks: list[bytes] | None
2100 ) -> UnpackedObject:
2101 assert self._file is not None
2102 self._file.seek(offset)
2103 unpacked, _ = unpack_object(
2104 self._file.read,
2105 include_comp=self._include_comp,
2106 compute_crc32=self._compute_crc32,
2107 )
2108 unpacked.offset = offset
2109 if base_chunks is None:
2110 assert unpacked.pack_type_num == obj_type_num
2111 else:
2112 assert unpacked.pack_type_num in DELTA_TYPES
2113 unpacked.obj_type_num = obj_type_num
2114 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks)
2115 return unpacked
2117 def _follow_chain(
2118 self, offset: int, obj_type_num: int, base_chunks: list[bytes] | None
2119 ) -> Iterator[T]:
2120 # Unlike PackData.get_object_at, there is no need to cache offsets as
2121 # this approach by design inflates each object exactly once.
2122 todo = [(offset, obj_type_num, base_chunks)]
2123 while todo:
2124 (offset, obj_type_num, base_chunks) = todo.pop()
2125 unpacked = self._resolve_object(offset, obj_type_num, base_chunks)
2126 yield self._result(unpacked)
2128 assert unpacked.offset is not None
2129 unblocked = chain(
2130 self._pending_ofs.pop(unpacked.offset, []),
2131 self._pending_ref.pop(unpacked.sha(), []),
2132 )
2133 todo.extend(
2134 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore
2135 for new_offset in unblocked
2136 )
2138 def __iter__(self) -> Iterator[T]:
2139 """Iterate over objects in the pack."""
2140 return self._walk_all_chains()
2142 def ext_refs(self) -> list[RawObjectID]:
2143 """Return external references."""
2144 return self._ext_refs
2147class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]):
2148 """Delta chain iterator that yield unpacked objects."""
2150 def _result(self, unpacked: UnpackedObject) -> UnpackedObject:
2151 """Return the unpacked object.
2153 Args:
2154 unpacked: The unpacked object
2156 Returns:
2157 The unpacked object unchanged
2158 """
2159 return unpacked
2162class PackIndexer(DeltaChainIterator[PackIndexEntry]):
2163 """Delta chain iterator that yields index entries."""
2165 _compute_crc32 = True
2167 def _result(self, unpacked: UnpackedObject) -> PackIndexEntry:
2168 """Convert unpacked object to pack index entry.
2170 Args:
2171 unpacked: The unpacked object
2173 Returns:
2174 Tuple of (sha, offset, crc32) for index entry
2175 """
2176 assert unpacked.offset is not None
2177 return unpacked.sha(), unpacked.offset, unpacked.crc32
2180class PackInflater(DeltaChainIterator[ShaFile]):
2181 """Delta chain iterator that yields ShaFile objects."""
2183 def _result(self, unpacked: UnpackedObject) -> ShaFile:
2184 """Convert unpacked object to ShaFile.
2186 Args:
2187 unpacked: The unpacked object
2189 Returns:
2190 ShaFile object from the unpacked data
2191 """
2192 return unpacked.sha_file()
2195class SHA1Reader(BinaryIO):
2196 """Wrapper for file-like object that remembers the SHA1 of its data."""
2198 def __init__(self, f: IO[bytes]) -> None:
2199 """Initialize SHA1Reader.
2201 Args:
2202 f: File-like object to wrap
2203 """
2204 self.f = f
2205 self.sha1 = sha1(b"")
2207 def read(self, size: int = -1) -> bytes:
2208 """Read bytes and update SHA1.
2210 Args:
2211 size: Number of bytes to read, -1 for all
2213 Returns:
2214 Bytes read from file
2215 """
2216 data = self.f.read(size)
2217 self.sha1.update(data)
2218 return data
2220 def check_sha(self, allow_empty: bool = False) -> None:
2221 """Check if the SHA1 matches the expected value.
2223 Args:
2224 allow_empty: Allow empty SHA1 hash
2226 Raises:
2227 ChecksumMismatch: If SHA1 doesn't match
2228 """
2229 stored = self.f.read(20)
2230 # If git option index.skipHash is set the index will be empty
2231 if stored != self.sha1.digest() and (
2232 not allow_empty
2233 or sha_to_hex(RawObjectID(stored))
2234 != b"0000000000000000000000000000000000000000"
2235 ):
2236 raise ChecksumMismatch(
2237 self.sha1.hexdigest(), sha_to_hex(RawObjectID(stored))
2238 )
2240 def close(self) -> None:
2241 """Close the underlying file."""
2242 return self.f.close()
2244 def tell(self) -> int:
2245 """Return current file position."""
2246 return self.f.tell()
2248 # BinaryIO abstract methods
2249 def readable(self) -> bool:
2250 """Check if file is readable."""
2251 return True
2253 def writable(self) -> bool:
2254 """Check if file is writable."""
2255 return False
2257 def seekable(self) -> bool:
2258 """Check if file is seekable."""
2259 return getattr(self.f, "seekable", lambda: False)()
2261 def seek(self, offset: int, whence: int = 0) -> int:
2262 """Seek to position in file.
2264 Args:
2265 offset: Position offset
2266 whence: Reference point (0=start, 1=current, 2=end)
2268 Returns:
2269 New file position
2270 """
2271 return self.f.seek(offset, whence)
2273 def flush(self) -> None:
2274 """Flush the file buffer."""
2275 if hasattr(self.f, "flush"):
2276 self.f.flush()
2278 def readline(self, size: int = -1) -> bytes:
2279 """Read a line from the file.
2281 Args:
2282 size: Maximum bytes to read
2284 Returns:
2285 Line read from file
2286 """
2287 return self.f.readline(size)
2289 def readlines(self, hint: int = -1) -> list[bytes]:
2290 """Read all lines from the file.
2292 Args:
2293 hint: Approximate number of bytes to read
2295 Returns:
2296 List of lines
2297 """
2298 return self.f.readlines(hint)
2300 def writelines(self, lines: Iterable[bytes], /) -> None: # type: ignore[override]
2301 """Write multiple lines to the file (not supported)."""
2302 raise UnsupportedOperation("writelines")
2304 def write(self, data: bytes, /) -> int: # type: ignore[override]
2305 """Write data to the file (not supported)."""
2306 raise UnsupportedOperation("write")
2308 def __enter__(self) -> "SHA1Reader":
2309 """Enter context manager."""
2310 return self
2312 def __exit__(
2313 self,
2314 type: type | None,
2315 value: BaseException | None,
2316 traceback: TracebackType | None,
2317 ) -> None:
2318 """Exit context manager and close file."""
2319 self.close()
2321 def __iter__(self) -> "SHA1Reader":
2322 """Return iterator for reading file lines."""
2323 return self
2325 def __next__(self) -> bytes:
2326 """Get next line from file.
2328 Returns:
2329 Next line
2331 Raises:
2332 StopIteration: When no more lines
2333 """
2334 line = self.readline()
2335 if not line:
2336 raise StopIteration
2337 return line
2339 def fileno(self) -> int:
2340 """Return file descriptor number."""
2341 return self.f.fileno()
2343 def isatty(self) -> bool:
2344 """Check if file is a terminal."""
2345 return getattr(self.f, "isatty", lambda: False)()
2347 def truncate(self, size: int | None = None) -> int:
2348 """Not supported for read-only file.
2350 Raises:
2351 UnsupportedOperation: Always raised
2352 """
2353 raise UnsupportedOperation("truncate")
2356class SHA1Writer(BinaryIO):
2357 """Wrapper for file-like object that remembers the SHA1 of its data."""
2359 def __init__(self, f: BinaryIO | IO[bytes]) -> None:
2360 """Initialize SHA1Writer.
2362 Args:
2363 f: File-like object to wrap
2364 """
2365 self.f = f
2366 self.length = 0
2367 self.sha1 = sha1(b"")
2368 self.digest: bytes | None = None
2370 def write(self, data: bytes | bytearray | memoryview, /) -> int: # type: ignore[override]
2371 """Write data and update SHA1.
2373 Args:
2374 data: Data to write
2376 Returns:
2377 Number of bytes written
2378 """
2379 self.sha1.update(data)
2380 written = self.f.write(data)
2381 self.length += written
2382 return written
2384 def write_sha(self) -> bytes:
2385 """Write the SHA1 digest to the file.
2387 Returns:
2388 The SHA1 digest bytes
2389 """
2390 sha = self.sha1.digest()
2391 assert len(sha) == 20
2392 self.f.write(sha)
2393 self.length += len(sha)
2394 return sha
2396 def close(self) -> None:
2397 """Close the pack file and finalize the SHA."""
2398 self.digest = self.write_sha()
2399 self.f.close()
2401 def offset(self) -> int:
2402 """Get the total number of bytes written.
2404 Returns:
2405 Total bytes written
2406 """
2407 return self.length
2409 def tell(self) -> int:
2410 """Return current file position."""
2411 return self.f.tell()
2413 # BinaryIO abstract methods
2414 def readable(self) -> bool:
2415 """Check if file is readable."""
2416 return False
2418 def writable(self) -> bool:
2419 """Check if file is writable."""
2420 return True
2422 def seekable(self) -> bool:
2423 """Check if file is seekable."""
2424 return getattr(self.f, "seekable", lambda: False)()
2426 def seek(self, offset: int, whence: int = 0) -> int:
2427 """Seek to position in file.
2429 Args:
2430 offset: Position offset
2431 whence: Reference point (0=start, 1=current, 2=end)
2433 Returns:
2434 New file position
2435 """
2436 return self.f.seek(offset, whence)
2438 def flush(self) -> None:
2439 """Flush the file buffer."""
2440 if hasattr(self.f, "flush"):
2441 self.f.flush()
2443 def readline(self, size: int = -1) -> bytes:
2444 """Not supported for write-only file.
2446 Raises:
2447 UnsupportedOperation: Always raised
2448 """
2449 raise UnsupportedOperation("readline")
2451 def readlines(self, hint: int = -1) -> list[bytes]:
2452 """Not supported for write-only file.
2454 Raises:
2455 UnsupportedOperation: Always raised
2456 """
2457 raise UnsupportedOperation("readlines")
2459 def writelines(self, lines: Iterable[bytes], /) -> None: # type: ignore[override]
2460 """Write multiple lines to the file.
2462 Args:
2463 lines: Iterable of lines to write
2464 """
2465 for line in lines:
2466 self.write(line)
2468 def read(self, size: int = -1) -> bytes:
2469 """Not supported for write-only file.
2471 Raises:
2472 UnsupportedOperation: Always raised
2473 """
2474 raise UnsupportedOperation("read")
2476 def __enter__(self) -> "SHA1Writer":
2477 """Enter context manager."""
2478 return self
2480 def __exit__(
2481 self,
2482 type: type | None,
2483 value: BaseException | None,
2484 traceback: TracebackType | None,
2485 ) -> None:
2486 """Exit context manager and close file."""
2487 self.close()
2489 def __iter__(self) -> "SHA1Writer":
2490 """Return iterator."""
2491 return self
2493 def __next__(self) -> bytes:
2494 """Not supported for write-only file.
2496 Raises:
2497 UnsupportedOperation: Always raised
2498 """
2499 raise UnsupportedOperation("__next__")
2501 def fileno(self) -> int:
2502 """Return file descriptor number."""
2503 return self.f.fileno()
2505 def isatty(self) -> bool:
2506 """Check if file is a terminal."""
2507 return getattr(self.f, "isatty", lambda: False)()
2509 def truncate(self, size: int | None = None) -> int:
2510 """Not supported for write-only file.
2512 Raises:
2513 UnsupportedOperation: Always raised
2514 """
2515 raise UnsupportedOperation("truncate")
2518def pack_object_header(
2519 type_num: int, delta_base: bytes | int | None, size: int
2520) -> bytearray:
2521 """Create a pack object header for the given object info.
2523 Args:
2524 type_num: Numeric type of the object.
2525 delta_base: Delta base offset or ref, or None for whole objects.
2526 size: Uncompressed object size.
2527 Returns: A header for a packed object.
2528 """
2529 header = []
2530 c = (type_num << 4) | (size & 15)
2531 size >>= 4
2532 while size:
2533 header.append(c | 0x80)
2534 c = size & 0x7F
2535 size >>= 7
2536 header.append(c)
2537 if type_num == OFS_DELTA:
2538 assert isinstance(delta_base, int)
2539 ret = [delta_base & 0x7F]
2540 delta_base >>= 7
2541 while delta_base:
2542 delta_base -= 1
2543 ret.insert(0, 0x80 | (delta_base & 0x7F))
2544 delta_base >>= 7
2545 header.extend(ret)
2546 elif type_num == REF_DELTA:
2547 assert isinstance(delta_base, bytes)
2548 assert len(delta_base) == 20
2549 header += delta_base
2550 return bytearray(header)
2553def pack_object_chunks(
2554 type: int,
2555 object: list[bytes] | tuple[bytes | int, list[bytes]],
2556 compression_level: int = -1,
2557) -> Iterator[bytes]:
2558 """Generate chunks for a pack object.
2560 Args:
2561 type: Numeric type of the object
2562 object: Object to write
2563 compression_level: the zlib compression level
2564 Returns: Chunks
2565 """
2566 if type in DELTA_TYPES:
2567 if isinstance(object, tuple):
2568 delta_base, object = object
2569 else:
2570 raise TypeError("Delta types require a tuple of (delta_base, object)")
2571 else:
2572 delta_base = None
2574 # Convert object to list of bytes chunks
2575 if isinstance(object, bytes):
2576 chunks = [object]
2577 elif isinstance(object, list):
2578 chunks = object
2579 elif isinstance(object, ShaFile):
2580 chunks = object.as_raw_chunks()
2581 else:
2582 # Shouldn't reach here with proper typing
2583 raise TypeError(f"Unexpected object type: {object.__class__.__name__}")
2585 yield bytes(pack_object_header(type, delta_base, sum(map(len, chunks))))
2586 compressor = zlib.compressobj(level=compression_level)
2587 for data in chunks:
2588 yield compressor.compress(data)
2589 yield compressor.flush()
2592def write_pack_object(
2593 write: Callable[[bytes], int],
2594 type: int,
2595 object: list[bytes] | tuple[bytes | int, list[bytes]],
2596 sha: "HashObject | None" = None,
2597 compression_level: int = -1,
2598) -> int:
2599 """Write pack object to a file.
2601 Args:
2602 write: Write function to use
2603 type: Numeric type of the object
2604 object: Object to write
2605 sha: Optional SHA-1 hasher to update
2606 compression_level: the zlib compression level
2607 Returns: CRC32 checksum of the written object
2608 """
2609 crc32 = 0
2610 for chunk in pack_object_chunks(type, object, compression_level=compression_level):
2611 write(chunk)
2612 if sha is not None:
2613 sha.update(chunk)
2614 crc32 = binascii.crc32(chunk, crc32)
2615 return crc32 & 0xFFFFFFFF
2618def write_pack(
2619 filename: str,
2620 objects: Sequence[ShaFile] | Sequence[tuple[ShaFile, bytes | None]],
2621 *,
2622 deltify: bool | None = None,
2623 delta_window_size: int | None = None,
2624 compression_level: int = -1,
2625) -> tuple[bytes, bytes]:
2626 """Write a new pack data file.
2628 Args:
2629 filename: Path to the new pack file (without .pack extension)
2630 objects: Objects to write to the pack
2631 delta_window_size: Delta window size
2632 deltify: Whether to deltify pack objects
2633 compression_level: the zlib compression level
2634 Returns: Tuple with checksum of pack file and index file
2635 """
2636 with GitFile(filename + ".pack", "wb") as f:
2637 entries, data_sum = write_pack_objects(
2638 f,
2639 objects,
2640 delta_window_size=delta_window_size,
2641 deltify=deltify,
2642 compression_level=compression_level,
2643 )
2644 entries_list = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])
2645 with GitFile(filename + ".idx", "wb") as f:
2646 idx_sha = write_pack_index(f, entries_list, data_sum)
2647 return data_sum, idx_sha
2650def pack_header_chunks(num_objects: int) -> Iterator[bytes]:
2651 """Yield chunks for a pack header."""
2652 yield b"PACK" # Pack header
2653 yield struct.pack(b">L", 2) # Pack version
2654 yield struct.pack(b">L", num_objects) # Number of objects in pack
2657def write_pack_header(
2658 write: Callable[[bytes], int] | IO[bytes], num_objects: int
2659) -> None:
2660 """Write a pack header for the given number of objects."""
2661 write_fn: Callable[[bytes], int]
2662 if hasattr(write, "write"):
2663 write_fn = write.write
2664 warnings.warn(
2665 "write_pack_header() now takes a write rather than file argument",
2666 DeprecationWarning,
2667 stacklevel=2,
2668 )
2669 else:
2670 write_fn = write
2671 for chunk in pack_header_chunks(num_objects):
2672 write_fn(chunk)
2675def find_reusable_deltas(
2676 container: PackedObjectContainer,
2677 object_ids: Set[ObjectID],
2678 *,
2679 other_haves: Set[ObjectID] | None = None,
2680 progress: Callable[..., None] | None = None,
2681) -> Iterator[UnpackedObject]:
2682 """Find deltas in a pack that can be reused.
2684 Args:
2685 container: Pack container to search for deltas
2686 object_ids: Set of object IDs to find deltas for
2687 other_haves: Set of other object IDs we have
2688 progress: Optional progress reporting callback
2690 Returns:
2691 Iterator of UnpackedObject entries that can be reused
2692 """
2693 if other_haves is None:
2694 other_haves = set()
2695 reused = 0
2696 for i, unpacked in enumerate(
2697 container.iter_unpacked_subset(
2698 object_ids, allow_missing=True, convert_ofs_delta=True
2699 )
2700 ):
2701 if progress is not None and i % 1000 == 0:
2702 progress(f"checking for reusable deltas: {i}/{len(object_ids)}\r".encode())
2703 if unpacked.pack_type_num == REF_DELTA:
2704 hexsha = sha_to_hex(unpacked.delta_base) # type: ignore
2705 if hexsha in object_ids or hexsha in other_haves:
2706 yield unpacked
2707 reused += 1
2708 if progress is not None:
2709 progress((f"found {reused} deltas to reuse\n").encode())
2712def deltify_pack_objects(
2713 objects: Iterator[ShaFile] | Iterator[tuple[ShaFile, bytes | None]],
2714 *,
2715 window_size: int | None = None,
2716 progress: Callable[..., None] | None = None,
2717) -> Iterator[UnpackedObject]:
2718 """Generate deltas for pack objects.
2720 Args:
2721 objects: An iterable of (object, path) tuples to deltify.
2722 window_size: Window size; None for default
2723 progress: Optional progress reporting callback
2724 Returns: Iterator over type_num, object id, delta_base, content
2725 delta_base is None for full text entries
2726 """
2728 def objects_with_hints() -> Iterator[tuple[ShaFile, tuple[int, bytes | None]]]:
2729 for e in objects:
2730 if isinstance(e, ShaFile):
2731 yield (e, (e.type_num, None))
2732 else:
2733 yield (e[0], (e[0].type_num, e[1]))
2735 sorted_objs = sort_objects_for_delta(objects_with_hints())
2736 yield from deltas_from_sorted_objects(
2737 sorted_objs,
2738 window_size=window_size,
2739 progress=progress,
2740 )
2743def sort_objects_for_delta(
2744 objects: Iterator[ShaFile] | Iterator[tuple[ShaFile, PackHint | None]],
2745) -> Iterator[tuple[ShaFile, bytes | None]]:
2746 """Sort objects for optimal delta compression.
2748 Args:
2749 objects: Iterator of objects or (object, hint) tuples
2751 Returns:
2752 Iterator of sorted (ShaFile, path) tuples
2753 """
2754 magic = []
2755 for entry in objects:
2756 if isinstance(entry, tuple):
2757 obj, hint = entry
2758 if hint is None:
2759 type_num = None
2760 path = None
2761 else:
2762 (type_num, path) = hint
2763 else:
2764 obj = entry
2765 type_num = None
2766 path = None
2767 magic.append((type_num, path, -obj.raw_length(), obj))
2768 # Build a list of objects ordered by the magic Linus heuristic
2769 # This helps us find good objects to diff against us
2770 magic.sort()
2771 return ((x[3], x[1]) for x in magic)
2774def deltas_from_sorted_objects(
2775 objects: Iterator[tuple[ShaFile, bytes | None]],
2776 window_size: int | None = None,
2777 progress: Callable[..., None] | None = None,
2778) -> Iterator[UnpackedObject]:
2779 """Create deltas from sorted objects.
2781 Args:
2782 objects: Iterator of sorted objects to deltify
2783 window_size: Delta window size; None for default
2784 progress: Optional progress reporting callback
2786 Returns:
2787 Iterator of UnpackedObject entries
2788 """
2789 # TODO(jelmer): Use threads
2790 if window_size is None:
2791 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE
2793 possible_bases: deque[tuple[bytes, int, list[bytes]]] = deque()
2794 for i, (o, path) in enumerate(objects):
2795 if progress is not None and i % 1000 == 0:
2796 progress((f"generating deltas: {i}\r").encode())
2797 raw = o.as_raw_chunks()
2798 winner = raw
2799 winner_len = sum(map(len, winner))
2800 winner_base = None
2801 for base_id, base_type_num, base in possible_bases:
2802 if base_type_num != o.type_num:
2803 continue
2804 delta_len = 0
2805 delta = []
2806 for chunk in create_delta(b"".join(base), b"".join(raw)):
2807 delta_len += len(chunk)
2808 if delta_len >= winner_len:
2809 break
2810 delta.append(chunk)
2811 else:
2812 winner_base = base_id
2813 winner = delta
2814 winner_len = sum(map(len, winner))
2815 yield UnpackedObject(
2816 o.type_num,
2817 sha=o.sha().digest(),
2818 delta_base=winner_base,
2819 decomp_len=winner_len,
2820 decomp_chunks=winner,
2821 )
2822 possible_bases.appendleft((o.sha().digest(), o.type_num, raw))
2823 while len(possible_bases) > window_size:
2824 possible_bases.pop()
2827def pack_objects_to_data(
2828 objects: Sequence[ShaFile]
2829 | Sequence[tuple[ShaFile, bytes | None]]
2830 | Sequence[tuple[ShaFile, PackHint | None]],
2831 *,
2832 deltify: bool | None = None,
2833 delta_window_size: int | None = None,
2834 ofs_delta: bool = True,
2835 progress: Callable[..., None] | None = None,
2836) -> tuple[int, Iterator[UnpackedObject]]:
2837 """Create pack data from objects.
2839 Args:
2840 objects: Pack objects
2841 deltify: Whether to deltify pack objects
2842 delta_window_size: Delta window size
2843 ofs_delta: Whether to use offset deltas
2844 progress: Optional progress reporting callback
2845 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
2846 """
2847 count = len(objects)
2848 if deltify is None:
2849 # PERFORMANCE/TODO(jelmer): This should be enabled but the python
2850 # implementation is *much* too slow at the moment.
2851 # Maybe consider enabling it just if the rust extension is available?
2852 deltify = False
2853 if deltify:
2854 return (
2855 count,
2856 deltify_pack_objects(
2857 iter(objects), # type: ignore
2858 window_size=delta_window_size,
2859 progress=progress,
2860 ),
2861 )
2862 else:
2864 def iter_without_path() -> Iterator[UnpackedObject]:
2865 for o in objects:
2866 if isinstance(o, tuple):
2867 yield full_unpacked_object(o[0])
2868 else:
2869 yield full_unpacked_object(o)
2871 return (count, iter_without_path())
2874def generate_unpacked_objects(
2875 container: PackedObjectContainer,
2876 object_ids: Sequence[tuple[ObjectID, PackHint | None]],
2877 delta_window_size: int | None = None,
2878 deltify: bool | None = None,
2879 reuse_deltas: bool = True,
2880 ofs_delta: bool = True,
2881 other_haves: set[ObjectID] | None = None,
2882 progress: Callable[..., None] | None = None,
2883) -> Iterator[UnpackedObject]:
2884 """Create pack data from objects.
2886 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
2887 """
2888 todo = dict(object_ids)
2889 if reuse_deltas:
2890 for unpack in find_reusable_deltas(
2891 container, set(todo), other_haves=other_haves, progress=progress
2892 ):
2893 del todo[sha_to_hex(RawObjectID(unpack.sha()))]
2894 yield unpack
2895 if deltify is None:
2896 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too
2897 # slow at the moment.
2898 deltify = False
2899 if deltify:
2900 objects_to_delta = container.iterobjects_subset(
2901 todo.keys(), allow_missing=False
2902 )
2903 sorted_objs = sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta)
2904 yield from deltas_from_sorted_objects(
2905 sorted_objs,
2906 window_size=delta_window_size,
2907 progress=progress,
2908 )
2909 else:
2910 for oid in todo:
2911 yield full_unpacked_object(container[oid])
2914def full_unpacked_object(o: ShaFile) -> UnpackedObject:
2915 """Create an UnpackedObject from a ShaFile.
2917 Args:
2918 o: ShaFile object to convert
2920 Returns:
2921 UnpackedObject with full object data
2922 """
2923 return UnpackedObject(
2924 o.type_num,
2925 delta_base=None,
2926 crc32=None,
2927 decomp_chunks=o.as_raw_chunks(),
2928 sha=o.sha().digest(),
2929 )
2932def write_pack_from_container(
2933 write: Callable[[bytes], None]
2934 | Callable[[bytes | bytearray | memoryview], int]
2935 | IO[bytes],
2936 container: PackedObjectContainer,
2937 object_ids: Sequence[tuple[ObjectID, PackHint | None]],
2938 delta_window_size: int | None = None,
2939 deltify: bool | None = None,
2940 reuse_deltas: bool = True,
2941 compression_level: int = -1,
2942 other_haves: set[ObjectID] | None = None,
2943) -> tuple[dict[bytes, tuple[int, int]], bytes]:
2944 """Write a new pack data file.
2946 Args:
2947 write: write function to use
2948 container: PackedObjectContainer
2949 object_ids: Sequence of (object_id, hint) tuples to write
2950 delta_window_size: Sliding window size for searching for deltas;
2951 Set to None for default window size.
2952 deltify: Whether to deltify objects
2953 reuse_deltas: Whether to reuse existing deltas
2954 compression_level: the zlib compression level to use
2955 other_haves: Set of additional object IDs the receiver has
2956 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2957 """
2958 pack_contents_count = len(object_ids)
2959 pack_contents = generate_unpacked_objects(
2960 container,
2961 object_ids,
2962 delta_window_size=delta_window_size,
2963 deltify=deltify,
2964 reuse_deltas=reuse_deltas,
2965 other_haves=other_haves,
2966 )
2968 return write_pack_data(
2969 write,
2970 pack_contents,
2971 num_records=pack_contents_count,
2972 compression_level=compression_level,
2973 )
2976def write_pack_objects(
2977 write: Callable[[bytes], None] | IO[bytes],
2978 objects: Sequence[ShaFile] | Sequence[tuple[ShaFile, bytes | None]],
2979 *,
2980 delta_window_size: int | None = None,
2981 deltify: bool | None = None,
2982 compression_level: int = -1,
2983) -> tuple[dict[bytes, tuple[int, int]], bytes]:
2984 """Write a new pack data file.
2986 Args:
2987 write: write function to use
2988 objects: Sequence of (object, path) tuples to write
2989 delta_window_size: Sliding window size for searching for deltas;
2990 Set to None for default window size.
2991 deltify: Whether to deltify objects
2992 compression_level: the zlib compression level to use
2993 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2994 """
2995 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify)
2997 return write_pack_data(
2998 write,
2999 pack_contents,
3000 num_records=pack_contents_count,
3001 compression_level=compression_level,
3002 )
3005class PackChunkGenerator:
3006 """Generator for pack data chunks."""
3008 def __init__(
3009 self,
3010 num_records: int | None = None,
3011 records: Iterator[UnpackedObject] | None = None,
3012 progress: Callable[..., None] | None = None,
3013 compression_level: int = -1,
3014 reuse_compressed: bool = True,
3015 ) -> None:
3016 """Initialize PackChunkGenerator.
3018 Args:
3019 num_records: Expected number of records
3020 records: Iterator of pack records
3021 progress: Optional progress callback
3022 compression_level: Compression level (-1 for default)
3023 reuse_compressed: Whether to reuse compressed chunks
3024 """
3025 self.cs = sha1(b"")
3026 self.entries: dict[bytes, tuple[int, int]] = {}
3027 if records is None:
3028 records = iter([]) # Empty iterator if None
3029 self._it = self._pack_data_chunks(
3030 records=records,
3031 num_records=num_records,
3032 progress=progress,
3033 compression_level=compression_level,
3034 reuse_compressed=reuse_compressed,
3035 )
3037 def sha1digest(self) -> bytes:
3038 """Return the SHA1 digest of the pack data."""
3039 return self.cs.digest()
3041 def __iter__(self) -> Iterator[bytes]:
3042 """Iterate over pack data chunks."""
3043 return self._it
3045 def _pack_data_chunks(
3046 self,
3047 records: Iterator[UnpackedObject],
3048 *,
3049 num_records: int | None = None,
3050 progress: Callable[..., None] | None = None,
3051 compression_level: int = -1,
3052 reuse_compressed: bool = True,
3053 ) -> Iterator[bytes]:
3054 """Iterate pack data file chunks.
3056 Args:
3057 records: Iterator over UnpackedObject
3058 num_records: Number of records (defaults to len(records) if not specified)
3059 progress: Function to report progress to
3060 compression_level: the zlib compression level
3061 reuse_compressed: Whether to reuse compressed chunks
3062 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
3063 """
3064 # Write the pack
3065 if num_records is None:
3066 num_records = len(records) # type: ignore
3067 offset = 0
3068 for chunk in pack_header_chunks(num_records):
3069 yield chunk
3070 self.cs.update(chunk)
3071 offset += len(chunk)
3072 actual_num_records = 0
3073 for i, unpacked in enumerate(records):
3074 type_num = unpacked.pack_type_num
3075 if progress is not None and i % 1000 == 0:
3076 progress((f"writing pack data: {i}/{num_records}\r").encode("ascii"))
3077 raw: list[bytes] | tuple[int, list[bytes]] | tuple[bytes, list[bytes]]
3078 if unpacked.delta_base is not None:
3079 assert isinstance(unpacked.delta_base, bytes), (
3080 f"Expected bytes, got {type(unpacked.delta_base)}"
3081 )
3082 try:
3083 base_offset, _base_crc32 = self.entries[unpacked.delta_base]
3084 except KeyError:
3085 type_num = REF_DELTA
3086 assert isinstance(unpacked.delta_base, bytes)
3087 raw = (unpacked.delta_base, unpacked.decomp_chunks)
3088 else:
3089 type_num = OFS_DELTA
3090 raw = (offset - base_offset, unpacked.decomp_chunks)
3091 else:
3092 raw = unpacked.decomp_chunks
3093 chunks: list[bytes] | Iterator[bytes]
3094 if unpacked.comp_chunks is not None and reuse_compressed:
3095 chunks = unpacked.comp_chunks
3096 else:
3097 chunks = pack_object_chunks(
3098 type_num, raw, compression_level=compression_level
3099 )
3100 crc32 = 0
3101 object_size = 0
3102 for chunk in chunks:
3103 yield chunk
3104 crc32 = binascii.crc32(chunk, crc32)
3105 self.cs.update(chunk)
3106 object_size += len(chunk)
3107 actual_num_records += 1
3108 self.entries[unpacked.sha()] = (offset, crc32)
3109 offset += object_size
3110 if actual_num_records != num_records:
3111 raise AssertionError(
3112 f"actual records written differs: {actual_num_records} != {num_records}"
3113 )
3115 yield self.cs.digest()
3118def write_pack_data(
3119 write: Callable[[bytes], None]
3120 | Callable[[bytes | bytearray | memoryview], int]
3121 | IO[bytes],
3122 records: Iterator[UnpackedObject],
3123 *,
3124 num_records: int | None = None,
3125 progress: Callable[..., None] | None = None,
3126 compression_level: int = -1,
3127) -> tuple[dict[bytes, tuple[int, int]], bytes]:
3128 """Write a new pack data file.
3130 Args:
3131 write: Write function to use
3132 num_records: Number of records (defaults to len(records) if None)
3133 records: Iterator over type_num, object_id, delta_base, raw
3134 progress: Function to report progress to
3135 compression_level: the zlib compression level
3136 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
3137 """
3138 chunk_generator = PackChunkGenerator(
3139 num_records=num_records,
3140 records=records,
3141 progress=progress,
3142 compression_level=compression_level,
3143 )
3144 for chunk in chunk_generator:
3145 if callable(write):
3146 write(chunk)
3147 else:
3148 write.write(chunk)
3149 return chunk_generator.entries, chunk_generator.sha1digest()
3152def write_pack_index_v1(
3153 f: IO[bytes],
3154 entries: Iterable[tuple[bytes, int, int | None]],
3155 pack_checksum: bytes,
3156) -> bytes:
3157 """Write a new pack index file.
3159 Args:
3160 f: A file-like object to write to
3161 entries: List of tuples with object name (sha), offset_in_pack,
3162 and crc32_checksum.
3163 pack_checksum: Checksum of the pack file.
3164 Returns: The SHA of the written index file
3165 """
3166 f = SHA1Writer(f)
3167 fan_out_table: dict[int, int] = defaultdict(lambda: 0)
3168 for name, _offset, _entry_checksum in entries:
3169 fan_out_table[ord(name[:1])] += 1
3170 # Fan-out table
3171 for i in range(0x100):
3172 f.write(struct.pack(">L", fan_out_table[i]))
3173 fan_out_table[i + 1] += fan_out_table[i]
3174 for name, offset, _entry_checksum in entries:
3175 if not (offset <= 0xFFFFFFFF):
3176 raise TypeError("pack format 1 only supports offsets < 2Gb")
3177 f.write(struct.pack(">L20s", offset, name))
3178 assert len(pack_checksum) == 20
3179 f.write(pack_checksum)
3180 return f.write_sha()
3183def _delta_encode_size(size: int) -> bytes:
3184 ret = bytearray()
3185 c = size & 0x7F
3186 size >>= 7
3187 while size:
3188 ret.append(c | 0x80)
3189 c = size & 0x7F
3190 size >>= 7
3191 ret.append(c)
3192 return bytes(ret)
3195# The length of delta compression copy operations in version 2 packs is limited
3196# to 64K. To copy more, we use several copy operations. Version 3 packs allow
3197# 24-bit lengths in copy operations, but we always make version 2 packs.
3198_MAX_COPY_LEN = 0xFFFF
3201def _encode_copy_operation(start: int, length: int) -> bytes:
3202 scratch = bytearray([0x80])
3203 for i in range(4):
3204 if start & 0xFF << i * 8:
3205 scratch.append((start >> i * 8) & 0xFF)
3206 scratch[0] |= 1 << i
3207 for i in range(2):
3208 if length & 0xFF << i * 8:
3209 scratch.append((length >> i * 8) & 0xFF)
3210 scratch[0] |= 1 << (4 + i)
3211 return bytes(scratch)
3214def _create_delta_py(base_buf: bytes, target_buf: bytes) -> Iterator[bytes]:
3215 """Use python difflib to work out how to transform base_buf to target_buf.
3217 Args:
3218 base_buf: Base buffer
3219 target_buf: Target buffer
3220 """
3221 if isinstance(base_buf, list):
3222 base_buf = b"".join(base_buf)
3223 if isinstance(target_buf, list):
3224 target_buf = b"".join(target_buf)
3225 assert isinstance(base_buf, bytes)
3226 assert isinstance(target_buf, bytes)
3227 # write delta header
3228 yield _delta_encode_size(len(base_buf))
3229 yield _delta_encode_size(len(target_buf))
3230 # write out delta opcodes
3231 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf)
3232 for opcode, i1, i2, j1, j2 in seq.get_opcodes():
3233 # Git patch opcodes don't care about deletes!
3234 # if opcode == 'replace' or opcode == 'delete':
3235 # pass
3236 if opcode == "equal":
3237 # If they are equal, unpacker will use data from base_buf
3238 # Write out an opcode that says what range to use
3239 copy_start = i1
3240 copy_len = i2 - i1
3241 while copy_len > 0:
3242 to_copy = min(copy_len, _MAX_COPY_LEN)
3243 yield _encode_copy_operation(copy_start, to_copy)
3244 copy_start += to_copy
3245 copy_len -= to_copy
3246 if opcode == "replace" or opcode == "insert":
3247 # If we are replacing a range or adding one, then we just
3248 # output it to the stream (prefixed by its size)
3249 s = j2 - j1
3250 o = j1
3251 while s > 127:
3252 yield bytes([127])
3253 yield bytes(memoryview(target_buf)[o : o + 127])
3254 s -= 127
3255 o += 127
3256 yield bytes([s])
3257 yield bytes(memoryview(target_buf)[o : o + s])
3260# Default to pure Python implementation
3261create_delta = _create_delta_py
3264def apply_delta(
3265 src_buf: bytes | list[bytes], delta: bytes | list[bytes]
3266) -> list[bytes]:
3267 """Based on the similar function in git's patch-delta.c.
3269 Args:
3270 src_buf: Source buffer
3271 delta: Delta instructions
3272 """
3273 if not isinstance(src_buf, bytes):
3274 src_buf = b"".join(src_buf)
3275 if not isinstance(delta, bytes):
3276 delta = b"".join(delta)
3277 out = []
3278 index = 0
3279 delta_length = len(delta)
3281 def get_delta_header_size(delta: bytes, index: int) -> tuple[int, int]:
3282 size = 0
3283 i = 0
3284 while delta:
3285 cmd = ord(delta[index : index + 1])
3286 index += 1
3287 size |= (cmd & ~0x80) << i
3288 i += 7
3289 if not cmd & 0x80:
3290 break
3291 return size, index
3293 src_size, index = get_delta_header_size(delta, index)
3294 dest_size, index = get_delta_header_size(delta, index)
3295 if src_size != len(src_buf):
3296 raise ApplyDeltaError(
3297 f"Unexpected source buffer size: {src_size} vs {len(src_buf)}"
3298 )
3299 while index < delta_length:
3300 cmd = ord(delta[index : index + 1])
3301 index += 1
3302 if cmd & 0x80:
3303 cp_off = 0
3304 for i in range(4):
3305 if cmd & (1 << i):
3306 x = ord(delta[index : index + 1])
3307 index += 1
3308 cp_off |= x << (i * 8)
3309 cp_size = 0
3310 # Version 3 packs can contain copy sizes larger than 64K.
3311 for i in range(3):
3312 if cmd & (1 << (4 + i)):
3313 x = ord(delta[index : index + 1])
3314 index += 1
3315 cp_size |= x << (i * 8)
3316 if cp_size == 0:
3317 cp_size = 0x10000
3318 if (
3319 cp_off + cp_size < cp_size
3320 or cp_off + cp_size > src_size
3321 or cp_size > dest_size
3322 ):
3323 break
3324 out.append(src_buf[cp_off : cp_off + cp_size])
3325 elif cmd != 0:
3326 out.append(delta[index : index + cmd])
3327 index += cmd
3328 else:
3329 raise ApplyDeltaError("Invalid opcode 0")
3331 if index != delta_length:
3332 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}")
3334 if dest_size != chunks_length(out):
3335 raise ApplyDeltaError("dest size incorrect")
3337 return out
3340def write_pack_index_v2(
3341 f: IO[bytes],
3342 entries: Iterable[tuple[bytes, int, int | None]],
3343 pack_checksum: bytes,
3344) -> bytes:
3345 """Write a new pack index file.
3347 Args:
3348 f: File-like object to write to
3349 entries: List of tuples with object name (sha), offset_in_pack, and
3350 crc32_checksum.
3351 pack_checksum: Checksum of the pack file.
3352 Returns: The SHA of the index file written
3353 """
3354 f = SHA1Writer(f)
3355 f.write(b"\377tOc") # Magic!
3356 f.write(struct.pack(">L", 2))
3357 fan_out_table: dict[int, int] = defaultdict(lambda: 0)
3358 for name, offset, entry_checksum in entries:
3359 fan_out_table[ord(name[:1])] += 1
3360 # Fan-out table
3361 largetable: list[int] = []
3362 for i in range(0x100):
3363 f.write(struct.pack(b">L", fan_out_table[i]))
3364 fan_out_table[i + 1] += fan_out_table[i]
3365 for name, offset, entry_checksum in entries:
3366 f.write(name)
3367 for name, offset, entry_checksum in entries:
3368 f.write(struct.pack(b">L", entry_checksum))
3369 for name, offset, entry_checksum in entries:
3370 if offset < 2**31:
3371 f.write(struct.pack(b">L", offset))
3372 else:
3373 f.write(struct.pack(b">L", 2**31 + len(largetable)))
3374 largetable.append(offset)
3375 for offset in largetable:
3376 f.write(struct.pack(b">Q", offset))
3377 assert len(pack_checksum) == 20
3378 f.write(pack_checksum)
3379 return f.write_sha()
3382def write_pack_index_v3(
3383 f: IO[bytes],
3384 entries: Iterable[tuple[bytes, int, int | None]],
3385 pack_checksum: bytes,
3386 hash_algorithm: int = 1,
3387) -> bytes:
3388 """Write a new pack index file in v3 format.
3390 Args:
3391 f: File-like object to write to
3392 entries: List of tuples with object name (sha), offset_in_pack, and
3393 crc32_checksum.
3394 pack_checksum: Checksum of the pack file.
3395 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
3396 Returns: The SHA of the index file written
3397 """
3398 if hash_algorithm == 1:
3399 hash_size = 20 # SHA-1
3400 writer_cls = SHA1Writer
3401 elif hash_algorithm == 2:
3402 hash_size = 32 # SHA-256
3403 # TODO: Add SHA256Writer when SHA-256 support is implemented
3404 raise NotImplementedError("SHA-256 support not yet implemented")
3405 else:
3406 raise ValueError(f"Unknown hash algorithm {hash_algorithm}")
3408 # Convert entries to list to allow multiple iterations
3409 entries_list = list(entries)
3411 # Calculate shortest unambiguous prefix length for object names
3412 # For now, use full hash size (this could be optimized)
3413 shortened_oid_len = hash_size
3415 f = writer_cls(f)
3416 f.write(b"\377tOc") # Magic!
3417 f.write(struct.pack(">L", 3)) # Version 3
3418 f.write(struct.pack(">L", hash_algorithm)) # Hash algorithm
3419 f.write(struct.pack(">L", shortened_oid_len)) # Shortened OID length
3421 fan_out_table: dict[int, int] = defaultdict(lambda: 0)
3422 for name, offset, entry_checksum in entries_list:
3423 if len(name) != hash_size:
3424 raise ValueError(
3425 f"Object name has wrong length: expected {hash_size}, got {len(name)}"
3426 )
3427 fan_out_table[ord(name[:1])] += 1
3429 # Fan-out table
3430 largetable: list[int] = []
3431 for i in range(0x100):
3432 f.write(struct.pack(b">L", fan_out_table[i]))
3433 fan_out_table[i + 1] += fan_out_table[i]
3435 # Object names table
3436 for name, offset, entry_checksum in entries_list:
3437 f.write(name)
3439 # CRC32 checksums table
3440 for name, offset, entry_checksum in entries_list:
3441 f.write(struct.pack(b">L", entry_checksum))
3443 # Offset table
3444 for name, offset, entry_checksum in entries_list:
3445 if offset < 2**31:
3446 f.write(struct.pack(b">L", offset))
3447 else:
3448 f.write(struct.pack(b">L", 2**31 + len(largetable)))
3449 largetable.append(offset)
3451 # Large offset table
3452 for offset in largetable:
3453 f.write(struct.pack(b">Q", offset))
3455 assert len(pack_checksum) == hash_size, (
3456 f"Pack checksum has wrong length: expected {hash_size}, got {len(pack_checksum)}"
3457 )
3458 f.write(pack_checksum)
3459 return f.write_sha()
3462def write_pack_index(
3463 f: IO[bytes],
3464 entries: Iterable[tuple[bytes, int, int | None]],
3465 pack_checksum: bytes,
3466 progress: Callable[..., None] | None = None,
3467 version: int | None = None,
3468) -> bytes:
3469 """Write a pack index file.
3471 Args:
3472 f: File-like object to write to.
3473 entries: List of (checksum, offset, crc32) tuples
3474 pack_checksum: Checksum of the pack file.
3475 progress: Progress function (not currently used)
3476 version: Pack index version to use (1, 2, or 3). If None, defaults to DEFAULT_PACK_INDEX_VERSION.
3478 Returns:
3479 SHA of the written index file
3480 """
3481 if version is None:
3482 version = DEFAULT_PACK_INDEX_VERSION
3484 if version == 1:
3485 return write_pack_index_v1(f, entries, pack_checksum)
3486 elif version == 2:
3487 return write_pack_index_v2(f, entries, pack_checksum)
3488 elif version == 3:
3489 return write_pack_index_v3(f, entries, pack_checksum)
3490 else:
3491 raise ValueError(f"Unsupported pack index version: {version}")
3494class Pack:
3495 """A Git pack object."""
3497 _data_load: Callable[[], PackData] | None
3498 _idx_load: Callable[[], PackIndex] | None
3500 _data: PackData | None
3501 _idx: PackIndex | None
3502 _bitmap: "PackBitmap | None"
3504 def __init__(
3505 self,
3506 basename: str,
3507 resolve_ext_ref: ResolveExtRefFn | None = None,
3508 *,
3509 delta_window_size: int | None = None,
3510 window_memory: int | None = None,
3511 delta_cache_size: int | None = None,
3512 depth: int | None = None,
3513 threads: int | None = None,
3514 big_file_threshold: int | None = None,
3515 ) -> None:
3516 """Initialize a Pack object.
3518 Args:
3519 basename: Base path for pack files (without .pack/.idx extension)
3520 resolve_ext_ref: Optional function to resolve external references
3521 delta_window_size: Size of the delta compression window
3522 window_memory: Memory limit for delta compression window
3523 delta_cache_size: Size of the delta cache
3524 depth: Maximum depth for delta chains
3525 threads: Number of threads to use for operations
3526 big_file_threshold: Size threshold for big file handling
3527 """
3528 self._basename = basename
3529 self._data = None
3530 self._idx = None
3531 self._bitmap = None
3532 self._idx_path = self._basename + ".idx"
3533 self._data_path = self._basename + ".pack"
3534 self._bitmap_path = self._basename + ".bitmap"
3535 self.delta_window_size = delta_window_size
3536 self.window_memory = window_memory
3537 self.delta_cache_size = delta_cache_size
3538 self.depth = depth
3539 self.threads = threads
3540 self.big_file_threshold = big_file_threshold
3541 self._data_load = lambda: PackData(
3542 self._data_path,
3543 delta_window_size=delta_window_size,
3544 window_memory=window_memory,
3545 delta_cache_size=delta_cache_size,
3546 depth=depth,
3547 threads=threads,
3548 big_file_threshold=big_file_threshold,
3549 )
3550 self._idx_load = lambda: load_pack_index(self._idx_path)
3551 self.resolve_ext_ref = resolve_ext_ref
3553 @classmethod
3554 def from_lazy_objects(
3555 cls, data_fn: Callable[[], PackData], idx_fn: Callable[[], PackIndex]
3556 ) -> "Pack":
3557 """Create a new pack object from callables to load pack data and index objects."""
3558 ret = cls("")
3559 ret._data_load = data_fn
3560 ret._idx_load = idx_fn
3561 return ret
3563 @classmethod
3564 def from_objects(cls, data: PackData, idx: PackIndex) -> "Pack":
3565 """Create a new pack object from pack data and index objects."""
3566 ret = cls("")
3567 ret._data = data
3568 ret._data_load = None
3569 ret._idx = idx
3570 ret._idx_load = None
3571 ret.check_length_and_checksum()
3572 return ret
3574 def name(self) -> bytes:
3575 """The SHA over the SHAs of the objects in this pack."""
3576 return self.index.objects_sha1()
3578 @property
3579 def data(self) -> PackData:
3580 """The pack data object being used."""
3581 if self._data is None:
3582 assert self._data_load
3583 self._data = self._data_load()
3584 self.check_length_and_checksum()
3585 return self._data
3587 @property
3588 def index(self) -> PackIndex:
3589 """The index being used.
3591 Note: This may be an in-memory index
3592 """
3593 if self._idx is None:
3594 assert self._idx_load
3595 self._idx = self._idx_load()
3596 return self._idx
3598 @property
3599 def bitmap(self) -> "PackBitmap | None":
3600 """The bitmap being used, if available.
3602 Returns:
3603 PackBitmap instance or None if no bitmap exists
3605 Raises:
3606 ValueError: If bitmap file is invalid or corrupt
3607 """
3608 if self._bitmap is None:
3609 from .bitmap import read_bitmap
3611 self._bitmap = read_bitmap(self._bitmap_path, pack_index=self.index)
3612 return self._bitmap
3614 def ensure_bitmap(
3615 self,
3616 object_store: "BaseObjectStore",
3617 refs: dict["Ref", "ObjectID"],
3618 commit_interval: int | None = None,
3619 progress: Callable[[str], None] | None = None,
3620 ) -> "PackBitmap":
3621 """Ensure a bitmap exists for this pack, generating one if needed.
3623 Args:
3624 object_store: Object store to read objects from
3625 refs: Dictionary of ref names to commit SHAs
3626 commit_interval: Include every Nth commit in bitmap index
3627 progress: Optional progress reporting callback
3629 Returns:
3630 PackBitmap instance (either existing or newly generated)
3631 """
3632 from .bitmap import generate_bitmap, write_bitmap
3634 # Check if bitmap already exists
3635 try:
3636 existing = self.bitmap
3637 if existing is not None:
3638 return existing
3639 except FileNotFoundError:
3640 pass # No bitmap, we'll generate one
3642 # Generate new bitmap
3643 if progress:
3644 progress(f"Generating bitmap for {self.name().decode('utf-8')}...\n")
3646 pack_bitmap = generate_bitmap(
3647 self.index,
3648 object_store,
3649 refs,
3650 self.get_stored_checksum(),
3651 commit_interval=commit_interval,
3652 progress=progress,
3653 )
3655 # Write bitmap file
3656 write_bitmap(self._bitmap_path, pack_bitmap)
3658 if progress:
3659 progress(f"Wrote {self._bitmap_path}\n")
3661 # Update cached bitmap
3662 self._bitmap = pack_bitmap
3664 return pack_bitmap
3666 def close(self) -> None:
3667 """Close the pack file and index."""
3668 if self._data is not None:
3669 self._data.close()
3670 if self._idx is not None:
3671 self._idx.close()
3673 def __enter__(self) -> "Pack":
3674 """Enter context manager."""
3675 return self
3677 def __exit__(
3678 self,
3679 exc_type: type | None,
3680 exc_val: BaseException | None,
3681 exc_tb: TracebackType | None,
3682 ) -> None:
3683 """Exit context manager."""
3684 self.close()
3686 def __eq__(self, other: object) -> bool:
3687 """Check equality with another pack."""
3688 if not isinstance(other, Pack):
3689 return False
3690 return self.index == other.index
3692 def __len__(self) -> int:
3693 """Number of entries in this pack."""
3694 return len(self.index)
3696 def __repr__(self) -> str:
3697 """Return string representation of this pack."""
3698 return f"{self.__class__.__name__}({self._basename!r})"
3700 def __iter__(self) -> Iterator[ObjectID]:
3701 """Iterate over all the sha1s of the objects in this pack."""
3702 return iter(self.index)
3704 def check_length_and_checksum(self) -> None:
3705 """Sanity check the length and checksum of the pack index and data."""
3706 assert len(self.index) == len(self.data), (
3707 f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)"
3708 )
3709 idx_stored_checksum = self.index.get_pack_checksum()
3710 data_stored_checksum = self.data.get_stored_checksum()
3711 if (
3712 idx_stored_checksum is not None
3713 and idx_stored_checksum != data_stored_checksum
3714 ):
3715 raise ChecksumMismatch(
3716 sha_to_hex(RawObjectID(idx_stored_checksum)),
3717 sha_to_hex(RawObjectID(data_stored_checksum)),
3718 )
3720 def check(self) -> None:
3721 """Check the integrity of this pack.
3723 Raises:
3724 ChecksumMismatch: if a checksum for the index or data is wrong
3725 """
3726 self.index.check()
3727 self.data.check()
3728 for obj in self.iterobjects():
3729 obj.check()
3730 # TODO: object connectivity checks
3732 def get_stored_checksum(self) -> bytes:
3733 """Return the stored checksum of the pack data."""
3734 return self.data.get_stored_checksum()
3736 def pack_tuples(self) -> list[tuple[ShaFile, None]]:
3737 """Return pack tuples for all objects in pack."""
3738 return [(o, None) for o in self.iterobjects()]
3740 def __contains__(self, sha1: ObjectID | RawObjectID) -> bool:
3741 """Check whether this pack contains a particular SHA1."""
3742 try:
3743 self.index.object_offset(sha1)
3744 return True
3745 except KeyError:
3746 return False
3748 def get_raw(self, sha1: RawObjectID | ObjectID) -> tuple[int, bytes]:
3749 """Get raw object data by SHA1."""
3750 offset = self.index.object_offset(sha1)
3751 obj_type, obj = self.data.get_object_at(offset)
3752 type_num, chunks = self.resolve_object(offset, obj_type, obj)
3753 return type_num, b"".join(chunks) # type: ignore[arg-type]
3755 def __getitem__(self, sha1: "ObjectID | RawObjectID") -> ShaFile:
3756 """Retrieve the specified SHA1."""
3757 type, uncomp = self.get_raw(sha1)
3758 return ShaFile.from_raw_string(type, uncomp, sha=sha1)
3760 def iterobjects(self) -> Iterator[ShaFile]:
3761 """Iterate over the objects in this pack."""
3762 return iter(
3763 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref)
3764 )
3766 def iterobjects_subset(
3767 self, shas: Iterable[ObjectID], *, allow_missing: bool = False
3768 ) -> Iterator[ShaFile]:
3769 """Iterate over a subset of objects in this pack."""
3770 return (
3771 uo
3772 for uo in PackInflater.for_pack_subset(
3773 self,
3774 shas,
3775 allow_missing=allow_missing,
3776 resolve_ext_ref=self.resolve_ext_ref,
3777 )
3778 if uo.id in shas
3779 )
3781 def iter_unpacked_subset(
3782 self,
3783 shas: Iterable[ObjectID | RawObjectID],
3784 *,
3785 include_comp: bool = False,
3786 allow_missing: bool = False,
3787 convert_ofs_delta: bool = False,
3788 ) -> Iterator[UnpackedObject]:
3789 """Iterate over unpacked objects in subset."""
3790 ofs_pending: dict[int, list[UnpackedObject]] = defaultdict(list)
3791 ofs: dict[int, bytes] = {}
3792 todo: set[ObjectID | RawObjectID] = set(shas)
3793 for unpacked in self.iter_unpacked(include_comp=include_comp):
3794 sha = unpacked.sha()
3795 if unpacked.offset is not None:
3796 ofs[unpacked.offset] = sha
3797 hexsha = sha_to_hex(RawObjectID(sha))
3798 if hexsha in todo:
3799 if unpacked.pack_type_num == OFS_DELTA:
3800 assert isinstance(unpacked.delta_base, int)
3801 assert unpacked.offset is not None
3802 base_offset = unpacked.offset - unpacked.delta_base
3803 try:
3804 unpacked.delta_base = ofs[base_offset]
3805 except KeyError:
3806 ofs_pending[base_offset].append(unpacked)
3807 continue
3808 else:
3809 unpacked.pack_type_num = REF_DELTA
3810 yield unpacked
3811 todo.remove(hexsha)
3812 if unpacked.offset is not None:
3813 for child in ofs_pending.pop(unpacked.offset, []):
3814 child.pack_type_num = REF_DELTA
3815 child.delta_base = sha
3816 yield child
3817 assert not ofs_pending
3818 if not allow_missing and todo:
3819 raise UnresolvedDeltas(list(todo))
3821 def iter_unpacked(self, include_comp: bool = False) -> Iterator[UnpackedObject]:
3822 """Iterate over all unpacked objects in this pack."""
3823 ofs_to_entries = {
3824 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries()
3825 }
3826 for unpacked in self.data.iter_unpacked(include_comp=include_comp):
3827 assert unpacked.offset is not None
3828 (sha, crc32) = ofs_to_entries[unpacked.offset]
3829 unpacked._sha = sha
3830 unpacked.crc32 = crc32
3831 yield unpacked
3833 def keep(self, msg: bytes | None = None) -> str:
3834 """Add a .keep file for the pack, preventing git from garbage collecting it.
3836 Args:
3837 msg: A message written inside the .keep file; can be used later
3838 to determine whether or not a .keep file is obsolete.
3839 Returns: The path of the .keep file, as a string.
3840 """
3841 keepfile_name = f"{self._basename}.keep"
3842 with GitFile(keepfile_name, "wb") as keepfile:
3843 if msg:
3844 keepfile.write(msg)
3845 keepfile.write(b"\n")
3846 return keepfile_name
3848 def get_ref(
3849 self, sha: RawObjectID | ObjectID
3850 ) -> tuple[int | None, int, OldUnpackedObject]:
3851 """Get the object for a ref SHA, only looking in this pack."""
3852 # TODO: cache these results
3853 try:
3854 offset = self.index.object_offset(sha)
3855 except KeyError:
3856 offset = None
3857 if offset:
3858 type, obj = self.data.get_object_at(offset)
3859 elif self.resolve_ext_ref:
3860 type, obj = self.resolve_ext_ref(sha)
3861 else:
3862 raise KeyError(sha)
3863 return offset, type, obj
3865 def resolve_object(
3866 self,
3867 offset: int,
3868 type: int,
3869 obj: OldUnpackedObject,
3870 get_ref: Callable[
3871 [RawObjectID | ObjectID], tuple[int | None, int, OldUnpackedObject]
3872 ]
3873 | None = None,
3874 ) -> tuple[int, OldUnpackedObject]:
3875 """Resolve an object, possibly resolving deltas when necessary.
3877 Returns: Tuple with object type and contents.
3878 """
3879 # Walk down the delta chain, building a stack of deltas to reach
3880 # the requested object.
3881 base_offset: int | None = offset
3882 base_type = type
3883 base_obj = obj
3884 delta_stack = []
3885 while base_type in DELTA_TYPES:
3886 prev_offset = base_offset
3887 if get_ref is None:
3888 get_ref = self.get_ref
3889 if base_type == OFS_DELTA:
3890 (delta_offset, delta) = base_obj
3891 # TODO: clean up asserts and replace with nicer error messages
3892 assert isinstance(delta_offset, int), (
3893 f"Expected int, got {delta_offset.__class__}"
3894 )
3895 assert base_offset is not None
3896 base_offset = base_offset - delta_offset
3897 base_type, base_obj = self.data.get_object_at(base_offset)
3898 assert isinstance(base_type, int)
3899 elif base_type == REF_DELTA:
3900 (basename, delta) = base_obj
3901 assert isinstance(basename, bytes) and len(basename) == 20
3902 base_offset, base_type, base_obj = get_ref(cast(RawObjectID, basename))
3903 assert isinstance(base_type, int)
3904 if base_offset == prev_offset: # object is based on itself
3905 raise UnresolvedDeltas([basename])
3906 delta_stack.append((prev_offset, base_type, delta))
3908 # Now grab the base object (mustn't be a delta) and apply the
3909 # deltas all the way up the stack.
3910 chunks = base_obj
3911 for prev_offset, _delta_type, delta in reversed(delta_stack):
3912 # Convert chunks to bytes for apply_delta if needed
3913 if isinstance(chunks, list):
3914 chunks_bytes = b"".join(chunks)
3915 elif isinstance(chunks, tuple):
3916 # For tuple type, second element is the actual data
3917 _, chunk_data = chunks
3918 if isinstance(chunk_data, list):
3919 chunks_bytes = b"".join(chunk_data)
3920 else:
3921 chunks_bytes = chunk_data
3922 else:
3923 chunks_bytes = chunks
3925 # Apply delta and get result as list
3926 chunks = apply_delta(chunks_bytes, delta)
3928 if prev_offset is not None:
3929 self.data._offset_cache[prev_offset] = base_type, chunks
3930 return base_type, chunks
3932 def entries(
3933 self, progress: Callable[[int, int], None] | None = None
3934 ) -> Iterator[PackIndexEntry]:
3935 """Yield entries summarizing the contents of this pack.
3937 Args:
3938 progress: Progress function, called with current and total
3939 object count.
3940 Returns: iterator of tuples with (sha, offset, crc32)
3941 """
3942 return self.data.iterentries(
3943 progress=progress, resolve_ext_ref=self.resolve_ext_ref
3944 )
3946 def sorted_entries(
3947 self, progress: ProgressFn | None = None
3948 ) -> Iterator[PackIndexEntry]:
3949 """Return entries in this pack, sorted by SHA.
3951 Args:
3952 progress: Progress function, called with current and total
3953 object count
3954 Returns: Iterator of tuples with (sha, offset, crc32)
3955 """
3956 return iter(
3957 self.data.sorted_entries(
3958 progress=progress, resolve_ext_ref=self.resolve_ext_ref
3959 )
3960 )
3962 def get_unpacked_object(
3963 self,
3964 sha: ObjectID | RawObjectID,
3965 *,
3966 include_comp: bool = False,
3967 convert_ofs_delta: bool = True,
3968 ) -> UnpackedObject:
3969 """Get the unpacked object for a sha.
3971 Args:
3972 sha: SHA of object to fetch
3973 include_comp: Whether to include compression data in UnpackedObject
3974 convert_ofs_delta: Whether to convert offset deltas to ref deltas
3975 """
3976 offset = self.index.object_offset(sha)
3977 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp)
3978 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta:
3979 assert isinstance(unpacked.delta_base, int)
3980 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base)
3981 unpacked.pack_type_num = REF_DELTA
3982 return unpacked
3985def extend_pack(
3986 f: BinaryIO,
3987 object_ids: Set["RawObjectID"],
3988 get_raw: Callable[["RawObjectID | ObjectID"], tuple[int, bytes]],
3989 *,
3990 compression_level: int = -1,
3991 progress: Callable[[bytes], None] | None = None,
3992) -> tuple[bytes, list[tuple["RawObjectID", int, int]]]:
3993 """Extend a pack file with more objects.
3995 The caller should make sure that object_ids does not contain any objects
3996 that are already in the pack
3997 """
3998 # Update the header with the new number of objects.
3999 f.seek(0)
4000 _version, num_objects = read_pack_header(f.read)
4002 if object_ids:
4003 f.seek(0)
4004 write_pack_header(f.write, num_objects + len(object_ids))
4006 # Must flush before reading (http://bugs.python.org/issue3207)
4007 f.flush()
4009 # Rescan the rest of the pack, computing the SHA with the new header.
4010 new_sha = compute_file_sha(f, end_ofs=-20)
4012 # Must reposition before writing (http://bugs.python.org/issue3207)
4013 f.seek(0, os.SEEK_CUR)
4015 extra_entries = []
4017 # Complete the pack.
4018 for i, object_id in enumerate(object_ids):
4019 if progress is not None:
4020 progress(
4021 (f"writing extra base objects: {i}/{len(object_ids)}\r").encode("ascii")
4022 )
4023 assert len(object_id) == 20
4024 type_num, data = get_raw(object_id)
4025 offset = f.tell()
4026 crc32 = write_pack_object(
4027 f.write,
4028 type_num,
4029 [data], # Convert bytes to list[bytes]
4030 sha=new_sha,
4031 compression_level=compression_level,
4032 )
4033 extra_entries.append((object_id, offset, crc32))
4034 pack_sha = new_sha.digest()
4035 f.write(pack_sha)
4036 return pack_sha, extra_entries
4039try:
4040 from dulwich._pack import ( # type: ignore
4041 apply_delta,
4042 bisect_find_sha,
4043 )
4044except ImportError:
4045 pass
4047# Try to import the Rust version of create_delta
4048try:
4049 from dulwich._pack import create_delta as _create_delta_rs
4050except ImportError:
4051 pass
4052else:
4053 # Wrap the Rust version to match the Python API (returns bytes instead of Iterator)
4054 def _create_delta_rs_wrapper(base_buf: bytes, target_buf: bytes) -> Iterator[bytes]:
4055 """Wrapper for Rust create_delta to match Python API."""
4056 yield _create_delta_rs(base_buf, target_buf)
4058 create_delta = _create_delta_rs_wrapper