Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/pack.py: 23%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# pack.py -- For dealing with packed git objects.
2# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk>
4#
5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
7# General Public License as published by the Free Software Foundation; version 2.0
8# or (at your option) any later version. You can redistribute it and/or
9# modify it under the terms of either of these two licenses.
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# You should have received a copy of the licenses; if not, see
18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
20# License, Version 2.0.
21#
23"""Classes for dealing with packed git objects.
25A pack is a compact representation of a bunch of objects, stored
26using deltas where possible.
28They have two parts, the pack file, which stores the data, and an index
29that tells you where the data is.
31To find an object you look in all of the index files 'til you find a
32match for the object name. You then use the pointer got from this as
33a pointer in to the corresponding packfile.
34"""
36import binascii
37from collections import defaultdict, deque
38from contextlib import suppress
39from io import BytesIO, UnsupportedOperation
41try:
42 from cdifflib import CSequenceMatcher as SequenceMatcher
43except ModuleNotFoundError:
44 from difflib import SequenceMatcher
46import os
47import struct
48import sys
49import warnings
50import zlib
51from collections.abc import Iterable, Iterator, Sequence, Set
52from hashlib import sha1
53from itertools import chain
54from os import SEEK_CUR, SEEK_END
55from struct import unpack_from
56from types import TracebackType
57from typing import (
58 IO,
59 TYPE_CHECKING,
60 Any,
61 BinaryIO,
62 Callable,
63 Generic,
64 Optional,
65 Protocol,
66 TypeVar,
67 Union,
68)
70try:
71 import mmap
72except ImportError:
73 has_mmap = False
74else:
75 has_mmap = True
77if sys.version_info >= (3, 12):
78 from collections.abc import Buffer
79else:
80 Buffer = Union[bytes, bytearray, memoryview]
82if TYPE_CHECKING:
83 from _hashlib import HASH as HashObject
85 from .bitmap import PackBitmap
86 from .commit_graph import CommitGraph
88# For some reason the above try, except fails to set has_mmap = False for plan9
89if sys.platform == "Plan9":
90 has_mmap = False
92from . import replace_me
93from .errors import ApplyDeltaError, ChecksumMismatch
94from .file import GitFile, _GitFile
95from .lru_cache import LRUSizeCache
96from .objects import ObjectID, ShaFile, hex_to_sha, object_header, sha_to_hex
98OFS_DELTA = 6
99REF_DELTA = 7
101DELTA_TYPES = (OFS_DELTA, REF_DELTA)
104DEFAULT_PACK_DELTA_WINDOW_SIZE = 10
106# Keep pack files under 16Mb in memory, otherwise write them out to disk
107PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024
109# Default pack index version to use when none is specified
110DEFAULT_PACK_INDEX_VERSION = 2
113OldUnpackedObject = Union[tuple[Union[bytes, int], list[bytes]], list[bytes]]
114ResolveExtRefFn = Callable[[bytes], tuple[int, OldUnpackedObject]]
115ProgressFn = Callable[[int, str], None]
116PackHint = tuple[int, Optional[bytes]]
119class UnresolvedDeltas(Exception):
120 """Delta objects could not be resolved."""
122 def __init__(self, shas: list[bytes]) -> None:
123 """Initialize UnresolvedDeltas exception.
125 Args:
126 shas: List of SHA hashes for unresolved delta objects
127 """
128 self.shas = shas
131class ObjectContainer(Protocol):
132 """Protocol for objects that can contain git objects."""
134 def add_object(self, obj: ShaFile) -> None:
135 """Add a single object to this object store."""
137 def add_objects(
138 self,
139 objects: Sequence[tuple[ShaFile, Optional[str]]],
140 progress: Optional[Callable[..., None]] = None,
141 ) -> Optional["Pack"]:
142 """Add a set of objects to this object store.
144 Args:
145 objects: Iterable over a list of (object, path) tuples
146 progress: Progress callback for object insertion
147 Returns: Optional Pack object of the objects written.
148 """
150 def __contains__(self, sha1: bytes) -> bool:
151 """Check if a hex sha is present."""
153 def __getitem__(self, sha1: bytes) -> ShaFile:
154 """Retrieve an object."""
156 def get_commit_graph(self) -> Optional["CommitGraph"]:
157 """Get the commit graph for this object store.
159 Returns:
160 CommitGraph object if available, None otherwise
161 """
162 return None
165class PackedObjectContainer(ObjectContainer):
166 """Container for objects packed in a pack file."""
168 def get_unpacked_object(
169 self, sha1: bytes, *, include_comp: bool = False
170 ) -> "UnpackedObject":
171 """Get a raw unresolved object.
173 Args:
174 sha1: SHA-1 hash of the object
175 include_comp: Whether to include compressed data
177 Returns:
178 UnpackedObject instance
179 """
180 raise NotImplementedError(self.get_unpacked_object)
182 def iterobjects_subset(
183 self, shas: Iterable[bytes], *, allow_missing: bool = False
184 ) -> Iterator[ShaFile]:
185 """Iterate over a subset of objects.
187 Args:
188 shas: Iterable of object SHAs to retrieve
189 allow_missing: If True, skip missing objects
191 Returns:
192 Iterator of ShaFile objects
193 """
194 raise NotImplementedError(self.iterobjects_subset)
196 def iter_unpacked_subset(
197 self,
198 shas: Iterable[bytes],
199 *,
200 include_comp: bool = False,
201 allow_missing: bool = False,
202 convert_ofs_delta: bool = True,
203 ) -> Iterator["UnpackedObject"]:
204 """Iterate over unpacked objects from a subset of SHAs.
206 Args:
207 shas: Set of object SHAs to retrieve
208 include_comp: Include compressed data if True
209 allow_missing: If True, skip missing objects
210 convert_ofs_delta: If True, convert offset deltas to ref deltas
212 Returns:
213 Iterator of UnpackedObject instances
214 """
215 raise NotImplementedError(self.iter_unpacked_subset)
218class UnpackedObjectStream:
219 """Abstract base class for a stream of unpacked objects."""
221 def __iter__(self) -> Iterator["UnpackedObject"]:
222 """Iterate over unpacked objects."""
223 raise NotImplementedError(self.__iter__)
225 def __len__(self) -> int:
226 """Return the number of objects in the stream."""
227 raise NotImplementedError(self.__len__)
230def take_msb_bytes(
231 read: Callable[[int], bytes], crc32: Optional[int] = None
232) -> tuple[list[int], Optional[int]]:
233 """Read bytes marked with most significant bit.
235 Args:
236 read: Read function
237 crc32: Optional CRC32 checksum to update
239 Returns:
240 Tuple of (list of bytes read, updated CRC32 or None)
241 """
242 ret: list[int] = []
243 while len(ret) == 0 or ret[-1] & 0x80:
244 b = read(1)
245 if crc32 is not None:
246 crc32 = binascii.crc32(b, crc32)
247 ret.append(ord(b[:1]))
248 return ret, crc32
251class PackFileDisappeared(Exception):
252 """Raised when a pack file unexpectedly disappears."""
254 def __init__(self, obj: object) -> None:
255 """Initialize PackFileDisappeared exception.
257 Args:
258 obj: The object that triggered the exception
259 """
260 self.obj = obj
263class UnpackedObject:
264 """Class encapsulating an object unpacked from a pack file.
266 These objects should only be created from within unpack_object. Most
267 members start out as empty and are filled in at various points by
268 read_zlib_chunks, unpack_object, DeltaChainIterator, etc.
270 End users of this object should take care that the function they're getting
271 this object from is guaranteed to set the members they need.
272 """
274 __slots__ = [
275 "_sha", # Cached binary SHA.
276 "comp_chunks", # Compressed object chunks.
277 "crc32", # CRC32.
278 "decomp_chunks", # Decompressed object chunks.
279 "decomp_len", # Decompressed length of this object.
280 "delta_base", # Delta base offset or SHA.
281 "obj_chunks", # Decompressed and delta-resolved chunks.
282 "obj_type_num", # Type of this object.
283 "offset", # Offset in its pack.
284 "pack_type_num", # Type of this object in the pack (may be a delta).
285 ]
287 obj_type_num: Optional[int]
288 obj_chunks: Optional[list[bytes]]
289 delta_base: Union[None, bytes, int]
290 decomp_chunks: list[bytes]
291 comp_chunks: Optional[list[bytes]]
292 decomp_len: Optional[int]
293 crc32: Optional[int]
294 offset: Optional[int]
295 pack_type_num: int
296 _sha: Optional[bytes]
298 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be
299 # methods of this object.
300 def __init__(
301 self,
302 pack_type_num: int,
303 *,
304 delta_base: Union[None, bytes, int] = None,
305 decomp_len: Optional[int] = None,
306 crc32: Optional[int] = None,
307 sha: Optional[bytes] = None,
308 decomp_chunks: Optional[list[bytes]] = None,
309 offset: Optional[int] = None,
310 ) -> None:
311 """Initialize an UnpackedObject.
313 Args:
314 pack_type_num: Type number of this object in the pack
315 delta_base: Delta base (offset or SHA) if this is a delta object
316 decomp_len: Decompressed length of this object
317 crc32: CRC32 checksum
318 sha: SHA-1 hash of the object
319 decomp_chunks: Decompressed chunks
320 offset: Offset in the pack file
321 """
322 self.offset = offset
323 self._sha = sha
324 self.pack_type_num = pack_type_num
325 self.delta_base = delta_base
326 self.comp_chunks = None
327 self.decomp_chunks: list[bytes] = decomp_chunks or []
328 if decomp_chunks is not None and decomp_len is None:
329 self.decomp_len = sum(map(len, decomp_chunks))
330 else:
331 self.decomp_len = decomp_len
332 self.crc32 = crc32
334 if pack_type_num in DELTA_TYPES:
335 self.obj_type_num = None
336 self.obj_chunks = None
337 else:
338 self.obj_type_num = pack_type_num
339 self.obj_chunks = self.decomp_chunks
340 self.delta_base = delta_base
342 def sha(self) -> bytes:
343 """Return the binary SHA of this object."""
344 if self._sha is None:
345 assert self.obj_type_num is not None and self.obj_chunks is not None
346 self._sha = obj_sha(self.obj_type_num, self.obj_chunks)
347 return self._sha
349 def sha_file(self) -> ShaFile:
350 """Return a ShaFile from this object."""
351 assert self.obj_type_num is not None and self.obj_chunks is not None
352 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)
354 # Only provided for backwards compatibility with code that expects either
355 # chunks or a delta tuple.
356 def _obj(self) -> OldUnpackedObject:
357 """Return the decompressed chunks, or (delta base, delta chunks)."""
358 if self.pack_type_num in DELTA_TYPES:
359 assert isinstance(self.delta_base, (bytes, int))
360 return (self.delta_base, self.decomp_chunks)
361 else:
362 return self.decomp_chunks
364 def __eq__(self, other: object) -> bool:
365 """Check equality with another UnpackedObject."""
366 if not isinstance(other, UnpackedObject):
367 return False
368 for slot in self.__slots__:
369 if getattr(self, slot) != getattr(other, slot):
370 return False
371 return True
373 def __ne__(self, other: object) -> bool:
374 """Check inequality with another UnpackedObject."""
375 return not (self == other)
377 def __repr__(self) -> str:
378 """Return string representation of this UnpackedObject."""
379 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__]
380 return "{}({})".format(self.__class__.__name__, ", ".join(data))
383_ZLIB_BUFSIZE = 65536 # 64KB buffer for better I/O performance
386def read_zlib_chunks(
387 read_some: Callable[[int], bytes],
388 unpacked: UnpackedObject,
389 include_comp: bool = False,
390 buffer_size: int = _ZLIB_BUFSIZE,
391) -> bytes:
392 """Read zlib data from a buffer.
394 This function requires that the buffer have additional data following the
395 compressed data, which is guaranteed to be the case for git pack files.
397 Args:
398 read_some: Read function that returns at least one byte, but may
399 return less than the requested size.
400 unpacked: An UnpackedObject to write result data to. If its crc32
401 attr is not None, the CRC32 of the compressed bytes will be computed
402 using this starting CRC32.
403 After this function, will have the following attrs set:
404 * comp_chunks (if include_comp is True)
405 * decomp_chunks
406 * decomp_len
407 * crc32
408 include_comp: If True, include compressed data in the result.
409 buffer_size: Size of the read buffer.
410 Returns: Leftover unused data from the decompression.
412 Raises:
413 zlib.error: if a decompression error occurred.
414 """
415 if unpacked.decomp_len is None or unpacked.decomp_len <= -1:
416 raise ValueError("non-negative zlib data stream size expected")
417 decomp_obj = zlib.decompressobj()
419 comp_chunks = []
420 decomp_chunks = unpacked.decomp_chunks
421 decomp_len = 0
422 crc32 = unpacked.crc32
424 while True:
425 add = read_some(buffer_size)
426 if not add:
427 raise zlib.error("EOF before end of zlib stream")
428 comp_chunks.append(add)
429 decomp = decomp_obj.decompress(add)
430 decomp_len += len(decomp)
431 decomp_chunks.append(decomp)
432 unused = decomp_obj.unused_data
433 if unused:
434 left = len(unused)
435 if crc32 is not None:
436 crc32 = binascii.crc32(add[:-left], crc32)
437 if include_comp:
438 comp_chunks[-1] = add[:-left]
439 break
440 elif crc32 is not None:
441 crc32 = binascii.crc32(add, crc32)
442 if crc32 is not None:
443 crc32 &= 0xFFFFFFFF
445 if decomp_len != unpacked.decomp_len:
446 raise zlib.error("decompressed data does not match expected size")
448 unpacked.crc32 = crc32
449 if include_comp:
450 unpacked.comp_chunks = comp_chunks
451 return unused
454def iter_sha1(iter: Iterable[bytes]) -> bytes:
455 """Return the hexdigest of the SHA1 over a set of names.
457 Args:
458 iter: Iterator over string objects
459 Returns: 40-byte hex sha1 digest
460 """
461 sha = sha1()
462 for name in iter:
463 sha.update(name)
464 return sha.hexdigest().encode("ascii")
467def load_pack_index(path: Union[str, os.PathLike[str]]) -> "PackIndex":
468 """Load an index file by path.
470 Args:
471 path: Path to the index file
472 Returns: A PackIndex loaded from the given path
473 """
474 with GitFile(path, "rb") as f:
475 return load_pack_index_file(path, f)
478def _load_file_contents(
479 f: Union[IO[bytes], _GitFile], size: Optional[int] = None
480) -> tuple[Union[bytes, Any], int]:
481 """Load contents from a file, preferring mmap when possible.
483 Args:
484 f: File-like object to load
485 size: Expected size, or None to determine from file
486 Returns: Tuple of (contents, size)
487 """
488 try:
489 fd = f.fileno()
490 except (UnsupportedOperation, AttributeError):
491 fd = None
492 # Attempt to use mmap if possible
493 if fd is not None:
494 if size is None:
495 size = os.fstat(fd).st_size
496 if has_mmap:
497 try:
498 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
499 except (OSError, ValueError):
500 # Can't mmap - perhaps a socket or invalid file descriptor
501 pass
502 else:
503 return contents, size
504 contents_bytes = f.read()
505 size = len(contents_bytes)
506 return contents_bytes, size
509def load_pack_index_file(
510 path: Union[str, os.PathLike[str]], f: Union[IO[bytes], _GitFile]
511) -> "PackIndex":
512 """Load an index file from a file-like object.
514 Args:
515 path: Path for the index file
516 f: File-like object
517 Returns: A PackIndex loaded from the given file
518 """
519 contents, size = _load_file_contents(f)
520 if contents[:4] == b"\377tOc":
521 version = struct.unpack(b">L", contents[4:8])[0]
522 if version == 2:
523 return PackIndex2(path, file=f, contents=contents, size=size)
524 elif version == 3:
525 return PackIndex3(path, file=f, contents=contents, size=size)
526 else:
527 raise KeyError(f"Unknown pack index format {version}")
528 else:
529 return PackIndex1(path, file=f, contents=contents, size=size)
532def bisect_find_sha(
533 start: int, end: int, sha: bytes, unpack_name: Callable[[int], bytes]
534) -> Optional[int]:
535 """Find a SHA in a data blob with sorted SHAs.
537 Args:
538 start: Start index of range to search
539 end: End index of range to search
540 sha: Sha to find
541 unpack_name: Callback to retrieve SHA by index
542 Returns: Index of the SHA, or None if it wasn't found
543 """
544 assert start <= end
545 while start <= end:
546 i = (start + end) // 2
547 file_sha = unpack_name(i)
548 if file_sha < sha:
549 start = i + 1
550 elif file_sha > sha:
551 end = i - 1
552 else:
553 return i
554 return None
557PackIndexEntry = tuple[bytes, int, Optional[int]]
560class PackIndex:
561 """An index in to a packfile.
563 Given a sha id of an object a pack index can tell you the location in the
564 packfile of that object if it has it.
565 """
567 # Default to SHA-1 for backward compatibility
568 hash_algorithm = 1
569 hash_size = 20
571 def __eq__(self, other: object) -> bool:
572 """Check equality with another PackIndex."""
573 if not isinstance(other, PackIndex):
574 return False
576 for (name1, _, _), (name2, _, _) in zip(
577 self.iterentries(), other.iterentries()
578 ):
579 if name1 != name2:
580 return False
581 return True
583 def __ne__(self, other: object) -> bool:
584 """Check if this pack index is not equal to another."""
585 return not self.__eq__(other)
587 def __len__(self) -> int:
588 """Return the number of entries in this pack index."""
589 raise NotImplementedError(self.__len__)
591 def __iter__(self) -> Iterator[bytes]:
592 """Iterate over the SHAs in this pack."""
593 return map(sha_to_hex, self._itersha())
595 def iterentries(self) -> Iterator[PackIndexEntry]:
596 """Iterate over the entries in this pack index.
598 Returns: iterator over tuples with object name, offset in packfile and
599 crc32 checksum.
600 """
601 raise NotImplementedError(self.iterentries)
603 def get_pack_checksum(self) -> Optional[bytes]:
604 """Return the SHA1 checksum stored for the corresponding packfile.
606 Returns: 20-byte binary digest, or None if not available
607 """
608 raise NotImplementedError(self.get_pack_checksum)
610 @replace_me(since="0.21.0", remove_in="0.23.0")
611 def object_index(self, sha: bytes) -> int:
612 """Return the index for the given SHA.
614 Args:
615 sha: SHA-1 hash
617 Returns:
618 Index position
619 """
620 return self.object_offset(sha)
622 def object_offset(self, sha: bytes) -> int:
623 """Return the offset in to the corresponding packfile for the object.
625 Given the name of an object it will return the offset that object
626 lives at within the corresponding pack file. If the pack file doesn't
627 have the object then None will be returned.
628 """
629 raise NotImplementedError(self.object_offset)
631 def object_sha1(self, index: int) -> bytes:
632 """Return the SHA1 corresponding to the index in the pack file."""
633 for name, offset, _crc32 in self.iterentries():
634 if offset == index:
635 return name
636 else:
637 raise KeyError(index)
639 def _object_offset(self, sha: bytes) -> int:
640 """See object_offset.
642 Args:
643 sha: A *binary* SHA string. (20 characters long)_
644 """
645 raise NotImplementedError(self._object_offset)
647 def objects_sha1(self) -> bytes:
648 """Return the hex SHA1 over all the shas of all objects in this pack.
650 Note: This is used for the filename of the pack.
651 """
652 return iter_sha1(self._itersha())
654 def _itersha(self) -> Iterator[bytes]:
655 """Yield all the SHA1's of the objects in the index, sorted."""
656 raise NotImplementedError(self._itersha)
658 def iter_prefix(self, prefix: bytes) -> Iterator[bytes]:
659 """Iterate over all SHA1s with the given prefix.
661 Args:
662 prefix: Binary prefix to match
663 Returns: Iterator of matching SHA1s
664 """
665 # Default implementation for PackIndex classes that don't override
666 for sha, _, _ in self.iterentries():
667 if sha.startswith(prefix):
668 yield sha
670 def close(self) -> None:
671 """Close any open files."""
673 def check(self) -> None:
674 """Check the consistency of this pack index."""
677class MemoryPackIndex(PackIndex):
678 """Pack index that is stored entirely in memory."""
680 def __init__(
681 self,
682 entries: list[tuple[bytes, int, Optional[int]]],
683 pack_checksum: Optional[bytes] = None,
684 ) -> None:
685 """Create a new MemoryPackIndex.
687 Args:
688 entries: Sequence of name, idx, crc32 (sorted)
689 pack_checksum: Optional pack checksum
690 """
691 self._by_sha = {}
692 self._by_offset = {}
693 for name, offset, _crc32 in entries:
694 self._by_sha[name] = offset
695 self._by_offset[offset] = name
696 self._entries = entries
697 self._pack_checksum = pack_checksum
699 def get_pack_checksum(self) -> Optional[bytes]:
700 """Return the SHA checksum stored for the corresponding packfile."""
701 return self._pack_checksum
703 def __len__(self) -> int:
704 """Return the number of entries in this pack index."""
705 return len(self._entries)
707 def object_offset(self, sha: bytes) -> int:
708 """Return the offset for the given SHA.
710 Args:
711 sha: SHA to look up (binary or hex)
712 Returns: Offset in the pack file
713 """
714 if len(sha) == 40:
715 sha = hex_to_sha(sha)
716 return self._by_sha[sha]
718 def object_sha1(self, offset: int) -> bytes:
719 """Return the SHA1 for the object at the given offset."""
720 return self._by_offset[offset]
722 def _itersha(self) -> Iterator[bytes]:
723 """Iterate over all SHA1s in the index."""
724 return iter(self._by_sha)
726 def iterentries(self) -> Iterator[PackIndexEntry]:
727 """Iterate over all index entries."""
728 return iter(self._entries)
730 @classmethod
731 def for_pack(cls, pack_data: "PackData") -> "MemoryPackIndex":
732 """Create a MemoryPackIndex from a PackData object."""
733 return MemoryPackIndex(
734 list(pack_data.sorted_entries()), pack_data.get_stored_checksum()
735 )
737 @classmethod
738 def clone(cls, other_index: "PackIndex") -> "MemoryPackIndex":
739 """Create a copy of another PackIndex in memory."""
740 return cls(list(other_index.iterentries()), other_index.get_pack_checksum())
743class FilePackIndex(PackIndex):
744 """Pack index that is based on a file.
746 To do the loop it opens the file, and indexes first 256 4 byte groups
747 with the first byte of the sha id. The value in the four byte group indexed
748 is the end of the group that shares the same starting byte. Subtract one
749 from the starting byte and index again to find the start of the group.
750 The values are sorted by sha id within the group, so do the math to find
751 the start and end offset and then bisect in to find if the value is
752 present.
753 """
755 _fan_out_table: list[int]
756 _file: Union[IO[bytes], _GitFile]
758 def __init__(
759 self,
760 filename: Union[str, os.PathLike[str]],
761 file: Optional[Union[IO[bytes], _GitFile]] = None,
762 contents: Optional[Union[bytes, "mmap.mmap"]] = None,
763 size: Optional[int] = None,
764 ) -> None:
765 """Create a pack index object.
767 Provide it with the name of the index file to consider, and it will map
768 it whenever required.
769 """
770 self._filename = filename
771 # Take the size now, so it can be checked each time we map the file to
772 # ensure that it hasn't changed.
773 if file is None:
774 self._file = GitFile(filename, "rb")
775 else:
776 self._file = file
777 if contents is None:
778 self._contents, self._size = _load_file_contents(self._file, size)
779 else:
780 self._contents = contents
781 self._size = size if size is not None else len(contents)
783 @property
784 def path(self) -> str:
785 """Return the path to this index file."""
786 return os.fspath(self._filename)
788 def __eq__(self, other: object) -> bool:
789 """Check equality with another FilePackIndex."""
790 # Quick optimization:
791 if (
792 isinstance(other, FilePackIndex)
793 and self._fan_out_table != other._fan_out_table
794 ):
795 return False
797 return super().__eq__(other)
799 def close(self) -> None:
800 """Close the underlying file and any mmap."""
801 self._file.close()
802 close_fn = getattr(self._contents, "close", None)
803 if close_fn is not None:
804 close_fn()
806 def __len__(self) -> int:
807 """Return the number of entries in this pack index."""
808 return self._fan_out_table[-1]
810 def _unpack_entry(self, i: int) -> PackIndexEntry:
811 """Unpack the i-th entry in the index file.
813 Returns: Tuple with object name (SHA), offset in pack file and CRC32
814 checksum (if known).
815 """
816 raise NotImplementedError(self._unpack_entry)
818 def _unpack_name(self, i: int) -> bytes:
819 """Unpack the i-th name from the index file."""
820 raise NotImplementedError(self._unpack_name)
822 def _unpack_offset(self, i: int) -> int:
823 """Unpack the i-th object offset from the index file."""
824 raise NotImplementedError(self._unpack_offset)
826 def _unpack_crc32_checksum(self, i: int) -> Optional[int]:
827 """Unpack the crc32 checksum for the ith object from the index file."""
828 raise NotImplementedError(self._unpack_crc32_checksum)
830 def _itersha(self) -> Iterator[bytes]:
831 """Iterate over all SHA1s in the index."""
832 for i in range(len(self)):
833 yield self._unpack_name(i)
835 def iterentries(self) -> Iterator[PackIndexEntry]:
836 """Iterate over the entries in this pack index.
838 Returns: iterator over tuples with object name, offset in packfile and
839 crc32 checksum.
840 """
841 for i in range(len(self)):
842 yield self._unpack_entry(i)
844 def _read_fan_out_table(self, start_offset: int) -> list[int]:
845 """Read the fan-out table from the index.
847 The fan-out table contains 256 entries mapping first byte values
848 to the number of objects with SHA1s less than or equal to that byte.
850 Args:
851 start_offset: Offset in the file where the fan-out table starts
852 Returns: List of 256 integers
853 """
854 ret = []
855 for i in range(0x100):
856 fanout_entry = self._contents[
857 start_offset + i * 4 : start_offset + (i + 1) * 4
858 ]
859 ret.append(struct.unpack(">L", fanout_entry)[0])
860 return ret
862 def check(self) -> None:
863 """Check that the stored checksum matches the actual checksum."""
864 actual = self.calculate_checksum()
865 stored = self.get_stored_checksum()
866 if actual != stored:
867 raise ChecksumMismatch(stored, actual)
869 def calculate_checksum(self) -> bytes:
870 """Calculate the SHA1 checksum over this pack index.
872 Returns: This is a 20-byte binary digest
873 """
874 return sha1(self._contents[:-20]).digest()
876 def get_pack_checksum(self) -> bytes:
877 """Return the SHA1 checksum stored for the corresponding packfile.
879 Returns: 20-byte binary digest
880 """
881 return bytes(self._contents[-40:-20])
883 def get_stored_checksum(self) -> bytes:
884 """Return the SHA1 checksum stored for this index.
886 Returns: 20-byte binary digest
887 """
888 return bytes(self._contents[-20:])
890 def object_offset(self, sha: bytes) -> int:
891 """Return the offset in to the corresponding packfile for the object.
893 Given the name of an object it will return the offset that object
894 lives at within the corresponding pack file. If the pack file doesn't
895 have the object then None will be returned.
896 """
897 if len(sha) == 40:
898 sha = hex_to_sha(sha)
899 try:
900 return self._object_offset(sha)
901 except ValueError as exc:
902 closed = getattr(self._contents, "closed", None)
903 if closed in (None, True):
904 raise PackFileDisappeared(self) from exc
905 raise
907 def _object_offset(self, sha: bytes) -> int:
908 """See object_offset.
910 Args:
911 sha: A *binary* SHA string. (20 characters long)_
912 """
913 assert len(sha) == 20
914 idx = ord(sha[:1])
915 if idx == 0:
916 start = 0
917 else:
918 start = self._fan_out_table[idx - 1]
919 end = self._fan_out_table[idx]
920 i = bisect_find_sha(start, end, sha, self._unpack_name)
921 if i is None:
922 raise KeyError(sha)
923 return self._unpack_offset(i)
925 def iter_prefix(self, prefix: bytes) -> Iterator[bytes]:
926 """Iterate over all SHA1s with the given prefix."""
927 start = ord(prefix[:1])
928 if start == 0:
929 start = 0
930 else:
931 start = self._fan_out_table[start - 1]
932 end = ord(prefix[:1]) + 1
933 if end == 0x100:
934 end = len(self)
935 else:
936 end = self._fan_out_table[end]
937 assert start <= end
938 started = False
939 for i in range(start, end):
940 name: bytes = self._unpack_name(i)
941 if name.startswith(prefix):
942 yield name
943 started = True
944 elif started:
945 break
948class PackIndex1(FilePackIndex):
949 """Version 1 Pack Index file."""
951 def __init__(
952 self,
953 filename: Union[str, os.PathLike[str]],
954 file: Optional[Union[IO[bytes], _GitFile]] = None,
955 contents: Optional[bytes] = None,
956 size: Optional[int] = None,
957 ) -> None:
958 """Initialize a version 1 pack index.
960 Args:
961 filename: Path to the index file
962 file: Optional file object
963 contents: Optional mmap'd contents
964 size: Optional size of the index
965 """
966 super().__init__(filename, file, contents, size)
967 self.version = 1
968 self._fan_out_table = self._read_fan_out_table(0)
970 def _unpack_entry(self, i: int) -> tuple[bytes, int, None]:
971 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24))
972 return (name, offset, None)
974 def _unpack_name(self, i: int) -> bytes:
975 offset = (0x100 * 4) + (i * 24) + 4
976 return self._contents[offset : offset + 20]
978 def _unpack_offset(self, i: int) -> int:
979 offset = (0x100 * 4) + (i * 24)
980 result = unpack_from(">L", self._contents, offset)[0]
981 assert isinstance(result, int)
982 return result
984 def _unpack_crc32_checksum(self, i: int) -> None:
985 # Not stored in v1 index files
986 return None
989class PackIndex2(FilePackIndex):
990 """Version 2 Pack Index file."""
992 def __init__(
993 self,
994 filename: Union[str, os.PathLike[str]],
995 file: Optional[Union[IO[bytes], _GitFile]] = None,
996 contents: Optional[bytes] = None,
997 size: Optional[int] = None,
998 ) -> None:
999 """Initialize a version 2 pack index.
1001 Args:
1002 filename: Path to the index file
1003 file: Optional file object
1004 contents: Optional mmap'd contents
1005 size: Optional size of the index
1006 """
1007 super().__init__(filename, file, contents, size)
1008 if self._contents[:4] != b"\377tOc":
1009 raise AssertionError("Not a v2 pack index file")
1010 (self.version,) = unpack_from(b">L", self._contents, 4)
1011 if self.version != 2:
1012 raise AssertionError(f"Version was {self.version}")
1013 self._fan_out_table = self._read_fan_out_table(8)
1014 self._name_table_offset = 8 + 0x100 * 4
1015 self._crc32_table_offset = self._name_table_offset + 20 * len(self)
1016 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
1017 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(
1018 self
1019 )
1021 def _unpack_entry(self, i: int) -> tuple[bytes, int, int]:
1022 return (
1023 self._unpack_name(i),
1024 self._unpack_offset(i),
1025 self._unpack_crc32_checksum(i),
1026 )
1028 def _unpack_name(self, i: int) -> bytes:
1029 offset = self._name_table_offset + i * 20
1030 return self._contents[offset : offset + 20]
1032 def _unpack_offset(self, i: int) -> int:
1033 offset_pos = self._pack_offset_table_offset + i * 4
1034 offset = unpack_from(">L", self._contents, offset_pos)[0]
1035 assert isinstance(offset, int)
1036 if offset & (2**31):
1037 large_offset_pos = (
1038 self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8
1039 )
1040 offset = unpack_from(">Q", self._contents, large_offset_pos)[0]
1041 assert isinstance(offset, int)
1042 return offset
1044 def _unpack_crc32_checksum(self, i: int) -> int:
1045 result = unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
1046 assert isinstance(result, int)
1047 return result
1050class PackIndex3(FilePackIndex):
1051 """Version 3 Pack Index file.
1053 Supports variable hash sizes for SHA-1 (20 bytes) and SHA-256 (32 bytes).
1054 """
1056 def __init__(
1057 self,
1058 filename: Union[str, os.PathLike[str]],
1059 file: Optional[Union[IO[bytes], _GitFile]] = None,
1060 contents: Optional[bytes] = None,
1061 size: Optional[int] = None,
1062 ) -> None:
1063 """Initialize a version 3 pack index.
1065 Args:
1066 filename: Path to the index file
1067 file: Optional file object
1068 contents: Optional mmap'd contents
1069 size: Optional size of the index
1070 """
1071 super().__init__(filename, file, contents, size)
1072 if self._contents[:4] != b"\377tOc":
1073 raise AssertionError("Not a v3 pack index file")
1074 (self.version,) = unpack_from(b">L", self._contents, 4)
1075 if self.version != 3:
1076 raise AssertionError(f"Version was {self.version}")
1078 # Read hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
1079 (self.hash_algorithm,) = unpack_from(b">L", self._contents, 8)
1080 if self.hash_algorithm == 1:
1081 self.hash_size = 20 # SHA-1
1082 elif self.hash_algorithm == 2:
1083 self.hash_size = 32 # SHA-256
1084 else:
1085 raise AssertionError(f"Unknown hash algorithm {self.hash_algorithm}")
1087 # Read length of shortened object names
1088 (self.shortened_oid_len,) = unpack_from(b">L", self._contents, 12)
1090 # Calculate offsets based on variable hash size
1091 self._fan_out_table = self._read_fan_out_table(
1092 16
1093 ) # After header (4 + 4 + 4 + 4)
1094 self._name_table_offset = 16 + 0x100 * 4
1095 self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self)
1096 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
1097 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(
1098 self
1099 )
1101 def _unpack_entry(self, i: int) -> tuple[bytes, int, int]:
1102 return (
1103 self._unpack_name(i),
1104 self._unpack_offset(i),
1105 self._unpack_crc32_checksum(i),
1106 )
1108 def _unpack_name(self, i: int) -> bytes:
1109 offset = self._name_table_offset + i * self.hash_size
1110 return self._contents[offset : offset + self.hash_size]
1112 def _unpack_offset(self, i: int) -> int:
1113 offset_pos = self._pack_offset_table_offset + i * 4
1114 offset = unpack_from(">L", self._contents, offset_pos)[0]
1115 assert isinstance(offset, int)
1116 if offset & (2**31):
1117 large_offset_pos = (
1118 self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8
1119 )
1120 offset = unpack_from(">Q", self._contents, large_offset_pos)[0]
1121 assert isinstance(offset, int)
1122 return offset
1124 def _unpack_crc32_checksum(self, i: int) -> int:
1125 result = unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
1126 assert isinstance(result, int)
1127 return result
1130def read_pack_header(read: Callable[[int], bytes]) -> tuple[int, int]:
1131 """Read the header of a pack file.
1133 Args:
1134 read: Read function
1135 Returns: Tuple of (pack version, number of objects). If no data is
1136 available to read, returns (None, None).
1137 """
1138 header = read(12)
1139 if not header:
1140 raise AssertionError("file too short to contain pack")
1141 if header[:4] != b"PACK":
1142 raise AssertionError(f"Invalid pack header {header!r}")
1143 (version,) = unpack_from(b">L", header, 4)
1144 if version not in (2, 3):
1145 raise AssertionError(f"Version was {version}")
1146 (num_objects,) = unpack_from(b">L", header, 8)
1147 return (version, num_objects)
1150def chunks_length(chunks: Union[bytes, Iterable[bytes]]) -> int:
1151 """Get the total length of a sequence of chunks.
1153 Args:
1154 chunks: Either a single bytes object or an iterable of bytes
1155 Returns: Total length in bytes
1156 """
1157 if isinstance(chunks, bytes):
1158 return len(chunks)
1159 else:
1160 return sum(map(len, chunks))
1163def unpack_object(
1164 read_all: Callable[[int], bytes],
1165 read_some: Optional[Callable[[int], bytes]] = None,
1166 compute_crc32: bool = False,
1167 include_comp: bool = False,
1168 zlib_bufsize: int = _ZLIB_BUFSIZE,
1169) -> tuple[UnpackedObject, bytes]:
1170 """Unpack a Git object.
1172 Args:
1173 read_all: Read function that blocks until the number of requested
1174 bytes are read.
1175 read_some: Read function that returns at least one byte, but may not
1176 return the number of bytes requested.
1177 compute_crc32: If True, compute the CRC32 of the compressed data. If
1178 False, the returned CRC32 will be None.
1179 include_comp: If True, include compressed data in the result.
1180 zlib_bufsize: An optional buffer size for zlib operations.
1181 Returns: A tuple of (unpacked, unused), where unused is the unused data
1182 leftover from decompression, and unpacked in an UnpackedObject with
1183 the following attrs set:
1185 * obj_chunks (for non-delta types)
1186 * pack_type_num
1187 * delta_base (for delta types)
1188 * comp_chunks (if include_comp is True)
1189 * decomp_chunks
1190 * decomp_len
1191 * crc32 (if compute_crc32 is True)
1192 """
1193 if read_some is None:
1194 read_some = read_all
1195 if compute_crc32:
1196 crc32 = 0
1197 else:
1198 crc32 = None
1200 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)
1201 type_num = (raw[0] >> 4) & 0x07
1202 size = raw[0] & 0x0F
1203 for i, byte in enumerate(raw[1:]):
1204 size += (byte & 0x7F) << ((i * 7) + 4)
1206 delta_base: Union[int, bytes, None]
1207 raw_base = len(raw)
1208 if type_num == OFS_DELTA:
1209 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)
1210 raw_base += len(raw)
1211 if raw[-1] & 0x80:
1212 raise AssertionError
1213 delta_base_offset = raw[0] & 0x7F
1214 for byte in raw[1:]:
1215 delta_base_offset += 1
1216 delta_base_offset <<= 7
1217 delta_base_offset += byte & 0x7F
1218 delta_base = delta_base_offset
1219 elif type_num == REF_DELTA:
1220 delta_base_obj = read_all(20)
1221 if crc32 is not None:
1222 crc32 = binascii.crc32(delta_base_obj, crc32)
1223 delta_base = delta_base_obj
1224 raw_base += 20
1225 else:
1226 delta_base = None
1228 unpacked = UnpackedObject(
1229 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32
1230 )
1231 unused = read_zlib_chunks(
1232 read_some,
1233 unpacked,
1234 buffer_size=zlib_bufsize,
1235 include_comp=include_comp,
1236 )
1237 return unpacked, unused
1240def _compute_object_size(value: tuple[int, Any]) -> int:
1241 """Compute the size of a unresolved object for use with LRUSizeCache."""
1242 (num, obj) = value
1243 if num in DELTA_TYPES:
1244 return chunks_length(obj[1])
1245 return chunks_length(obj)
1248class PackStreamReader:
1249 """Class to read a pack stream.
1251 The pack is read from a ReceivableProtocol using read() or recv() as
1252 appropriate.
1253 """
1255 def __init__(
1256 self,
1257 read_all: Callable[[int], bytes],
1258 read_some: Optional[Callable[[int], bytes]] = None,
1259 zlib_bufsize: int = _ZLIB_BUFSIZE,
1260 ) -> None:
1261 """Initialize pack stream reader.
1263 Args:
1264 read_all: Function to read all requested bytes
1265 read_some: Function to read some bytes (optional)
1266 zlib_bufsize: Buffer size for zlib decompression
1267 """
1268 self.read_all = read_all
1269 if read_some is None:
1270 self.read_some = read_all
1271 else:
1272 self.read_some = read_some
1273 self.sha = sha1()
1274 self._offset = 0
1275 self._rbuf = BytesIO()
1276 # trailer is a deque to avoid memory allocation on small reads
1277 self._trailer: deque[int] = deque()
1278 self._zlib_bufsize = zlib_bufsize
1280 def _read(self, read: Callable[[int], bytes], size: int) -> bytes:
1281 """Read up to size bytes using the given callback.
1283 As a side effect, update the verifier's hash (excluding the last 20
1284 bytes read).
1286 Args:
1287 read: The read callback to read from.
1288 size: The maximum number of bytes to read; the particular
1289 behavior is callback-specific.
1290 Returns: Bytes read
1291 """
1292 data = read(size)
1294 # maintain a trailer of the last 20 bytes we've read
1295 n = len(data)
1296 self._offset += n
1297 tn = len(self._trailer)
1298 if n >= 20:
1299 to_pop = tn
1300 to_add = 20
1301 else:
1302 to_pop = max(n + tn - 20, 0)
1303 to_add = n
1304 self.sha.update(
1305 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)]))
1306 )
1307 self._trailer.extend(data[-to_add:])
1309 # hash everything but the trailer
1310 self.sha.update(data[:-to_add])
1311 return data
1313 def _buf_len(self) -> int:
1314 buf = self._rbuf
1315 start = buf.tell()
1316 buf.seek(0, SEEK_END)
1317 end = buf.tell()
1318 buf.seek(start)
1319 return end - start
1321 @property
1322 def offset(self) -> int:
1323 """Return current offset in the stream."""
1324 return self._offset - self._buf_len()
1326 def read(self, size: int) -> bytes:
1327 """Read, blocking until size bytes are read."""
1328 buf_len = self._buf_len()
1329 if buf_len >= size:
1330 return self._rbuf.read(size)
1331 buf_data = self._rbuf.read()
1332 self._rbuf = BytesIO()
1333 return buf_data + self._read(self.read_all, size - buf_len)
1335 def recv(self, size: int) -> bytes:
1336 """Read up to size bytes, blocking until one byte is read."""
1337 buf_len = self._buf_len()
1338 if buf_len:
1339 data = self._rbuf.read(size)
1340 if size >= buf_len:
1341 self._rbuf = BytesIO()
1342 return data
1343 return self._read(self.read_some, size)
1345 def __len__(self) -> int:
1346 """Return the number of objects in this pack."""
1347 return self._num_objects
1349 def read_objects(self, compute_crc32: bool = False) -> Iterator[UnpackedObject]:
1350 """Read the objects in this pack file.
1352 Args:
1353 compute_crc32: If True, compute the CRC32 of the compressed
1354 data. If False, the returned CRC32 will be None.
1355 Returns: Iterator over UnpackedObjects with the following members set:
1356 offset
1357 obj_type_num
1358 obj_chunks (for non-delta types)
1359 delta_base (for delta types)
1360 decomp_chunks
1361 decomp_len
1362 crc32 (if compute_crc32 is True)
1364 Raises:
1365 ChecksumMismatch: if the checksum of the pack contents does not
1366 match the checksum in the pack trailer.
1367 zlib.error: if an error occurred during zlib decompression.
1368 IOError: if an error occurred writing to the output file.
1369 """
1370 _pack_version, self._num_objects = read_pack_header(self.read)
1372 for _ in range(self._num_objects):
1373 offset = self.offset
1374 unpacked, unused = unpack_object(
1375 self.read,
1376 read_some=self.recv,
1377 compute_crc32=compute_crc32,
1378 zlib_bufsize=self._zlib_bufsize,
1379 )
1380 unpacked.offset = offset
1382 # prepend any unused data to current read buffer
1383 buf = BytesIO()
1384 buf.write(unused)
1385 buf.write(self._rbuf.read())
1386 buf.seek(0)
1387 self._rbuf = buf
1389 yield unpacked
1391 if self._buf_len() < 20:
1392 # If the read buffer is full, then the last read() got the whole
1393 # trailer off the wire. If not, it means there is still some of the
1394 # trailer to read. We need to read() all 20 bytes; N come from the
1395 # read buffer and (20 - N) come from the wire.
1396 self.read(20)
1398 pack_sha = bytearray(self._trailer)
1399 if pack_sha != self.sha.digest():
1400 raise ChecksumMismatch(sha_to_hex(bytes(pack_sha)), self.sha.hexdigest())
1403class PackStreamCopier(PackStreamReader):
1404 """Class to verify a pack stream as it is being read.
1406 The pack is read from a ReceivableProtocol using read() or recv() as
1407 appropriate and written out to the given file-like object.
1408 """
1410 def __init__(
1411 self,
1412 read_all: Callable[[int], bytes],
1413 read_some: Optional[Callable[[int], bytes]],
1414 outfile: IO[bytes],
1415 delta_iter: Optional["DeltaChainIterator[UnpackedObject]"] = None,
1416 ) -> None:
1417 """Initialize the copier.
1419 Args:
1420 read_all: Read function that blocks until the number of
1421 requested bytes are read.
1422 read_some: Read function that returns at least one byte, but may
1423 not return the number of bytes requested.
1424 outfile: File-like object to write output through.
1425 delta_iter: Optional DeltaChainIterator to record deltas as we
1426 read them.
1427 """
1428 super().__init__(read_all, read_some=read_some)
1429 self.outfile = outfile
1430 self._delta_iter = delta_iter
1432 def _read(self, read: Callable[[int], bytes], size: int) -> bytes:
1433 """Read data from the read callback and write it to the file."""
1434 data = super()._read(read, size)
1435 self.outfile.write(data)
1436 return data
1438 def verify(self, progress: Optional[Callable[..., None]] = None) -> None:
1439 """Verify a pack stream and write it to the output file.
1441 See PackStreamReader.iterobjects for a list of exceptions this may
1442 throw.
1443 """
1444 i = 0 # default count of entries if read_objects() is empty
1445 for i, unpacked in enumerate(self.read_objects()):
1446 if self._delta_iter:
1447 self._delta_iter.record(unpacked)
1448 if progress is not None:
1449 progress(f"copying pack entries: {i}/{len(self)}\r".encode("ascii"))
1450 if progress is not None:
1451 progress(f"copied {i} pack entries\n".encode("ascii"))
1454def obj_sha(type: int, chunks: Union[bytes, Iterable[bytes]]) -> bytes:
1455 """Compute the SHA for a numeric type and object chunks."""
1456 sha = sha1()
1457 sha.update(object_header(type, chunks_length(chunks)))
1458 if isinstance(chunks, bytes):
1459 sha.update(chunks)
1460 else:
1461 for chunk in chunks:
1462 sha.update(chunk)
1463 return sha.digest()
1466def compute_file_sha(
1467 f: IO[bytes], start_ofs: int = 0, end_ofs: int = 0, buffer_size: int = 1 << 16
1468) -> "HashObject":
1469 """Hash a portion of a file into a new SHA.
1471 Args:
1472 f: A file-like object to read from that supports seek().
1473 start_ofs: The offset in the file to start reading at.
1474 end_ofs: The offset in the file to end reading at, relative to the
1475 end of the file.
1476 buffer_size: A buffer size for reading.
1477 Returns: A new SHA object updated with data read from the file.
1478 """
1479 sha = sha1()
1480 f.seek(0, SEEK_END)
1481 length = f.tell()
1482 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length:
1483 raise AssertionError(
1484 f"Attempt to read beyond file length. start_ofs: {start_ofs}, end_ofs: {end_ofs}, file length: {length}"
1485 )
1486 todo = length + end_ofs - start_ofs
1487 f.seek(start_ofs)
1488 while todo:
1489 data = f.read(min(todo, buffer_size))
1490 sha.update(data)
1491 todo -= len(data)
1492 return sha
1495class PackData:
1496 """The data contained in a packfile.
1498 Pack files can be accessed both sequentially for exploding a pack, and
1499 directly with the help of an index to retrieve a specific object.
1501 The objects within are either complete or a delta against another.
1503 The header is variable length. If the MSB of each byte is set then it
1504 indicates that the subsequent byte is still part of the header.
1505 For the first byte the next MS bits are the type, which tells you the type
1506 of object, and whether it is a delta. The LS byte is the lowest bits of the
1507 size. For each subsequent byte the LS 7 bits are the next MS bits of the
1508 size, i.e. the last byte of the header contains the MS bits of the size.
1510 For the complete objects the data is stored as zlib deflated data.
1511 The size in the header is the uncompressed object size, so to uncompress
1512 you need to just keep feeding data to zlib until you get an object back,
1513 or it errors on bad data. This is done here by just giving the complete
1514 buffer from the start of the deflated object on. This is bad, but until I
1515 get mmap sorted out it will have to do.
1517 Currently there are no integrity checks done. Also no attempt is made to
1518 try and detect the delta case, or a request for an object at the wrong
1519 position. It will all just throw a zlib or KeyError.
1520 """
1522 def __init__(
1523 self,
1524 filename: Union[str, os.PathLike[str]],
1525 file: Optional[IO[bytes]] = None,
1526 size: Optional[int] = None,
1527 *,
1528 delta_window_size: Optional[int] = None,
1529 window_memory: Optional[int] = None,
1530 delta_cache_size: Optional[int] = None,
1531 depth: Optional[int] = None,
1532 threads: Optional[int] = None,
1533 big_file_threshold: Optional[int] = None,
1534 ) -> None:
1535 """Create a PackData object representing the pack in the given filename.
1537 The file must exist and stay readable until the object is disposed of.
1538 It must also stay the same size. It will be mapped whenever needed.
1540 Currently there is a restriction on the size of the pack as the python
1541 mmap implementation is flawed.
1542 """
1543 self._filename = filename
1544 self._size = size
1545 self._header_size = 12
1546 self.delta_window_size = delta_window_size
1547 self.window_memory = window_memory
1548 self.delta_cache_size = delta_cache_size
1549 self.depth = depth
1550 self.threads = threads
1551 self.big_file_threshold = big_file_threshold
1552 self._file: IO[bytes]
1554 if file is None:
1555 self._file = GitFile(self._filename, "rb")
1556 else:
1557 self._file = file
1558 (_version, self._num_objects) = read_pack_header(self._file.read)
1560 # Use delta_cache_size config if available, otherwise default
1561 cache_size = delta_cache_size or (1024 * 1024 * 20)
1562 self._offset_cache = LRUSizeCache[int, tuple[int, OldUnpackedObject]](
1563 cache_size, compute_size=_compute_object_size
1564 )
1566 @property
1567 def filename(self) -> str:
1568 """Get the filename of the pack file.
1570 Returns:
1571 Base filename without directory path
1572 """
1573 return os.path.basename(self._filename)
1575 @property
1576 def path(self) -> Union[str, os.PathLike[str]]:
1577 """Get the full path of the pack file.
1579 Returns:
1580 Full path to the pack file
1581 """
1582 return self._filename
1584 @classmethod
1585 def from_file(cls, file: IO[bytes], size: Optional[int] = None) -> "PackData":
1586 """Create a PackData object from an open file.
1588 Args:
1589 file: Open file object
1590 size: Optional file size
1592 Returns:
1593 PackData instance
1594 """
1595 return cls(str(file), file=file, size=size)
1597 @classmethod
1598 def from_path(cls, path: Union[str, os.PathLike[str]]) -> "PackData":
1599 """Create a PackData object from a file path.
1601 Args:
1602 path: Path to the pack file
1604 Returns:
1605 PackData instance
1606 """
1607 return cls(filename=path)
1609 def close(self) -> None:
1610 """Close the underlying pack file."""
1611 self._file.close()
1613 def __enter__(self) -> "PackData":
1614 """Enter context manager."""
1615 return self
1617 def __exit__(
1618 self,
1619 exc_type: Optional[type],
1620 exc_val: Optional[BaseException],
1621 exc_tb: Optional[TracebackType],
1622 ) -> None:
1623 """Exit context manager."""
1624 self.close()
1626 def __eq__(self, other: object) -> bool:
1627 """Check equality with another object."""
1628 if isinstance(other, PackData):
1629 return self.get_stored_checksum() == other.get_stored_checksum()
1630 return False
1632 def _get_size(self) -> int:
1633 if self._size is not None:
1634 return self._size
1635 self._size = os.path.getsize(self._filename)
1636 if self._size < self._header_size:
1637 errmsg = f"{self._filename} is too small for a packfile ({self._size} < {self._header_size})"
1638 raise AssertionError(errmsg)
1639 return self._size
1641 def __len__(self) -> int:
1642 """Returns the number of objects in this pack."""
1643 return self._num_objects
1645 def calculate_checksum(self) -> bytes:
1646 """Calculate the checksum for this pack.
1648 Returns: 20-byte binary SHA1 digest
1649 """
1650 return compute_file_sha(self._file, end_ofs=-20).digest()
1652 def iter_unpacked(self, *, include_comp: bool = False) -> Iterator[UnpackedObject]:
1653 """Iterate over unpacked objects in the pack."""
1654 self._file.seek(self._header_size)
1656 if self._num_objects is None:
1657 return
1659 for _ in range(self._num_objects):
1660 offset = self._file.tell()
1661 unpacked, unused = unpack_object(
1662 self._file.read, compute_crc32=False, include_comp=include_comp
1663 )
1664 unpacked.offset = offset
1665 yield unpacked
1666 # Back up over unused data.
1667 self._file.seek(-len(unused), SEEK_CUR)
1669 def iterentries(
1670 self,
1671 progress: Optional[Callable[[int, int], None]] = None,
1672 resolve_ext_ref: Optional[ResolveExtRefFn] = None,
1673 ) -> Iterator[tuple[bytes, int, Optional[int]]]:
1674 """Yield entries summarizing the contents of this pack.
1676 Args:
1677 progress: Progress function, called with current and total
1678 object count.
1679 resolve_ext_ref: Optional function to resolve external references
1680 Returns: iterator of tuples with (sha, offset, crc32)
1681 """
1682 num_objects = self._num_objects
1683 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref)
1684 for i, result in enumerate(indexer):
1685 if progress is not None:
1686 progress(i, num_objects)
1687 yield result
1689 def sorted_entries(
1690 self,
1691 progress: Optional[ProgressFn] = None,
1692 resolve_ext_ref: Optional[ResolveExtRefFn] = None,
1693 ) -> list[tuple[bytes, int, int]]:
1694 """Return entries in this pack, sorted by SHA.
1696 Args:
1697 progress: Progress function, called with current and total
1698 object count
1699 resolve_ext_ref: Optional function to resolve external references
1700 Returns: Iterator of tuples with (sha, offset, crc32)
1701 """
1702 return sorted(
1703 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref) # type: ignore
1704 )
1706 def create_index_v1(
1707 self,
1708 filename: str,
1709 progress: Optional[Callable[..., None]] = None,
1710 resolve_ext_ref: Optional[ResolveExtRefFn] = None,
1711 ) -> bytes:
1712 """Create a version 1 file for this data file.
1714 Args:
1715 filename: Index filename.
1716 progress: Progress report function
1717 resolve_ext_ref: Optional function to resolve external references
1718 Returns: Checksum of index file
1719 """
1720 entries = self.sorted_entries(
1721 progress=progress, resolve_ext_ref=resolve_ext_ref
1722 )
1723 checksum = self.calculate_checksum()
1724 with GitFile(filename, "wb") as f:
1725 write_pack_index_v1(
1726 f,
1727 entries,
1728 checksum,
1729 )
1730 return checksum
1732 def create_index_v2(
1733 self,
1734 filename: str,
1735 progress: Optional[Callable[..., None]] = None,
1736 resolve_ext_ref: Optional[ResolveExtRefFn] = None,
1737 ) -> bytes:
1738 """Create a version 2 index file for this data file.
1740 Args:
1741 filename: Index filename.
1742 progress: Progress report function
1743 resolve_ext_ref: Optional function to resolve external references
1744 Returns: Checksum of index file
1745 """
1746 entries = self.sorted_entries(
1747 progress=progress, resolve_ext_ref=resolve_ext_ref
1748 )
1749 with GitFile(filename, "wb") as f:
1750 return write_pack_index_v2(f, entries, self.calculate_checksum())
1752 def create_index_v3(
1753 self,
1754 filename: str,
1755 progress: Optional[Callable[..., None]] = None,
1756 resolve_ext_ref: Optional[ResolveExtRefFn] = None,
1757 hash_algorithm: int = 1,
1758 ) -> bytes:
1759 """Create a version 3 index file for this data file.
1761 Args:
1762 filename: Index filename.
1763 progress: Progress report function
1764 resolve_ext_ref: Function to resolve external references
1765 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
1766 Returns: Checksum of index file
1767 """
1768 entries = self.sorted_entries(
1769 progress=progress, resolve_ext_ref=resolve_ext_ref
1770 )
1771 with GitFile(filename, "wb") as f:
1772 return write_pack_index_v3(
1773 f, entries, self.calculate_checksum(), hash_algorithm
1774 )
1776 def create_index(
1777 self,
1778 filename: str,
1779 progress: Optional[Callable[..., None]] = None,
1780 version: int = 2,
1781 resolve_ext_ref: Optional[ResolveExtRefFn] = None,
1782 hash_algorithm: int = 1,
1783 ) -> bytes:
1784 """Create an index file for this data file.
1786 Args:
1787 filename: Index filename.
1788 progress: Progress report function
1789 version: Index version (1, 2, or 3)
1790 resolve_ext_ref: Function to resolve external references
1791 hash_algorithm: Hash algorithm identifier for v3 (1 = SHA-1, 2 = SHA-256)
1792 Returns: Checksum of index file
1793 """
1794 if version == 1:
1795 return self.create_index_v1(
1796 filename, progress, resolve_ext_ref=resolve_ext_ref
1797 )
1798 elif version == 2:
1799 return self.create_index_v2(
1800 filename, progress, resolve_ext_ref=resolve_ext_ref
1801 )
1802 elif version == 3:
1803 return self.create_index_v3(
1804 filename,
1805 progress,
1806 resolve_ext_ref=resolve_ext_ref,
1807 hash_algorithm=hash_algorithm,
1808 )
1809 else:
1810 raise ValueError(f"unknown index format {version}")
1812 def get_stored_checksum(self) -> bytes:
1813 """Return the expected checksum stored in this pack."""
1814 self._file.seek(-20, SEEK_END)
1815 return self._file.read(20)
1817 def check(self) -> None:
1818 """Check the consistency of this pack."""
1819 actual = self.calculate_checksum()
1820 stored = self.get_stored_checksum()
1821 if actual != stored:
1822 raise ChecksumMismatch(stored, actual)
1824 def get_unpacked_object_at(
1825 self, offset: int, *, include_comp: bool = False
1826 ) -> UnpackedObject:
1827 """Given offset in the packfile return a UnpackedObject."""
1828 assert offset >= self._header_size
1829 self._file.seek(offset)
1830 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp)
1831 unpacked.offset = offset
1832 return unpacked
1834 def get_object_at(self, offset: int) -> tuple[int, OldUnpackedObject]:
1835 """Given an offset in to the packfile return the object that is there.
1837 Using the associated index the location of an object can be looked up,
1838 and then the packfile can be asked directly for that object using this
1839 function.
1840 """
1841 try:
1842 return self._offset_cache[offset]
1843 except KeyError:
1844 pass
1845 unpacked = self.get_unpacked_object_at(offset, include_comp=False)
1846 return (unpacked.pack_type_num, unpacked._obj())
1849T = TypeVar("T")
1852class DeltaChainIterator(Generic[T]):
1853 """Abstract iterator over pack data based on delta chains.
1855 Each object in the pack is guaranteed to be inflated exactly once,
1856 regardless of how many objects reference it as a delta base. As a result,
1857 memory usage is proportional to the length of the longest delta chain.
1859 Subclasses can override _result to define the result type of the iterator.
1860 By default, results are UnpackedObjects with the following members set:
1862 * offset
1863 * obj_type_num
1864 * obj_chunks
1865 * pack_type_num
1866 * delta_base (for delta types)
1867 * comp_chunks (if _include_comp is True)
1868 * decomp_chunks
1869 * decomp_len
1870 * crc32 (if _compute_crc32 is True)
1871 """
1873 _compute_crc32 = False
1874 _include_comp = False
1876 def __init__(
1877 self,
1878 file_obj: Optional[IO[bytes]],
1879 *,
1880 resolve_ext_ref: Optional[ResolveExtRefFn] = None,
1881 ) -> None:
1882 """Initialize DeltaChainIterator.
1884 Args:
1885 file_obj: File object to read pack data from
1886 resolve_ext_ref: Optional function to resolve external references
1887 """
1888 self._file = file_obj
1889 self._resolve_ext_ref = resolve_ext_ref
1890 self._pending_ofs: dict[int, list[int]] = defaultdict(list)
1891 self._pending_ref: dict[bytes, list[int]] = defaultdict(list)
1892 self._full_ofs: list[tuple[int, int]] = []
1893 self._ext_refs: list[bytes] = []
1895 @classmethod
1896 def for_pack_data(
1897 cls, pack_data: PackData, resolve_ext_ref: Optional[ResolveExtRefFn] = None
1898 ) -> "DeltaChainIterator[T]":
1899 """Create a DeltaChainIterator from pack data.
1901 Args:
1902 pack_data: PackData object to iterate
1903 resolve_ext_ref: Optional function to resolve external refs
1905 Returns:
1906 DeltaChainIterator instance
1907 """
1908 walker = cls(None, resolve_ext_ref=resolve_ext_ref)
1909 walker.set_pack_data(pack_data)
1910 for unpacked in pack_data.iter_unpacked(include_comp=False):
1911 walker.record(unpacked)
1912 return walker
1914 @classmethod
1915 def for_pack_subset(
1916 cls,
1917 pack: "Pack",
1918 shas: Iterable[bytes],
1919 *,
1920 allow_missing: bool = False,
1921 resolve_ext_ref: Optional[ResolveExtRefFn] = None,
1922 ) -> "DeltaChainIterator[T]":
1923 """Create a DeltaChainIterator for a subset of objects.
1925 Args:
1926 pack: Pack object containing the data
1927 shas: Iterable of object SHAs to include
1928 allow_missing: If True, skip missing objects
1929 resolve_ext_ref: Optional function to resolve external refs
1931 Returns:
1932 DeltaChainIterator instance
1933 """
1934 walker = cls(None, resolve_ext_ref=resolve_ext_ref)
1935 walker.set_pack_data(pack.data)
1936 todo = set()
1937 for sha in shas:
1938 assert isinstance(sha, bytes)
1939 try:
1940 off = pack.index.object_offset(sha)
1941 except KeyError:
1942 if not allow_missing:
1943 raise
1944 else:
1945 todo.add(off)
1946 done = set()
1947 while todo:
1948 off = todo.pop()
1949 unpacked = pack.data.get_unpacked_object_at(off)
1950 walker.record(unpacked)
1951 done.add(off)
1952 base_ofs = None
1953 if unpacked.pack_type_num == OFS_DELTA:
1954 assert unpacked.offset is not None
1955 assert unpacked.delta_base is not None
1956 assert isinstance(unpacked.delta_base, int)
1957 base_ofs = unpacked.offset - unpacked.delta_base
1958 elif unpacked.pack_type_num == REF_DELTA:
1959 with suppress(KeyError):
1960 assert isinstance(unpacked.delta_base, bytes)
1961 base_ofs = pack.index.object_index(unpacked.delta_base)
1962 if base_ofs is not None and base_ofs not in done:
1963 todo.add(base_ofs)
1964 return walker
1966 def record(self, unpacked: UnpackedObject) -> None:
1967 """Record an unpacked object for later processing.
1969 Args:
1970 unpacked: UnpackedObject to record
1971 """
1972 type_num = unpacked.pack_type_num
1973 offset = unpacked.offset
1974 assert offset is not None
1975 if type_num == OFS_DELTA:
1976 assert unpacked.delta_base is not None
1977 assert isinstance(unpacked.delta_base, int)
1978 base_offset = offset - unpacked.delta_base
1979 self._pending_ofs[base_offset].append(offset)
1980 elif type_num == REF_DELTA:
1981 assert isinstance(unpacked.delta_base, bytes)
1982 self._pending_ref[unpacked.delta_base].append(offset)
1983 else:
1984 self._full_ofs.append((offset, type_num))
1986 def set_pack_data(self, pack_data: PackData) -> None:
1987 """Set the pack data for iteration.
1989 Args:
1990 pack_data: PackData object to use
1991 """
1992 self._file = pack_data._file
1994 def _walk_all_chains(self) -> Iterator[T]:
1995 for offset, type_num in self._full_ofs:
1996 yield from self._follow_chain(offset, type_num, None)
1997 yield from self._walk_ref_chains()
1998 assert not self._pending_ofs, repr(self._pending_ofs)
2000 def _ensure_no_pending(self) -> None:
2001 if self._pending_ref:
2002 raise UnresolvedDeltas([sha_to_hex(s) for s in self._pending_ref])
2004 def _walk_ref_chains(self) -> Iterator[T]:
2005 if not self._resolve_ext_ref:
2006 self._ensure_no_pending()
2007 return
2009 for base_sha, pending in sorted(self._pending_ref.items()):
2010 if base_sha not in self._pending_ref:
2011 continue
2012 try:
2013 type_num, chunks = self._resolve_ext_ref(base_sha)
2014 except KeyError:
2015 # Not an external ref, but may depend on one. Either it will
2016 # get popped via a _follow_chain call, or we will raise an
2017 # error below.
2018 continue
2019 self._ext_refs.append(base_sha)
2020 self._pending_ref.pop(base_sha)
2021 for new_offset in pending:
2022 yield from self._follow_chain(new_offset, type_num, chunks) # type: ignore[arg-type]
2024 self._ensure_no_pending()
2026 def _result(self, unpacked: UnpackedObject) -> T:
2027 raise NotImplementedError
2029 def _resolve_object(
2030 self, offset: int, obj_type_num: int, base_chunks: Optional[list[bytes]]
2031 ) -> UnpackedObject:
2032 assert self._file is not None
2033 self._file.seek(offset)
2034 unpacked, _ = unpack_object(
2035 self._file.read,
2036 include_comp=self._include_comp,
2037 compute_crc32=self._compute_crc32,
2038 )
2039 unpacked.offset = offset
2040 if base_chunks is None:
2041 assert unpacked.pack_type_num == obj_type_num
2042 else:
2043 assert unpacked.pack_type_num in DELTA_TYPES
2044 unpacked.obj_type_num = obj_type_num
2045 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks)
2046 return unpacked
2048 def _follow_chain(
2049 self, offset: int, obj_type_num: int, base_chunks: Optional[list[bytes]]
2050 ) -> Iterator[T]:
2051 # Unlike PackData.get_object_at, there is no need to cache offsets as
2052 # this approach by design inflates each object exactly once.
2053 todo = [(offset, obj_type_num, base_chunks)]
2054 while todo:
2055 (offset, obj_type_num, base_chunks) = todo.pop()
2056 unpacked = self._resolve_object(offset, obj_type_num, base_chunks)
2057 yield self._result(unpacked)
2059 assert unpacked.offset is not None
2060 unblocked = chain(
2061 self._pending_ofs.pop(unpacked.offset, []),
2062 self._pending_ref.pop(unpacked.sha(), []),
2063 )
2064 todo.extend(
2065 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore
2066 for new_offset in unblocked
2067 )
2069 def __iter__(self) -> Iterator[T]:
2070 """Iterate over objects in the pack."""
2071 return self._walk_all_chains()
2073 def ext_refs(self) -> list[bytes]:
2074 """Return external references."""
2075 return self._ext_refs
2078class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]):
2079 """Delta chain iterator that yield unpacked objects."""
2081 def _result(self, unpacked: UnpackedObject) -> UnpackedObject:
2082 """Return the unpacked object.
2084 Args:
2085 unpacked: The unpacked object
2087 Returns:
2088 The unpacked object unchanged
2089 """
2090 return unpacked
2093class PackIndexer(DeltaChainIterator[PackIndexEntry]):
2094 """Delta chain iterator that yields index entries."""
2096 _compute_crc32 = True
2098 def _result(self, unpacked: UnpackedObject) -> tuple[bytes, int, Optional[int]]:
2099 """Convert unpacked object to pack index entry.
2101 Args:
2102 unpacked: The unpacked object
2104 Returns:
2105 Tuple of (sha, offset, crc32) for index entry
2106 """
2107 assert unpacked.offset is not None
2108 return unpacked.sha(), unpacked.offset, unpacked.crc32
2111class PackInflater(DeltaChainIterator[ShaFile]):
2112 """Delta chain iterator that yields ShaFile objects."""
2114 def _result(self, unpacked: UnpackedObject) -> ShaFile:
2115 """Convert unpacked object to ShaFile.
2117 Args:
2118 unpacked: The unpacked object
2120 Returns:
2121 ShaFile object from the unpacked data
2122 """
2123 return unpacked.sha_file()
2126class SHA1Reader(BinaryIO):
2127 """Wrapper for file-like object that remembers the SHA1 of its data."""
2129 def __init__(self, f: IO[bytes]) -> None:
2130 """Initialize SHA1Reader.
2132 Args:
2133 f: File-like object to wrap
2134 """
2135 self.f = f
2136 self.sha1 = sha1(b"")
2138 def read(self, size: int = -1) -> bytes:
2139 """Read bytes and update SHA1.
2141 Args:
2142 size: Number of bytes to read, -1 for all
2144 Returns:
2145 Bytes read from file
2146 """
2147 data = self.f.read(size)
2148 self.sha1.update(data)
2149 return data
2151 def check_sha(self, allow_empty: bool = False) -> None:
2152 """Check if the SHA1 matches the expected value.
2154 Args:
2155 allow_empty: Allow empty SHA1 hash
2157 Raises:
2158 ChecksumMismatch: If SHA1 doesn't match
2159 """
2160 stored = self.f.read(20)
2161 # If git option index.skipHash is set the index will be empty
2162 if stored != self.sha1.digest() and (
2163 not allow_empty
2164 or sha_to_hex(stored) != b"0000000000000000000000000000000000000000"
2165 ):
2166 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
2168 def close(self) -> None:
2169 """Close the underlying file."""
2170 return self.f.close()
2172 def tell(self) -> int:
2173 """Return current file position."""
2174 return self.f.tell()
2176 # BinaryIO abstract methods
2177 def readable(self) -> bool:
2178 """Check if file is readable."""
2179 return True
2181 def writable(self) -> bool:
2182 """Check if file is writable."""
2183 return False
2185 def seekable(self) -> bool:
2186 """Check if file is seekable."""
2187 return getattr(self.f, "seekable", lambda: False)()
2189 def seek(self, offset: int, whence: int = 0) -> int:
2190 """Seek to position in file.
2192 Args:
2193 offset: Position offset
2194 whence: Reference point (0=start, 1=current, 2=end)
2196 Returns:
2197 New file position
2198 """
2199 return self.f.seek(offset, whence)
2201 def flush(self) -> None:
2202 """Flush the file buffer."""
2203 if hasattr(self.f, "flush"):
2204 self.f.flush()
2206 def readline(self, size: int = -1) -> bytes:
2207 """Read a line from the file.
2209 Args:
2210 size: Maximum bytes to read
2212 Returns:
2213 Line read from file
2214 """
2215 return self.f.readline(size)
2217 def readlines(self, hint: int = -1) -> list[bytes]:
2218 """Read all lines from the file.
2220 Args:
2221 hint: Approximate number of bytes to read
2223 Returns:
2224 List of lines
2225 """
2226 return self.f.readlines(hint)
2228 def writelines(self, lines: Iterable[bytes], /) -> None: # type: ignore[override]
2229 """Write multiple lines to the file (not supported)."""
2230 raise UnsupportedOperation("writelines")
2232 def write(self, data: bytes, /) -> int: # type: ignore[override]
2233 """Write data to the file (not supported)."""
2234 raise UnsupportedOperation("write")
2236 def __enter__(self) -> "SHA1Reader":
2237 """Enter context manager."""
2238 return self
2240 def __exit__(
2241 self,
2242 type: Optional[type],
2243 value: Optional[BaseException],
2244 traceback: Optional[TracebackType],
2245 ) -> None:
2246 """Exit context manager and close file."""
2247 self.close()
2249 def __iter__(self) -> "SHA1Reader":
2250 """Return iterator for reading file lines."""
2251 return self
2253 def __next__(self) -> bytes:
2254 """Get next line from file.
2256 Returns:
2257 Next line
2259 Raises:
2260 StopIteration: When no more lines
2261 """
2262 line = self.readline()
2263 if not line:
2264 raise StopIteration
2265 return line
2267 def fileno(self) -> int:
2268 """Return file descriptor number."""
2269 return self.f.fileno()
2271 def isatty(self) -> bool:
2272 """Check if file is a terminal."""
2273 return getattr(self.f, "isatty", lambda: False)()
2275 def truncate(self, size: Optional[int] = None) -> int:
2276 """Not supported for read-only file.
2278 Raises:
2279 UnsupportedOperation: Always raised
2280 """
2281 raise UnsupportedOperation("truncate")
2284class SHA1Writer(BinaryIO):
2285 """Wrapper for file-like object that remembers the SHA1 of its data."""
2287 def __init__(self, f: Union[BinaryIO, IO[bytes]]) -> None:
2288 """Initialize SHA1Writer.
2290 Args:
2291 f: File-like object to wrap
2292 """
2293 self.f = f
2294 self.length = 0
2295 self.sha1 = sha1(b"")
2296 self.digest: Optional[bytes] = None
2298 def write(self, data: Union[bytes, bytearray, memoryview], /) -> int: # type: ignore[override]
2299 """Write data and update SHA1.
2301 Args:
2302 data: Data to write
2304 Returns:
2305 Number of bytes written
2306 """
2307 self.sha1.update(data)
2308 written = self.f.write(data)
2309 self.length += written
2310 return written
2312 def write_sha(self) -> bytes:
2313 """Write the SHA1 digest to the file.
2315 Returns:
2316 The SHA1 digest bytes
2317 """
2318 sha = self.sha1.digest()
2319 assert len(sha) == 20
2320 self.f.write(sha)
2321 self.length += len(sha)
2322 return sha
2324 def close(self) -> None:
2325 """Close the pack file and finalize the SHA."""
2326 self.digest = self.write_sha()
2327 self.f.close()
2329 def offset(self) -> int:
2330 """Get the total number of bytes written.
2332 Returns:
2333 Total bytes written
2334 """
2335 return self.length
2337 def tell(self) -> int:
2338 """Return current file position."""
2339 return self.f.tell()
2341 # BinaryIO abstract methods
2342 def readable(self) -> bool:
2343 """Check if file is readable."""
2344 return False
2346 def writable(self) -> bool:
2347 """Check if file is writable."""
2348 return True
2350 def seekable(self) -> bool:
2351 """Check if file is seekable."""
2352 return getattr(self.f, "seekable", lambda: False)()
2354 def seek(self, offset: int, whence: int = 0) -> int:
2355 """Seek to position in file.
2357 Args:
2358 offset: Position offset
2359 whence: Reference point (0=start, 1=current, 2=end)
2361 Returns:
2362 New file position
2363 """
2364 return self.f.seek(offset, whence)
2366 def flush(self) -> None:
2367 """Flush the file buffer."""
2368 if hasattr(self.f, "flush"):
2369 self.f.flush()
2371 def readline(self, size: int = -1) -> bytes:
2372 """Not supported for write-only file.
2374 Raises:
2375 UnsupportedOperation: Always raised
2376 """
2377 raise UnsupportedOperation("readline")
2379 def readlines(self, hint: int = -1) -> list[bytes]:
2380 """Not supported for write-only file.
2382 Raises:
2383 UnsupportedOperation: Always raised
2384 """
2385 raise UnsupportedOperation("readlines")
2387 def writelines(self, lines: Iterable[bytes], /) -> None: # type: ignore[override]
2388 """Write multiple lines to the file.
2390 Args:
2391 lines: Iterable of lines to write
2392 """
2393 for line in lines:
2394 self.write(line)
2396 def read(self, size: int = -1) -> bytes:
2397 """Not supported for write-only file.
2399 Raises:
2400 UnsupportedOperation: Always raised
2401 """
2402 raise UnsupportedOperation("read")
2404 def __enter__(self) -> "SHA1Writer":
2405 """Enter context manager."""
2406 return self
2408 def __exit__(
2409 self,
2410 type: Optional[type],
2411 value: Optional[BaseException],
2412 traceback: Optional[TracebackType],
2413 ) -> None:
2414 """Exit context manager and close file."""
2415 self.close()
2417 def __iter__(self) -> "SHA1Writer":
2418 """Return iterator."""
2419 return self
2421 def __next__(self) -> bytes:
2422 """Not supported for write-only file.
2424 Raises:
2425 UnsupportedOperation: Always raised
2426 """
2427 raise UnsupportedOperation("__next__")
2429 def fileno(self) -> int:
2430 """Return file descriptor number."""
2431 return self.f.fileno()
2433 def isatty(self) -> bool:
2434 """Check if file is a terminal."""
2435 return getattr(self.f, "isatty", lambda: False)()
2437 def truncate(self, size: Optional[int] = None) -> int:
2438 """Not supported for write-only file.
2440 Raises:
2441 UnsupportedOperation: Always raised
2442 """
2443 raise UnsupportedOperation("truncate")
2446def pack_object_header(
2447 type_num: int, delta_base: Optional[Union[bytes, int]], size: int
2448) -> bytearray:
2449 """Create a pack object header for the given object info.
2451 Args:
2452 type_num: Numeric type of the object.
2453 delta_base: Delta base offset or ref, or None for whole objects.
2454 size: Uncompressed object size.
2455 Returns: A header for a packed object.
2456 """
2457 header = []
2458 c = (type_num << 4) | (size & 15)
2459 size >>= 4
2460 while size:
2461 header.append(c | 0x80)
2462 c = size & 0x7F
2463 size >>= 7
2464 header.append(c)
2465 if type_num == OFS_DELTA:
2466 assert isinstance(delta_base, int)
2467 ret = [delta_base & 0x7F]
2468 delta_base >>= 7
2469 while delta_base:
2470 delta_base -= 1
2471 ret.insert(0, 0x80 | (delta_base & 0x7F))
2472 delta_base >>= 7
2473 header.extend(ret)
2474 elif type_num == REF_DELTA:
2475 assert isinstance(delta_base, bytes)
2476 assert len(delta_base) == 20
2477 header += delta_base
2478 return bytearray(header)
2481def pack_object_chunks(
2482 type: int,
2483 object: Union[list[bytes], tuple[Union[bytes, int], list[bytes]]],
2484 compression_level: int = -1,
2485) -> Iterator[bytes]:
2486 """Generate chunks for a pack object.
2488 Args:
2489 type: Numeric type of the object
2490 object: Object to write
2491 compression_level: the zlib compression level
2492 Returns: Chunks
2493 """
2494 if type in DELTA_TYPES:
2495 if isinstance(object, tuple):
2496 delta_base, object = object
2497 else:
2498 raise TypeError("Delta types require a tuple of (delta_base, object)")
2499 else:
2500 delta_base = None
2502 # Convert object to list of bytes chunks
2503 if isinstance(object, bytes):
2504 chunks = [object]
2505 elif isinstance(object, list):
2506 chunks = object
2507 elif isinstance(object, ShaFile):
2508 chunks = object.as_raw_chunks()
2509 else:
2510 # Shouldn't reach here with proper typing
2511 raise TypeError(f"Unexpected object type: {object.__class__.__name__}")
2513 yield bytes(pack_object_header(type, delta_base, sum(map(len, chunks))))
2514 compressor = zlib.compressobj(level=compression_level)
2515 for data in chunks:
2516 yield compressor.compress(data)
2517 yield compressor.flush()
2520def write_pack_object(
2521 write: Callable[[bytes], int],
2522 type: int,
2523 object: Union[list[bytes], tuple[Union[bytes, int], list[bytes]]],
2524 sha: Optional["HashObject"] = None,
2525 compression_level: int = -1,
2526) -> int:
2527 """Write pack object to a file.
2529 Args:
2530 write: Write function to use
2531 type: Numeric type of the object
2532 object: Object to write
2533 sha: Optional SHA-1 hasher to update
2534 compression_level: the zlib compression level
2535 Returns: CRC32 checksum of the written object
2536 """
2537 crc32 = 0
2538 for chunk in pack_object_chunks(type, object, compression_level=compression_level):
2539 write(chunk)
2540 if sha is not None:
2541 sha.update(chunk)
2542 crc32 = binascii.crc32(chunk, crc32)
2543 return crc32 & 0xFFFFFFFF
2546def write_pack(
2547 filename: str,
2548 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],
2549 *,
2550 deltify: Optional[bool] = None,
2551 delta_window_size: Optional[int] = None,
2552 compression_level: int = -1,
2553) -> tuple[bytes, bytes]:
2554 """Write a new pack data file.
2556 Args:
2557 filename: Path to the new pack file (without .pack extension)
2558 objects: Objects to write to the pack
2559 delta_window_size: Delta window size
2560 deltify: Whether to deltify pack objects
2561 compression_level: the zlib compression level
2562 Returns: Tuple with checksum of pack file and index file
2563 """
2564 with GitFile(filename + ".pack", "wb") as f:
2565 entries, data_sum = write_pack_objects(
2566 f,
2567 objects,
2568 delta_window_size=delta_window_size,
2569 deltify=deltify,
2570 compression_level=compression_level,
2571 )
2572 entries_list = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])
2573 with GitFile(filename + ".idx", "wb") as f:
2574 idx_sha = write_pack_index(f, entries_list, data_sum)
2575 return data_sum, idx_sha
2578def pack_header_chunks(num_objects: int) -> Iterator[bytes]:
2579 """Yield chunks for a pack header."""
2580 yield b"PACK" # Pack header
2581 yield struct.pack(b">L", 2) # Pack version
2582 yield struct.pack(b">L", num_objects) # Number of objects in pack
2585def write_pack_header(
2586 write: Union[Callable[[bytes], int], IO[bytes]], num_objects: int
2587) -> None:
2588 """Write a pack header for the given number of objects."""
2589 write_fn: Callable[[bytes], int]
2590 if hasattr(write, "write"):
2591 write_fn = write.write
2592 warnings.warn(
2593 "write_pack_header() now takes a write rather than file argument",
2594 DeprecationWarning,
2595 stacklevel=2,
2596 )
2597 else:
2598 write_fn = write
2599 for chunk in pack_header_chunks(num_objects):
2600 write_fn(chunk)
2603def find_reusable_deltas(
2604 container: PackedObjectContainer,
2605 object_ids: Set[bytes],
2606 *,
2607 other_haves: Optional[Set[bytes]] = None,
2608 progress: Optional[Callable[..., None]] = None,
2609) -> Iterator[UnpackedObject]:
2610 """Find deltas in a pack that can be reused.
2612 Args:
2613 container: Pack container to search for deltas
2614 object_ids: Set of object IDs to find deltas for
2615 other_haves: Set of other object IDs we have
2616 progress: Optional progress reporting callback
2618 Returns:
2619 Iterator of UnpackedObject entries that can be reused
2620 """
2621 if other_haves is None:
2622 other_haves = set()
2623 reused = 0
2624 for i, unpacked in enumerate(
2625 container.iter_unpacked_subset(
2626 object_ids, allow_missing=True, convert_ofs_delta=True
2627 )
2628 ):
2629 if progress is not None and i % 1000 == 0:
2630 progress(f"checking for reusable deltas: {i}/{len(object_ids)}\r".encode())
2631 if unpacked.pack_type_num == REF_DELTA:
2632 hexsha = sha_to_hex(unpacked.delta_base) # type: ignore
2633 if hexsha in object_ids or hexsha in other_haves:
2634 yield unpacked
2635 reused += 1
2636 if progress is not None:
2637 progress((f"found {reused} deltas to reuse\n").encode())
2640def deltify_pack_objects(
2641 objects: Union[Iterator[ShaFile], Iterator[tuple[ShaFile, Optional[bytes]]]],
2642 *,
2643 window_size: Optional[int] = None,
2644 progress: Optional[Callable[..., None]] = None,
2645) -> Iterator[UnpackedObject]:
2646 """Generate deltas for pack objects.
2648 Args:
2649 objects: An iterable of (object, path) tuples to deltify.
2650 window_size: Window size; None for default
2651 progress: Optional progress reporting callback
2652 Returns: Iterator over type_num, object id, delta_base, content
2653 delta_base is None for full text entries
2654 """
2656 def objects_with_hints() -> Iterator[tuple[ShaFile, tuple[int, Optional[bytes]]]]:
2657 for e in objects:
2658 if isinstance(e, ShaFile):
2659 yield (e, (e.type_num, None))
2660 else:
2661 yield (e[0], (e[0].type_num, e[1]))
2663 sorted_objs = sort_objects_for_delta(objects_with_hints())
2664 yield from deltas_from_sorted_objects(
2665 sorted_objs,
2666 window_size=window_size,
2667 progress=progress,
2668 )
2671def sort_objects_for_delta(
2672 objects: Union[Iterator[ShaFile], Iterator[tuple[ShaFile, Optional[PackHint]]]],
2673) -> Iterator[tuple[ShaFile, Optional[bytes]]]:
2674 """Sort objects for optimal delta compression.
2676 Args:
2677 objects: Iterator of objects or (object, hint) tuples
2679 Returns:
2680 Iterator of sorted (ShaFile, path) tuples
2681 """
2682 magic = []
2683 for entry in objects:
2684 if isinstance(entry, tuple):
2685 obj, hint = entry
2686 if hint is None:
2687 type_num = None
2688 path = None
2689 else:
2690 (type_num, path) = hint
2691 else:
2692 obj = entry
2693 type_num = None
2694 path = None
2695 magic.append((type_num, path, -obj.raw_length(), obj))
2696 # Build a list of objects ordered by the magic Linus heuristic
2697 # This helps us find good objects to diff against us
2698 magic.sort()
2699 return ((x[3], x[1]) for x in magic)
2702def deltas_from_sorted_objects(
2703 objects: Iterator[tuple[ShaFile, Optional[bytes]]],
2704 window_size: Optional[int] = None,
2705 progress: Optional[Callable[..., None]] = None,
2706) -> Iterator[UnpackedObject]:
2707 """Create deltas from sorted objects.
2709 Args:
2710 objects: Iterator of sorted objects to deltify
2711 window_size: Delta window size; None for default
2712 progress: Optional progress reporting callback
2714 Returns:
2715 Iterator of UnpackedObject entries
2716 """
2717 # TODO(jelmer): Use threads
2718 if window_size is None:
2719 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE
2721 possible_bases: deque[tuple[bytes, int, list[bytes]]] = deque()
2722 for i, (o, path) in enumerate(objects):
2723 if progress is not None and i % 1000 == 0:
2724 progress((f"generating deltas: {i}\r").encode())
2725 raw = o.as_raw_chunks()
2726 winner = raw
2727 winner_len = sum(map(len, winner))
2728 winner_base = None
2729 for base_id, base_type_num, base in possible_bases:
2730 if base_type_num != o.type_num:
2731 continue
2732 delta_len = 0
2733 delta = []
2734 for chunk in create_delta(b"".join(base), b"".join(raw)):
2735 delta_len += len(chunk)
2736 if delta_len >= winner_len:
2737 break
2738 delta.append(chunk)
2739 else:
2740 winner_base = base_id
2741 winner = delta
2742 winner_len = sum(map(len, winner))
2743 yield UnpackedObject(
2744 o.type_num,
2745 sha=o.sha().digest(),
2746 delta_base=winner_base,
2747 decomp_len=winner_len,
2748 decomp_chunks=winner,
2749 )
2750 possible_bases.appendleft((o.sha().digest(), o.type_num, raw))
2751 while len(possible_bases) > window_size:
2752 possible_bases.pop()
2755def pack_objects_to_data(
2756 objects: Union[
2757 Sequence[ShaFile],
2758 Sequence[tuple[ShaFile, Optional[bytes]]],
2759 Sequence[tuple[ShaFile, Optional[PackHint]]],
2760 ],
2761 *,
2762 deltify: Optional[bool] = None,
2763 delta_window_size: Optional[int] = None,
2764 ofs_delta: bool = True,
2765 progress: Optional[Callable[..., None]] = None,
2766) -> tuple[int, Iterator[UnpackedObject]]:
2767 """Create pack data from objects.
2769 Args:
2770 objects: Pack objects
2771 deltify: Whether to deltify pack objects
2772 delta_window_size: Delta window size
2773 ofs_delta: Whether to use offset deltas
2774 progress: Optional progress reporting callback
2775 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
2776 """
2777 count = len(objects)
2778 if deltify is None:
2779 # PERFORMANCE/TODO(jelmer): This should be enabled but the python
2780 # implementation is *much* too slow at the moment.
2781 # Maybe consider enabling it just if the rust extension is available?
2782 deltify = False
2783 if deltify:
2784 return (
2785 count,
2786 deltify_pack_objects(
2787 iter(objects), # type: ignore
2788 window_size=delta_window_size,
2789 progress=progress,
2790 ),
2791 )
2792 else:
2794 def iter_without_path() -> Iterator[UnpackedObject]:
2795 for o in objects:
2796 if isinstance(o, tuple):
2797 yield full_unpacked_object(o[0])
2798 else:
2799 yield full_unpacked_object(o)
2801 return (count, iter_without_path())
2804def generate_unpacked_objects(
2805 container: PackedObjectContainer,
2806 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]],
2807 delta_window_size: Optional[int] = None,
2808 deltify: Optional[bool] = None,
2809 reuse_deltas: bool = True,
2810 ofs_delta: bool = True,
2811 other_haves: Optional[set[bytes]] = None,
2812 progress: Optional[Callable[..., None]] = None,
2813) -> Iterator[UnpackedObject]:
2814 """Create pack data from objects.
2816 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
2817 """
2818 todo = dict(object_ids)
2819 if reuse_deltas:
2820 for unpack in find_reusable_deltas(
2821 container, set(todo), other_haves=other_haves, progress=progress
2822 ):
2823 del todo[sha_to_hex(unpack.sha())]
2824 yield unpack
2825 if deltify is None:
2826 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too
2827 # slow at the moment.
2828 deltify = False
2829 if deltify:
2830 objects_to_delta = container.iterobjects_subset(
2831 todo.keys(), allow_missing=False
2832 )
2833 sorted_objs = sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta)
2834 yield from deltas_from_sorted_objects(
2835 sorted_objs,
2836 window_size=delta_window_size,
2837 progress=progress,
2838 )
2839 else:
2840 for oid in todo:
2841 yield full_unpacked_object(container[oid])
2844def full_unpacked_object(o: ShaFile) -> UnpackedObject:
2845 """Create an UnpackedObject from a ShaFile.
2847 Args:
2848 o: ShaFile object to convert
2850 Returns:
2851 UnpackedObject with full object data
2852 """
2853 return UnpackedObject(
2854 o.type_num,
2855 delta_base=None,
2856 crc32=None,
2857 decomp_chunks=o.as_raw_chunks(),
2858 sha=o.sha().digest(),
2859 )
2862def write_pack_from_container(
2863 write: Union[
2864 Callable[[bytes], None],
2865 Callable[[Union[bytes, bytearray, memoryview]], int],
2866 IO[bytes],
2867 ],
2868 container: PackedObjectContainer,
2869 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]],
2870 delta_window_size: Optional[int] = None,
2871 deltify: Optional[bool] = None,
2872 reuse_deltas: bool = True,
2873 compression_level: int = -1,
2874 other_haves: Optional[set[bytes]] = None,
2875) -> tuple[dict[bytes, tuple[int, int]], bytes]:
2876 """Write a new pack data file.
2878 Args:
2879 write: write function to use
2880 container: PackedObjectContainer
2881 object_ids: Sequence of (object_id, hint) tuples to write
2882 delta_window_size: Sliding window size for searching for deltas;
2883 Set to None for default window size.
2884 deltify: Whether to deltify objects
2885 reuse_deltas: Whether to reuse existing deltas
2886 compression_level: the zlib compression level to use
2887 other_haves: Set of additional object IDs the receiver has
2888 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2889 """
2890 pack_contents_count = len(object_ids)
2891 pack_contents = generate_unpacked_objects(
2892 container,
2893 object_ids,
2894 delta_window_size=delta_window_size,
2895 deltify=deltify,
2896 reuse_deltas=reuse_deltas,
2897 other_haves=other_haves,
2898 )
2900 return write_pack_data(
2901 write,
2902 pack_contents,
2903 num_records=pack_contents_count,
2904 compression_level=compression_level,
2905 )
2908def write_pack_objects(
2909 write: Union[Callable[[bytes], None], IO[bytes]],
2910 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],
2911 *,
2912 delta_window_size: Optional[int] = None,
2913 deltify: Optional[bool] = None,
2914 compression_level: int = -1,
2915) -> tuple[dict[bytes, tuple[int, int]], bytes]:
2916 """Write a new pack data file.
2918 Args:
2919 write: write function to use
2920 objects: Sequence of (object, path) tuples to write
2921 delta_window_size: Sliding window size for searching for deltas;
2922 Set to None for default window size.
2923 deltify: Whether to deltify objects
2924 compression_level: the zlib compression level to use
2925 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2926 """
2927 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify)
2929 return write_pack_data(
2930 write,
2931 pack_contents,
2932 num_records=pack_contents_count,
2933 compression_level=compression_level,
2934 )
2937class PackChunkGenerator:
2938 """Generator for pack data chunks."""
2940 def __init__(
2941 self,
2942 num_records: Optional[int] = None,
2943 records: Optional[Iterator[UnpackedObject]] = None,
2944 progress: Optional[Callable[..., None]] = None,
2945 compression_level: int = -1,
2946 reuse_compressed: bool = True,
2947 ) -> None:
2948 """Initialize PackChunkGenerator.
2950 Args:
2951 num_records: Expected number of records
2952 records: Iterator of pack records
2953 progress: Optional progress callback
2954 compression_level: Compression level (-1 for default)
2955 reuse_compressed: Whether to reuse compressed chunks
2956 """
2957 self.cs = sha1(b"")
2958 self.entries: dict[bytes, tuple[int, int]] = {}
2959 if records is None:
2960 records = iter([]) # Empty iterator if None
2961 self._it = self._pack_data_chunks(
2962 records=records,
2963 num_records=num_records,
2964 progress=progress,
2965 compression_level=compression_level,
2966 reuse_compressed=reuse_compressed,
2967 )
2969 def sha1digest(self) -> bytes:
2970 """Return the SHA1 digest of the pack data."""
2971 return self.cs.digest()
2973 def __iter__(self) -> Iterator[bytes]:
2974 """Iterate over pack data chunks."""
2975 return self._it
2977 def _pack_data_chunks(
2978 self,
2979 records: Iterator[UnpackedObject],
2980 *,
2981 num_records: Optional[int] = None,
2982 progress: Optional[Callable[..., None]] = None,
2983 compression_level: int = -1,
2984 reuse_compressed: bool = True,
2985 ) -> Iterator[bytes]:
2986 """Iterate pack data file chunks.
2988 Args:
2989 records: Iterator over UnpackedObject
2990 num_records: Number of records (defaults to len(records) if not specified)
2991 progress: Function to report progress to
2992 compression_level: the zlib compression level
2993 reuse_compressed: Whether to reuse compressed chunks
2994 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2995 """
2996 # Write the pack
2997 if num_records is None:
2998 num_records = len(records) # type: ignore
2999 offset = 0
3000 for chunk in pack_header_chunks(num_records):
3001 yield chunk
3002 self.cs.update(chunk)
3003 offset += len(chunk)
3004 actual_num_records = 0
3005 for i, unpacked in enumerate(records):
3006 type_num = unpacked.pack_type_num
3007 if progress is not None and i % 1000 == 0:
3008 progress((f"writing pack data: {i}/{num_records}\r").encode("ascii"))
3009 raw: Union[list[bytes], tuple[int, list[bytes]], tuple[bytes, list[bytes]]]
3010 if unpacked.delta_base is not None:
3011 assert isinstance(unpacked.delta_base, bytes), (
3012 f"Expected bytes, got {type(unpacked.delta_base)}"
3013 )
3014 try:
3015 base_offset, _base_crc32 = self.entries[unpacked.delta_base]
3016 except KeyError:
3017 type_num = REF_DELTA
3018 assert isinstance(unpacked.delta_base, bytes)
3019 raw = (unpacked.delta_base, unpacked.decomp_chunks)
3020 else:
3021 type_num = OFS_DELTA
3022 raw = (offset - base_offset, unpacked.decomp_chunks)
3023 else:
3024 raw = unpacked.decomp_chunks
3025 chunks: Union[list[bytes], Iterator[bytes]]
3026 if unpacked.comp_chunks is not None and reuse_compressed:
3027 chunks = unpacked.comp_chunks
3028 else:
3029 chunks = pack_object_chunks(
3030 type_num, raw, compression_level=compression_level
3031 )
3032 crc32 = 0
3033 object_size = 0
3034 for chunk in chunks:
3035 yield chunk
3036 crc32 = binascii.crc32(chunk, crc32)
3037 self.cs.update(chunk)
3038 object_size += len(chunk)
3039 actual_num_records += 1
3040 self.entries[unpacked.sha()] = (offset, crc32)
3041 offset += object_size
3042 if actual_num_records != num_records:
3043 raise AssertionError(
3044 f"actual records written differs: {actual_num_records} != {num_records}"
3045 )
3047 yield self.cs.digest()
3050def write_pack_data(
3051 write: Union[
3052 Callable[[bytes], None],
3053 Callable[[Union[bytes, bytearray, memoryview]], int],
3054 IO[bytes],
3055 ],
3056 records: Iterator[UnpackedObject],
3057 *,
3058 num_records: Optional[int] = None,
3059 progress: Optional[Callable[..., None]] = None,
3060 compression_level: int = -1,
3061) -> tuple[dict[bytes, tuple[int, int]], bytes]:
3062 """Write a new pack data file.
3064 Args:
3065 write: Write function to use
3066 num_records: Number of records (defaults to len(records) if None)
3067 records: Iterator over type_num, object_id, delta_base, raw
3068 progress: Function to report progress to
3069 compression_level: the zlib compression level
3070 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
3071 """
3072 chunk_generator = PackChunkGenerator(
3073 num_records=num_records,
3074 records=records,
3075 progress=progress,
3076 compression_level=compression_level,
3077 )
3078 for chunk in chunk_generator:
3079 if callable(write):
3080 write(chunk)
3081 else:
3082 write.write(chunk)
3083 return chunk_generator.entries, chunk_generator.sha1digest()
3086def write_pack_index_v1(
3087 f: IO[bytes],
3088 entries: Iterable[tuple[bytes, int, Union[int, None]]],
3089 pack_checksum: bytes,
3090) -> bytes:
3091 """Write a new pack index file.
3093 Args:
3094 f: A file-like object to write to
3095 entries: List of tuples with object name (sha), offset_in_pack,
3096 and crc32_checksum.
3097 pack_checksum: Checksum of the pack file.
3098 Returns: The SHA of the written index file
3099 """
3100 f = SHA1Writer(f)
3101 fan_out_table: dict[int, int] = defaultdict(lambda: 0)
3102 for name, _offset, _entry_checksum in entries:
3103 fan_out_table[ord(name[:1])] += 1
3104 # Fan-out table
3105 for i in range(0x100):
3106 f.write(struct.pack(">L", fan_out_table[i]))
3107 fan_out_table[i + 1] += fan_out_table[i]
3108 for name, offset, _entry_checksum in entries:
3109 if not (offset <= 0xFFFFFFFF):
3110 raise TypeError("pack format 1 only supports offsets < 2Gb")
3111 f.write(struct.pack(">L20s", offset, name))
3112 assert len(pack_checksum) == 20
3113 f.write(pack_checksum)
3114 return f.write_sha()
3117def _delta_encode_size(size: int) -> bytes:
3118 ret = bytearray()
3119 c = size & 0x7F
3120 size >>= 7
3121 while size:
3122 ret.append(c | 0x80)
3123 c = size & 0x7F
3124 size >>= 7
3125 ret.append(c)
3126 return bytes(ret)
3129# The length of delta compression copy operations in version 2 packs is limited
3130# to 64K. To copy more, we use several copy operations. Version 3 packs allow
3131# 24-bit lengths in copy operations, but we always make version 2 packs.
3132_MAX_COPY_LEN = 0xFFFF
3135def _encode_copy_operation(start: int, length: int) -> bytes:
3136 scratch = bytearray([0x80])
3137 for i in range(4):
3138 if start & 0xFF << i * 8:
3139 scratch.append((start >> i * 8) & 0xFF)
3140 scratch[0] |= 1 << i
3141 for i in range(2):
3142 if length & 0xFF << i * 8:
3143 scratch.append((length >> i * 8) & 0xFF)
3144 scratch[0] |= 1 << (4 + i)
3145 return bytes(scratch)
3148def _create_delta_py(base_buf: bytes, target_buf: bytes) -> Iterator[bytes]:
3149 """Use python difflib to work out how to transform base_buf to target_buf.
3151 Args:
3152 base_buf: Base buffer
3153 target_buf: Target buffer
3154 """
3155 if isinstance(base_buf, list):
3156 base_buf = b"".join(base_buf)
3157 if isinstance(target_buf, list):
3158 target_buf = b"".join(target_buf)
3159 assert isinstance(base_buf, bytes)
3160 assert isinstance(target_buf, bytes)
3161 # write delta header
3162 yield _delta_encode_size(len(base_buf))
3163 yield _delta_encode_size(len(target_buf))
3164 # write out delta opcodes
3165 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf)
3166 for opcode, i1, i2, j1, j2 in seq.get_opcodes():
3167 # Git patch opcodes don't care about deletes!
3168 # if opcode == 'replace' or opcode == 'delete':
3169 # pass
3170 if opcode == "equal":
3171 # If they are equal, unpacker will use data from base_buf
3172 # Write out an opcode that says what range to use
3173 copy_start = i1
3174 copy_len = i2 - i1
3175 while copy_len > 0:
3176 to_copy = min(copy_len, _MAX_COPY_LEN)
3177 yield _encode_copy_operation(copy_start, to_copy)
3178 copy_start += to_copy
3179 copy_len -= to_copy
3180 if opcode == "replace" or opcode == "insert":
3181 # If we are replacing a range or adding one, then we just
3182 # output it to the stream (prefixed by its size)
3183 s = j2 - j1
3184 o = j1
3185 while s > 127:
3186 yield bytes([127])
3187 yield bytes(memoryview(target_buf)[o : o + 127])
3188 s -= 127
3189 o += 127
3190 yield bytes([s])
3191 yield bytes(memoryview(target_buf)[o : o + s])
3194# Default to pure Python implementation
3195create_delta = _create_delta_py
3198def apply_delta(
3199 src_buf: Union[bytes, list[bytes]], delta: Union[bytes, list[bytes]]
3200) -> list[bytes]:
3201 """Based on the similar function in git's patch-delta.c.
3203 Args:
3204 src_buf: Source buffer
3205 delta: Delta instructions
3206 """
3207 if not isinstance(src_buf, bytes):
3208 src_buf = b"".join(src_buf)
3209 if not isinstance(delta, bytes):
3210 delta = b"".join(delta)
3211 out = []
3212 index = 0
3213 delta_length = len(delta)
3215 def get_delta_header_size(delta: bytes, index: int) -> tuple[int, int]:
3216 size = 0
3217 i = 0
3218 while delta:
3219 cmd = ord(delta[index : index + 1])
3220 index += 1
3221 size |= (cmd & ~0x80) << i
3222 i += 7
3223 if not cmd & 0x80:
3224 break
3225 return size, index
3227 src_size, index = get_delta_header_size(delta, index)
3228 dest_size, index = get_delta_header_size(delta, index)
3229 if src_size != len(src_buf):
3230 raise ApplyDeltaError(
3231 f"Unexpected source buffer size: {src_size} vs {len(src_buf)}"
3232 )
3233 while index < delta_length:
3234 cmd = ord(delta[index : index + 1])
3235 index += 1
3236 if cmd & 0x80:
3237 cp_off = 0
3238 for i in range(4):
3239 if cmd & (1 << i):
3240 x = ord(delta[index : index + 1])
3241 index += 1
3242 cp_off |= x << (i * 8)
3243 cp_size = 0
3244 # Version 3 packs can contain copy sizes larger than 64K.
3245 for i in range(3):
3246 if cmd & (1 << (4 + i)):
3247 x = ord(delta[index : index + 1])
3248 index += 1
3249 cp_size |= x << (i * 8)
3250 if cp_size == 0:
3251 cp_size = 0x10000
3252 if (
3253 cp_off + cp_size < cp_size
3254 or cp_off + cp_size > src_size
3255 or cp_size > dest_size
3256 ):
3257 break
3258 out.append(src_buf[cp_off : cp_off + cp_size])
3259 elif cmd != 0:
3260 out.append(delta[index : index + cmd])
3261 index += cmd
3262 else:
3263 raise ApplyDeltaError("Invalid opcode 0")
3265 if index != delta_length:
3266 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}")
3268 if dest_size != chunks_length(out):
3269 raise ApplyDeltaError("dest size incorrect")
3271 return out
3274def write_pack_index_v2(
3275 f: IO[bytes],
3276 entries: Iterable[tuple[bytes, int, Union[int, None]]],
3277 pack_checksum: bytes,
3278) -> bytes:
3279 """Write a new pack index file.
3281 Args:
3282 f: File-like object to write to
3283 entries: List of tuples with object name (sha), offset_in_pack, and
3284 crc32_checksum.
3285 pack_checksum: Checksum of the pack file.
3286 Returns: The SHA of the index file written
3287 """
3288 f = SHA1Writer(f)
3289 f.write(b"\377tOc") # Magic!
3290 f.write(struct.pack(">L", 2))
3291 fan_out_table: dict[int, int] = defaultdict(lambda: 0)
3292 for name, offset, entry_checksum in entries:
3293 fan_out_table[ord(name[:1])] += 1
3294 # Fan-out table
3295 largetable: list[int] = []
3296 for i in range(0x100):
3297 f.write(struct.pack(b">L", fan_out_table[i]))
3298 fan_out_table[i + 1] += fan_out_table[i]
3299 for name, offset, entry_checksum in entries:
3300 f.write(name)
3301 for name, offset, entry_checksum in entries:
3302 f.write(struct.pack(b">L", entry_checksum))
3303 for name, offset, entry_checksum in entries:
3304 if offset < 2**31:
3305 f.write(struct.pack(b">L", offset))
3306 else:
3307 f.write(struct.pack(b">L", 2**31 + len(largetable)))
3308 largetable.append(offset)
3309 for offset in largetable:
3310 f.write(struct.pack(b">Q", offset))
3311 assert len(pack_checksum) == 20
3312 f.write(pack_checksum)
3313 return f.write_sha()
3316def write_pack_index_v3(
3317 f: IO[bytes],
3318 entries: Iterable[tuple[bytes, int, Union[int, None]]],
3319 pack_checksum: bytes,
3320 hash_algorithm: int = 1,
3321) -> bytes:
3322 """Write a new pack index file in v3 format.
3324 Args:
3325 f: File-like object to write to
3326 entries: List of tuples with object name (sha), offset_in_pack, and
3327 crc32_checksum.
3328 pack_checksum: Checksum of the pack file.
3329 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
3330 Returns: The SHA of the index file written
3331 """
3332 if hash_algorithm == 1:
3333 hash_size = 20 # SHA-1
3334 writer_cls = SHA1Writer
3335 elif hash_algorithm == 2:
3336 hash_size = 32 # SHA-256
3337 # TODO: Add SHA256Writer when SHA-256 support is implemented
3338 raise NotImplementedError("SHA-256 support not yet implemented")
3339 else:
3340 raise ValueError(f"Unknown hash algorithm {hash_algorithm}")
3342 # Convert entries to list to allow multiple iterations
3343 entries_list = list(entries)
3345 # Calculate shortest unambiguous prefix length for object names
3346 # For now, use full hash size (this could be optimized)
3347 shortened_oid_len = hash_size
3349 f = writer_cls(f)
3350 f.write(b"\377tOc") # Magic!
3351 f.write(struct.pack(">L", 3)) # Version 3
3352 f.write(struct.pack(">L", hash_algorithm)) # Hash algorithm
3353 f.write(struct.pack(">L", shortened_oid_len)) # Shortened OID length
3355 fan_out_table: dict[int, int] = defaultdict(lambda: 0)
3356 for name, offset, entry_checksum in entries_list:
3357 if len(name) != hash_size:
3358 raise ValueError(
3359 f"Object name has wrong length: expected {hash_size}, got {len(name)}"
3360 )
3361 fan_out_table[ord(name[:1])] += 1
3363 # Fan-out table
3364 largetable: list[int] = []
3365 for i in range(0x100):
3366 f.write(struct.pack(b">L", fan_out_table[i]))
3367 fan_out_table[i + 1] += fan_out_table[i]
3369 # Object names table
3370 for name, offset, entry_checksum in entries_list:
3371 f.write(name)
3373 # CRC32 checksums table
3374 for name, offset, entry_checksum in entries_list:
3375 f.write(struct.pack(b">L", entry_checksum))
3377 # Offset table
3378 for name, offset, entry_checksum in entries_list:
3379 if offset < 2**31:
3380 f.write(struct.pack(b">L", offset))
3381 else:
3382 f.write(struct.pack(b">L", 2**31 + len(largetable)))
3383 largetable.append(offset)
3385 # Large offset table
3386 for offset in largetable:
3387 f.write(struct.pack(b">Q", offset))
3389 assert len(pack_checksum) == hash_size, (
3390 f"Pack checksum has wrong length: expected {hash_size}, got {len(pack_checksum)}"
3391 )
3392 f.write(pack_checksum)
3393 return f.write_sha()
3396def write_pack_index(
3397 f: IO[bytes],
3398 entries: Iterable[tuple[bytes, int, Union[int, None]]],
3399 pack_checksum: bytes,
3400 progress: Optional[Callable[..., None]] = None,
3401 version: Optional[int] = None,
3402) -> bytes:
3403 """Write a pack index file.
3405 Args:
3406 f: File-like object to write to.
3407 entries: List of (checksum, offset, crc32) tuples
3408 pack_checksum: Checksum of the pack file.
3409 progress: Progress function (not currently used)
3410 version: Pack index version to use (1, 2, or 3). If None, defaults to DEFAULT_PACK_INDEX_VERSION.
3412 Returns:
3413 SHA of the written index file
3414 """
3415 if version is None:
3416 version = DEFAULT_PACK_INDEX_VERSION
3418 if version == 1:
3419 return write_pack_index_v1(f, entries, pack_checksum)
3420 elif version == 2:
3421 return write_pack_index_v2(f, entries, pack_checksum)
3422 elif version == 3:
3423 return write_pack_index_v3(f, entries, pack_checksum)
3424 else:
3425 raise ValueError(f"Unsupported pack index version: {version}")
3428class Pack:
3429 """A Git pack object."""
3431 _data_load: Optional[Callable[[], PackData]]
3432 _idx_load: Optional[Callable[[], PackIndex]]
3434 _data: Optional[PackData]
3435 _idx: Optional[PackIndex]
3436 _bitmap: Optional["PackBitmap"]
3438 def __init__(
3439 self,
3440 basename: str,
3441 resolve_ext_ref: Optional[ResolveExtRefFn] = None,
3442 *,
3443 delta_window_size: Optional[int] = None,
3444 window_memory: Optional[int] = None,
3445 delta_cache_size: Optional[int] = None,
3446 depth: Optional[int] = None,
3447 threads: Optional[int] = None,
3448 big_file_threshold: Optional[int] = None,
3449 ) -> None:
3450 """Initialize a Pack object.
3452 Args:
3453 basename: Base path for pack files (without .pack/.idx extension)
3454 resolve_ext_ref: Optional function to resolve external references
3455 delta_window_size: Size of the delta compression window
3456 window_memory: Memory limit for delta compression window
3457 delta_cache_size: Size of the delta cache
3458 depth: Maximum depth for delta chains
3459 threads: Number of threads to use for operations
3460 big_file_threshold: Size threshold for big file handling
3461 """
3462 self._basename = basename
3463 self._data = None
3464 self._idx = None
3465 self._bitmap = None
3466 self._idx_path = self._basename + ".idx"
3467 self._data_path = self._basename + ".pack"
3468 self._bitmap_path = self._basename + ".bitmap"
3469 self.delta_window_size = delta_window_size
3470 self.window_memory = window_memory
3471 self.delta_cache_size = delta_cache_size
3472 self.depth = depth
3473 self.threads = threads
3474 self.big_file_threshold = big_file_threshold
3475 self._data_load = lambda: PackData(
3476 self._data_path,
3477 delta_window_size=delta_window_size,
3478 window_memory=window_memory,
3479 delta_cache_size=delta_cache_size,
3480 depth=depth,
3481 threads=threads,
3482 big_file_threshold=big_file_threshold,
3483 )
3484 self._idx_load = lambda: load_pack_index(self._idx_path)
3485 self.resolve_ext_ref = resolve_ext_ref
3487 @classmethod
3488 def from_lazy_objects(
3489 cls, data_fn: Callable[[], PackData], idx_fn: Callable[[], PackIndex]
3490 ) -> "Pack":
3491 """Create a new pack object from callables to load pack data and index objects."""
3492 ret = cls("")
3493 ret._data_load = data_fn
3494 ret._idx_load = idx_fn
3495 return ret
3497 @classmethod
3498 def from_objects(cls, data: PackData, idx: PackIndex) -> "Pack":
3499 """Create a new pack object from pack data and index objects."""
3500 ret = cls("")
3501 ret._data = data
3502 ret._data_load = None
3503 ret._idx = idx
3504 ret._idx_load = None
3505 ret.check_length_and_checksum()
3506 return ret
3508 def name(self) -> bytes:
3509 """The SHA over the SHAs of the objects in this pack."""
3510 return self.index.objects_sha1()
3512 @property
3513 def data(self) -> PackData:
3514 """The pack data object being used."""
3515 if self._data is None:
3516 assert self._data_load
3517 self._data = self._data_load()
3518 self.check_length_and_checksum()
3519 return self._data
3521 @property
3522 def index(self) -> PackIndex:
3523 """The index being used.
3525 Note: This may be an in-memory index
3526 """
3527 if self._idx is None:
3528 assert self._idx_load
3529 self._idx = self._idx_load()
3530 return self._idx
3532 @property
3533 def bitmap(self) -> Optional["PackBitmap"]:
3534 """The bitmap being used, if available.
3536 Returns:
3537 PackBitmap instance or None if no bitmap exists
3539 Raises:
3540 ValueError: If bitmap file is invalid or corrupt
3541 """
3542 if self._bitmap is None:
3543 from .bitmap import read_bitmap
3545 self._bitmap = read_bitmap(self._bitmap_path, pack_index=self.index)
3546 return self._bitmap
3548 def close(self) -> None:
3549 """Close the pack file and index."""
3550 if self._data is not None:
3551 self._data.close()
3552 if self._idx is not None:
3553 self._idx.close()
3555 def __enter__(self) -> "Pack":
3556 """Enter context manager."""
3557 return self
3559 def __exit__(
3560 self,
3561 exc_type: Optional[type],
3562 exc_val: Optional[BaseException],
3563 exc_tb: Optional[TracebackType],
3564 ) -> None:
3565 """Exit context manager."""
3566 self.close()
3568 def __eq__(self, other: object) -> bool:
3569 """Check equality with another pack."""
3570 if not isinstance(other, Pack):
3571 return False
3572 return self.index == other.index
3574 def __len__(self) -> int:
3575 """Number of entries in this pack."""
3576 return len(self.index)
3578 def __repr__(self) -> str:
3579 """Return string representation of this pack."""
3580 return f"{self.__class__.__name__}({self._basename!r})"
3582 def __iter__(self) -> Iterator[bytes]:
3583 """Iterate over all the sha1s of the objects in this pack."""
3584 return iter(self.index)
3586 def check_length_and_checksum(self) -> None:
3587 """Sanity check the length and checksum of the pack index and data."""
3588 assert len(self.index) == len(self.data), (
3589 f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)"
3590 )
3591 idx_stored_checksum = self.index.get_pack_checksum()
3592 data_stored_checksum = self.data.get_stored_checksum()
3593 if (
3594 idx_stored_checksum is not None
3595 and idx_stored_checksum != data_stored_checksum
3596 ):
3597 raise ChecksumMismatch(
3598 sha_to_hex(idx_stored_checksum),
3599 sha_to_hex(data_stored_checksum),
3600 )
3602 def check(self) -> None:
3603 """Check the integrity of this pack.
3605 Raises:
3606 ChecksumMismatch: if a checksum for the index or data is wrong
3607 """
3608 self.index.check()
3609 self.data.check()
3610 for obj in self.iterobjects():
3611 obj.check()
3612 # TODO: object connectivity checks
3614 def get_stored_checksum(self) -> bytes:
3615 """Return the stored checksum of the pack data."""
3616 return self.data.get_stored_checksum()
3618 def pack_tuples(self) -> list[tuple[ShaFile, None]]:
3619 """Return pack tuples for all objects in pack."""
3620 return [(o, None) for o in self.iterobjects()]
3622 def __contains__(self, sha1: bytes) -> bool:
3623 """Check whether this pack contains a particular SHA1."""
3624 try:
3625 self.index.object_offset(sha1)
3626 return True
3627 except KeyError:
3628 return False
3630 def get_raw(self, sha1: bytes) -> tuple[int, bytes]:
3631 """Get raw object data by SHA1."""
3632 offset = self.index.object_offset(sha1)
3633 obj_type, obj = self.data.get_object_at(offset)
3634 type_num, chunks = self.resolve_object(offset, obj_type, obj)
3635 return type_num, b"".join(chunks) # type: ignore[arg-type]
3637 def __getitem__(self, sha1: bytes) -> ShaFile:
3638 """Retrieve the specified SHA1."""
3639 type, uncomp = self.get_raw(sha1)
3640 return ShaFile.from_raw_string(type, uncomp, sha=sha1)
3642 def iterobjects(self) -> Iterator[ShaFile]:
3643 """Iterate over the objects in this pack."""
3644 return iter(
3645 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref)
3646 )
3648 def iterobjects_subset(
3649 self, shas: Iterable[ObjectID], *, allow_missing: bool = False
3650 ) -> Iterator[ShaFile]:
3651 """Iterate over a subset of objects in this pack."""
3652 return (
3653 uo
3654 for uo in PackInflater.for_pack_subset(
3655 self,
3656 shas,
3657 allow_missing=allow_missing,
3658 resolve_ext_ref=self.resolve_ext_ref,
3659 )
3660 if uo.id in shas
3661 )
3663 def iter_unpacked_subset(
3664 self,
3665 shas: Iterable[ObjectID],
3666 *,
3667 include_comp: bool = False,
3668 allow_missing: bool = False,
3669 convert_ofs_delta: bool = False,
3670 ) -> Iterator[UnpackedObject]:
3671 """Iterate over unpacked objects in subset."""
3672 ofs_pending: dict[int, list[UnpackedObject]] = defaultdict(list)
3673 ofs: dict[int, bytes] = {}
3674 todo = set(shas)
3675 for unpacked in self.iter_unpacked(include_comp=include_comp):
3676 sha = unpacked.sha()
3677 if unpacked.offset is not None:
3678 ofs[unpacked.offset] = sha
3679 hexsha = sha_to_hex(sha)
3680 if hexsha in todo:
3681 if unpacked.pack_type_num == OFS_DELTA:
3682 assert isinstance(unpacked.delta_base, int)
3683 assert unpacked.offset is not None
3684 base_offset = unpacked.offset - unpacked.delta_base
3685 try:
3686 unpacked.delta_base = ofs[base_offset]
3687 except KeyError:
3688 ofs_pending[base_offset].append(unpacked)
3689 continue
3690 else:
3691 unpacked.pack_type_num = REF_DELTA
3692 yield unpacked
3693 todo.remove(hexsha)
3694 if unpacked.offset is not None:
3695 for child in ofs_pending.pop(unpacked.offset, []):
3696 child.pack_type_num = REF_DELTA
3697 child.delta_base = sha
3698 yield child
3699 assert not ofs_pending
3700 if not allow_missing and todo:
3701 raise UnresolvedDeltas(list(todo))
3703 def iter_unpacked(self, include_comp: bool = False) -> Iterator[UnpackedObject]:
3704 """Iterate over all unpacked objects in this pack."""
3705 ofs_to_entries = {
3706 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries()
3707 }
3708 for unpacked in self.data.iter_unpacked(include_comp=include_comp):
3709 assert unpacked.offset is not None
3710 (sha, crc32) = ofs_to_entries[unpacked.offset]
3711 unpacked._sha = sha
3712 unpacked.crc32 = crc32
3713 yield unpacked
3715 def keep(self, msg: Optional[bytes] = None) -> str:
3716 """Add a .keep file for the pack, preventing git from garbage collecting it.
3718 Args:
3719 msg: A message written inside the .keep file; can be used later
3720 to determine whether or not a .keep file is obsolete.
3721 Returns: The path of the .keep file, as a string.
3722 """
3723 keepfile_name = f"{self._basename}.keep"
3724 with GitFile(keepfile_name, "wb") as keepfile:
3725 if msg:
3726 keepfile.write(msg)
3727 keepfile.write(b"\n")
3728 return keepfile_name
3730 def get_ref(self, sha: bytes) -> tuple[Optional[int], int, OldUnpackedObject]:
3731 """Get the object for a ref SHA, only looking in this pack."""
3732 # TODO: cache these results
3733 try:
3734 offset = self.index.object_offset(sha)
3735 except KeyError:
3736 offset = None
3737 if offset:
3738 type, obj = self.data.get_object_at(offset)
3739 elif self.resolve_ext_ref:
3740 type, obj = self.resolve_ext_ref(sha)
3741 else:
3742 raise KeyError(sha)
3743 return offset, type, obj
3745 def resolve_object(
3746 self,
3747 offset: int,
3748 type: int,
3749 obj: OldUnpackedObject,
3750 get_ref: Optional[
3751 Callable[[bytes], tuple[Optional[int], int, OldUnpackedObject]]
3752 ] = None,
3753 ) -> tuple[int, OldUnpackedObject]:
3754 """Resolve an object, possibly resolving deltas when necessary.
3756 Returns: Tuple with object type and contents.
3757 """
3758 # Walk down the delta chain, building a stack of deltas to reach
3759 # the requested object.
3760 base_offset = offset
3761 base_type = type
3762 base_obj = obj
3763 delta_stack = []
3764 while base_type in DELTA_TYPES:
3765 prev_offset = base_offset
3766 if get_ref is None:
3767 get_ref = self.get_ref
3768 if base_type == OFS_DELTA:
3769 (delta_offset, delta) = base_obj
3770 # TODO: clean up asserts and replace with nicer error messages
3771 assert isinstance(delta_offset, int), (
3772 f"Expected int, got {delta_offset.__class__}"
3773 )
3774 base_offset = base_offset - delta_offset
3775 base_type, base_obj = self.data.get_object_at(base_offset)
3776 assert isinstance(base_type, int)
3777 elif base_type == REF_DELTA:
3778 (basename, delta) = base_obj
3779 assert isinstance(basename, bytes) and len(basename) == 20
3780 base_offset, base_type, base_obj = get_ref(basename) # type: ignore[assignment]
3781 assert isinstance(base_type, int)
3782 if base_offset == prev_offset: # object is based on itself
3783 raise UnresolvedDeltas([basename])
3784 delta_stack.append((prev_offset, base_type, delta))
3786 # Now grab the base object (mustn't be a delta) and apply the
3787 # deltas all the way up the stack.
3788 chunks = base_obj
3789 for prev_offset, _delta_type, delta in reversed(delta_stack):
3790 # Convert chunks to bytes for apply_delta if needed
3791 if isinstance(chunks, list):
3792 chunks_bytes = b"".join(chunks)
3793 elif isinstance(chunks, tuple):
3794 # For tuple type, second element is the actual data
3795 _, chunk_data = chunks
3796 if isinstance(chunk_data, list):
3797 chunks_bytes = b"".join(chunk_data)
3798 else:
3799 chunks_bytes = chunk_data
3800 else:
3801 chunks_bytes = chunks
3803 # Apply delta and get result as list
3804 chunks = apply_delta(chunks_bytes, delta)
3806 if prev_offset is not None:
3807 self.data._offset_cache[prev_offset] = base_type, chunks
3808 return base_type, chunks
3810 def entries(
3811 self, progress: Optional[Callable[[int, int], None]] = None
3812 ) -> Iterator[PackIndexEntry]:
3813 """Yield entries summarizing the contents of this pack.
3815 Args:
3816 progress: Progress function, called with current and total
3817 object count.
3818 Returns: iterator of tuples with (sha, offset, crc32)
3819 """
3820 return self.data.iterentries(
3821 progress=progress, resolve_ext_ref=self.resolve_ext_ref
3822 )
3824 def sorted_entries(
3825 self, progress: Optional[ProgressFn] = None
3826 ) -> Iterator[PackIndexEntry]:
3827 """Return entries in this pack, sorted by SHA.
3829 Args:
3830 progress: Progress function, called with current and total
3831 object count
3832 Returns: Iterator of tuples with (sha, offset, crc32)
3833 """
3834 return iter(
3835 self.data.sorted_entries(
3836 progress=progress, resolve_ext_ref=self.resolve_ext_ref
3837 )
3838 )
3840 def get_unpacked_object(
3841 self, sha: bytes, *, include_comp: bool = False, convert_ofs_delta: bool = True
3842 ) -> UnpackedObject:
3843 """Get the unpacked object for a sha.
3845 Args:
3846 sha: SHA of object to fetch
3847 include_comp: Whether to include compression data in UnpackedObject
3848 convert_ofs_delta: Whether to convert offset deltas to ref deltas
3849 """
3850 offset = self.index.object_offset(sha)
3851 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp)
3852 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta:
3853 assert isinstance(unpacked.delta_base, int)
3854 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base)
3855 unpacked.pack_type_num = REF_DELTA
3856 return unpacked
3859def extend_pack(
3860 f: BinaryIO,
3861 object_ids: Set[ObjectID],
3862 get_raw: Callable[[ObjectID], tuple[int, bytes]],
3863 *,
3864 compression_level: int = -1,
3865 progress: Optional[Callable[[bytes], None]] = None,
3866) -> tuple[bytes, list[tuple[bytes, int, int]]]:
3867 """Extend a pack file with more objects.
3869 The caller should make sure that object_ids does not contain any objects
3870 that are already in the pack
3871 """
3872 # Update the header with the new number of objects.
3873 f.seek(0)
3874 _version, num_objects = read_pack_header(f.read)
3876 if object_ids:
3877 f.seek(0)
3878 write_pack_header(f.write, num_objects + len(object_ids))
3880 # Must flush before reading (http://bugs.python.org/issue3207)
3881 f.flush()
3883 # Rescan the rest of the pack, computing the SHA with the new header.
3884 new_sha = compute_file_sha(f, end_ofs=-20)
3886 # Must reposition before writing (http://bugs.python.org/issue3207)
3887 f.seek(0, os.SEEK_CUR)
3889 extra_entries = []
3891 # Complete the pack.
3892 for i, object_id in enumerate(object_ids):
3893 if progress is not None:
3894 progress(
3895 (f"writing extra base objects: {i}/{len(object_ids)}\r").encode("ascii")
3896 )
3897 assert len(object_id) == 20
3898 type_num, data = get_raw(object_id)
3899 offset = f.tell()
3900 crc32 = write_pack_object(
3901 f.write,
3902 type_num,
3903 [data], # Convert bytes to list[bytes]
3904 sha=new_sha,
3905 compression_level=compression_level,
3906 )
3907 extra_entries.append((object_id, offset, crc32))
3908 pack_sha = new_sha.digest()
3909 f.write(pack_sha)
3910 return pack_sha, extra_entries
3913try:
3914 from dulwich._pack import ( # type: ignore
3915 apply_delta,
3916 bisect_find_sha,
3917 )
3918except ImportError:
3919 pass
3921# Try to import the Rust version of create_delta
3922try:
3923 from dulwich._pack import create_delta as _create_delta_rs
3924except ImportError:
3925 pass
3926else:
3927 # Wrap the Rust version to match the Python API (returns bytes instead of Iterator)
3928 def _create_delta_rs_wrapper(base_buf: bytes, target_buf: bytes) -> Iterator[bytes]:
3929 """Wrapper for Rust create_delta to match Python API."""
3930 yield _create_delta_rs(base_buf, target_buf)
3932 create_delta = _create_delta_rs_wrapper