Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/pack.py: 24%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# pack.py -- For dealing with packed git objects.
2# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk>
4#
5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
7# General Public License as published by the Free Software Foundation; version 2.0
8# or (at your option) any later version. You can redistribute it and/or
9# modify it under the terms of either of these two licenses.
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# You should have received a copy of the licenses; if not, see
18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
20# License, Version 2.0.
21#
23"""Classes for dealing with packed git objects.
25A pack is a compact representation of a bunch of objects, stored
26using deltas where possible.
28They have two parts, the pack file, which stores the data, and an index
29that tells you where the data is.
31To find an object you look in all of the index files 'til you find a
32match for the object name. You then use the pointer got from this as
33a pointer in to the corresponding packfile.
34"""
36import binascii
37from collections import defaultdict, deque
38from contextlib import suppress
39from io import BytesIO, UnsupportedOperation
41try:
42 from cdifflib import CSequenceMatcher as SequenceMatcher
43except ModuleNotFoundError:
44 from difflib import SequenceMatcher
46import os
47import struct
48import sys
49import warnings
50import zlib
51from collections.abc import Iterable, Iterator, Sequence
52from hashlib import sha1
53from itertools import chain
54from os import SEEK_CUR, SEEK_END
55from struct import unpack_from
56from typing import (
57 IO,
58 TYPE_CHECKING,
59 Any,
60 BinaryIO,
61 Callable,
62 Generic,
63 Optional,
64 Protocol,
65 TypeVar,
66 Union,
67)
69try:
70 import mmap
71except ImportError:
72 has_mmap = False
73else:
74 has_mmap = True
76if TYPE_CHECKING:
77 from .commit_graph import CommitGraph
79# For some reason the above try, except fails to set has_mmap = False for plan9
80if sys.platform == "Plan9":
81 has_mmap = False
83from . import replace_me
84from .errors import ApplyDeltaError, ChecksumMismatch
85from .file import GitFile, _GitFile
86from .lru_cache import LRUSizeCache
87from .objects import ObjectID, ShaFile, hex_to_sha, object_header, sha_to_hex
89OFS_DELTA = 6
90REF_DELTA = 7
92DELTA_TYPES = (OFS_DELTA, REF_DELTA)
95DEFAULT_PACK_DELTA_WINDOW_SIZE = 10
97# Keep pack files under 16Mb in memory, otherwise write them out to disk
98PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024
100# Default pack index version to use when none is specified
101DEFAULT_PACK_INDEX_VERSION = 2
104OldUnpackedObject = Union[tuple[Union[bytes, int], list[bytes]], list[bytes]]
105ResolveExtRefFn = Callable[[bytes], tuple[int, OldUnpackedObject]]
106ProgressFn = Callable[[int, str], None]
107PackHint = tuple[int, Optional[bytes]]
110class UnresolvedDeltas(Exception):
111 """Delta objects could not be resolved."""
113 def __init__(self, shas: list[bytes]) -> None:
114 self.shas = shas
117class ObjectContainer(Protocol):
118 def add_object(self, obj: ShaFile) -> None:
119 """Add a single object to this object store."""
121 def add_objects(
122 self,
123 objects: Sequence[tuple[ShaFile, Optional[str]]],
124 progress: Optional[Callable[[str], None]] = None,
125 ) -> None:
126 """Add a set of objects to this object store.
128 Args:
129 objects: Iterable over a list of (object, path) tuples
130 """
132 def __contains__(self, sha1: bytes) -> bool:
133 """Check if a hex sha is present."""
135 def __getitem__(self, sha1: bytes) -> ShaFile:
136 """Retrieve an object."""
138 def get_commit_graph(self) -> Optional["CommitGraph"]:
139 """Get the commit graph for this object store.
141 Returns:
142 CommitGraph object if available, None otherwise
143 """
144 return None
147class PackedObjectContainer(ObjectContainer):
148 def get_unpacked_object(
149 self, sha1: bytes, *, include_comp: bool = False
150 ) -> "UnpackedObject":
151 """Get a raw unresolved object."""
152 raise NotImplementedError(self.get_unpacked_object)
154 def iterobjects_subset(
155 self, shas: Iterable[bytes], *, allow_missing: bool = False
156 ) -> Iterator[ShaFile]:
157 raise NotImplementedError(self.iterobjects_subset)
159 def iter_unpacked_subset(
160 self,
161 shas: set[bytes],
162 include_comp: bool = False,
163 allow_missing: bool = False,
164 convert_ofs_delta: bool = True,
165 ) -> Iterator["UnpackedObject"]:
166 raise NotImplementedError(self.iter_unpacked_subset)
169class UnpackedObjectStream:
170 """Abstract base class for a stream of unpacked objects."""
172 def __iter__(self) -> Iterator["UnpackedObject"]:
173 raise NotImplementedError(self.__iter__)
175 def __len__(self) -> int:
176 raise NotImplementedError(self.__len__)
179def take_msb_bytes(
180 read: Callable[[int], bytes], crc32: Optional[int] = None
181) -> tuple[list[int], Optional[int]]:
182 """Read bytes marked with most significant bit.
184 Args:
185 read: Read function
186 """
187 ret: list[int] = []
188 while len(ret) == 0 or ret[-1] & 0x80:
189 b = read(1)
190 if crc32 is not None:
191 crc32 = binascii.crc32(b, crc32)
192 ret.append(ord(b[:1]))
193 return ret, crc32
196class PackFileDisappeared(Exception):
197 """Raised when a pack file unexpectedly disappears."""
199 def __init__(self, obj: object) -> None:
200 self.obj = obj
203class UnpackedObject:
204 """Class encapsulating an object unpacked from a pack file.
206 These objects should only be created from within unpack_object. Most
207 members start out as empty and are filled in at various points by
208 read_zlib_chunks, unpack_object, DeltaChainIterator, etc.
210 End users of this object should take care that the function they're getting
211 this object from is guaranteed to set the members they need.
212 """
214 __slots__ = [
215 "_sha", # Cached binary SHA.
216 "comp_chunks", # Compressed object chunks.
217 "crc32", # CRC32.
218 "decomp_chunks", # Decompressed object chunks.
219 "decomp_len", # Decompressed length of this object.
220 "delta_base", # Delta base offset or SHA.
221 "obj_chunks", # Decompressed and delta-resolved chunks.
222 "obj_type_num", # Type of this object.
223 "offset", # Offset in its pack.
224 "pack_type_num", # Type of this object in the pack (may be a delta).
225 ]
227 obj_type_num: Optional[int]
228 obj_chunks: Optional[list[bytes]]
229 delta_base: Union[None, bytes, int]
230 decomp_chunks: list[bytes]
231 comp_chunks: Optional[list[bytes]]
232 decomp_len: Optional[int]
233 crc32: Optional[int]
234 offset: Optional[int]
235 pack_type_num: int
236 _sha: Optional[bytes]
238 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be
239 # methods of this object.
240 def __init__(
241 self,
242 pack_type_num: int,
243 *,
244 delta_base: Union[None, bytes, int] = None,
245 decomp_len: Optional[int] = None,
246 crc32: Optional[int] = None,
247 sha: Optional[bytes] = None,
248 decomp_chunks: Optional[list[bytes]] = None,
249 offset: Optional[int] = None,
250 ) -> None:
251 self.offset = offset
252 self._sha = sha
253 self.pack_type_num = pack_type_num
254 self.delta_base = delta_base
255 self.comp_chunks = None
256 self.decomp_chunks: list[bytes] = decomp_chunks or []
257 if decomp_chunks is not None and decomp_len is None:
258 self.decomp_len = sum(map(len, decomp_chunks))
259 else:
260 self.decomp_len = decomp_len
261 self.crc32 = crc32
263 if pack_type_num in DELTA_TYPES:
264 self.obj_type_num = None
265 self.obj_chunks = None
266 else:
267 self.obj_type_num = pack_type_num
268 self.obj_chunks = self.decomp_chunks
269 self.delta_base = delta_base
271 def sha(self) -> bytes:
272 """Return the binary SHA of this object."""
273 if self._sha is None:
274 self._sha = obj_sha(self.obj_type_num, self.obj_chunks)
275 return self._sha
277 def sha_file(self) -> ShaFile:
278 """Return a ShaFile from this object."""
279 assert self.obj_type_num is not None and self.obj_chunks is not None
280 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)
282 # Only provided for backwards compatibility with code that expects either
283 # chunks or a delta tuple.
284 def _obj(self) -> OldUnpackedObject:
285 """Return the decompressed chunks, or (delta base, delta chunks)."""
286 if self.pack_type_num in DELTA_TYPES:
287 assert isinstance(self.delta_base, (bytes, int))
288 return (self.delta_base, self.decomp_chunks)
289 else:
290 return self.decomp_chunks
292 def __eq__(self, other: object) -> bool:
293 if not isinstance(other, UnpackedObject):
294 return False
295 for slot in self.__slots__:
296 if getattr(self, slot) != getattr(other, slot):
297 return False
298 return True
300 def __ne__(self, other: object) -> bool:
301 """Check inequality with another UnpackedObject."""
302 return not (self == other)
304 def __repr__(self) -> str:
305 """Return string representation of this UnpackedObject."""
306 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__]
307 return "{}({})".format(self.__class__.__name__, ", ".join(data))
310_ZLIB_BUFSIZE = 65536 # 64KB buffer for better I/O performance
313def read_zlib_chunks(
314 read_some: Callable[[int], bytes],
315 unpacked: UnpackedObject,
316 include_comp: bool = False,
317 buffer_size: int = _ZLIB_BUFSIZE,
318) -> bytes:
319 """Read zlib data from a buffer.
321 This function requires that the buffer have additional data following the
322 compressed data, which is guaranteed to be the case for git pack files.
324 Args:
325 read_some: Read function that returns at least one byte, but may
326 return less than the requested size.
327 unpacked: An UnpackedObject to write result data to. If its crc32
328 attr is not None, the CRC32 of the compressed bytes will be computed
329 using this starting CRC32.
330 After this function, will have the following attrs set:
331 * comp_chunks (if include_comp is True)
332 * decomp_chunks
333 * decomp_len
334 * crc32
335 include_comp: If True, include compressed data in the result.
336 buffer_size: Size of the read buffer.
337 Returns: Leftover unused data from the decompression.
339 Raises:
340 zlib.error: if a decompression error occurred.
341 """
342 if unpacked.decomp_len is None or unpacked.decomp_len <= -1:
343 raise ValueError("non-negative zlib data stream size expected")
344 decomp_obj = zlib.decompressobj()
346 comp_chunks = []
347 decomp_chunks = unpacked.decomp_chunks
348 decomp_len = 0
349 crc32 = unpacked.crc32
351 while True:
352 add = read_some(buffer_size)
353 if not add:
354 raise zlib.error("EOF before end of zlib stream")
355 comp_chunks.append(add)
356 decomp = decomp_obj.decompress(add)
357 decomp_len += len(decomp)
358 decomp_chunks.append(decomp)
359 unused = decomp_obj.unused_data
360 if unused:
361 left = len(unused)
362 if crc32 is not None:
363 crc32 = binascii.crc32(add[:-left], crc32)
364 if include_comp:
365 comp_chunks[-1] = add[:-left]
366 break
367 elif crc32 is not None:
368 crc32 = binascii.crc32(add, crc32)
369 if crc32 is not None:
370 crc32 &= 0xFFFFFFFF
372 if decomp_len != unpacked.decomp_len:
373 raise zlib.error("decompressed data does not match expected size")
375 unpacked.crc32 = crc32
376 if include_comp:
377 unpacked.comp_chunks = comp_chunks
378 return unused
381def iter_sha1(iter: Iterable[bytes]) -> bytes:
382 """Return the hexdigest of the SHA1 over a set of names.
384 Args:
385 iter: Iterator over string objects
386 Returns: 40-byte hex sha1 digest
387 """
388 sha = sha1()
389 for name in iter:
390 sha.update(name)
391 return sha.hexdigest().encode("ascii")
394def load_pack_index(path: Union[str, os.PathLike]) -> "PackIndex":
395 """Load an index file by path.
397 Args:
398 path: Path to the index file
399 Returns: A PackIndex loaded from the given path
400 """
401 with GitFile(path, "rb") as f:
402 return load_pack_index_file(path, f)
405def _load_file_contents(
406 f: Union[IO[bytes], _GitFile], size: Optional[int] = None
407) -> tuple[Union[bytes, Any], int]:
408 """Load contents from a file, preferring mmap when possible.
410 Args:
411 f: File-like object to load
412 size: Expected size, or None to determine from file
413 Returns: Tuple of (contents, size)
414 """
415 try:
416 fd = f.fileno()
417 except (UnsupportedOperation, AttributeError):
418 fd = None
419 # Attempt to use mmap if possible
420 if fd is not None:
421 if size is None:
422 size = os.fstat(fd).st_size
423 if has_mmap:
424 try:
425 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
426 except (OSError, ValueError):
427 # Can't mmap - perhaps a socket or invalid file descriptor
428 pass
429 else:
430 return contents, size
431 contents_bytes = f.read()
432 size = len(contents_bytes)
433 return contents_bytes, size
436def load_pack_index_file(
437 path: Union[str, os.PathLike], f: Union[IO[bytes], _GitFile]
438) -> "PackIndex":
439 """Load an index file from a file-like object.
441 Args:
442 path: Path for the index file
443 f: File-like object
444 Returns: A PackIndex loaded from the given file
445 """
446 contents, size = _load_file_contents(f)
447 if contents[:4] == b"\377tOc":
448 version = struct.unpack(b">L", contents[4:8])[0]
449 if version == 2:
450 return PackIndex2(path, file=f, contents=contents, size=size)
451 elif version == 3:
452 return PackIndex3(path, file=f, contents=contents, size=size)
453 else:
454 raise KeyError(f"Unknown pack index format {version}")
455 else:
456 return PackIndex1(path, file=f, contents=contents, size=size)
459def bisect_find_sha(
460 start: int, end: int, sha: bytes, unpack_name: Callable[[int], bytes]
461) -> Optional[int]:
462 """Find a SHA in a data blob with sorted SHAs.
464 Args:
465 start: Start index of range to search
466 end: End index of range to search
467 sha: Sha to find
468 unpack_name: Callback to retrieve SHA by index
469 Returns: Index of the SHA, or None if it wasn't found
470 """
471 assert start <= end
472 while start <= end:
473 i = (start + end) // 2
474 file_sha = unpack_name(i)
475 if file_sha < sha:
476 start = i + 1
477 elif file_sha > sha:
478 end = i - 1
479 else:
480 return i
481 return None
484PackIndexEntry = tuple[bytes, int, Optional[int]]
487class PackIndex:
488 """An index in to a packfile.
490 Given a sha id of an object a pack index can tell you the location in the
491 packfile of that object if it has it.
492 """
494 # Default to SHA-1 for backward compatibility
495 hash_algorithm = 1
496 hash_size = 20
498 def __eq__(self, other: object) -> bool:
499 if not isinstance(other, PackIndex):
500 return False
502 for (name1, _, _), (name2, _, _) in zip(
503 self.iterentries(), other.iterentries()
504 ):
505 if name1 != name2:
506 return False
507 return True
509 def __ne__(self, other: object) -> bool:
510 """Check if this pack index is not equal to another."""
511 return not self.__eq__(other)
513 def __len__(self) -> int:
514 """Return the number of entries in this pack index."""
515 raise NotImplementedError(self.__len__)
517 def __iter__(self) -> Iterator[bytes]:
518 """Iterate over the SHAs in this pack."""
519 return map(sha_to_hex, self._itersha())
521 def iterentries(self) -> Iterator[PackIndexEntry]:
522 """Iterate over the entries in this pack index.
524 Returns: iterator over tuples with object name, offset in packfile and
525 crc32 checksum.
526 """
527 raise NotImplementedError(self.iterentries)
529 def get_pack_checksum(self) -> Optional[bytes]:
530 """Return the SHA1 checksum stored for the corresponding packfile.
532 Returns: 20-byte binary digest, or None if not available
533 """
534 raise NotImplementedError(self.get_pack_checksum)
536 @replace_me(since="0.21.0", remove_in="0.23.0")
537 def object_index(self, sha: bytes) -> int:
538 return self.object_offset(sha)
540 def object_offset(self, sha: bytes) -> int:
541 """Return the offset in to the corresponding packfile for the object.
543 Given the name of an object it will return the offset that object
544 lives at within the corresponding pack file. If the pack file doesn't
545 have the object then None will be returned.
546 """
547 raise NotImplementedError(self.object_offset)
549 def object_sha1(self, index: int) -> bytes:
550 """Return the SHA1 corresponding to the index in the pack file."""
551 for name, offset, _crc32 in self.iterentries():
552 if offset == index:
553 return name
554 else:
555 raise KeyError(index)
557 def _object_offset(self, sha: bytes) -> int:
558 """See object_offset.
560 Args:
561 sha: A *binary* SHA string. (20 characters long)_
562 """
563 raise NotImplementedError(self._object_offset)
565 def objects_sha1(self) -> bytes:
566 """Return the hex SHA1 over all the shas of all objects in this pack.
568 Note: This is used for the filename of the pack.
569 """
570 return iter_sha1(self._itersha())
572 def _itersha(self) -> Iterator[bytes]:
573 """Yield all the SHA1's of the objects in the index, sorted."""
574 raise NotImplementedError(self._itersha)
576 def close(self) -> None:
577 """Close any open files."""
579 def check(self) -> None:
580 """Check the consistency of this pack index."""
583class MemoryPackIndex(PackIndex):
584 """Pack index that is stored entirely in memory."""
586 def __init__(
587 self,
588 entries: list[tuple[bytes, int, Optional[int]]],
589 pack_checksum: Optional[bytes] = None,
590 ) -> None:
591 """Create a new MemoryPackIndex.
593 Args:
594 entries: Sequence of name, idx, crc32 (sorted)
595 pack_checksum: Optional pack checksum
596 """
597 self._by_sha = {}
598 self._by_offset = {}
599 for name, offset, _crc32 in entries:
600 self._by_sha[name] = offset
601 self._by_offset[offset] = name
602 self._entries = entries
603 self._pack_checksum = pack_checksum
605 def get_pack_checksum(self) -> Optional[bytes]:
606 """Return the SHA checksum stored for the corresponding packfile."""
607 return self._pack_checksum
609 def __len__(self) -> int:
610 """Return the number of entries in this pack index."""
611 return len(self._entries)
613 def object_offset(self, sha: bytes) -> int:
614 """Return the offset for the given SHA.
616 Args:
617 sha: SHA to look up (binary or hex)
618 Returns: Offset in the pack file
619 """
620 if len(sha) == 40:
621 sha = hex_to_sha(sha)
622 return self._by_sha[sha]
624 def object_sha1(self, offset: int) -> bytes:
625 """Return the SHA1 for the object at the given offset."""
626 return self._by_offset[offset]
628 def _itersha(self) -> Iterator[bytes]:
629 """Iterate over all SHA1s in the index."""
630 return iter(self._by_sha)
632 def iterentries(self) -> Iterator[PackIndexEntry]:
633 """Iterate over all index entries."""
634 return iter(self._entries)
636 @classmethod
637 def for_pack(cls, pack_data: "PackData") -> "MemoryPackIndex":
638 """Create a MemoryPackIndex from a PackData object."""
639 return MemoryPackIndex(
640 list(pack_data.sorted_entries()), pack_data.get_stored_checksum()
641 )
643 @classmethod
644 def clone(cls, other_index: "PackIndex") -> "MemoryPackIndex":
645 """Create a copy of another PackIndex in memory."""
646 return cls(list(other_index.iterentries()), other_index.get_pack_checksum())
649class FilePackIndex(PackIndex):
650 """Pack index that is based on a file.
652 To do the loop it opens the file, and indexes first 256 4 byte groups
653 with the first byte of the sha id. The value in the four byte group indexed
654 is the end of the group that shares the same starting byte. Subtract one
655 from the starting byte and index again to find the start of the group.
656 The values are sorted by sha id within the group, so do the math to find
657 the start and end offset and then bisect in to find if the value is
658 present.
659 """
661 _fan_out_table: list[int]
663 def __init__(
664 self,
665 filename: Union[str, os.PathLike],
666 file: Optional[BinaryIO] = None,
667 contents: Optional[Union[bytes, "mmap.mmap"]] = None,
668 size: Optional[int] = None,
669 ) -> None:
670 """Create a pack index object.
672 Provide it with the name of the index file to consider, and it will map
673 it whenever required.
674 """
675 self._filename = filename
676 # Take the size now, so it can be checked each time we map the file to
677 # ensure that it hasn't changed.
678 if file is None:
679 self._file = GitFile(filename, "rb")
680 else:
681 self._file = file
682 if contents is None:
683 self._contents, self._size = _load_file_contents(self._file, size)
684 else:
685 self._contents = contents
686 self._size = size if size is not None else len(contents)
688 @property
689 def path(self) -> str:
690 """Return the path to this index file."""
691 return os.fspath(self._filename)
693 def __eq__(self, other: object) -> bool:
694 # Quick optimization:
695 if (
696 isinstance(other, FilePackIndex)
697 and self._fan_out_table != other._fan_out_table
698 ):
699 return False
701 return super().__eq__(other)
703 def close(self) -> None:
704 """Close the underlying file and any mmap."""
705 self._file.close()
706 close_fn = getattr(self._contents, "close", None)
707 if close_fn is not None:
708 close_fn()
710 def __len__(self) -> int:
711 """Return the number of entries in this pack index."""
712 return self._fan_out_table[-1]
714 def _unpack_entry(self, i: int) -> PackIndexEntry:
715 """Unpack the i-th entry in the index file.
717 Returns: Tuple with object name (SHA), offset in pack file and CRC32
718 checksum (if known).
719 """
720 raise NotImplementedError(self._unpack_entry)
722 def _unpack_name(self, i) -> bytes:
723 """Unpack the i-th name from the index file."""
724 raise NotImplementedError(self._unpack_name)
726 def _unpack_offset(self, i) -> int:
727 """Unpack the i-th object offset from the index file."""
728 raise NotImplementedError(self._unpack_offset)
730 def _unpack_crc32_checksum(self, i) -> Optional[int]:
731 """Unpack the crc32 checksum for the ith object from the index file."""
732 raise NotImplementedError(self._unpack_crc32_checksum)
734 def _itersha(self) -> Iterator[bytes]:
735 """Iterate over all SHA1s in the index."""
736 for i in range(len(self)):
737 yield self._unpack_name(i)
739 def iterentries(self) -> Iterator[PackIndexEntry]:
740 """Iterate over the entries in this pack index.
742 Returns: iterator over tuples with object name, offset in packfile and
743 crc32 checksum.
744 """
745 for i in range(len(self)):
746 yield self._unpack_entry(i)
748 def _read_fan_out_table(self, start_offset: int) -> list[int]:
749 """Read the fan-out table from the index.
751 The fan-out table contains 256 entries mapping first byte values
752 to the number of objects with SHA1s less than or equal to that byte.
754 Args:
755 start_offset: Offset in the file where the fan-out table starts
756 Returns: List of 256 integers
757 """
758 ret = []
759 for i in range(0x100):
760 fanout_entry = self._contents[
761 start_offset + i * 4 : start_offset + (i + 1) * 4
762 ]
763 ret.append(struct.unpack(">L", fanout_entry)[0])
764 return ret
766 def check(self) -> None:
767 """Check that the stored checksum matches the actual checksum."""
768 actual = self.calculate_checksum()
769 stored = self.get_stored_checksum()
770 if actual != stored:
771 raise ChecksumMismatch(stored, actual)
773 def calculate_checksum(self) -> bytes:
774 """Calculate the SHA1 checksum over this pack index.
776 Returns: This is a 20-byte binary digest
777 """
778 return sha1(self._contents[:-20]).digest()
780 def get_pack_checksum(self) -> bytes:
781 """Return the SHA1 checksum stored for the corresponding packfile.
783 Returns: 20-byte binary digest
784 """
785 return bytes(self._contents[-40:-20])
787 def get_stored_checksum(self) -> bytes:
788 """Return the SHA1 checksum stored for this index.
790 Returns: 20-byte binary digest
791 """
792 return bytes(self._contents[-20:])
794 def object_offset(self, sha: bytes) -> int:
795 """Return the offset in to the corresponding packfile for the object.
797 Given the name of an object it will return the offset that object
798 lives at within the corresponding pack file. If the pack file doesn't
799 have the object then None will be returned.
800 """
801 if len(sha) == 40:
802 sha = hex_to_sha(sha)
803 try:
804 return self._object_offset(sha)
805 except ValueError as exc:
806 closed = getattr(self._contents, "closed", None)
807 if closed in (None, True):
808 raise PackFileDisappeared(self) from exc
809 raise
811 def _object_offset(self, sha: bytes) -> int:
812 """See object_offset.
814 Args:
815 sha: A *binary* SHA string. (20 characters long)_
816 """
817 assert len(sha) == 20
818 idx = ord(sha[:1])
819 if idx == 0:
820 start = 0
821 else:
822 start = self._fan_out_table[idx - 1]
823 end = self._fan_out_table[idx]
824 i = bisect_find_sha(start, end, sha, self._unpack_name)
825 if i is None:
826 raise KeyError(sha)
827 return self._unpack_offset(i)
829 def iter_prefix(self, prefix: bytes) -> Iterator[bytes]:
830 """Iterate over all SHA1s with the given prefix."""
831 start = ord(prefix[:1])
832 if start == 0:
833 start = 0
834 else:
835 start = self._fan_out_table[start - 1]
836 end = ord(prefix[:1]) + 1
837 if end == 0x100:
838 end = len(self)
839 else:
840 end = self._fan_out_table[end]
841 assert start <= end
842 started = False
843 for i in range(start, end):
844 name: bytes = self._unpack_name(i)
845 if name.startswith(prefix):
846 yield name
847 started = True
848 elif started:
849 break
852class PackIndex1(FilePackIndex):
853 """Version 1 Pack Index file."""
855 def __init__(
856 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None
857 ) -> None:
858 super().__init__(filename, file, contents, size)
859 self.version = 1
860 self._fan_out_table = self._read_fan_out_table(0)
862 def _unpack_entry(self, i):
863 """Unpack the i-th entry from the v1 index."""
864 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24))
865 return (name, offset, None)
867 def _unpack_name(self, i):
868 """Unpack the i-th SHA1 from the v1 index."""
869 offset = (0x100 * 4) + (i * 24) + 4
870 return self._contents[offset : offset + 20]
872 def _unpack_offset(self, i):
873 """Unpack the i-th offset from the v1 index."""
874 offset = (0x100 * 4) + (i * 24)
875 return unpack_from(">L", self._contents, offset)[0]
877 def _unpack_crc32_checksum(self, i) -> None:
878 """Return None as v1 indexes don't store CRC32 checksums."""
879 # Not stored in v1 index files
880 return None
883class PackIndex2(FilePackIndex):
884 """Version 2 Pack Index file."""
886 def __init__(
887 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None
888 ) -> None:
889 super().__init__(filename, file, contents, size)
890 if self._contents[:4] != b"\377tOc":
891 raise AssertionError("Not a v2 pack index file")
892 (self.version,) = unpack_from(b">L", self._contents, 4)
893 if self.version != 2:
894 raise AssertionError(f"Version was {self.version}")
895 self._fan_out_table = self._read_fan_out_table(8)
896 self._name_table_offset = 8 + 0x100 * 4
897 self._crc32_table_offset = self._name_table_offset + 20 * len(self)
898 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
899 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(
900 self
901 )
903 def _unpack_entry(self, i):
904 """Unpack the i-th entry from the v2 index."""
905 return (
906 self._unpack_name(i),
907 self._unpack_offset(i),
908 self._unpack_crc32_checksum(i),
909 )
911 def _unpack_name(self, i):
912 """Unpack the i-th SHA1 from the v2 index."""
913 offset = self._name_table_offset + i * 20
914 return self._contents[offset : offset + 20]
916 def _unpack_offset(self, i):
917 """Unpack the i-th offset from the v2 index.
919 Handles large offsets (>2GB) by reading from the large offset table.
920 """
921 offset = self._pack_offset_table_offset + i * 4
922 offset = unpack_from(">L", self._contents, offset)[0]
923 if offset & (2**31):
924 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8
925 offset = unpack_from(">Q", self._contents, offset)[0]
926 return offset
928 def _unpack_crc32_checksum(self, i):
929 """Unpack the i-th CRC32 checksum from the v2 index."""
930 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
933class PackIndex3(FilePackIndex):
934 """Version 3 Pack Index file.
936 Supports variable hash sizes for SHA-1 (20 bytes) and SHA-256 (32 bytes).
937 """
939 def __init__(
940 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None
941 ) -> None:
942 super().__init__(filename, file, contents, size)
943 if self._contents[:4] != b"\377tOc":
944 raise AssertionError("Not a v3 pack index file")
945 (self.version,) = unpack_from(b">L", self._contents, 4)
946 if self.version != 3:
947 raise AssertionError(f"Version was {self.version}")
949 # Read hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
950 (self.hash_algorithm,) = unpack_from(b">L", self._contents, 8)
951 if self.hash_algorithm == 1:
952 self.hash_size = 20 # SHA-1
953 elif self.hash_algorithm == 2:
954 self.hash_size = 32 # SHA-256
955 else:
956 raise AssertionError(f"Unknown hash algorithm {self.hash_algorithm}")
958 # Read length of shortened object names
959 (self.shortened_oid_len,) = unpack_from(b">L", self._contents, 12)
961 # Calculate offsets based on variable hash size
962 self._fan_out_table = self._read_fan_out_table(
963 16
964 ) # After header (4 + 4 + 4 + 4)
965 self._name_table_offset = 16 + 0x100 * 4
966 self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self)
967 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
968 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(
969 self
970 )
972 def _unpack_entry(self, i):
973 return (
974 self._unpack_name(i),
975 self._unpack_offset(i),
976 self._unpack_crc32_checksum(i),
977 )
979 def _unpack_name(self, i):
980 offset = self._name_table_offset + i * self.hash_size
981 return self._contents[offset : offset + self.hash_size]
983 def _unpack_offset(self, i):
984 offset = self._pack_offset_table_offset + i * 4
985 offset = unpack_from(">L", self._contents, offset)[0]
986 if offset & (2**31):
987 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8
988 offset = unpack_from(">Q", self._contents, offset)[0]
989 return offset
991 def _unpack_crc32_checksum(self, i):
992 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
995def read_pack_header(read) -> tuple[int, int]:
996 """Read the header of a pack file.
998 Args:
999 read: Read function
1000 Returns: Tuple of (pack version, number of objects). If no data is
1001 available to read, returns (None, None).
1002 """
1003 header = read(12)
1004 if not header:
1005 raise AssertionError("file too short to contain pack")
1006 if header[:4] != b"PACK":
1007 raise AssertionError(f"Invalid pack header {header!r}")
1008 (version,) = unpack_from(b">L", header, 4)
1009 if version not in (2, 3):
1010 raise AssertionError(f"Version was {version}")
1011 (num_objects,) = unpack_from(b">L", header, 8)
1012 return (version, num_objects)
1015def chunks_length(chunks: Union[bytes, Iterable[bytes]]) -> int:
1016 """Get the total length of a sequence of chunks.
1018 Args:
1019 chunks: Either a single bytes object or an iterable of bytes
1020 Returns: Total length in bytes
1021 """
1022 if isinstance(chunks, bytes):
1023 return len(chunks)
1024 else:
1025 return sum(map(len, chunks))
1028def unpack_object(
1029 read_all: Callable[[int], bytes],
1030 read_some: Optional[Callable[[int], bytes]] = None,
1031 compute_crc32=False,
1032 include_comp=False,
1033 zlib_bufsize=_ZLIB_BUFSIZE,
1034) -> tuple[UnpackedObject, bytes]:
1035 """Unpack a Git object.
1037 Args:
1038 read_all: Read function that blocks until the number of requested
1039 bytes are read.
1040 read_some: Read function that returns at least one byte, but may not
1041 return the number of bytes requested.
1042 compute_crc32: If True, compute the CRC32 of the compressed data. If
1043 False, the returned CRC32 will be None.
1044 include_comp: If True, include compressed data in the result.
1045 zlib_bufsize: An optional buffer size for zlib operations.
1046 Returns: A tuple of (unpacked, unused), where unused is the unused data
1047 leftover from decompression, and unpacked in an UnpackedObject with
1048 the following attrs set:
1050 * obj_chunks (for non-delta types)
1051 * pack_type_num
1052 * delta_base (for delta types)
1053 * comp_chunks (if include_comp is True)
1054 * decomp_chunks
1055 * decomp_len
1056 * crc32 (if compute_crc32 is True)
1057 """
1058 if read_some is None:
1059 read_some = read_all
1060 if compute_crc32:
1061 crc32 = 0
1062 else:
1063 crc32 = None
1065 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)
1066 type_num = (raw[0] >> 4) & 0x07
1067 size = raw[0] & 0x0F
1068 for i, byte in enumerate(raw[1:]):
1069 size += (byte & 0x7F) << ((i * 7) + 4)
1071 delta_base: Union[int, bytes, None]
1072 raw_base = len(raw)
1073 if type_num == OFS_DELTA:
1074 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)
1075 raw_base += len(raw)
1076 if raw[-1] & 0x80:
1077 raise AssertionError
1078 delta_base_offset = raw[0] & 0x7F
1079 for byte in raw[1:]:
1080 delta_base_offset += 1
1081 delta_base_offset <<= 7
1082 delta_base_offset += byte & 0x7F
1083 delta_base = delta_base_offset
1084 elif type_num == REF_DELTA:
1085 delta_base_obj = read_all(20)
1086 if crc32 is not None:
1087 crc32 = binascii.crc32(delta_base_obj, crc32)
1088 delta_base = delta_base_obj
1089 raw_base += 20
1090 else:
1091 delta_base = None
1093 unpacked = UnpackedObject(
1094 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32
1095 )
1096 unused = read_zlib_chunks(
1097 read_some,
1098 unpacked,
1099 buffer_size=zlib_bufsize,
1100 include_comp=include_comp,
1101 )
1102 return unpacked, unused
1105def _compute_object_size(value):
1106 """Compute the size of an unresolved object for use with LRUSizeCache.
1108 Args:
1109 value: Tuple of (type_num, object_chunks)
1110 Returns: Size in bytes
1111 """
1112 (num, obj) = value
1113 if num in DELTA_TYPES:
1114 return chunks_length(obj[1])
1115 return chunks_length(obj)
1118class PackStreamReader:
1119 """Class to read a pack stream.
1121 The pack is read from a ReceivableProtocol using read() or recv() as
1122 appropriate.
1123 """
1125 def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE) -> None:
1126 self.read_all = read_all
1127 if read_some is None:
1128 self.read_some = read_all
1129 else:
1130 self.read_some = read_some
1131 self.sha = sha1()
1132 self._offset = 0
1133 self._rbuf = BytesIO()
1134 # trailer is a deque to avoid memory allocation on small reads
1135 self._trailer: deque[bytes] = deque()
1136 self._zlib_bufsize = zlib_bufsize
1138 def _read(self, read, size):
1139 """Read up to size bytes using the given callback.
1141 As a side effect, update the verifier's hash (excluding the last 20
1142 bytes read).
1144 Args:
1145 read: The read callback to read from.
1146 size: The maximum number of bytes to read; the particular
1147 behavior is callback-specific.
1148 Returns: Bytes read
1149 """
1150 data = read(size)
1152 # maintain a trailer of the last 20 bytes we've read
1153 n = len(data)
1154 self._offset += n
1155 tn = len(self._trailer)
1156 if n >= 20:
1157 to_pop = tn
1158 to_add = 20
1159 else:
1160 to_pop = max(n + tn - 20, 0)
1161 to_add = n
1162 self.sha.update(
1163 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)]))
1164 )
1165 self._trailer.extend(data[-to_add:])
1167 # hash everything but the trailer
1168 self.sha.update(data[:-to_add])
1169 return data
1171 def _buf_len(self):
1172 """Get the number of bytes in the read buffer."""
1173 buf = self._rbuf
1174 start = buf.tell()
1175 buf.seek(0, SEEK_END)
1176 end = buf.tell()
1177 buf.seek(start)
1178 return end - start
1180 @property
1181 def offset(self):
1182 """Return the current offset in the pack stream."""
1183 return self._offset - self._buf_len()
1185 def read(self, size):
1186 """Read, blocking until size bytes are read."""
1187 buf_len = self._buf_len()
1188 if buf_len >= size:
1189 return self._rbuf.read(size)
1190 buf_data = self._rbuf.read()
1191 self._rbuf = BytesIO()
1192 return buf_data + self._read(self.read_all, size - buf_len)
1194 def recv(self, size):
1195 """Read up to size bytes, blocking until one byte is read."""
1196 buf_len = self._buf_len()
1197 if buf_len:
1198 data = self._rbuf.read(size)
1199 if size >= buf_len:
1200 self._rbuf = BytesIO()
1201 return data
1202 return self._read(self.read_some, size)
1204 def __len__(self) -> int:
1205 return self._num_objects
1207 def read_objects(self, compute_crc32=False) -> Iterator[UnpackedObject]:
1208 """Read the objects in this pack file.
1210 Args:
1211 compute_crc32: If True, compute the CRC32 of the compressed
1212 data. If False, the returned CRC32 will be None.
1213 Returns: Iterator over UnpackedObjects with the following members set:
1214 offset
1215 obj_type_num
1216 obj_chunks (for non-delta types)
1217 delta_base (for delta types)
1218 decomp_chunks
1219 decomp_len
1220 crc32 (if compute_crc32 is True)
1222 Raises:
1223 ChecksumMismatch: if the checksum of the pack contents does not
1224 match the checksum in the pack trailer.
1225 zlib.error: if an error occurred during zlib decompression.
1226 IOError: if an error occurred writing to the output file.
1227 """
1228 pack_version, self._num_objects = read_pack_header(self.read)
1230 for _ in range(self._num_objects):
1231 offset = self.offset
1232 unpacked, unused = unpack_object(
1233 self.read,
1234 read_some=self.recv,
1235 compute_crc32=compute_crc32,
1236 zlib_bufsize=self._zlib_bufsize,
1237 )
1238 unpacked.offset = offset
1240 # prepend any unused data to current read buffer
1241 buf = BytesIO()
1242 buf.write(unused)
1243 buf.write(self._rbuf.read())
1244 buf.seek(0)
1245 self._rbuf = buf
1247 yield unpacked
1249 if self._buf_len() < 20:
1250 # If the read buffer is full, then the last read() got the whole
1251 # trailer off the wire. If not, it means there is still some of the
1252 # trailer to read. We need to read() all 20 bytes; N come from the
1253 # read buffer and (20 - N) come from the wire.
1254 self.read(20)
1256 pack_sha = bytearray(self._trailer) # type: ignore
1257 if pack_sha != self.sha.digest():
1258 raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest())
1261class PackStreamCopier(PackStreamReader):
1262 """Class to verify a pack stream as it is being read.
1264 The pack is read from a ReceivableProtocol using read() or recv() as
1265 appropriate and written out to the given file-like object.
1266 """
1268 def __init__(self, read_all, read_some, outfile, delta_iter=None) -> None:
1269 """Initialize the copier.
1271 Args:
1272 read_all: Read function that blocks until the number of
1273 requested bytes are read.
1274 read_some: Read function that returns at least one byte, but may
1275 not return the number of bytes requested.
1276 outfile: File-like object to write output through.
1277 delta_iter: Optional DeltaChainIterator to record deltas as we
1278 read them.
1279 """
1280 super().__init__(read_all, read_some=read_some)
1281 self.outfile = outfile
1282 self._delta_iter = delta_iter
1284 def _read(self, read, size):
1285 """Read data from the read callback and write it to the file.
1287 Args:
1288 read: Read callback function
1289 size: Number of bytes to read
1290 Returns: Data read
1291 """
1292 data = super()._read(read, size)
1293 self.outfile.write(data)
1294 return data
1296 def verify(self, progress=None) -> None:
1297 """Verify a pack stream and write it to the output file.
1299 See PackStreamReader.iterobjects for a list of exceptions this may
1300 throw.
1301 """
1302 i = 0 # default count of entries if read_objects() is empty
1303 for i, unpacked in enumerate(self.read_objects()):
1304 if self._delta_iter:
1305 self._delta_iter.record(unpacked)
1306 if progress is not None:
1307 progress(f"copying pack entries: {i}/{len(self)}\r".encode("ascii"))
1308 if progress is not None:
1309 progress(f"copied {i} pack entries\n".encode("ascii"))
1312def obj_sha(type, chunks):
1313 """Compute the SHA for a numeric type and object chunks.
1315 Args:
1316 type: Numeric type of the object
1317 chunks: Object data as bytes or iterable of bytes
1318 Returns: SHA-1 digest (20 bytes)
1319 """
1320 sha = sha1()
1321 sha.update(object_header(type, chunks_length(chunks)))
1322 if isinstance(chunks, bytes):
1323 sha.update(chunks)
1324 else:
1325 for chunk in chunks:
1326 sha.update(chunk)
1327 return sha.digest()
1330def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1 << 16):
1331 """Hash a portion of a file into a new SHA.
1333 Args:
1334 f: A file-like object to read from that supports seek().
1335 start_ofs: The offset in the file to start reading at.
1336 end_ofs: The offset in the file to end reading at, relative to the
1337 end of the file.
1338 buffer_size: A buffer size for reading.
1339 Returns: A new SHA object updated with data read from the file.
1340 """
1341 sha = sha1()
1342 f.seek(0, SEEK_END)
1343 length = f.tell()
1344 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length:
1345 raise AssertionError(
1346 f"Attempt to read beyond file length. start_ofs: {start_ofs}, end_ofs: {end_ofs}, file length: {length}"
1347 )
1348 todo = length + end_ofs - start_ofs
1349 f.seek(start_ofs)
1350 while todo:
1351 data = f.read(min(todo, buffer_size))
1352 sha.update(data)
1353 todo -= len(data)
1354 return sha
1357class PackData:
1358 """The data contained in a packfile.
1360 Pack files can be accessed both sequentially for exploding a pack, and
1361 directly with the help of an index to retrieve a specific object.
1363 The objects within are either complete or a delta against another.
1365 The header is variable length. If the MSB of each byte is set then it
1366 indicates that the subsequent byte is still part of the header.
1367 For the first byte the next MS bits are the type, which tells you the type
1368 of object, and whether it is a delta. The LS byte is the lowest bits of the
1369 size. For each subsequent byte the LS 7 bits are the next MS bits of the
1370 size, i.e. the last byte of the header contains the MS bits of the size.
1372 For the complete objects the data is stored as zlib deflated data.
1373 The size in the header is the uncompressed object size, so to uncompress
1374 you need to just keep feeding data to zlib until you get an object back,
1375 or it errors on bad data. This is done here by just giving the complete
1376 buffer from the start of the deflated object on. This is bad, but until I
1377 get mmap sorted out it will have to do.
1379 Currently there are no integrity checks done. Also no attempt is made to
1380 try and detect the delta case, or a request for an object at the wrong
1381 position. It will all just throw a zlib or KeyError.
1382 """
1384 def __init__(
1385 self,
1386 filename: Union[str, os.PathLike],
1387 file=None,
1388 size=None,
1389 *,
1390 delta_window_size=None,
1391 window_memory=None,
1392 delta_cache_size=None,
1393 depth=None,
1394 threads=None,
1395 big_file_threshold=None,
1396 ) -> None:
1397 """Create a PackData object representing the pack in the given filename.
1399 The file must exist and stay readable until the object is disposed of.
1400 It must also stay the same size. It will be mapped whenever needed.
1402 Currently there is a restriction on the size of the pack as the python
1403 mmap implementation is flawed.
1404 """
1405 self._filename = filename
1406 self._size = size
1407 self._header_size = 12
1408 self.delta_window_size = delta_window_size
1409 self.window_memory = window_memory
1410 self.delta_cache_size = delta_cache_size
1411 self.depth = depth
1412 self.threads = threads
1413 self.big_file_threshold = big_file_threshold
1415 if file is None:
1416 self._file = GitFile(self._filename, "rb")
1417 else:
1418 self._file = file
1419 (version, self._num_objects) = read_pack_header(self._file.read)
1421 # Use delta_cache_size config if available, otherwise default
1422 cache_size = delta_cache_size or (1024 * 1024 * 20)
1423 self._offset_cache = LRUSizeCache[int, tuple[int, OldUnpackedObject]](
1424 cache_size, compute_size=_compute_object_size
1425 )
1427 @property
1428 def filename(self):
1429 return os.path.basename(self._filename)
1431 @property
1432 def path(self):
1433 return self._filename
1435 @classmethod
1436 def from_file(cls, file, size=None):
1437 return cls(str(file), file=file, size=size)
1439 @classmethod
1440 def from_path(cls, path: Union[str, os.PathLike]):
1441 return cls(filename=path)
1443 def close(self) -> None:
1444 """Close the underlying pack file."""
1445 self._file.close()
1447 def __enter__(self):
1448 return self
1450 def __exit__(self, exc_type, exc_val, exc_tb):
1451 self.close()
1453 def __eq__(self, other):
1454 """Check equality based on pack checksum."""
1455 if isinstance(other, PackData):
1456 return self.get_stored_checksum() == other.get_stored_checksum()
1457 return False
1459 def _get_size(self):
1460 """Get the size of the pack file.
1462 Returns: Size in bytes
1463 Raises: AssertionError if file is too small to be a pack
1464 """
1465 if self._size is not None:
1466 return self._size
1467 self._size = os.path.getsize(self._filename)
1468 if self._size < self._header_size:
1469 errmsg = f"{self._filename} is too small for a packfile ({self._size} < {self._header_size})"
1470 raise AssertionError(errmsg)
1471 return self._size
1473 def __len__(self) -> int:
1474 """Returns the number of objects in this pack."""
1475 return self._num_objects
1477 def calculate_checksum(self):
1478 """Calculate the checksum for this pack.
1480 Returns: 20-byte binary SHA1 digest
1481 """
1482 return compute_file_sha(self._file, end_ofs=-20).digest()
1484 def iter_unpacked(self, *, include_comp: bool = False):
1485 """Iterate over unpacked objects in the pack.
1487 Args:
1488 include_comp: If True, include compressed object data
1489 Yields: UnpackedObject instances
1490 """
1491 self._file.seek(self._header_size)
1493 if self._num_objects is None:
1494 return
1496 for _ in range(self._num_objects):
1497 offset = self._file.tell()
1498 unpacked, unused = unpack_object(
1499 self._file.read, compute_crc32=False, include_comp=include_comp
1500 )
1501 unpacked.offset = offset
1502 yield unpacked
1503 # Back up over unused data.
1504 self._file.seek(-len(unused), SEEK_CUR)
1506 def iterentries(
1507 self, progress=None, resolve_ext_ref: Optional[ResolveExtRefFn] = None
1508 ):
1509 """Yield entries summarizing the contents of this pack.
1511 Args:
1512 progress: Progress function, called with current and total
1513 object count.
1514 Returns: iterator of tuples with (sha, offset, crc32)
1515 """
1516 num_objects = self._num_objects
1517 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref)
1518 for i, result in enumerate(indexer):
1519 if progress is not None:
1520 progress(i, num_objects)
1521 yield result
1523 def sorted_entries(
1524 self,
1525 progress: Optional[ProgressFn] = None,
1526 resolve_ext_ref: Optional[ResolveExtRefFn] = None,
1527 ):
1528 """Return entries in this pack, sorted by SHA.
1530 Args:
1531 progress: Progress function, called with current and total
1532 object count
1533 Returns: Iterator of tuples with (sha, offset, crc32)
1534 """
1535 return sorted(
1536 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref)
1537 )
1539 def create_index_v1(self, filename, progress=None, resolve_ext_ref=None):
1540 """Create a version 1 file for this data file.
1542 Args:
1543 filename: Index filename.
1544 progress: Progress report function
1545 Returns: Checksum of index file
1546 """
1547 entries = self.sorted_entries(
1548 progress=progress, resolve_ext_ref=resolve_ext_ref
1549 )
1550 with GitFile(filename, "wb") as f:
1551 return write_pack_index_v1(f, entries, self.calculate_checksum())
1553 def create_index_v2(self, filename, progress=None, resolve_ext_ref=None):
1554 """Create a version 2 index file for this data file.
1556 Args:
1557 filename: Index filename.
1558 progress: Progress report function
1559 Returns: Checksum of index file
1560 """
1561 entries = self.sorted_entries(
1562 progress=progress, resolve_ext_ref=resolve_ext_ref
1563 )
1564 with GitFile(filename, "wb") as f:
1565 return write_pack_index_v2(f, entries, self.calculate_checksum())
1567 def create_index_v3(
1568 self, filename, progress=None, resolve_ext_ref=None, hash_algorithm=1
1569 ):
1570 """Create a version 3 index file for this data file.
1572 Args:
1573 filename: Index filename.
1574 progress: Progress report function
1575 resolve_ext_ref: Function to resolve external references
1576 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
1577 Returns: Checksum of index file
1578 """
1579 entries = self.sorted_entries(
1580 progress=progress, resolve_ext_ref=resolve_ext_ref
1581 )
1582 with GitFile(filename, "wb") as f:
1583 return write_pack_index_v3(
1584 f, entries, self.calculate_checksum(), hash_algorithm
1585 )
1587 def create_index(
1588 self, filename, progress=None, version=2, resolve_ext_ref=None, hash_algorithm=1
1589 ):
1590 """Create an index file for this data file.
1592 Args:
1593 filename: Index filename.
1594 progress: Progress report function
1595 version: Index version (1, 2, or 3)
1596 resolve_ext_ref: Function to resolve external references
1597 hash_algorithm: Hash algorithm identifier for v3 (1 = SHA-1, 2 = SHA-256)
1598 Returns: Checksum of index file
1599 """
1600 if version == 1:
1601 return self.create_index_v1(
1602 filename, progress, resolve_ext_ref=resolve_ext_ref
1603 )
1604 elif version == 2:
1605 return self.create_index_v2(
1606 filename, progress, resolve_ext_ref=resolve_ext_ref
1607 )
1608 elif version == 3:
1609 return self.create_index_v3(
1610 filename,
1611 progress,
1612 resolve_ext_ref=resolve_ext_ref,
1613 hash_algorithm=hash_algorithm,
1614 )
1615 else:
1616 raise ValueError(f"unknown index format {version}")
1618 def get_stored_checksum(self):
1619 """Return the expected checksum stored in this pack."""
1620 self._file.seek(-20, SEEK_END)
1621 return self._file.read(20)
1623 def check(self) -> None:
1624 """Check the consistency of this pack."""
1625 actual = self.calculate_checksum()
1626 stored = self.get_stored_checksum()
1627 if actual != stored:
1628 raise ChecksumMismatch(stored, actual)
1630 def get_unpacked_object_at(
1631 self, offset: int, *, include_comp: bool = False
1632 ) -> UnpackedObject:
1633 """Given offset in the packfile return a UnpackedObject."""
1634 assert offset >= self._header_size
1635 self._file.seek(offset)
1636 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp)
1637 unpacked.offset = offset
1638 return unpacked
1640 def get_object_at(self, offset: int) -> tuple[int, OldUnpackedObject]:
1641 """Given an offset in to the packfile return the object that is there.
1643 Using the associated index the location of an object can be looked up,
1644 and then the packfile can be asked directly for that object using this
1645 function.
1646 """
1647 try:
1648 return self._offset_cache[offset]
1649 except KeyError:
1650 pass
1651 unpacked = self.get_unpacked_object_at(offset, include_comp=False)
1652 return (unpacked.pack_type_num, unpacked._obj())
1655T = TypeVar("T")
1658class DeltaChainIterator(Generic[T]):
1659 """Abstract iterator over pack data based on delta chains.
1661 Each object in the pack is guaranteed to be inflated exactly once,
1662 regardless of how many objects reference it as a delta base. As a result,
1663 memory usage is proportional to the length of the longest delta chain.
1665 Subclasses can override _result to define the result type of the iterator.
1666 By default, results are UnpackedObjects with the following members set:
1668 * offset
1669 * obj_type_num
1670 * obj_chunks
1671 * pack_type_num
1672 * delta_base (for delta types)
1673 * comp_chunks (if _include_comp is True)
1674 * decomp_chunks
1675 * decomp_len
1676 * crc32 (if _compute_crc32 is True)
1677 """
1679 _compute_crc32 = False
1680 _include_comp = False
1682 def __init__(self, file_obj, *, resolve_ext_ref=None) -> None:
1683 self._file = file_obj
1684 self._resolve_ext_ref = resolve_ext_ref
1685 self._pending_ofs: dict[int, list[int]] = defaultdict(list)
1686 self._pending_ref: dict[bytes, list[int]] = defaultdict(list)
1687 self._full_ofs: list[tuple[int, int]] = []
1688 self._ext_refs: list[bytes] = []
1690 @classmethod
1691 def for_pack_data(cls, pack_data: PackData, resolve_ext_ref=None):
1692 walker = cls(None, resolve_ext_ref=resolve_ext_ref)
1693 walker.set_pack_data(pack_data)
1694 for unpacked in pack_data.iter_unpacked(include_comp=False):
1695 walker.record(unpacked)
1696 return walker
1698 @classmethod
1699 def for_pack_subset(
1700 cls,
1701 pack: "Pack",
1702 shas: Iterable[bytes],
1703 *,
1704 allow_missing: bool = False,
1705 resolve_ext_ref=None,
1706 ):
1707 walker = cls(None, resolve_ext_ref=resolve_ext_ref)
1708 walker.set_pack_data(pack.data)
1709 todo = set()
1710 for sha in shas:
1711 assert isinstance(sha, bytes)
1712 try:
1713 off = pack.index.object_offset(sha)
1714 except KeyError:
1715 if not allow_missing:
1716 raise
1717 else:
1718 todo.add(off)
1719 done = set()
1720 while todo:
1721 off = todo.pop()
1722 unpacked = pack.data.get_unpacked_object_at(off)
1723 walker.record(unpacked)
1724 done.add(off)
1725 base_ofs = None
1726 if unpacked.pack_type_num == OFS_DELTA:
1727 assert unpacked.offset is not None
1728 assert unpacked.delta_base is not None
1729 assert isinstance(unpacked.delta_base, int)
1730 base_ofs = unpacked.offset - unpacked.delta_base
1731 elif unpacked.pack_type_num == REF_DELTA:
1732 with suppress(KeyError):
1733 assert isinstance(unpacked.delta_base, bytes)
1734 base_ofs = pack.index.object_index(unpacked.delta_base)
1735 if base_ofs is not None and base_ofs not in done:
1736 todo.add(base_ofs)
1737 return walker
1739 def record(self, unpacked: UnpackedObject) -> None:
1740 type_num = unpacked.pack_type_num
1741 offset = unpacked.offset
1742 assert offset is not None
1743 if type_num == OFS_DELTA:
1744 assert unpacked.delta_base is not None
1745 assert isinstance(unpacked.delta_base, int)
1746 base_offset = offset - unpacked.delta_base
1747 self._pending_ofs[base_offset].append(offset)
1748 elif type_num == REF_DELTA:
1749 assert isinstance(unpacked.delta_base, bytes)
1750 self._pending_ref[unpacked.delta_base].append(offset)
1751 else:
1752 self._full_ofs.append((offset, type_num))
1754 def set_pack_data(self, pack_data: PackData) -> None:
1755 self._file = pack_data._file
1757 def _walk_all_chains(self):
1758 for offset, type_num in self._full_ofs:
1759 yield from self._follow_chain(offset, type_num, None)
1760 yield from self._walk_ref_chains()
1761 assert not self._pending_ofs, repr(self._pending_ofs)
1763 def _ensure_no_pending(self) -> None:
1764 if self._pending_ref:
1765 raise UnresolvedDeltas([sha_to_hex(s) for s in self._pending_ref])
1767 def _walk_ref_chains(self):
1768 if not self._resolve_ext_ref:
1769 self._ensure_no_pending()
1770 return
1772 for base_sha, pending in sorted(self._pending_ref.items()):
1773 if base_sha not in self._pending_ref:
1774 continue
1775 try:
1776 type_num, chunks = self._resolve_ext_ref(base_sha)
1777 except KeyError:
1778 # Not an external ref, but may depend on one. Either it will
1779 # get popped via a _follow_chain call, or we will raise an
1780 # error below.
1781 continue
1782 self._ext_refs.append(base_sha)
1783 self._pending_ref.pop(base_sha)
1784 for new_offset in pending:
1785 yield from self._follow_chain(new_offset, type_num, chunks)
1787 self._ensure_no_pending()
1789 def _result(self, unpacked: UnpackedObject) -> T:
1790 raise NotImplementedError
1792 def _resolve_object(
1793 self, offset: int, obj_type_num: int, base_chunks: list[bytes]
1794 ) -> UnpackedObject:
1795 self._file.seek(offset)
1796 unpacked, _ = unpack_object(
1797 self._file.read,
1798 include_comp=self._include_comp,
1799 compute_crc32=self._compute_crc32,
1800 )
1801 unpacked.offset = offset
1802 if base_chunks is None:
1803 assert unpacked.pack_type_num == obj_type_num
1804 else:
1805 assert unpacked.pack_type_num in DELTA_TYPES
1806 unpacked.obj_type_num = obj_type_num
1807 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks)
1808 return unpacked
1810 def _follow_chain(self, offset: int, obj_type_num: int, base_chunks: list[bytes]):
1811 # Unlike PackData.get_object_at, there is no need to cache offsets as
1812 # this approach by design inflates each object exactly once.
1813 todo = [(offset, obj_type_num, base_chunks)]
1814 while todo:
1815 (offset, obj_type_num, base_chunks) = todo.pop()
1816 unpacked = self._resolve_object(offset, obj_type_num, base_chunks)
1817 yield self._result(unpacked)
1819 assert unpacked.offset is not None
1820 unblocked = chain(
1821 self._pending_ofs.pop(unpacked.offset, []),
1822 self._pending_ref.pop(unpacked.sha(), []),
1823 )
1824 todo.extend(
1825 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore
1826 for new_offset in unblocked
1827 )
1829 def __iter__(self) -> Iterator[T]:
1830 return self._walk_all_chains()
1832 def ext_refs(self):
1833 return self._ext_refs
1836class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]):
1837 """Delta chain iterator that yield unpacked objects."""
1839 def _result(self, unpacked):
1840 return unpacked
1843class PackIndexer(DeltaChainIterator[PackIndexEntry]):
1844 """Delta chain iterator that yields index entries."""
1846 _compute_crc32 = True
1848 def _result(self, unpacked):
1849 return unpacked.sha(), unpacked.offset, unpacked.crc32
1852class PackInflater(DeltaChainIterator[ShaFile]):
1853 """Delta chain iterator that yields ShaFile objects."""
1855 def _result(self, unpacked):
1856 return unpacked.sha_file()
1859class SHA1Reader(BinaryIO):
1860 """Wrapper for file-like object that remembers the SHA1 of its data."""
1862 def __init__(self, f) -> None:
1863 self.f = f
1864 self.sha1 = sha1(b"")
1866 def read(self, size: int = -1) -> bytes:
1867 data = self.f.read(size)
1868 self.sha1.update(data)
1869 return data
1871 def check_sha(self, allow_empty: bool = False) -> None:
1872 stored = self.f.read(20)
1873 # If git option index.skipHash is set the index will be empty
1874 if stored != self.sha1.digest() and (
1875 not allow_empty
1876 or sha_to_hex(stored) != b"0000000000000000000000000000000000000000"
1877 ):
1878 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
1880 def close(self):
1881 return self.f.close()
1883 def tell(self) -> int:
1884 return self.f.tell()
1886 # BinaryIO abstract methods
1887 def readable(self) -> bool:
1888 return True
1890 def writable(self) -> bool:
1891 return False
1893 def seekable(self) -> bool:
1894 return getattr(self.f, "seekable", lambda: False)()
1896 def seek(self, offset: int, whence: int = 0) -> int:
1897 return self.f.seek(offset, whence)
1899 def flush(self) -> None:
1900 if hasattr(self.f, "flush"):
1901 self.f.flush()
1903 def readline(self, size: int = -1) -> bytes:
1904 return self.f.readline(size)
1906 def readlines(self, hint: int = -1) -> list[bytes]:
1907 return self.f.readlines(hint)
1909 def writelines(self, lines) -> None:
1910 raise UnsupportedOperation("writelines")
1912 def write(self, data) -> int:
1913 raise UnsupportedOperation("write")
1915 def __enter__(self):
1916 return self
1918 def __exit__(self, type, value, traceback):
1919 self.close()
1921 def __iter__(self):
1922 return self
1924 def __next__(self) -> bytes:
1925 line = self.readline()
1926 if not line:
1927 raise StopIteration
1928 return line
1930 def fileno(self) -> int:
1931 return self.f.fileno()
1933 def isatty(self) -> bool:
1934 return getattr(self.f, "isatty", lambda: False)()
1936 def truncate(self, size: Optional[int] = None) -> int:
1937 raise UnsupportedOperation("truncate")
1940class SHA1Writer(BinaryIO):
1941 """Wrapper for file-like object that remembers the SHA1 of its data."""
1943 def __init__(self, f) -> None:
1944 self.f = f
1945 self.length = 0
1946 self.sha1 = sha1(b"")
1948 def write(self, data) -> int:
1949 self.sha1.update(data)
1950 self.f.write(data)
1951 self.length += len(data)
1952 return len(data)
1954 def write_sha(self):
1955 sha = self.sha1.digest()
1956 assert len(sha) == 20
1957 self.f.write(sha)
1958 self.length += len(sha)
1959 return sha
1961 def close(self):
1962 sha = self.write_sha()
1963 self.f.close()
1964 return sha
1966 def offset(self):
1967 return self.length
1969 def tell(self) -> int:
1970 return self.f.tell()
1972 # BinaryIO abstract methods
1973 def readable(self) -> bool:
1974 return False
1976 def writable(self) -> bool:
1977 return True
1979 def seekable(self) -> bool:
1980 return getattr(self.f, "seekable", lambda: False)()
1982 def seek(self, offset: int, whence: int = 0) -> int:
1983 return self.f.seek(offset, whence)
1985 def flush(self) -> None:
1986 if hasattr(self.f, "flush"):
1987 self.f.flush()
1989 def readline(self, size: int = -1) -> bytes:
1990 raise UnsupportedOperation("readline")
1992 def readlines(self, hint: int = -1) -> list[bytes]:
1993 raise UnsupportedOperation("readlines")
1995 def writelines(self, lines) -> None:
1996 for line in lines:
1997 self.write(line)
1999 def read(self, size: int = -1) -> bytes:
2000 raise UnsupportedOperation("read")
2002 def __enter__(self):
2003 return self
2005 def __exit__(self, type, value, traceback):
2006 self.close()
2008 def __iter__(self):
2009 return self
2011 def __next__(self) -> bytes:
2012 raise UnsupportedOperation("__next__")
2014 def fileno(self) -> int:
2015 return self.f.fileno()
2017 def isatty(self) -> bool:
2018 return getattr(self.f, "isatty", lambda: False)()
2020 def truncate(self, size: Optional[int] = None) -> int:
2021 raise UnsupportedOperation("truncate")
2024def pack_object_header(type_num, delta_base, size):
2025 """Create a pack object header for the given object info.
2027 Args:
2028 type_num: Numeric type of the object.
2029 delta_base: Delta base offset or ref, or None for whole objects.
2030 size: Uncompressed object size.
2031 Returns: A header for a packed object.
2032 """
2033 header = []
2034 c = (type_num << 4) | (size & 15)
2035 size >>= 4
2036 while size:
2037 header.append(c | 0x80)
2038 c = size & 0x7F
2039 size >>= 7
2040 header.append(c)
2041 if type_num == OFS_DELTA:
2042 ret = [delta_base & 0x7F]
2043 delta_base >>= 7
2044 while delta_base:
2045 delta_base -= 1
2046 ret.insert(0, 0x80 | (delta_base & 0x7F))
2047 delta_base >>= 7
2048 header.extend(ret)
2049 elif type_num == REF_DELTA:
2050 assert len(delta_base) == 20
2051 header += delta_base
2052 return bytearray(header)
2055def pack_object_chunks(type, object, compression_level=-1):
2056 """Generate chunks for a pack object.
2058 Args:
2059 type: Numeric type of the object
2060 object: Object to write
2061 compression_level: the zlib compression level
2062 Returns: Chunks
2063 """
2064 if type in DELTA_TYPES:
2065 delta_base, object = object
2066 else:
2067 delta_base = None
2068 if isinstance(object, bytes):
2069 object = [object]
2070 yield bytes(pack_object_header(type, delta_base, sum(map(len, object))))
2071 compressor = zlib.compressobj(level=compression_level)
2072 for data in object:
2073 yield compressor.compress(data)
2074 yield compressor.flush()
2077def write_pack_object(write, type, object, sha=None, compression_level=-1):
2078 """Write pack object to a file.
2080 Args:
2081 write: Write function to use
2082 type: Numeric type of the object
2083 object: Object to write
2084 compression_level: the zlib compression level
2085 Returns: Tuple with offset at which the object was written, and crc32
2086 """
2087 crc32 = 0
2088 for chunk in pack_object_chunks(type, object, compression_level=compression_level):
2089 write(chunk)
2090 if sha is not None:
2091 sha.update(chunk)
2092 crc32 = binascii.crc32(chunk, crc32)
2093 return crc32 & 0xFFFFFFFF
2096def write_pack(
2097 filename,
2098 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],
2099 *,
2100 deltify: Optional[bool] = None,
2101 delta_window_size: Optional[int] = None,
2102 compression_level: int = -1,
2103):
2104 """Write a new pack data file.
2106 Args:
2107 filename: Path to the new pack file (without .pack extension)
2108 delta_window_size: Delta window size
2109 deltify: Whether to deltify pack objects
2110 compression_level: the zlib compression level
2111 Returns: Tuple with checksum of pack file and index file
2112 """
2113 with GitFile(filename + ".pack", "wb") as f:
2114 entries, data_sum = write_pack_objects(
2115 f.write,
2116 objects,
2117 delta_window_size=delta_window_size,
2118 deltify=deltify,
2119 compression_level=compression_level,
2120 )
2121 entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])
2122 with GitFile(filename + ".idx", "wb") as f:
2123 return data_sum, write_pack_index(f, entries, data_sum)
2126def pack_header_chunks(num_objects):
2127 """Yield chunks for a pack header."""
2128 yield b"PACK" # Pack header
2129 yield struct.pack(b">L", 2) # Pack version
2130 yield struct.pack(b">L", num_objects) # Number of objects in pack
2133def write_pack_header(write, num_objects) -> None:
2134 """Write a pack header for the given number of objects."""
2135 if hasattr(write, "write"):
2136 write = write.write
2137 warnings.warn(
2138 "write_pack_header() now takes a write rather than file argument",
2139 DeprecationWarning,
2140 stacklevel=2,
2141 )
2142 for chunk in pack_header_chunks(num_objects):
2143 write(chunk)
2146def find_reusable_deltas(
2147 container: PackedObjectContainer,
2148 object_ids: set[bytes],
2149 *,
2150 other_haves: Optional[set[bytes]] = None,
2151 progress=None,
2152) -> Iterator[UnpackedObject]:
2153 if other_haves is None:
2154 other_haves = set()
2155 reused = 0
2156 for i, unpacked in enumerate(
2157 container.iter_unpacked_subset(
2158 object_ids, allow_missing=True, convert_ofs_delta=True
2159 )
2160 ):
2161 if progress is not None and i % 1000 == 0:
2162 progress(f"checking for reusable deltas: {i}/{len(object_ids)}\r".encode())
2163 if unpacked.pack_type_num == REF_DELTA:
2164 hexsha = sha_to_hex(unpacked.delta_base) # type: ignore
2165 if hexsha in object_ids or hexsha in other_haves:
2166 yield unpacked
2167 reused += 1
2168 if progress is not None:
2169 progress((f"found {reused} deltas to reuse\n").encode())
2172def deltify_pack_objects(
2173 objects: Union[Iterator[bytes], Iterator[tuple[ShaFile, Optional[bytes]]]],
2174 *,
2175 window_size: Optional[int] = None,
2176 progress=None,
2177) -> Iterator[UnpackedObject]:
2178 """Generate deltas for pack objects.
2180 Args:
2181 objects: An iterable of (object, path) tuples to deltify.
2182 window_size: Window size; None for default
2183 Returns: Iterator over type_num, object id, delta_base, content
2184 delta_base is None for full text entries
2185 """
2187 def objects_with_hints():
2188 for e in objects:
2189 if isinstance(e, ShaFile):
2190 yield (e, (e.type_num, None))
2191 else:
2192 yield (e[0], (e[0].type_num, e[1]))
2194 yield from deltas_from_sorted_objects(
2195 sort_objects_for_delta(objects_with_hints()),
2196 window_size=window_size,
2197 progress=progress,
2198 )
2201def sort_objects_for_delta(
2202 objects: Union[Iterator[ShaFile], Iterator[tuple[ShaFile, Optional[PackHint]]]],
2203) -> Iterator[ShaFile]:
2204 magic = []
2205 for entry in objects:
2206 if isinstance(entry, tuple):
2207 obj, hint = entry
2208 if hint is None:
2209 type_num = None
2210 path = None
2211 else:
2212 (type_num, path) = hint
2213 else:
2214 obj = entry
2215 magic.append((type_num, path, -obj.raw_length(), obj))
2216 # Build a list of objects ordered by the magic Linus heuristic
2217 # This helps us find good objects to diff against us
2218 magic.sort()
2219 return (x[3] for x in magic)
2222def deltas_from_sorted_objects(
2223 objects, window_size: Optional[int] = None, progress=None
2224):
2225 # TODO(jelmer): Use threads
2226 if window_size is None:
2227 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE
2229 possible_bases: deque[tuple[bytes, int, list[bytes]]] = deque()
2230 for i, o in enumerate(objects):
2231 if progress is not None and i % 1000 == 0:
2232 progress((f"generating deltas: {i}\r").encode())
2233 raw = o.as_raw_chunks()
2234 winner = raw
2235 winner_len = sum(map(len, winner))
2236 winner_base = None
2237 for base_id, base_type_num, base in possible_bases:
2238 if base_type_num != o.type_num:
2239 continue
2240 delta_len = 0
2241 delta = []
2242 for chunk in create_delta(base, raw):
2243 delta_len += len(chunk)
2244 if delta_len >= winner_len:
2245 break
2246 delta.append(chunk)
2247 else:
2248 winner_base = base_id
2249 winner = delta
2250 winner_len = sum(map(len, winner))
2251 yield UnpackedObject(
2252 o.type_num,
2253 sha=o.sha().digest(),
2254 delta_base=winner_base,
2255 decomp_len=winner_len,
2256 decomp_chunks=winner,
2257 )
2258 possible_bases.appendleft((o.sha().digest(), o.type_num, raw))
2259 while len(possible_bases) > window_size:
2260 possible_bases.pop()
2263def pack_objects_to_data(
2264 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],
2265 *,
2266 deltify: Optional[bool] = None,
2267 delta_window_size: Optional[int] = None,
2268 ofs_delta: bool = True,
2269 progress=None,
2270) -> tuple[int, Iterator[UnpackedObject]]:
2271 """Create pack data from objects.
2273 Args:
2274 objects: Pack objects
2275 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
2276 """
2277 # TODO(jelmer): support deltaifying
2278 count = len(objects)
2279 if deltify is None:
2280 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too
2281 # slow at the moment.
2282 deltify = False
2283 if deltify:
2284 return (
2285 count,
2286 deltify_pack_objects(
2287 iter(objects), # type: ignore
2288 window_size=delta_window_size,
2289 progress=progress,
2290 ),
2291 )
2292 else:
2294 def iter_without_path():
2295 for o in objects:
2296 if isinstance(o, tuple):
2297 yield full_unpacked_object(o[0])
2298 else:
2299 yield full_unpacked_object(o)
2301 return (count, iter_without_path())
2304def generate_unpacked_objects(
2305 container: PackedObjectContainer,
2306 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]],
2307 delta_window_size: Optional[int] = None,
2308 deltify: Optional[bool] = None,
2309 reuse_deltas: bool = True,
2310 ofs_delta: bool = True,
2311 other_haves: Optional[set[bytes]] = None,
2312 progress=None,
2313) -> Iterator[UnpackedObject]:
2314 """Create pack data from objects.
2316 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
2317 """
2318 todo = dict(object_ids)
2319 if reuse_deltas:
2320 for unpack in find_reusable_deltas(
2321 container, set(todo), other_haves=other_haves, progress=progress
2322 ):
2323 del todo[sha_to_hex(unpack.sha())]
2324 yield unpack
2325 if deltify is None:
2326 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too
2327 # slow at the moment.
2328 deltify = False
2329 if deltify:
2330 objects_to_delta = container.iterobjects_subset(
2331 todo.keys(), allow_missing=False
2332 )
2333 yield from deltas_from_sorted_objects(
2334 sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta),
2335 window_size=delta_window_size,
2336 progress=progress,
2337 )
2338 else:
2339 for oid in todo:
2340 yield full_unpacked_object(container[oid])
2343def full_unpacked_object(o: ShaFile) -> UnpackedObject:
2344 return UnpackedObject(
2345 o.type_num,
2346 delta_base=None,
2347 crc32=None,
2348 decomp_chunks=o.as_raw_chunks(),
2349 sha=o.sha().digest(),
2350 )
2353def write_pack_from_container(
2354 write,
2355 container: PackedObjectContainer,
2356 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]],
2357 delta_window_size: Optional[int] = None,
2358 deltify: Optional[bool] = None,
2359 reuse_deltas: bool = True,
2360 compression_level: int = -1,
2361 other_haves: Optional[set[bytes]] = None,
2362):
2363 """Write a new pack data file.
2365 Args:
2366 write: write function to use
2367 container: PackedObjectContainer
2368 delta_window_size: Sliding window size for searching for deltas;
2369 Set to None for default window size.
2370 deltify: Whether to deltify objects
2371 compression_level: the zlib compression level to use
2372 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2373 """
2374 pack_contents_count = len(object_ids)
2375 pack_contents = generate_unpacked_objects(
2376 container,
2377 object_ids,
2378 delta_window_size=delta_window_size,
2379 deltify=deltify,
2380 reuse_deltas=reuse_deltas,
2381 other_haves=other_haves,
2382 )
2384 return write_pack_data(
2385 write,
2386 pack_contents,
2387 num_records=pack_contents_count,
2388 compression_level=compression_level,
2389 )
2392def write_pack_objects(
2393 write,
2394 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],
2395 *,
2396 delta_window_size: Optional[int] = None,
2397 deltify: Optional[bool] = None,
2398 compression_level: int = -1,
2399):
2400 """Write a new pack data file.
2402 Args:
2403 write: write function to use
2404 objects: Sequence of (object, path) tuples to write
2405 delta_window_size: Sliding window size for searching for deltas;
2406 Set to None for default window size.
2407 deltify: Whether to deltify objects
2408 compression_level: the zlib compression level to use
2409 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2410 """
2411 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify)
2413 return write_pack_data(
2414 write,
2415 pack_contents,
2416 num_records=pack_contents_count,
2417 compression_level=compression_level,
2418 )
2421class PackChunkGenerator:
2422 def __init__(
2423 self,
2424 num_records=None,
2425 records=None,
2426 progress=None,
2427 compression_level=-1,
2428 reuse_compressed=True,
2429 ) -> None:
2430 self.cs = sha1(b"")
2431 self.entries: dict[Union[int, bytes], tuple[int, int]] = {}
2432 self._it = self._pack_data_chunks(
2433 num_records=num_records,
2434 records=records,
2435 progress=progress,
2436 compression_level=compression_level,
2437 reuse_compressed=reuse_compressed,
2438 )
2440 def sha1digest(self):
2441 return self.cs.digest()
2443 def __iter__(self):
2444 return self._it
2446 def _pack_data_chunks(
2447 self,
2448 records: Iterator[UnpackedObject],
2449 *,
2450 num_records=None,
2451 progress=None,
2452 compression_level: int = -1,
2453 reuse_compressed: bool = True,
2454 ) -> Iterator[bytes]:
2455 """Iterate pack data file chunks.
2457 Args:
2458 records: Iterator over UnpackedObject
2459 num_records: Number of records (defaults to len(records) if not specified)
2460 progress: Function to report progress to
2461 compression_level: the zlib compression level
2462 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2463 """
2464 # Write the pack
2465 if num_records is None:
2466 num_records = len(records) # type: ignore
2467 offset = 0
2468 for chunk in pack_header_chunks(num_records):
2469 yield chunk
2470 self.cs.update(chunk)
2471 offset += len(chunk)
2472 actual_num_records = 0
2473 for i, unpacked in enumerate(records):
2474 type_num = unpacked.pack_type_num
2475 if progress is not None and i % 1000 == 0:
2476 progress((f"writing pack data: {i}/{num_records}\r").encode("ascii"))
2477 raw: Union[list[bytes], tuple[int, list[bytes]], tuple[bytes, list[bytes]]]
2478 if unpacked.delta_base is not None:
2479 try:
2480 base_offset, base_crc32 = self.entries[unpacked.delta_base]
2481 except KeyError:
2482 type_num = REF_DELTA
2483 assert isinstance(unpacked.delta_base, bytes)
2484 raw = (unpacked.delta_base, unpacked.decomp_chunks)
2485 else:
2486 type_num = OFS_DELTA
2487 raw = (offset - base_offset, unpacked.decomp_chunks)
2488 else:
2489 raw = unpacked.decomp_chunks
2490 if unpacked.comp_chunks is not None and reuse_compressed:
2491 chunks = unpacked.comp_chunks
2492 else:
2493 chunks = pack_object_chunks(
2494 type_num, raw, compression_level=compression_level
2495 )
2496 crc32 = 0
2497 object_size = 0
2498 for chunk in chunks:
2499 yield chunk
2500 crc32 = binascii.crc32(chunk, crc32)
2501 self.cs.update(chunk)
2502 object_size += len(chunk)
2503 actual_num_records += 1
2504 self.entries[unpacked.sha()] = (offset, crc32)
2505 offset += object_size
2506 if actual_num_records != num_records:
2507 raise AssertionError(
2508 f"actual records written differs: {actual_num_records} != {num_records}"
2509 )
2511 yield self.cs.digest()
2514def write_pack_data(
2515 write,
2516 records: Iterator[UnpackedObject],
2517 *,
2518 num_records=None,
2519 progress=None,
2520 compression_level=-1,
2521):
2522 """Write a new pack data file.
2524 Args:
2525 write: Write function to use
2526 num_records: Number of records (defaults to len(records) if None)
2527 records: Iterator over type_num, object_id, delta_base, raw
2528 progress: Function to report progress to
2529 compression_level: the zlib compression level
2530 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2531 """
2532 chunk_generator = PackChunkGenerator(
2533 num_records=num_records,
2534 records=records,
2535 progress=progress,
2536 compression_level=compression_level,
2537 )
2538 for chunk in chunk_generator:
2539 write(chunk)
2540 return chunk_generator.entries, chunk_generator.sha1digest()
2543def write_pack_index_v1(f, entries, pack_checksum):
2544 """Write a new pack index file.
2546 Args:
2547 f: A file-like object to write to
2548 entries: List of tuples with object name (sha), offset_in_pack,
2549 and crc32_checksum.
2550 pack_checksum: Checksum of the pack file.
2551 Returns: The SHA of the written index file
2552 """
2553 f = SHA1Writer(f)
2554 fan_out_table = defaultdict(lambda: 0)
2555 for name, _offset, _entry_checksum in entries:
2556 fan_out_table[ord(name[:1])] += 1
2557 # Fan-out table
2558 for i in range(0x100):
2559 f.write(struct.pack(">L", fan_out_table[i]))
2560 fan_out_table[i + 1] += fan_out_table[i]
2561 for name, offset, _entry_checksum in entries:
2562 if not (offset <= 0xFFFFFFFF):
2563 raise TypeError("pack format 1 only supports offsets < 2Gb")
2564 f.write(struct.pack(">L20s", offset, name))
2565 assert len(pack_checksum) == 20
2566 f.write(pack_checksum)
2567 return f.write_sha()
2570def _delta_encode_size(size) -> bytes:
2571 ret = bytearray()
2572 c = size & 0x7F
2573 size >>= 7
2574 while size:
2575 ret.append(c | 0x80)
2576 c = size & 0x7F
2577 size >>= 7
2578 ret.append(c)
2579 return bytes(ret)
2582# The length of delta compression copy operations in version 2 packs is limited
2583# to 64K. To copy more, we use several copy operations. Version 3 packs allow
2584# 24-bit lengths in copy operations, but we always make version 2 packs.
2585_MAX_COPY_LEN = 0xFFFF
2588def _encode_copy_operation(start, length):
2589 scratch = bytearray([0x80])
2590 for i in range(4):
2591 if start & 0xFF << i * 8:
2592 scratch.append((start >> i * 8) & 0xFF)
2593 scratch[0] |= 1 << i
2594 for i in range(2):
2595 if length & 0xFF << i * 8:
2596 scratch.append((length >> i * 8) & 0xFF)
2597 scratch[0] |= 1 << (4 + i)
2598 return bytes(scratch)
2601def create_delta(base_buf, target_buf):
2602 """Use python difflib to work out how to transform base_buf to target_buf.
2604 Args:
2605 base_buf: Base buffer
2606 target_buf: Target buffer
2607 """
2608 if isinstance(base_buf, list):
2609 base_buf = b"".join(base_buf)
2610 if isinstance(target_buf, list):
2611 target_buf = b"".join(target_buf)
2612 assert isinstance(base_buf, bytes)
2613 assert isinstance(target_buf, bytes)
2614 # write delta header
2615 yield _delta_encode_size(len(base_buf))
2616 yield _delta_encode_size(len(target_buf))
2617 # write out delta opcodes
2618 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf)
2619 for opcode, i1, i2, j1, j2 in seq.get_opcodes():
2620 # Git patch opcodes don't care about deletes!
2621 # if opcode == 'replace' or opcode == 'delete':
2622 # pass
2623 if opcode == "equal":
2624 # If they are equal, unpacker will use data from base_buf
2625 # Write out an opcode that says what range to use
2626 copy_start = i1
2627 copy_len = i2 - i1
2628 while copy_len > 0:
2629 to_copy = min(copy_len, _MAX_COPY_LEN)
2630 yield _encode_copy_operation(copy_start, to_copy)
2631 copy_start += to_copy
2632 copy_len -= to_copy
2633 if opcode == "replace" or opcode == "insert":
2634 # If we are replacing a range or adding one, then we just
2635 # output it to the stream (prefixed by its size)
2636 s = j2 - j1
2637 o = j1
2638 while s > 127:
2639 yield bytes([127])
2640 yield memoryview(target_buf)[o : o + 127]
2641 s -= 127
2642 o += 127
2643 yield bytes([s])
2644 yield memoryview(target_buf)[o : o + s]
2647def apply_delta(src_buf, delta):
2648 """Based on the similar function in git's patch-delta.c.
2650 Args:
2651 src_buf: Source buffer
2652 delta: Delta instructions
2653 """
2654 if not isinstance(src_buf, bytes):
2655 src_buf = b"".join(src_buf)
2656 if not isinstance(delta, bytes):
2657 delta = b"".join(delta)
2658 out = []
2659 index = 0
2660 delta_length = len(delta)
2662 def get_delta_header_size(delta, index):
2663 size = 0
2664 i = 0
2665 while delta:
2666 cmd = ord(delta[index : index + 1])
2667 index += 1
2668 size |= (cmd & ~0x80) << i
2669 i += 7
2670 if not cmd & 0x80:
2671 break
2672 return size, index
2674 src_size, index = get_delta_header_size(delta, index)
2675 dest_size, index = get_delta_header_size(delta, index)
2676 if src_size != len(src_buf):
2677 raise ApplyDeltaError(
2678 f"Unexpected source buffer size: {src_size} vs {len(src_buf)}"
2679 )
2680 while index < delta_length:
2681 cmd = ord(delta[index : index + 1])
2682 index += 1
2683 if cmd & 0x80:
2684 cp_off = 0
2685 for i in range(4):
2686 if cmd & (1 << i):
2687 x = ord(delta[index : index + 1])
2688 index += 1
2689 cp_off |= x << (i * 8)
2690 cp_size = 0
2691 # Version 3 packs can contain copy sizes larger than 64K.
2692 for i in range(3):
2693 if cmd & (1 << (4 + i)):
2694 x = ord(delta[index : index + 1])
2695 index += 1
2696 cp_size |= x << (i * 8)
2697 if cp_size == 0:
2698 cp_size = 0x10000
2699 if (
2700 cp_off + cp_size < cp_size
2701 or cp_off + cp_size > src_size
2702 or cp_size > dest_size
2703 ):
2704 break
2705 out.append(src_buf[cp_off : cp_off + cp_size])
2706 elif cmd != 0:
2707 out.append(delta[index : index + cmd])
2708 index += cmd
2709 else:
2710 raise ApplyDeltaError("Invalid opcode 0")
2712 if index != delta_length:
2713 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}")
2715 if dest_size != chunks_length(out):
2716 raise ApplyDeltaError("dest size incorrect")
2718 return out
2721def write_pack_index_v2(
2722 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes
2723) -> bytes:
2724 """Write a new pack index file.
2726 Args:
2727 f: File-like object to write to
2728 entries: List of tuples with object name (sha), offset_in_pack, and
2729 crc32_checksum.
2730 pack_checksum: Checksum of the pack file.
2731 Returns: The SHA of the index file written
2732 """
2733 f = SHA1Writer(f)
2734 f.write(b"\377tOc") # Magic!
2735 f.write(struct.pack(">L", 2))
2736 fan_out_table: dict[int, int] = defaultdict(lambda: 0)
2737 for name, offset, entry_checksum in entries:
2738 fan_out_table[ord(name[:1])] += 1
2739 # Fan-out table
2740 largetable: list[int] = []
2741 for i in range(0x100):
2742 f.write(struct.pack(b">L", fan_out_table[i]))
2743 fan_out_table[i + 1] += fan_out_table[i]
2744 for name, offset, entry_checksum in entries:
2745 f.write(name)
2746 for name, offset, entry_checksum in entries:
2747 f.write(struct.pack(b">L", entry_checksum))
2748 for name, offset, entry_checksum in entries:
2749 if offset < 2**31:
2750 f.write(struct.pack(b">L", offset))
2751 else:
2752 f.write(struct.pack(b">L", 2**31 + len(largetable)))
2753 largetable.append(offset)
2754 for offset in largetable:
2755 f.write(struct.pack(b">Q", offset))
2756 assert len(pack_checksum) == 20
2757 f.write(pack_checksum)
2758 return f.write_sha()
2761def write_pack_index_v3(
2762 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes, hash_algorithm: int = 1
2763) -> bytes:
2764 """Write a new pack index file in v3 format.
2766 Args:
2767 f: File-like object to write to
2768 entries: List of tuples with object name (sha), offset_in_pack, and
2769 crc32_checksum.
2770 pack_checksum: Checksum of the pack file.
2771 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
2772 Returns: The SHA of the index file written
2773 """
2774 if hash_algorithm == 1:
2775 hash_size = 20 # SHA-1
2776 writer_cls = SHA1Writer
2777 elif hash_algorithm == 2:
2778 hash_size = 32 # SHA-256
2779 # TODO: Add SHA256Writer when SHA-256 support is implemented
2780 raise NotImplementedError("SHA-256 support not yet implemented")
2781 else:
2782 raise ValueError(f"Unknown hash algorithm {hash_algorithm}")
2784 # Convert entries to list to allow multiple iterations
2785 entries_list = list(entries)
2787 # Calculate shortest unambiguous prefix length for object names
2788 # For now, use full hash size (this could be optimized)
2789 shortened_oid_len = hash_size
2791 f = writer_cls(f)
2792 f.write(b"\377tOc") # Magic!
2793 f.write(struct.pack(">L", 3)) # Version 3
2794 f.write(struct.pack(">L", hash_algorithm)) # Hash algorithm
2795 f.write(struct.pack(">L", shortened_oid_len)) # Shortened OID length
2797 fan_out_table: dict[int, int] = defaultdict(lambda: 0)
2798 for name, offset, entry_checksum in entries_list:
2799 if len(name) != hash_size:
2800 raise ValueError(
2801 f"Object name has wrong length: expected {hash_size}, got {len(name)}"
2802 )
2803 fan_out_table[ord(name[:1])] += 1
2805 # Fan-out table
2806 largetable: list[int] = []
2807 for i in range(0x100):
2808 f.write(struct.pack(b">L", fan_out_table[i]))
2809 fan_out_table[i + 1] += fan_out_table[i]
2811 # Object names table
2812 for name, offset, entry_checksum in entries_list:
2813 f.write(name)
2815 # CRC32 checksums table
2816 for name, offset, entry_checksum in entries_list:
2817 f.write(struct.pack(b">L", entry_checksum))
2819 # Offset table
2820 for name, offset, entry_checksum in entries_list:
2821 if offset < 2**31:
2822 f.write(struct.pack(b">L", offset))
2823 else:
2824 f.write(struct.pack(b">L", 2**31 + len(largetable)))
2825 largetable.append(offset)
2827 # Large offset table
2828 for offset in largetable:
2829 f.write(struct.pack(b">Q", offset))
2831 assert len(pack_checksum) == hash_size, (
2832 f"Pack checksum has wrong length: expected {hash_size}, got {len(pack_checksum)}"
2833 )
2834 f.write(pack_checksum)
2835 return f.write_sha()
2838def write_pack_index(
2839 index_filename, entries, pack_checksum, progress=None, version=None
2840):
2841 """Write a pack index file.
2843 Args:
2844 index_filename: Index filename.
2845 entries: List of (checksum, offset, crc32) tuples
2846 pack_checksum: Checksum of the pack file.
2847 progress: Progress function (not currently used)
2848 version: Pack index version to use (1, 2, or 3). If None, defaults to DEFAULT_PACK_INDEX_VERSION.
2850 Returns:
2851 SHA of the written index file
2852 """
2853 if version is None:
2854 version = DEFAULT_PACK_INDEX_VERSION
2856 if version == 1:
2857 return write_pack_index_v1(index_filename, entries, pack_checksum)
2858 elif version == 2:
2859 return write_pack_index_v2(index_filename, entries, pack_checksum)
2860 elif version == 3:
2861 return write_pack_index_v3(index_filename, entries, pack_checksum)
2862 else:
2863 raise ValueError(f"Unsupported pack index version: {version}")
2866class Pack:
2867 """A Git pack object."""
2869 _data_load: Optional[Callable[[], PackData]]
2870 _idx_load: Optional[Callable[[], PackIndex]]
2872 _data: Optional[PackData]
2873 _idx: Optional[PackIndex]
2875 def __init__(
2876 self,
2877 basename,
2878 resolve_ext_ref: Optional[ResolveExtRefFn] = None,
2879 *,
2880 delta_window_size=None,
2881 window_memory=None,
2882 delta_cache_size=None,
2883 depth=None,
2884 threads=None,
2885 big_file_threshold=None,
2886 ) -> None:
2887 self._basename = basename
2888 self._data = None
2889 self._idx = None
2890 self._idx_path = self._basename + ".idx"
2891 self._data_path = self._basename + ".pack"
2892 self.delta_window_size = delta_window_size
2893 self.window_memory = window_memory
2894 self.delta_cache_size = delta_cache_size
2895 self.depth = depth
2896 self.threads = threads
2897 self.big_file_threshold = big_file_threshold
2898 self._data_load = lambda: PackData(
2899 self._data_path,
2900 delta_window_size=delta_window_size,
2901 window_memory=window_memory,
2902 delta_cache_size=delta_cache_size,
2903 depth=depth,
2904 threads=threads,
2905 big_file_threshold=big_file_threshold,
2906 )
2907 self._idx_load = lambda: load_pack_index(self._idx_path)
2908 self.resolve_ext_ref = resolve_ext_ref
2910 @classmethod
2911 def from_lazy_objects(cls, data_fn, idx_fn):
2912 """Create a new pack object from callables to load pack data and
2913 index objects.
2914 """
2915 ret = cls("")
2916 ret._data_load = data_fn
2917 ret._idx_load = idx_fn
2918 return ret
2920 @classmethod
2921 def from_objects(cls, data, idx):
2922 """Create a new pack object from pack data and index objects."""
2923 ret = cls("")
2924 ret._data = data
2925 ret._data_load = None
2926 ret._idx = idx
2927 ret._idx_load = None
2928 ret.check_length_and_checksum()
2929 return ret
2931 def name(self):
2932 """The SHA over the SHAs of the objects in this pack."""
2933 return self.index.objects_sha1()
2935 @property
2936 def data(self) -> PackData:
2937 """The pack data object being used."""
2938 if self._data is None:
2939 assert self._data_load
2940 self._data = self._data_load()
2941 self.check_length_and_checksum()
2942 return self._data
2944 @property
2945 def index(self) -> PackIndex:
2946 """The index being used.
2948 Note: This may be an in-memory index
2949 """
2950 if self._idx is None:
2951 assert self._idx_load
2952 self._idx = self._idx_load()
2953 return self._idx
2955 def close(self) -> None:
2956 if self._data is not None:
2957 self._data.close()
2958 if self._idx is not None:
2959 self._idx.close()
2961 def __enter__(self):
2962 return self
2964 def __exit__(self, exc_type, exc_val, exc_tb):
2965 self.close()
2967 def __eq__(self, other):
2968 return isinstance(self, type(other)) and self.index == other.index
2970 def __len__(self) -> int:
2971 """Number of entries in this pack."""
2972 return len(self.index)
2974 def __repr__(self) -> str:
2975 return f"{self.__class__.__name__}({self._basename!r})"
2977 def __iter__(self):
2978 """Iterate over all the sha1s of the objects in this pack."""
2979 return iter(self.index)
2981 def check_length_and_checksum(self) -> None:
2982 """Sanity check the length and checksum of the pack index and data."""
2983 assert len(self.index) == len(self.data), (
2984 f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)"
2985 )
2986 idx_stored_checksum = self.index.get_pack_checksum()
2987 data_stored_checksum = self.data.get_stored_checksum()
2988 if (
2989 idx_stored_checksum is not None
2990 and idx_stored_checksum != data_stored_checksum
2991 ):
2992 raise ChecksumMismatch(
2993 sha_to_hex(idx_stored_checksum),
2994 sha_to_hex(data_stored_checksum),
2995 )
2997 def check(self) -> None:
2998 """Check the integrity of this pack.
3000 Raises:
3001 ChecksumMismatch: if a checksum for the index or data is wrong
3002 """
3003 self.index.check()
3004 self.data.check()
3005 for obj in self.iterobjects():
3006 obj.check()
3007 # TODO: object connectivity checks
3009 def get_stored_checksum(self) -> bytes:
3010 return self.data.get_stored_checksum()
3012 def pack_tuples(self):
3013 return [(o, None) for o in self.iterobjects()]
3015 def __contains__(self, sha1: bytes) -> bool:
3016 """Check whether this pack contains a particular SHA1."""
3017 try:
3018 self.index.object_offset(sha1)
3019 return True
3020 except KeyError:
3021 return False
3023 def get_raw(self, sha1: bytes) -> tuple[int, bytes]:
3024 offset = self.index.object_offset(sha1)
3025 obj_type, obj = self.data.get_object_at(offset)
3026 type_num, chunks = self.resolve_object(offset, obj_type, obj)
3027 return type_num, b"".join(chunks)
3029 def __getitem__(self, sha1: bytes) -> ShaFile:
3030 """Retrieve the specified SHA1."""
3031 type, uncomp = self.get_raw(sha1)
3032 return ShaFile.from_raw_string(type, uncomp, sha=sha1)
3034 def iterobjects(self) -> Iterator[ShaFile]:
3035 """Iterate over the objects in this pack."""
3036 return iter(
3037 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref)
3038 )
3040 def iterobjects_subset(
3041 self, shas: Iterable[ObjectID], *, allow_missing: bool = False
3042 ) -> Iterator[ShaFile]:
3043 return (
3044 uo
3045 for uo in PackInflater.for_pack_subset(
3046 self,
3047 shas,
3048 allow_missing=allow_missing,
3049 resolve_ext_ref=self.resolve_ext_ref,
3050 )
3051 if uo.id in shas
3052 )
3054 def iter_unpacked_subset(
3055 self,
3056 shas: Iterable[ObjectID],
3057 *,
3058 include_comp: bool = False,
3059 allow_missing: bool = False,
3060 convert_ofs_delta: bool = False,
3061 ) -> Iterator[UnpackedObject]:
3062 ofs_pending: dict[int, list[UnpackedObject]] = defaultdict(list)
3063 ofs: dict[bytes, int] = {}
3064 todo = set(shas)
3065 for unpacked in self.iter_unpacked(include_comp=include_comp):
3066 sha = unpacked.sha()
3067 ofs[unpacked.offset] = sha
3068 hexsha = sha_to_hex(sha)
3069 if hexsha in todo:
3070 if unpacked.pack_type_num == OFS_DELTA:
3071 assert isinstance(unpacked.delta_base, int)
3072 base_offset = unpacked.offset - unpacked.delta_base
3073 try:
3074 unpacked.delta_base = ofs[base_offset]
3075 except KeyError:
3076 ofs_pending[base_offset].append(unpacked)
3077 continue
3078 else:
3079 unpacked.pack_type_num = REF_DELTA
3080 yield unpacked
3081 todo.remove(hexsha)
3082 for child in ofs_pending.pop(unpacked.offset, []):
3083 child.pack_type_num = REF_DELTA
3084 child.delta_base = sha
3085 yield child
3086 assert not ofs_pending
3087 if not allow_missing and todo:
3088 raise UnresolvedDeltas(list(todo))
3090 def iter_unpacked(self, include_comp=False):
3091 ofs_to_entries = {
3092 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries()
3093 }
3094 for unpacked in self.data.iter_unpacked(include_comp=include_comp):
3095 (sha, crc32) = ofs_to_entries[unpacked.offset]
3096 unpacked._sha = sha
3097 unpacked.crc32 = crc32
3098 yield unpacked
3100 def keep(self, msg: Optional[bytes] = None) -> str:
3101 """Add a .keep file for the pack, preventing git from garbage collecting it.
3103 Args:
3104 msg: A message written inside the .keep file; can be used later
3105 to determine whether or not a .keep file is obsolete.
3106 Returns: The path of the .keep file, as a string.
3107 """
3108 keepfile_name = f"{self._basename}.keep"
3109 with GitFile(keepfile_name, "wb") as keepfile:
3110 if msg:
3111 keepfile.write(msg)
3112 keepfile.write(b"\n")
3113 return keepfile_name
3115 def get_ref(self, sha: bytes) -> tuple[Optional[int], int, OldUnpackedObject]:
3116 """Get the object for a ref SHA, only looking in this pack."""
3117 # TODO: cache these results
3118 try:
3119 offset = self.index.object_offset(sha)
3120 except KeyError:
3121 offset = None
3122 if offset:
3123 type, obj = self.data.get_object_at(offset)
3124 elif self.resolve_ext_ref:
3125 type, obj = self.resolve_ext_ref(sha)
3126 else:
3127 raise KeyError(sha)
3128 return offset, type, obj
3130 def resolve_object(
3131 self, offset: int, type: int, obj, get_ref=None
3132 ) -> tuple[int, Iterable[bytes]]:
3133 """Resolve an object, possibly resolving deltas when necessary.
3135 Returns: Tuple with object type and contents.
3136 """
3137 # Walk down the delta chain, building a stack of deltas to reach
3138 # the requested object.
3139 base_offset = offset
3140 base_type = type
3141 base_obj = obj
3142 delta_stack = []
3143 while base_type in DELTA_TYPES:
3144 prev_offset = base_offset
3145 if get_ref is None:
3146 get_ref = self.get_ref
3147 if base_type == OFS_DELTA:
3148 (delta_offset, delta) = base_obj
3149 # TODO: clean up asserts and replace with nicer error messages
3150 base_offset = base_offset - delta_offset
3151 base_type, base_obj = self.data.get_object_at(base_offset)
3152 assert isinstance(base_type, int)
3153 elif base_type == REF_DELTA:
3154 (basename, delta) = base_obj
3155 assert isinstance(basename, bytes) and len(basename) == 20
3156 base_offset, base_type, base_obj = get_ref(basename)
3157 assert isinstance(base_type, int)
3158 if base_offset == prev_offset: # object is based on itself
3159 raise UnresolvedDeltas([basename])
3160 delta_stack.append((prev_offset, base_type, delta))
3162 # Now grab the base object (mustn't be a delta) and apply the
3163 # deltas all the way up the stack.
3164 chunks = base_obj
3165 for prev_offset, _delta_type, delta in reversed(delta_stack):
3166 chunks = apply_delta(chunks, delta)
3167 if prev_offset is not None:
3168 self.data._offset_cache[prev_offset] = base_type, chunks
3169 return base_type, chunks
3171 def entries(
3172 self, progress: Optional[ProgressFn] = None
3173 ) -> Iterator[PackIndexEntry]:
3174 """Yield entries summarizing the contents of this pack.
3176 Args:
3177 progress: Progress function, called with current and total
3178 object count.
3179 Returns: iterator of tuples with (sha, offset, crc32)
3180 """
3181 return self.data.iterentries(
3182 progress=progress, resolve_ext_ref=self.resolve_ext_ref
3183 )
3185 def sorted_entries(
3186 self, progress: Optional[ProgressFn] = None
3187 ) -> Iterator[PackIndexEntry]:
3188 """Return entries in this pack, sorted by SHA.
3190 Args:
3191 progress: Progress function, called with current and total
3192 object count
3193 Returns: Iterator of tuples with (sha, offset, crc32)
3194 """
3195 return self.data.sorted_entries(
3196 progress=progress, resolve_ext_ref=self.resolve_ext_ref
3197 )
3199 def get_unpacked_object(
3200 self, sha: bytes, *, include_comp: bool = False, convert_ofs_delta: bool = True
3201 ) -> UnpackedObject:
3202 """Get the unpacked object for a sha.
3204 Args:
3205 sha: SHA of object to fetch
3206 include_comp: Whether to include compression data in UnpackedObject
3207 """
3208 offset = self.index.object_offset(sha)
3209 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp)
3210 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta:
3211 assert isinstance(unpacked.delta_base, int)
3212 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base)
3213 unpacked.pack_type_num = REF_DELTA
3214 return unpacked
3217def extend_pack(
3218 f: BinaryIO,
3219 object_ids: set[ObjectID],
3220 get_raw,
3221 *,
3222 compression_level=-1,
3223 progress=None,
3224) -> tuple[bytes, list]:
3225 """Extend a pack file with more objects.
3227 The caller should make sure that object_ids does not contain any objects
3228 that are already in the pack
3229 """
3230 # Update the header with the new number of objects.
3231 f.seek(0)
3232 _version, num_objects = read_pack_header(f.read)
3234 if object_ids:
3235 f.seek(0)
3236 write_pack_header(f.write, num_objects + len(object_ids))
3238 # Must flush before reading (http://bugs.python.org/issue3207)
3239 f.flush()
3241 # Rescan the rest of the pack, computing the SHA with the new header.
3242 new_sha = compute_file_sha(f, end_ofs=-20)
3244 # Must reposition before writing (http://bugs.python.org/issue3207)
3245 f.seek(0, os.SEEK_CUR)
3247 extra_entries = []
3249 # Complete the pack.
3250 for i, object_id in enumerate(object_ids):
3251 if progress is not None:
3252 progress(
3253 (f"writing extra base objects: {i}/{len(object_ids)}\r").encode("ascii")
3254 )
3255 assert len(object_id) == 20
3256 type_num, data = get_raw(object_id)
3257 offset = f.tell()
3258 crc32 = write_pack_object(
3259 f.write,
3260 type_num,
3261 data,
3262 sha=new_sha,
3263 compression_level=compression_level,
3264 )
3265 extra_entries.append((object_id, offset, crc32))
3266 pack_sha = new_sha.digest()
3267 f.write(pack_sha)
3268 return pack_sha, extra_entries
3271try:
3272 from dulwich._pack import ( # type: ignore
3273 apply_delta, # type: ignore
3274 bisect_find_sha, # type: ignore
3275 )
3276except ImportError:
3277 pass