Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/pack.py: 27%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# pack.py -- For dealing with packed git objects.
2# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk>
4#
5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
7# General Public License as public by the Free Software Foundation; version 2.0
8# or (at your option) any later version. You can redistribute it and/or
9# modify it under the terms of either of these two licenses.
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# You should have received a copy of the licenses; if not, see
18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
20# License, Version 2.0.
21#
23"""Classes for dealing with packed git objects.
25A pack is a compact representation of a bunch of objects, stored
26using deltas where possible.
28They have two parts, the pack file, which stores the data, and an index
29that tells you where the data is.
31To find an object you look in all of the index files 'til you find a
32match for the object name. You then use the pointer got from this as
33a pointer in to the corresponding packfile.
34"""
36import binascii
37from collections import defaultdict, deque
38from contextlib import suppress
39from io import BytesIO, UnsupportedOperation
41try:
42 from cdifflib import CSequenceMatcher as SequenceMatcher
43except ModuleNotFoundError:
44 from difflib import SequenceMatcher
46import os
47import struct
48import sys
49import warnings
50import zlib
51from collections.abc import Iterable, Iterator, Sequence
52from hashlib import sha1
53from itertools import chain
54from os import SEEK_CUR, SEEK_END
55from struct import unpack_from
56from typing import (
57 BinaryIO,
58 Callable,
59 Generic,
60 Optional,
61 Protocol,
62 TypeVar,
63 Union,
64)
66try:
67 import mmap
68except ImportError:
69 has_mmap = False
70else:
71 has_mmap = True
73# For some reason the above try, except fails to set has_mmap = False for plan9
74if sys.platform == "Plan9":
75 has_mmap = False
77from . import replace_me
78from .errors import ApplyDeltaError, ChecksumMismatch
79from .file import GitFile
80from .lru_cache import LRUSizeCache
81from .objects import ObjectID, ShaFile, hex_to_sha, object_header, sha_to_hex
83OFS_DELTA = 6
84REF_DELTA = 7
86DELTA_TYPES = (OFS_DELTA, REF_DELTA)
89DEFAULT_PACK_DELTA_WINDOW_SIZE = 10
91# Keep pack files under 16Mb in memory, otherwise write them out to disk
92PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024
94# Default pack index version to use when none is specified
95DEFAULT_PACK_INDEX_VERSION = 2
98OldUnpackedObject = Union[tuple[Union[bytes, int], list[bytes]], list[bytes]]
99ResolveExtRefFn = Callable[[bytes], tuple[int, OldUnpackedObject]]
100ProgressFn = Callable[[int, str], None]
101PackHint = tuple[int, Optional[bytes]]
104class UnresolvedDeltas(Exception):
105 """Delta objects could not be resolved."""
107 def __init__(self, shas) -> None:
108 self.shas = shas
111class ObjectContainer(Protocol):
112 def add_object(self, obj: ShaFile) -> None:
113 """Add a single object to this object store."""
115 def add_objects(
116 self,
117 objects: Sequence[tuple[ShaFile, Optional[str]]],
118 progress: Optional[Callable[[str], None]] = None,
119 ) -> None:
120 """Add a set of objects to this object store.
122 Args:
123 objects: Iterable over a list of (object, path) tuples
124 """
126 def __contains__(self, sha1: bytes) -> bool:
127 """Check if a hex sha is present."""
129 def __getitem__(self, sha1: bytes) -> ShaFile:
130 """Retrieve an object."""
133class PackedObjectContainer(ObjectContainer):
134 def get_unpacked_object(
135 self, sha1: bytes, *, include_comp: bool = False
136 ) -> "UnpackedObject":
137 """Get a raw unresolved object."""
138 raise NotImplementedError(self.get_unpacked_object)
140 def iterobjects_subset(
141 self, shas: Iterable[bytes], *, allow_missing: bool = False
142 ) -> Iterator[ShaFile]:
143 raise NotImplementedError(self.iterobjects_subset)
145 def iter_unpacked_subset(
146 self,
147 shas: set[bytes],
148 include_comp: bool = False,
149 allow_missing: bool = False,
150 convert_ofs_delta: bool = True,
151 ) -> Iterator["UnpackedObject"]:
152 raise NotImplementedError(self.iter_unpacked_subset)
155class UnpackedObjectStream:
156 def __iter__(self) -> Iterator["UnpackedObject"]:
157 raise NotImplementedError(self.__iter__)
159 def __len__(self) -> int:
160 raise NotImplementedError(self.__len__)
163def take_msb_bytes(
164 read: Callable[[int], bytes], crc32: Optional[int] = None
165) -> tuple[list[int], Optional[int]]:
166 """Read bytes marked with most significant bit.
168 Args:
169 read: Read function
170 """
171 ret: list[int] = []
172 while len(ret) == 0 or ret[-1] & 0x80:
173 b = read(1)
174 if crc32 is not None:
175 crc32 = binascii.crc32(b, crc32)
176 ret.append(ord(b[:1]))
177 return ret, crc32
180class PackFileDisappeared(Exception):
181 def __init__(self, obj) -> None:
182 self.obj = obj
185class UnpackedObject:
186 """Class encapsulating an object unpacked from a pack file.
188 These objects should only be created from within unpack_object. Most
189 members start out as empty and are filled in at various points by
190 read_zlib_chunks, unpack_object, DeltaChainIterator, etc.
192 End users of this object should take care that the function they're getting
193 this object from is guaranteed to set the members they need.
194 """
196 __slots__ = [
197 "_sha", # Cached binary SHA.
198 "comp_chunks", # Compressed object chunks.
199 "crc32", # CRC32.
200 "decomp_chunks", # Decompressed object chunks.
201 "decomp_len", # Decompressed length of this object.
202 "delta_base", # Delta base offset or SHA.
203 "obj_chunks", # Decompressed and delta-resolved chunks.
204 "obj_type_num", # Type of this object.
205 "offset", # Offset in its pack.
206 "pack_type_num", # Type of this object in the pack (may be a delta).
207 ]
209 obj_type_num: Optional[int]
210 obj_chunks: Optional[list[bytes]]
211 delta_base: Union[None, bytes, int]
212 decomp_chunks: list[bytes]
213 comp_chunks: Optional[list[bytes]]
215 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be
216 # methods of this object.
217 def __init__(
218 self,
219 pack_type_num,
220 *,
221 delta_base=None,
222 decomp_len=None,
223 crc32=None,
224 sha=None,
225 decomp_chunks=None,
226 offset=None,
227 ) -> None:
228 self.offset = offset
229 self._sha = sha
230 self.pack_type_num = pack_type_num
231 self.delta_base = delta_base
232 self.comp_chunks = None
233 self.decomp_chunks: list[bytes] = decomp_chunks or []
234 if decomp_chunks is not None and decomp_len is None:
235 self.decomp_len = sum(map(len, decomp_chunks))
236 else:
237 self.decomp_len = decomp_len
238 self.crc32 = crc32
240 if pack_type_num in DELTA_TYPES:
241 self.obj_type_num = None
242 self.obj_chunks = None
243 else:
244 self.obj_type_num = pack_type_num
245 self.obj_chunks = self.decomp_chunks
246 self.delta_base = delta_base
248 def sha(self):
249 """Return the binary SHA of this object."""
250 if self._sha is None:
251 self._sha = obj_sha(self.obj_type_num, self.obj_chunks)
252 return self._sha
254 def sha_file(self):
255 """Return a ShaFile from this object."""
256 assert self.obj_type_num is not None and self.obj_chunks is not None
257 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)
259 # Only provided for backwards compatibility with code that expects either
260 # chunks or a delta tuple.
261 def _obj(self) -> OldUnpackedObject:
262 """Return the decompressed chunks, or (delta base, delta chunks)."""
263 if self.pack_type_num in DELTA_TYPES:
264 assert isinstance(self.delta_base, (bytes, int))
265 return (self.delta_base, self.decomp_chunks)
266 else:
267 return self.decomp_chunks
269 def __eq__(self, other):
270 if not isinstance(other, UnpackedObject):
271 return False
272 for slot in self.__slots__:
273 if getattr(self, slot) != getattr(other, slot):
274 return False
275 return True
277 def __ne__(self, other):
278 return not (self == other)
280 def __repr__(self) -> str:
281 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__]
282 return "{}({})".format(self.__class__.__name__, ", ".join(data))
285_ZLIB_BUFSIZE = 65536 # 64KB buffer for better I/O performance
288def read_zlib_chunks(
289 read_some: Callable[[int], bytes],
290 unpacked: UnpackedObject,
291 include_comp: bool = False,
292 buffer_size: int = _ZLIB_BUFSIZE,
293) -> bytes:
294 """Read zlib data from a buffer.
296 This function requires that the buffer have additional data following the
297 compressed data, which is guaranteed to be the case for git pack files.
299 Args:
300 read_some: Read function that returns at least one byte, but may
301 return less than the requested size.
302 unpacked: An UnpackedObject to write result data to. If its crc32
303 attr is not None, the CRC32 of the compressed bytes will be computed
304 using this starting CRC32.
305 After this function, will have the following attrs set:
306 * comp_chunks (if include_comp is True)
307 * decomp_chunks
308 * decomp_len
309 * crc32
310 include_comp: If True, include compressed data in the result.
311 buffer_size: Size of the read buffer.
312 Returns: Leftover unused data from the decompression.
314 Raises:
315 zlib.error: if a decompression error occurred.
316 """
317 if unpacked.decomp_len <= -1:
318 raise ValueError("non-negative zlib data stream size expected")
319 decomp_obj = zlib.decompressobj()
321 comp_chunks = []
322 decomp_chunks = unpacked.decomp_chunks
323 decomp_len = 0
324 crc32 = unpacked.crc32
326 while True:
327 add = read_some(buffer_size)
328 if not add:
329 raise zlib.error("EOF before end of zlib stream")
330 comp_chunks.append(add)
331 decomp = decomp_obj.decompress(add)
332 decomp_len += len(decomp)
333 decomp_chunks.append(decomp)
334 unused = decomp_obj.unused_data
335 if unused:
336 left = len(unused)
337 if crc32 is not None:
338 crc32 = binascii.crc32(add[:-left], crc32)
339 if include_comp:
340 comp_chunks[-1] = add[:-left]
341 break
342 elif crc32 is not None:
343 crc32 = binascii.crc32(add, crc32)
344 if crc32 is not None:
345 crc32 &= 0xFFFFFFFF
347 if decomp_len != unpacked.decomp_len:
348 raise zlib.error("decompressed data does not match expected size")
350 unpacked.crc32 = crc32
351 if include_comp:
352 unpacked.comp_chunks = comp_chunks
353 return unused
356def iter_sha1(iter):
357 """Return the hexdigest of the SHA1 over a set of names.
359 Args:
360 iter: Iterator over string objects
361 Returns: 40-byte hex sha1 digest
362 """
363 sha = sha1()
364 for name in iter:
365 sha.update(name)
366 return sha.hexdigest().encode("ascii")
369def load_pack_index(path: Union[str, os.PathLike]):
370 """Load an index file by path.
372 Args:
373 path: Path to the index file
374 Returns: A PackIndex loaded from the given path
375 """
376 with GitFile(path, "rb") as f:
377 return load_pack_index_file(path, f)
380def _load_file_contents(f, size=None):
381 try:
382 fd = f.fileno()
383 except (UnsupportedOperation, AttributeError):
384 fd = None
385 # Attempt to use mmap if possible
386 if fd is not None:
387 if size is None:
388 size = os.fstat(fd).st_size
389 if has_mmap:
390 try:
391 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
392 except OSError:
393 # Perhaps a socket?
394 pass
395 else:
396 return contents, size
397 contents = f.read()
398 size = len(contents)
399 return contents, size
402def load_pack_index_file(path: Union[str, os.PathLike], f):
403 """Load an index file from a file-like object.
405 Args:
406 path: Path for the index file
407 f: File-like object
408 Returns: A PackIndex loaded from the given file
409 """
410 contents, size = _load_file_contents(f)
411 if contents[:4] == b"\377tOc":
412 version = struct.unpack(b">L", contents[4:8])[0]
413 if version == 2:
414 return PackIndex2(path, file=f, contents=contents, size=size)
415 elif version == 3:
416 return PackIndex3(path, file=f, contents=contents, size=size)
417 else:
418 raise KeyError(f"Unknown pack index format {version}")
419 else:
420 return PackIndex1(path, file=f, contents=contents, size=size)
423def bisect_find_sha(start, end, sha, unpack_name):
424 """Find a SHA in a data blob with sorted SHAs.
426 Args:
427 start: Start index of range to search
428 end: End index of range to search
429 sha: Sha to find
430 unpack_name: Callback to retrieve SHA by index
431 Returns: Index of the SHA, or None if it wasn't found
432 """
433 assert start <= end
434 while start <= end:
435 i = (start + end) // 2
436 file_sha = unpack_name(i)
437 if file_sha < sha:
438 start = i + 1
439 elif file_sha > sha:
440 end = i - 1
441 else:
442 return i
443 return None
446PackIndexEntry = tuple[bytes, int, Optional[int]]
449class PackIndex:
450 """An index in to a packfile.
452 Given a sha id of an object a pack index can tell you the location in the
453 packfile of that object if it has it.
454 """
456 # Default to SHA-1 for backward compatibility
457 hash_algorithm = 1
458 hash_size = 20
460 def __eq__(self, other):
461 if not isinstance(other, PackIndex):
462 return False
464 for (name1, _, _), (name2, _, _) in zip(
465 self.iterentries(), other.iterentries()
466 ):
467 if name1 != name2:
468 return False
469 return True
471 def __ne__(self, other):
472 return not self.__eq__(other)
474 def __len__(self) -> int:
475 """Return the number of entries in this pack index."""
476 raise NotImplementedError(self.__len__)
478 def __iter__(self) -> Iterator[bytes]:
479 """Iterate over the SHAs in this pack."""
480 return map(sha_to_hex, self._itersha())
482 def iterentries(self) -> Iterator[PackIndexEntry]:
483 """Iterate over the entries in this pack index.
485 Returns: iterator over tuples with object name, offset in packfile and
486 crc32 checksum.
487 """
488 raise NotImplementedError(self.iterentries)
490 def get_pack_checksum(self) -> bytes:
491 """Return the SHA1 checksum stored for the corresponding packfile.
493 Returns: 20-byte binary digest
494 """
495 raise NotImplementedError(self.get_pack_checksum)
497 @replace_me(since="0.21.0", remove_in="0.23.0")
498 def object_index(self, sha: bytes) -> int:
499 return self.object_offset(sha)
501 def object_offset(self, sha: bytes) -> int:
502 """Return the offset in to the corresponding packfile for the object.
504 Given the name of an object it will return the offset that object
505 lives at within the corresponding pack file. If the pack file doesn't
506 have the object then None will be returned.
507 """
508 raise NotImplementedError(self.object_offset)
510 def object_sha1(self, index: int) -> bytes:
511 """Return the SHA1 corresponding to the index in the pack file."""
512 for name, offset, crc32 in self.iterentries():
513 if offset == index:
514 return name
515 else:
516 raise KeyError(index)
518 def _object_offset(self, sha: bytes) -> int:
519 """See object_offset.
521 Args:
522 sha: A *binary* SHA string. (20 characters long)_
523 """
524 raise NotImplementedError(self._object_offset)
526 def objects_sha1(self) -> bytes:
527 """Return the hex SHA1 over all the shas of all objects in this pack.
529 Note: This is used for the filename of the pack.
530 """
531 return iter_sha1(self._itersha())
533 def _itersha(self) -> Iterator[bytes]:
534 """Yield all the SHA1's of the objects in the index, sorted."""
535 raise NotImplementedError(self._itersha)
537 def close(self) -> None:
538 pass
540 def check(self) -> None:
541 pass
544class MemoryPackIndex(PackIndex):
545 """Pack index that is stored entirely in memory."""
547 def __init__(self, entries, pack_checksum=None) -> None:
548 """Create a new MemoryPackIndex.
550 Args:
551 entries: Sequence of name, idx, crc32 (sorted)
552 pack_checksum: Optional pack checksum
553 """
554 self._by_sha = {}
555 self._by_offset = {}
556 for name, offset, crc32 in entries:
557 self._by_sha[name] = offset
558 self._by_offset[offset] = name
559 self._entries = entries
560 self._pack_checksum = pack_checksum
562 def get_pack_checksum(self):
563 return self._pack_checksum
565 def __len__(self) -> int:
566 return len(self._entries)
568 def object_offset(self, sha):
569 if len(sha) == 40:
570 sha = hex_to_sha(sha)
571 return self._by_sha[sha]
573 def object_sha1(self, offset):
574 return self._by_offset[offset]
576 def _itersha(self):
577 return iter(self._by_sha)
579 def iterentries(self):
580 return iter(self._entries)
582 @classmethod
583 def for_pack(cls, pack):
584 return MemoryPackIndex(pack.sorted_entries(), pack.calculate_checksum())
586 @classmethod
587 def clone(cls, other_index):
588 return cls(other_index.iterentries(), other_index.get_pack_checksum())
591class FilePackIndex(PackIndex):
592 """Pack index that is based on a file.
594 To do the loop it opens the file, and indexes first 256 4 byte groups
595 with the first byte of the sha id. The value in the four byte group indexed
596 is the end of the group that shares the same starting byte. Subtract one
597 from the starting byte and index again to find the start of the group.
598 The values are sorted by sha id within the group, so do the math to find
599 the start and end offset and then bisect in to find if the value is
600 present.
601 """
603 _fan_out_table: list[int]
605 def __init__(self, filename, file=None, contents=None, size=None) -> None:
606 """Create a pack index object.
608 Provide it with the name of the index file to consider, and it will map
609 it whenever required.
610 """
611 self._filename = filename
612 # Take the size now, so it can be checked each time we map the file to
613 # ensure that it hasn't changed.
614 if file is None:
615 self._file = GitFile(filename, "rb")
616 else:
617 self._file = file
618 if contents is None:
619 self._contents, self._size = _load_file_contents(self._file, size)
620 else:
621 self._contents, self._size = (contents, size)
623 @property
624 def path(self) -> str:
625 return self._filename
627 def __eq__(self, other):
628 # Quick optimization:
629 if (
630 isinstance(other, FilePackIndex)
631 and self._fan_out_table != other._fan_out_table
632 ):
633 return False
635 return super().__eq__(other)
637 def close(self) -> None:
638 self._file.close()
639 if getattr(self._contents, "close", None) is not None:
640 self._contents.close()
642 def __len__(self) -> int:
643 """Return the number of entries in this pack index."""
644 return self._fan_out_table[-1]
646 def _unpack_entry(self, i: int) -> PackIndexEntry:
647 """Unpack the i-th entry in the index file.
649 Returns: Tuple with object name (SHA), offset in pack file and CRC32
650 checksum (if known).
651 """
652 raise NotImplementedError(self._unpack_entry)
654 def _unpack_name(self, i) -> bytes:
655 """Unpack the i-th name from the index file."""
656 raise NotImplementedError(self._unpack_name)
658 def _unpack_offset(self, i) -> int:
659 """Unpack the i-th object offset from the index file."""
660 raise NotImplementedError(self._unpack_offset)
662 def _unpack_crc32_checksum(self, i) -> Optional[int]:
663 """Unpack the crc32 checksum for the ith object from the index file."""
664 raise NotImplementedError(self._unpack_crc32_checksum)
666 def _itersha(self) -> Iterator[bytes]:
667 for i in range(len(self)):
668 yield self._unpack_name(i)
670 def iterentries(self) -> Iterator[PackIndexEntry]:
671 """Iterate over the entries in this pack index.
673 Returns: iterator over tuples with object name, offset in packfile and
674 crc32 checksum.
675 """
676 for i in range(len(self)):
677 yield self._unpack_entry(i)
679 def _read_fan_out_table(self, start_offset: int):
680 ret = []
681 for i in range(0x100):
682 fanout_entry = self._contents[
683 start_offset + i * 4 : start_offset + (i + 1) * 4
684 ]
685 ret.append(struct.unpack(">L", fanout_entry)[0])
686 return ret
688 def check(self) -> None:
689 """Check that the stored checksum matches the actual checksum."""
690 actual = self.calculate_checksum()
691 stored = self.get_stored_checksum()
692 if actual != stored:
693 raise ChecksumMismatch(stored, actual)
695 def calculate_checksum(self) -> bytes:
696 """Calculate the SHA1 checksum over this pack index.
698 Returns: This is a 20-byte binary digest
699 """
700 return sha1(self._contents[:-20]).digest()
702 def get_pack_checksum(self) -> bytes:
703 """Return the SHA1 checksum stored for the corresponding packfile.
705 Returns: 20-byte binary digest
706 """
707 return bytes(self._contents[-40:-20])
709 def get_stored_checksum(self) -> bytes:
710 """Return the SHA1 checksum stored for this index.
712 Returns: 20-byte binary digest
713 """
714 return bytes(self._contents[-20:])
716 def object_offset(self, sha: bytes) -> int:
717 """Return the offset in to the corresponding packfile for the object.
719 Given the name of an object it will return the offset that object
720 lives at within the corresponding pack file. If the pack file doesn't
721 have the object then None will be returned.
722 """
723 if len(sha) == 40:
724 sha = hex_to_sha(sha)
725 try:
726 return self._object_offset(sha)
727 except ValueError as exc:
728 closed = getattr(self._contents, "closed", None)
729 if closed in (None, True):
730 raise PackFileDisappeared(self) from exc
731 raise
733 def _object_offset(self, sha: bytes) -> int:
734 """See object_offset.
736 Args:
737 sha: A *binary* SHA string. (20 characters long)_
738 """
739 assert len(sha) == 20
740 idx = ord(sha[:1])
741 if idx == 0:
742 start = 0
743 else:
744 start = self._fan_out_table[idx - 1]
745 end = self._fan_out_table[idx]
746 i = bisect_find_sha(start, end, sha, self._unpack_name)
747 if i is None:
748 raise KeyError(sha)
749 return self._unpack_offset(i)
751 def iter_prefix(self, prefix: bytes) -> Iterator[bytes]:
752 """Iterate over all SHA1s with the given prefix."""
753 start = ord(prefix[:1])
754 if start == 0:
755 start = 0
756 else:
757 start = self._fan_out_table[start - 1]
758 end = ord(prefix[:1]) + 1
759 if end == 0x100:
760 end = len(self)
761 else:
762 end = self._fan_out_table[end]
763 assert start <= end
764 started = False
765 for i in range(start, end):
766 name: bytes = self._unpack_name(i)
767 if name.startswith(prefix):
768 yield name
769 started = True
770 elif started:
771 break
774class PackIndex1(FilePackIndex):
775 """Version 1 Pack Index file."""
777 def __init__(
778 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None
779 ) -> None:
780 super().__init__(filename, file, contents, size)
781 self.version = 1
782 self._fan_out_table = self._read_fan_out_table(0)
784 def _unpack_entry(self, i):
785 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24))
786 return (name, offset, None)
788 def _unpack_name(self, i):
789 offset = (0x100 * 4) + (i * 24) + 4
790 return self._contents[offset : offset + 20]
792 def _unpack_offset(self, i):
793 offset = (0x100 * 4) + (i * 24)
794 return unpack_from(">L", self._contents, offset)[0]
796 def _unpack_crc32_checksum(self, i) -> None:
797 # Not stored in v1 index files
798 return None
801class PackIndex2(FilePackIndex):
802 """Version 2 Pack Index file."""
804 def __init__(
805 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None
806 ) -> None:
807 super().__init__(filename, file, contents, size)
808 if self._contents[:4] != b"\377tOc":
809 raise AssertionError("Not a v2 pack index file")
810 (self.version,) = unpack_from(b">L", self._contents, 4)
811 if self.version != 2:
812 raise AssertionError(f"Version was {self.version}")
813 self._fan_out_table = self._read_fan_out_table(8)
814 self._name_table_offset = 8 + 0x100 * 4
815 self._crc32_table_offset = self._name_table_offset + 20 * len(self)
816 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
817 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(
818 self
819 )
821 def _unpack_entry(self, i):
822 return (
823 self._unpack_name(i),
824 self._unpack_offset(i),
825 self._unpack_crc32_checksum(i),
826 )
828 def _unpack_name(self, i):
829 offset = self._name_table_offset + i * 20
830 return self._contents[offset : offset + 20]
832 def _unpack_offset(self, i):
833 offset = self._pack_offset_table_offset + i * 4
834 offset = unpack_from(">L", self._contents, offset)[0]
835 if offset & (2**31):
836 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8
837 offset = unpack_from(">Q", self._contents, offset)[0]
838 return offset
840 def _unpack_crc32_checksum(self, i):
841 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
844class PackIndex3(FilePackIndex):
845 """Version 3 Pack Index file.
847 Supports variable hash sizes for SHA-1 (20 bytes) and SHA-256 (32 bytes).
848 """
850 def __init__(
851 self, filename: Union[str, os.PathLike], file=None, contents=None, size=None
852 ) -> None:
853 super().__init__(filename, file, contents, size)
854 if self._contents[:4] != b"\377tOc":
855 raise AssertionError("Not a v3 pack index file")
856 (self.version,) = unpack_from(b">L", self._contents, 4)
857 if self.version != 3:
858 raise AssertionError(f"Version was {self.version}")
860 # Read hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
861 (self.hash_algorithm,) = unpack_from(b">L", self._contents, 8)
862 if self.hash_algorithm == 1:
863 self.hash_size = 20 # SHA-1
864 elif self.hash_algorithm == 2:
865 self.hash_size = 32 # SHA-256
866 else:
867 raise AssertionError(f"Unknown hash algorithm {self.hash_algorithm}")
869 # Read length of shortened object names
870 (self.shortened_oid_len,) = unpack_from(b">L", self._contents, 12)
872 # Calculate offsets based on variable hash size
873 self._fan_out_table = self._read_fan_out_table(
874 16
875 ) # After header (4 + 4 + 4 + 4)
876 self._name_table_offset = 16 + 0x100 * 4
877 self._crc32_table_offset = self._name_table_offset + self.hash_size * len(self)
878 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
879 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(
880 self
881 )
883 def _unpack_entry(self, i):
884 return (
885 self._unpack_name(i),
886 self._unpack_offset(i),
887 self._unpack_crc32_checksum(i),
888 )
890 def _unpack_name(self, i):
891 offset = self._name_table_offset + i * self.hash_size
892 return self._contents[offset : offset + self.hash_size]
894 def _unpack_offset(self, i):
895 offset = self._pack_offset_table_offset + i * 4
896 offset = unpack_from(">L", self._contents, offset)[0]
897 if offset & (2**31):
898 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8
899 offset = unpack_from(">Q", self._contents, offset)[0]
900 return offset
902 def _unpack_crc32_checksum(self, i):
903 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
906def read_pack_header(read) -> tuple[int, int]:
907 """Read the header of a pack file.
909 Args:
910 read: Read function
911 Returns: Tuple of (pack version, number of objects). If no data is
912 available to read, returns (None, None).
913 """
914 header = read(12)
915 if not header:
916 raise AssertionError("file too short to contain pack")
917 if header[:4] != b"PACK":
918 raise AssertionError(f"Invalid pack header {header!r}")
919 (version,) = unpack_from(b">L", header, 4)
920 if version not in (2, 3):
921 raise AssertionError(f"Version was {version}")
922 (num_objects,) = unpack_from(b">L", header, 8)
923 return (version, num_objects)
926def chunks_length(chunks: Union[bytes, Iterable[bytes]]) -> int:
927 if isinstance(chunks, bytes):
928 return len(chunks)
929 else:
930 return sum(map(len, chunks))
933def unpack_object(
934 read_all: Callable[[int], bytes],
935 read_some: Optional[Callable[[int], bytes]] = None,
936 compute_crc32=False,
937 include_comp=False,
938 zlib_bufsize=_ZLIB_BUFSIZE,
939) -> tuple[UnpackedObject, bytes]:
940 """Unpack a Git object.
942 Args:
943 read_all: Read function that blocks until the number of requested
944 bytes are read.
945 read_some: Read function that returns at least one byte, but may not
946 return the number of bytes requested.
947 compute_crc32: If True, compute the CRC32 of the compressed data. If
948 False, the returned CRC32 will be None.
949 include_comp: If True, include compressed data in the result.
950 zlib_bufsize: An optional buffer size for zlib operations.
951 Returns: A tuple of (unpacked, unused), where unused is the unused data
952 leftover from decompression, and unpacked in an UnpackedObject with
953 the following attrs set:
955 * obj_chunks (for non-delta types)
956 * pack_type_num
957 * delta_base (for delta types)
958 * comp_chunks (if include_comp is True)
959 * decomp_chunks
960 * decomp_len
961 * crc32 (if compute_crc32 is True)
962 """
963 if read_some is None:
964 read_some = read_all
965 if compute_crc32:
966 crc32 = 0
967 else:
968 crc32 = None
970 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)
971 type_num = (raw[0] >> 4) & 0x07
972 size = raw[0] & 0x0F
973 for i, byte in enumerate(raw[1:]):
974 size += (byte & 0x7F) << ((i * 7) + 4)
976 delta_base: Union[int, bytes, None]
977 raw_base = len(raw)
978 if type_num == OFS_DELTA:
979 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)
980 raw_base += len(raw)
981 if raw[-1] & 0x80:
982 raise AssertionError
983 delta_base_offset = raw[0] & 0x7F
984 for byte in raw[1:]:
985 delta_base_offset += 1
986 delta_base_offset <<= 7
987 delta_base_offset += byte & 0x7F
988 delta_base = delta_base_offset
989 elif type_num == REF_DELTA:
990 delta_base_obj = read_all(20)
991 if crc32 is not None:
992 crc32 = binascii.crc32(delta_base_obj, crc32)
993 delta_base = delta_base_obj
994 raw_base += 20
995 else:
996 delta_base = None
998 unpacked = UnpackedObject(
999 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32
1000 )
1001 unused = read_zlib_chunks(
1002 read_some,
1003 unpacked,
1004 buffer_size=zlib_bufsize,
1005 include_comp=include_comp,
1006 )
1007 return unpacked, unused
1010def _compute_object_size(value):
1011 """Compute the size of a unresolved object for use with LRUSizeCache."""
1012 (num, obj) = value
1013 if num in DELTA_TYPES:
1014 return chunks_length(obj[1])
1015 return chunks_length(obj)
1018class PackStreamReader:
1019 """Class to read a pack stream.
1021 The pack is read from a ReceivableProtocol using read() or recv() as
1022 appropriate.
1023 """
1025 def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE) -> None:
1026 self.read_all = read_all
1027 if read_some is None:
1028 self.read_some = read_all
1029 else:
1030 self.read_some = read_some
1031 self.sha = sha1()
1032 self._offset = 0
1033 self._rbuf = BytesIO()
1034 # trailer is a deque to avoid memory allocation on small reads
1035 self._trailer: deque[bytes] = deque()
1036 self._zlib_bufsize = zlib_bufsize
1038 def _read(self, read, size):
1039 """Read up to size bytes using the given callback.
1041 As a side effect, update the verifier's hash (excluding the last 20
1042 bytes read).
1044 Args:
1045 read: The read callback to read from.
1046 size: The maximum number of bytes to read; the particular
1047 behavior is callback-specific.
1048 """
1049 data = read(size)
1051 # maintain a trailer of the last 20 bytes we've read
1052 n = len(data)
1053 self._offset += n
1054 tn = len(self._trailer)
1055 if n >= 20:
1056 to_pop = tn
1057 to_add = 20
1058 else:
1059 to_pop = max(n + tn - 20, 0)
1060 to_add = n
1061 self.sha.update(
1062 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)]))
1063 )
1064 self._trailer.extend(data[-to_add:])
1066 # hash everything but the trailer
1067 self.sha.update(data[:-to_add])
1068 return data
1070 def _buf_len(self):
1071 buf = self._rbuf
1072 start = buf.tell()
1073 buf.seek(0, SEEK_END)
1074 end = buf.tell()
1075 buf.seek(start)
1076 return end - start
1078 @property
1079 def offset(self):
1080 return self._offset - self._buf_len()
1082 def read(self, size):
1083 """Read, blocking until size bytes are read."""
1084 buf_len = self._buf_len()
1085 if buf_len >= size:
1086 return self._rbuf.read(size)
1087 buf_data = self._rbuf.read()
1088 self._rbuf = BytesIO()
1089 return buf_data + self._read(self.read_all, size - buf_len)
1091 def recv(self, size):
1092 """Read up to size bytes, blocking until one byte is read."""
1093 buf_len = self._buf_len()
1094 if buf_len:
1095 data = self._rbuf.read(size)
1096 if size >= buf_len:
1097 self._rbuf = BytesIO()
1098 return data
1099 return self._read(self.read_some, size)
1101 def __len__(self) -> int:
1102 return self._num_objects
1104 def read_objects(self, compute_crc32=False) -> Iterator[UnpackedObject]:
1105 """Read the objects in this pack file.
1107 Args:
1108 compute_crc32: If True, compute the CRC32 of the compressed
1109 data. If False, the returned CRC32 will be None.
1110 Returns: Iterator over UnpackedObjects with the following members set:
1111 offset
1112 obj_type_num
1113 obj_chunks (for non-delta types)
1114 delta_base (for delta types)
1115 decomp_chunks
1116 decomp_len
1117 crc32 (if compute_crc32 is True)
1119 Raises:
1120 ChecksumMismatch: if the checksum of the pack contents does not
1121 match the checksum in the pack trailer.
1122 zlib.error: if an error occurred during zlib decompression.
1123 IOError: if an error occurred writing to the output file.
1124 """
1125 pack_version, self._num_objects = read_pack_header(self.read)
1127 for i in range(self._num_objects):
1128 offset = self.offset
1129 unpacked, unused = unpack_object(
1130 self.read,
1131 read_some=self.recv,
1132 compute_crc32=compute_crc32,
1133 zlib_bufsize=self._zlib_bufsize,
1134 )
1135 unpacked.offset = offset
1137 # prepend any unused data to current read buffer
1138 buf = BytesIO()
1139 buf.write(unused)
1140 buf.write(self._rbuf.read())
1141 buf.seek(0)
1142 self._rbuf = buf
1144 yield unpacked
1146 if self._buf_len() < 20:
1147 # If the read buffer is full, then the last read() got the whole
1148 # trailer off the wire. If not, it means there is still some of the
1149 # trailer to read. We need to read() all 20 bytes; N come from the
1150 # read buffer and (20 - N) come from the wire.
1151 self.read(20)
1153 pack_sha = bytearray(self._trailer) # type: ignore
1154 if pack_sha != self.sha.digest():
1155 raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest())
1158class PackStreamCopier(PackStreamReader):
1159 """Class to verify a pack stream as it is being read.
1161 The pack is read from a ReceivableProtocol using read() or recv() as
1162 appropriate and written out to the given file-like object.
1163 """
1165 def __init__(self, read_all, read_some, outfile, delta_iter=None) -> None:
1166 """Initialize the copier.
1168 Args:
1169 read_all: Read function that blocks until the number of
1170 requested bytes are read.
1171 read_some: Read function that returns at least one byte, but may
1172 not return the number of bytes requested.
1173 outfile: File-like object to write output through.
1174 delta_iter: Optional DeltaChainIterator to record deltas as we
1175 read them.
1176 """
1177 super().__init__(read_all, read_some=read_some)
1178 self.outfile = outfile
1179 self._delta_iter = delta_iter
1181 def _read(self, read, size):
1182 """Read data from the read callback and write it to the file."""
1183 data = super()._read(read, size)
1184 self.outfile.write(data)
1185 return data
1187 def verify(self, progress=None) -> None:
1188 """Verify a pack stream and write it to the output file.
1190 See PackStreamReader.iterobjects for a list of exceptions this may
1191 throw.
1192 """
1193 i = 0 # default count of entries if read_objects() is empty
1194 for i, unpacked in enumerate(self.read_objects()):
1195 if self._delta_iter:
1196 self._delta_iter.record(unpacked)
1197 if progress is not None:
1198 progress(f"copying pack entries: {i}/{len(self)}\r".encode("ascii"))
1199 if progress is not None:
1200 progress(f"copied {i} pack entries\n".encode("ascii"))
1203def obj_sha(type, chunks):
1204 """Compute the SHA for a numeric type and object chunks."""
1205 sha = sha1()
1206 sha.update(object_header(type, chunks_length(chunks)))
1207 if isinstance(chunks, bytes):
1208 sha.update(chunks)
1209 else:
1210 for chunk in chunks:
1211 sha.update(chunk)
1212 return sha.digest()
1215def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1 << 16):
1216 """Hash a portion of a file into a new SHA.
1218 Args:
1219 f: A file-like object to read from that supports seek().
1220 start_ofs: The offset in the file to start reading at.
1221 end_ofs: The offset in the file to end reading at, relative to the
1222 end of the file.
1223 buffer_size: A buffer size for reading.
1224 Returns: A new SHA object updated with data read from the file.
1225 """
1226 sha = sha1()
1227 f.seek(0, SEEK_END)
1228 length = f.tell()
1229 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length:
1230 raise AssertionError(
1231 f"Attempt to read beyond file length. start_ofs: {start_ofs}, end_ofs: {end_ofs}, file length: {length}"
1232 )
1233 todo = length + end_ofs - start_ofs
1234 f.seek(start_ofs)
1235 while todo:
1236 data = f.read(min(todo, buffer_size))
1237 sha.update(data)
1238 todo -= len(data)
1239 return sha
1242class PackData:
1243 """The data contained in a packfile.
1245 Pack files can be accessed both sequentially for exploding a pack, and
1246 directly with the help of an index to retrieve a specific object.
1248 The objects within are either complete or a delta against another.
1250 The header is variable length. If the MSB of each byte is set then it
1251 indicates that the subsequent byte is still part of the header.
1252 For the first byte the next MS bits are the type, which tells you the type
1253 of object, and whether it is a delta. The LS byte is the lowest bits of the
1254 size. For each subsequent byte the LS 7 bits are the next MS bits of the
1255 size, i.e. the last byte of the header contains the MS bits of the size.
1257 For the complete objects the data is stored as zlib deflated data.
1258 The size in the header is the uncompressed object size, so to uncompress
1259 you need to just keep feeding data to zlib until you get an object back,
1260 or it errors on bad data. This is done here by just giving the complete
1261 buffer from the start of the deflated object on. This is bad, but until I
1262 get mmap sorted out it will have to do.
1264 Currently there are no integrity checks done. Also no attempt is made to
1265 try and detect the delta case, or a request for an object at the wrong
1266 position. It will all just throw a zlib or KeyError.
1267 """
1269 def __init__(self, filename: Union[str, os.PathLike], file=None, size=None) -> None:
1270 """Create a PackData object representing the pack in the given filename.
1272 The file must exist and stay readable until the object is disposed of.
1273 It must also stay the same size. It will be mapped whenever needed.
1275 Currently there is a restriction on the size of the pack as the python
1276 mmap implementation is flawed.
1277 """
1278 self._filename = filename
1279 self._size = size
1280 self._header_size = 12
1281 if file is None:
1282 self._file = GitFile(self._filename, "rb")
1283 else:
1284 self._file = file
1285 (version, self._num_objects) = read_pack_header(self._file.read)
1286 self._offset_cache = LRUSizeCache[int, tuple[int, OldUnpackedObject]](
1287 1024 * 1024 * 20, compute_size=_compute_object_size
1288 )
1290 @property
1291 def filename(self):
1292 return os.path.basename(self._filename)
1294 @property
1295 def path(self):
1296 return self._filename
1298 @classmethod
1299 def from_file(cls, file, size=None):
1300 return cls(str(file), file=file, size=size)
1302 @classmethod
1303 def from_path(cls, path: Union[str, os.PathLike]):
1304 return cls(filename=path)
1306 def close(self) -> None:
1307 self._file.close()
1309 def __enter__(self):
1310 return self
1312 def __exit__(self, exc_type, exc_val, exc_tb):
1313 self.close()
1315 def __eq__(self, other):
1316 if isinstance(other, PackData):
1317 return self.get_stored_checksum() == other.get_stored_checksum()
1318 return False
1320 def _get_size(self):
1321 if self._size is not None:
1322 return self._size
1323 self._size = os.path.getsize(self._filename)
1324 if self._size < self._header_size:
1325 errmsg = f"{self._filename} is too small for a packfile ({self._size} < {self._header_size})"
1326 raise AssertionError(errmsg)
1327 return self._size
1329 def __len__(self) -> int:
1330 """Returns the number of objects in this pack."""
1331 return self._num_objects
1333 def calculate_checksum(self):
1334 """Calculate the checksum for this pack.
1336 Returns: 20-byte binary SHA1 digest
1337 """
1338 return compute_file_sha(self._file, end_ofs=-20).digest()
1340 def iter_unpacked(self, *, include_comp: bool = False):
1341 self._file.seek(self._header_size)
1343 if self._num_objects is None:
1344 return
1346 for _ in range(self._num_objects):
1347 offset = self._file.tell()
1348 unpacked, unused = unpack_object(
1349 self._file.read, compute_crc32=False, include_comp=include_comp
1350 )
1351 unpacked.offset = offset
1352 yield unpacked
1353 # Back up over unused data.
1354 self._file.seek(-len(unused), SEEK_CUR)
1356 def iterentries(
1357 self, progress=None, resolve_ext_ref: Optional[ResolveExtRefFn] = None
1358 ):
1359 """Yield entries summarizing the contents of this pack.
1361 Args:
1362 progress: Progress function, called with current and total
1363 object count.
1364 Returns: iterator of tuples with (sha, offset, crc32)
1365 """
1366 num_objects = self._num_objects
1367 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref)
1368 for i, result in enumerate(indexer):
1369 if progress is not None:
1370 progress(i, num_objects)
1371 yield result
1373 def sorted_entries(
1374 self,
1375 progress: Optional[ProgressFn] = None,
1376 resolve_ext_ref: Optional[ResolveExtRefFn] = None,
1377 ):
1378 """Return entries in this pack, sorted by SHA.
1380 Args:
1381 progress: Progress function, called with current and total
1382 object count
1383 Returns: Iterator of tuples with (sha, offset, crc32)
1384 """
1385 return sorted(
1386 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref)
1387 )
1389 def create_index_v1(self, filename, progress=None, resolve_ext_ref=None):
1390 """Create a version 1 file for this data file.
1392 Args:
1393 filename: Index filename.
1394 progress: Progress report function
1395 Returns: Checksum of index file
1396 """
1397 entries = self.sorted_entries(
1398 progress=progress, resolve_ext_ref=resolve_ext_ref
1399 )
1400 with GitFile(filename, "wb") as f:
1401 return write_pack_index_v1(f, entries, self.calculate_checksum())
1403 def create_index_v2(self, filename, progress=None, resolve_ext_ref=None):
1404 """Create a version 2 index file for this data file.
1406 Args:
1407 filename: Index filename.
1408 progress: Progress report function
1409 Returns: Checksum of index file
1410 """
1411 entries = self.sorted_entries(
1412 progress=progress, resolve_ext_ref=resolve_ext_ref
1413 )
1414 with GitFile(filename, "wb") as f:
1415 return write_pack_index_v2(f, entries, self.calculate_checksum())
1417 def create_index_v3(
1418 self, filename, progress=None, resolve_ext_ref=None, hash_algorithm=1
1419 ):
1420 """Create a version 3 index file for this data file.
1422 Args:
1423 filename: Index filename.
1424 progress: Progress report function
1425 resolve_ext_ref: Function to resolve external references
1426 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
1427 Returns: Checksum of index file
1428 """
1429 entries = self.sorted_entries(
1430 progress=progress, resolve_ext_ref=resolve_ext_ref
1431 )
1432 with GitFile(filename, "wb") as f:
1433 return write_pack_index_v3(
1434 f, entries, self.calculate_checksum(), hash_algorithm
1435 )
1437 def create_index(
1438 self, filename, progress=None, version=2, resolve_ext_ref=None, hash_algorithm=1
1439 ):
1440 """Create an index file for this data file.
1442 Args:
1443 filename: Index filename.
1444 progress: Progress report function
1445 version: Index version (1, 2, or 3)
1446 resolve_ext_ref: Function to resolve external references
1447 hash_algorithm: Hash algorithm identifier for v3 (1 = SHA-1, 2 = SHA-256)
1448 Returns: Checksum of index file
1449 """
1450 if version == 1:
1451 return self.create_index_v1(
1452 filename, progress, resolve_ext_ref=resolve_ext_ref
1453 )
1454 elif version == 2:
1455 return self.create_index_v2(
1456 filename, progress, resolve_ext_ref=resolve_ext_ref
1457 )
1458 elif version == 3:
1459 return self.create_index_v3(
1460 filename,
1461 progress,
1462 resolve_ext_ref=resolve_ext_ref,
1463 hash_algorithm=hash_algorithm,
1464 )
1465 else:
1466 raise ValueError(f"unknown index format {version}")
1468 def get_stored_checksum(self):
1469 """Return the expected checksum stored in this pack."""
1470 self._file.seek(-20, SEEK_END)
1471 return self._file.read(20)
1473 def check(self) -> None:
1474 """Check the consistency of this pack."""
1475 actual = self.calculate_checksum()
1476 stored = self.get_stored_checksum()
1477 if actual != stored:
1478 raise ChecksumMismatch(stored, actual)
1480 def get_unpacked_object_at(
1481 self, offset: int, *, include_comp: bool = False
1482 ) -> UnpackedObject:
1483 """Given offset in the packfile return a UnpackedObject."""
1484 assert offset >= self._header_size
1485 self._file.seek(offset)
1486 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp)
1487 unpacked.offset = offset
1488 return unpacked
1490 def get_object_at(self, offset: int) -> tuple[int, OldUnpackedObject]:
1491 """Given an offset in to the packfile return the object that is there.
1493 Using the associated index the location of an object can be looked up,
1494 and then the packfile can be asked directly for that object using this
1495 function.
1496 """
1497 try:
1498 return self._offset_cache[offset]
1499 except KeyError:
1500 pass
1501 unpacked = self.get_unpacked_object_at(offset, include_comp=False)
1502 return (unpacked.pack_type_num, unpacked._obj())
1505T = TypeVar("T")
1508class DeltaChainIterator(Generic[T]):
1509 """Abstract iterator over pack data based on delta chains.
1511 Each object in the pack is guaranteed to be inflated exactly once,
1512 regardless of how many objects reference it as a delta base. As a result,
1513 memory usage is proportional to the length of the longest delta chain.
1515 Subclasses can override _result to define the result type of the iterator.
1516 By default, results are UnpackedObjects with the following members set:
1518 * offset
1519 * obj_type_num
1520 * obj_chunks
1521 * pack_type_num
1522 * delta_base (for delta types)
1523 * comp_chunks (if _include_comp is True)
1524 * decomp_chunks
1525 * decomp_len
1526 * crc32 (if _compute_crc32 is True)
1527 """
1529 _compute_crc32 = False
1530 _include_comp = False
1532 def __init__(self, file_obj, *, resolve_ext_ref=None) -> None:
1533 self._file = file_obj
1534 self._resolve_ext_ref = resolve_ext_ref
1535 self._pending_ofs: dict[int, list[int]] = defaultdict(list)
1536 self._pending_ref: dict[bytes, list[int]] = defaultdict(list)
1537 self._full_ofs: list[tuple[int, int]] = []
1538 self._ext_refs: list[bytes] = []
1540 @classmethod
1541 def for_pack_data(cls, pack_data: PackData, resolve_ext_ref=None):
1542 walker = cls(None, resolve_ext_ref=resolve_ext_ref)
1543 walker.set_pack_data(pack_data)
1544 for unpacked in pack_data.iter_unpacked(include_comp=False):
1545 walker.record(unpacked)
1546 return walker
1548 @classmethod
1549 def for_pack_subset(
1550 cls,
1551 pack: "Pack",
1552 shas: Iterable[bytes],
1553 *,
1554 allow_missing: bool = False,
1555 resolve_ext_ref=None,
1556 ):
1557 walker = cls(None, resolve_ext_ref=resolve_ext_ref)
1558 walker.set_pack_data(pack.data)
1559 todo = set()
1560 for sha in shas:
1561 assert isinstance(sha, bytes)
1562 try:
1563 off = pack.index.object_offset(sha)
1564 except KeyError:
1565 if not allow_missing:
1566 raise
1567 else:
1568 todo.add(off)
1569 done = set()
1570 while todo:
1571 off = todo.pop()
1572 unpacked = pack.data.get_unpacked_object_at(off)
1573 walker.record(unpacked)
1574 done.add(off)
1575 base_ofs = None
1576 if unpacked.pack_type_num == OFS_DELTA:
1577 base_ofs = unpacked.offset - unpacked.delta_base
1578 elif unpacked.pack_type_num == REF_DELTA:
1579 with suppress(KeyError):
1580 assert isinstance(unpacked.delta_base, bytes)
1581 base_ofs = pack.index.object_index(unpacked.delta_base)
1582 if base_ofs is not None and base_ofs not in done:
1583 todo.add(base_ofs)
1584 return walker
1586 def record(self, unpacked: UnpackedObject) -> None:
1587 type_num = unpacked.pack_type_num
1588 offset = unpacked.offset
1589 if type_num == OFS_DELTA:
1590 base_offset = offset - unpacked.delta_base
1591 self._pending_ofs[base_offset].append(offset)
1592 elif type_num == REF_DELTA:
1593 assert isinstance(unpacked.delta_base, bytes)
1594 self._pending_ref[unpacked.delta_base].append(offset)
1595 else:
1596 self._full_ofs.append((offset, type_num))
1598 def set_pack_data(self, pack_data: PackData) -> None:
1599 self._file = pack_data._file
1601 def _walk_all_chains(self):
1602 for offset, type_num in self._full_ofs:
1603 yield from self._follow_chain(offset, type_num, None)
1604 yield from self._walk_ref_chains()
1605 assert not self._pending_ofs, repr(self._pending_ofs)
1607 def _ensure_no_pending(self) -> None:
1608 if self._pending_ref:
1609 raise UnresolvedDeltas([sha_to_hex(s) for s in self._pending_ref])
1611 def _walk_ref_chains(self):
1612 if not self._resolve_ext_ref:
1613 self._ensure_no_pending()
1614 return
1616 for base_sha, pending in sorted(self._pending_ref.items()):
1617 if base_sha not in self._pending_ref:
1618 continue
1619 try:
1620 type_num, chunks = self._resolve_ext_ref(base_sha)
1621 except KeyError:
1622 # Not an external ref, but may depend on one. Either it will
1623 # get popped via a _follow_chain call, or we will raise an
1624 # error below.
1625 continue
1626 self._ext_refs.append(base_sha)
1627 self._pending_ref.pop(base_sha)
1628 for new_offset in pending:
1629 yield from self._follow_chain(new_offset, type_num, chunks)
1631 self._ensure_no_pending()
1633 def _result(self, unpacked: UnpackedObject) -> T:
1634 raise NotImplementedError
1636 def _resolve_object(
1637 self, offset: int, obj_type_num: int, base_chunks: list[bytes]
1638 ) -> UnpackedObject:
1639 self._file.seek(offset)
1640 unpacked, _ = unpack_object(
1641 self._file.read,
1642 include_comp=self._include_comp,
1643 compute_crc32=self._compute_crc32,
1644 )
1645 unpacked.offset = offset
1646 if base_chunks is None:
1647 assert unpacked.pack_type_num == obj_type_num
1648 else:
1649 assert unpacked.pack_type_num in DELTA_TYPES
1650 unpacked.obj_type_num = obj_type_num
1651 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks)
1652 return unpacked
1654 def _follow_chain(self, offset: int, obj_type_num: int, base_chunks: list[bytes]):
1655 # Unlike PackData.get_object_at, there is no need to cache offsets as
1656 # this approach by design inflates each object exactly once.
1657 todo = [(offset, obj_type_num, base_chunks)]
1658 while todo:
1659 (offset, obj_type_num, base_chunks) = todo.pop()
1660 unpacked = self._resolve_object(offset, obj_type_num, base_chunks)
1661 yield self._result(unpacked)
1663 unblocked = chain(
1664 self._pending_ofs.pop(unpacked.offset, []),
1665 self._pending_ref.pop(unpacked.sha(), []),
1666 )
1667 todo.extend(
1668 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore
1669 for new_offset in unblocked
1670 )
1672 def __iter__(self) -> Iterator[T]:
1673 return self._walk_all_chains()
1675 def ext_refs(self):
1676 return self._ext_refs
1679class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]):
1680 """Delta chain iterator that yield unpacked objects."""
1682 def _result(self, unpacked):
1683 return unpacked
1686class PackIndexer(DeltaChainIterator[PackIndexEntry]):
1687 """Delta chain iterator that yields index entries."""
1689 _compute_crc32 = True
1691 def _result(self, unpacked):
1692 return unpacked.sha(), unpacked.offset, unpacked.crc32
1695class PackInflater(DeltaChainIterator[ShaFile]):
1696 """Delta chain iterator that yields ShaFile objects."""
1698 def _result(self, unpacked):
1699 return unpacked.sha_file()
1702class SHA1Reader(BinaryIO):
1703 """Wrapper for file-like object that remembers the SHA1 of its data."""
1705 def __init__(self, f) -> None:
1706 self.f = f
1707 self.sha1 = sha1(b"")
1709 def read(self, size: int = -1) -> bytes:
1710 data = self.f.read(size)
1711 self.sha1.update(data)
1712 return data
1714 def check_sha(self, allow_empty: bool = False) -> None:
1715 stored = self.f.read(20)
1716 # If git option index.skipHash is set the index will be empty
1717 if stored != self.sha1.digest() and (
1718 not allow_empty
1719 or sha_to_hex(stored) != b"0000000000000000000000000000000000000000"
1720 ):
1721 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
1723 def close(self):
1724 return self.f.close()
1726 def tell(self) -> int:
1727 return self.f.tell()
1729 # BinaryIO abstract methods
1730 def readable(self) -> bool:
1731 return True
1733 def writable(self) -> bool:
1734 return False
1736 def seekable(self) -> bool:
1737 return getattr(self.f, "seekable", lambda: False)()
1739 def seek(self, offset: int, whence: int = 0) -> int:
1740 return self.f.seek(offset, whence)
1742 def flush(self) -> None:
1743 if hasattr(self.f, "flush"):
1744 self.f.flush()
1746 def readline(self, size: int = -1) -> bytes:
1747 return self.f.readline(size)
1749 def readlines(self, hint: int = -1) -> list[bytes]:
1750 return self.f.readlines(hint)
1752 def writelines(self, lines) -> None:
1753 raise UnsupportedOperation("writelines")
1755 def write(self, data) -> int:
1756 raise UnsupportedOperation("write")
1758 def __enter__(self):
1759 return self
1761 def __exit__(self, type, value, traceback):
1762 self.close()
1764 def __iter__(self):
1765 return self
1767 def __next__(self) -> bytes:
1768 line = self.readline()
1769 if not line:
1770 raise StopIteration
1771 return line
1773 def fileno(self) -> int:
1774 return self.f.fileno()
1776 def isatty(self) -> bool:
1777 return getattr(self.f, "isatty", lambda: False)()
1779 def truncate(self, size: Optional[int] = None) -> int:
1780 raise UnsupportedOperation("truncate")
1783class SHA1Writer(BinaryIO):
1784 """Wrapper for file-like object that remembers the SHA1 of its data."""
1786 def __init__(self, f) -> None:
1787 self.f = f
1788 self.length = 0
1789 self.sha1 = sha1(b"")
1791 def write(self, data) -> int:
1792 self.sha1.update(data)
1793 self.f.write(data)
1794 self.length += len(data)
1795 return len(data)
1797 def write_sha(self):
1798 sha = self.sha1.digest()
1799 assert len(sha) == 20
1800 self.f.write(sha)
1801 self.length += len(sha)
1802 return sha
1804 def close(self):
1805 sha = self.write_sha()
1806 self.f.close()
1807 return sha
1809 def offset(self):
1810 return self.length
1812 def tell(self) -> int:
1813 return self.f.tell()
1815 # BinaryIO abstract methods
1816 def readable(self) -> bool:
1817 return False
1819 def writable(self) -> bool:
1820 return True
1822 def seekable(self) -> bool:
1823 return getattr(self.f, "seekable", lambda: False)()
1825 def seek(self, offset: int, whence: int = 0) -> int:
1826 return self.f.seek(offset, whence)
1828 def flush(self) -> None:
1829 if hasattr(self.f, "flush"):
1830 self.f.flush()
1832 def readline(self, size: int = -1) -> bytes:
1833 raise UnsupportedOperation("readline")
1835 def readlines(self, hint: int = -1) -> list[bytes]:
1836 raise UnsupportedOperation("readlines")
1838 def writelines(self, lines) -> None:
1839 for line in lines:
1840 self.write(line)
1842 def read(self, size: int = -1) -> bytes:
1843 raise UnsupportedOperation("read")
1845 def __enter__(self):
1846 return self
1848 def __exit__(self, type, value, traceback):
1849 self.close()
1851 def __iter__(self):
1852 return self
1854 def __next__(self) -> bytes:
1855 raise UnsupportedOperation("__next__")
1857 def fileno(self) -> int:
1858 return self.f.fileno()
1860 def isatty(self) -> bool:
1861 return getattr(self.f, "isatty", lambda: False)()
1863 def truncate(self, size: Optional[int] = None) -> int:
1864 raise UnsupportedOperation("truncate")
1867def pack_object_header(type_num, delta_base, size):
1868 """Create a pack object header for the given object info.
1870 Args:
1871 type_num: Numeric type of the object.
1872 delta_base: Delta base offset or ref, or None for whole objects.
1873 size: Uncompressed object size.
1874 Returns: A header for a packed object.
1875 """
1876 header = []
1877 c = (type_num << 4) | (size & 15)
1878 size >>= 4
1879 while size:
1880 header.append(c | 0x80)
1881 c = size & 0x7F
1882 size >>= 7
1883 header.append(c)
1884 if type_num == OFS_DELTA:
1885 ret = [delta_base & 0x7F]
1886 delta_base >>= 7
1887 while delta_base:
1888 delta_base -= 1
1889 ret.insert(0, 0x80 | (delta_base & 0x7F))
1890 delta_base >>= 7
1891 header.extend(ret)
1892 elif type_num == REF_DELTA:
1893 assert len(delta_base) == 20
1894 header += delta_base
1895 return bytearray(header)
1898def pack_object_chunks(type, object, compression_level=-1):
1899 """Generate chunks for a pack object.
1901 Args:
1902 type: Numeric type of the object
1903 object: Object to write
1904 compression_level: the zlib compression level
1905 Returns: Chunks
1906 """
1907 if type in DELTA_TYPES:
1908 delta_base, object = object
1909 else:
1910 delta_base = None
1911 if isinstance(object, bytes):
1912 object = [object]
1913 yield bytes(pack_object_header(type, delta_base, sum(map(len, object))))
1914 compressor = zlib.compressobj(level=compression_level)
1915 for data in object:
1916 yield compressor.compress(data)
1917 yield compressor.flush()
1920def write_pack_object(write, type, object, sha=None, compression_level=-1):
1921 """Write pack object to a file.
1923 Args:
1924 write: Write function to use
1925 type: Numeric type of the object
1926 object: Object to write
1927 compression_level: the zlib compression level
1928 Returns: Tuple with offset at which the object was written, and crc32
1929 """
1930 crc32 = 0
1931 for chunk in pack_object_chunks(type, object, compression_level=compression_level):
1932 write(chunk)
1933 if sha is not None:
1934 sha.update(chunk)
1935 crc32 = binascii.crc32(chunk, crc32)
1936 return crc32 & 0xFFFFFFFF
1939def write_pack(
1940 filename,
1941 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],
1942 *,
1943 deltify: Optional[bool] = None,
1944 delta_window_size: Optional[int] = None,
1945 compression_level: int = -1,
1946):
1947 """Write a new pack data file.
1949 Args:
1950 filename: Path to the new pack file (without .pack extension)
1951 delta_window_size: Delta window size
1952 deltify: Whether to deltify pack objects
1953 compression_level: the zlib compression level
1954 Returns: Tuple with checksum of pack file and index file
1955 """
1956 with GitFile(filename + ".pack", "wb") as f:
1957 entries, data_sum = write_pack_objects(
1958 f.write,
1959 objects,
1960 delta_window_size=delta_window_size,
1961 deltify=deltify,
1962 compression_level=compression_level,
1963 )
1964 entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])
1965 with GitFile(filename + ".idx", "wb") as f:
1966 return data_sum, write_pack_index(f, entries, data_sum)
1969def pack_header_chunks(num_objects):
1970 """Yield chunks for a pack header."""
1971 yield b"PACK" # Pack header
1972 yield struct.pack(b">L", 2) # Pack version
1973 yield struct.pack(b">L", num_objects) # Number of objects in pack
1976def write_pack_header(write, num_objects) -> None:
1977 """Write a pack header for the given number of objects."""
1978 if hasattr(write, "write"):
1979 write = write.write
1980 warnings.warn(
1981 "write_pack_header() now takes a write rather than file argument",
1982 DeprecationWarning,
1983 stacklevel=2,
1984 )
1985 for chunk in pack_header_chunks(num_objects):
1986 write(chunk)
1989def find_reusable_deltas(
1990 container: PackedObjectContainer,
1991 object_ids: set[bytes],
1992 *,
1993 other_haves: Optional[set[bytes]] = None,
1994 progress=None,
1995) -> Iterator[UnpackedObject]:
1996 if other_haves is None:
1997 other_haves = set()
1998 reused = 0
1999 for i, unpacked in enumerate(
2000 container.iter_unpacked_subset(
2001 object_ids, allow_missing=True, convert_ofs_delta=True
2002 )
2003 ):
2004 if progress is not None and i % 1000 == 0:
2005 progress(f"checking for reusable deltas: {i}/{len(object_ids)}\r".encode())
2006 if unpacked.pack_type_num == REF_DELTA:
2007 hexsha = sha_to_hex(unpacked.delta_base) # type: ignore
2008 if hexsha in object_ids or hexsha in other_haves:
2009 yield unpacked
2010 reused += 1
2011 if progress is not None:
2012 progress((f"found {reused} deltas to reuse\n").encode())
2015def deltify_pack_objects(
2016 objects: Union[Iterator[bytes], Iterator[tuple[ShaFile, Optional[bytes]]]],
2017 *,
2018 window_size: Optional[int] = None,
2019 progress=None,
2020) -> Iterator[UnpackedObject]:
2021 """Generate deltas for pack objects.
2023 Args:
2024 objects: An iterable of (object, path) tuples to deltify.
2025 window_size: Window size; None for default
2026 Returns: Iterator over type_num, object id, delta_base, content
2027 delta_base is None for full text entries
2028 """
2030 def objects_with_hints():
2031 for e in objects:
2032 if isinstance(e, ShaFile):
2033 yield (e, (e.type_num, None))
2034 else:
2035 yield (e[0], (e[0].type_num, e[1]))
2037 yield from deltas_from_sorted_objects(
2038 sort_objects_for_delta(objects_with_hints()),
2039 window_size=window_size,
2040 progress=progress,
2041 )
2044def sort_objects_for_delta(
2045 objects: Union[Iterator[ShaFile], Iterator[tuple[ShaFile, Optional[PackHint]]]],
2046) -> Iterator[ShaFile]:
2047 magic = []
2048 for entry in objects:
2049 if isinstance(entry, tuple):
2050 obj, hint = entry
2051 if hint is None:
2052 type_num = None
2053 path = None
2054 else:
2055 (type_num, path) = hint
2056 else:
2057 obj = entry
2058 magic.append((type_num, path, -obj.raw_length(), obj))
2059 # Build a list of objects ordered by the magic Linus heuristic
2060 # This helps us find good objects to diff against us
2061 magic.sort()
2062 return (x[3] for x in magic)
2065def deltas_from_sorted_objects(
2066 objects, window_size: Optional[int] = None, progress=None
2067):
2068 # TODO(jelmer): Use threads
2069 if window_size is None:
2070 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE
2072 possible_bases: deque[tuple[bytes, int, list[bytes]]] = deque()
2073 for i, o in enumerate(objects):
2074 if progress is not None and i % 1000 == 0:
2075 progress((f"generating deltas: {i}\r").encode())
2076 raw = o.as_raw_chunks()
2077 winner = raw
2078 winner_len = sum(map(len, winner))
2079 winner_base = None
2080 for base_id, base_type_num, base in possible_bases:
2081 if base_type_num != o.type_num:
2082 continue
2083 delta_len = 0
2084 delta = []
2085 for chunk in create_delta(base, raw):
2086 delta_len += len(chunk)
2087 if delta_len >= winner_len:
2088 break
2089 delta.append(chunk)
2090 else:
2091 winner_base = base_id
2092 winner = delta
2093 winner_len = sum(map(len, winner))
2094 yield UnpackedObject(
2095 o.type_num,
2096 sha=o.sha().digest(),
2097 delta_base=winner_base,
2098 decomp_len=winner_len,
2099 decomp_chunks=winner,
2100 )
2101 possible_bases.appendleft((o.sha().digest(), o.type_num, raw))
2102 while len(possible_bases) > window_size:
2103 possible_bases.pop()
2106def pack_objects_to_data(
2107 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],
2108 *,
2109 deltify: Optional[bool] = None,
2110 delta_window_size: Optional[int] = None,
2111 ofs_delta: bool = True,
2112 progress=None,
2113) -> tuple[int, Iterator[UnpackedObject]]:
2114 """Create pack data from objects.
2116 Args:
2117 objects: Pack objects
2118 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
2119 """
2120 # TODO(jelmer): support deltaifying
2121 count = len(objects)
2122 if deltify is None:
2123 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too
2124 # slow at the moment.
2125 deltify = False
2126 if deltify:
2127 return (
2128 count,
2129 deltify_pack_objects(
2130 iter(objects), # type: ignore
2131 window_size=delta_window_size,
2132 progress=progress,
2133 ),
2134 )
2135 else:
2137 def iter_without_path():
2138 for o in objects:
2139 if isinstance(o, tuple):
2140 yield full_unpacked_object(o[0])
2141 else:
2142 yield full_unpacked_object(o)
2144 return (count, iter_without_path())
2147def generate_unpacked_objects(
2148 container: PackedObjectContainer,
2149 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]],
2150 delta_window_size: Optional[int] = None,
2151 deltify: Optional[bool] = None,
2152 reuse_deltas: bool = True,
2153 ofs_delta: bool = True,
2154 other_haves: Optional[set[bytes]] = None,
2155 progress=None,
2156) -> Iterator[UnpackedObject]:
2157 """Create pack data from objects.
2159 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
2160 """
2161 todo = dict(object_ids)
2162 if reuse_deltas:
2163 for unpack in find_reusable_deltas(
2164 container, set(todo), other_haves=other_haves, progress=progress
2165 ):
2166 del todo[sha_to_hex(unpack.sha())]
2167 yield unpack
2168 if deltify is None:
2169 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too
2170 # slow at the moment.
2171 deltify = False
2172 if deltify:
2173 objects_to_delta = container.iterobjects_subset(
2174 todo.keys(), allow_missing=False
2175 )
2176 yield from deltas_from_sorted_objects(
2177 sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta),
2178 window_size=delta_window_size,
2179 progress=progress,
2180 )
2181 else:
2182 for oid in todo:
2183 yield full_unpacked_object(container[oid])
2186def full_unpacked_object(o: ShaFile) -> UnpackedObject:
2187 return UnpackedObject(
2188 o.type_num,
2189 delta_base=None,
2190 crc32=None,
2191 decomp_chunks=o.as_raw_chunks(),
2192 sha=o.sha().digest(),
2193 )
2196def write_pack_from_container(
2197 write,
2198 container: PackedObjectContainer,
2199 object_ids: Sequence[tuple[ObjectID, Optional[PackHint]]],
2200 delta_window_size: Optional[int] = None,
2201 deltify: Optional[bool] = None,
2202 reuse_deltas: bool = True,
2203 compression_level: int = -1,
2204 other_haves: Optional[set[bytes]] = None,
2205):
2206 """Write a new pack data file.
2208 Args:
2209 write: write function to use
2210 container: PackedObjectContainer
2211 delta_window_size: Sliding window size for searching for deltas;
2212 Set to None for default window size.
2213 deltify: Whether to deltify objects
2214 compression_level: the zlib compression level to use
2215 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2216 """
2217 pack_contents_count = len(object_ids)
2218 pack_contents = generate_unpacked_objects(
2219 container,
2220 object_ids,
2221 delta_window_size=delta_window_size,
2222 deltify=deltify,
2223 reuse_deltas=reuse_deltas,
2224 other_haves=other_haves,
2225 )
2227 return write_pack_data(
2228 write,
2229 pack_contents,
2230 num_records=pack_contents_count,
2231 compression_level=compression_level,
2232 )
2235def write_pack_objects(
2236 write,
2237 objects: Union[Sequence[ShaFile], Sequence[tuple[ShaFile, Optional[bytes]]]],
2238 *,
2239 delta_window_size: Optional[int] = None,
2240 deltify: Optional[bool] = None,
2241 compression_level: int = -1,
2242):
2243 """Write a new pack data file.
2245 Args:
2246 write: write function to use
2247 objects: Sequence of (object, path) tuples to write
2248 delta_window_size: Sliding window size for searching for deltas;
2249 Set to None for default window size.
2250 deltify: Whether to deltify objects
2251 compression_level: the zlib compression level to use
2252 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2253 """
2254 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify)
2256 return write_pack_data(
2257 write,
2258 pack_contents,
2259 num_records=pack_contents_count,
2260 compression_level=compression_level,
2261 )
2264class PackChunkGenerator:
2265 def __init__(
2266 self,
2267 num_records=None,
2268 records=None,
2269 progress=None,
2270 compression_level=-1,
2271 reuse_compressed=True,
2272 ) -> None:
2273 self.cs = sha1(b"")
2274 self.entries: dict[Union[int, bytes], tuple[int, int]] = {}
2275 self._it = self._pack_data_chunks(
2276 num_records=num_records,
2277 records=records,
2278 progress=progress,
2279 compression_level=compression_level,
2280 reuse_compressed=reuse_compressed,
2281 )
2283 def sha1digest(self):
2284 return self.cs.digest()
2286 def __iter__(self):
2287 return self._it
2289 def _pack_data_chunks(
2290 self,
2291 records: Iterator[UnpackedObject],
2292 *,
2293 num_records=None,
2294 progress=None,
2295 compression_level: int = -1,
2296 reuse_compressed: bool = True,
2297 ) -> Iterator[bytes]:
2298 """Iterate pack data file chunks.
2300 Args:
2301 records: Iterator over UnpackedObject
2302 num_records: Number of records (defaults to len(records) if not specified)
2303 progress: Function to report progress to
2304 compression_level: the zlib compression level
2305 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2306 """
2307 # Write the pack
2308 if num_records is None:
2309 num_records = len(records) # type: ignore
2310 offset = 0
2311 for chunk in pack_header_chunks(num_records):
2312 yield chunk
2313 self.cs.update(chunk)
2314 offset += len(chunk)
2315 actual_num_records = 0
2316 for i, unpacked in enumerate(records):
2317 type_num = unpacked.pack_type_num
2318 if progress is not None and i % 1000 == 0:
2319 progress((f"writing pack data: {i}/{num_records}\r").encode("ascii"))
2320 raw: Union[list[bytes], tuple[int, list[bytes]], tuple[bytes, list[bytes]]]
2321 if unpacked.delta_base is not None:
2322 try:
2323 base_offset, base_crc32 = self.entries[unpacked.delta_base]
2324 except KeyError:
2325 type_num = REF_DELTA
2326 assert isinstance(unpacked.delta_base, bytes)
2327 raw = (unpacked.delta_base, unpacked.decomp_chunks)
2328 else:
2329 type_num = OFS_DELTA
2330 raw = (offset - base_offset, unpacked.decomp_chunks)
2331 else:
2332 raw = unpacked.decomp_chunks
2333 if unpacked.comp_chunks is not None and reuse_compressed:
2334 chunks = unpacked.comp_chunks
2335 else:
2336 chunks = pack_object_chunks(
2337 type_num, raw, compression_level=compression_level
2338 )
2339 crc32 = 0
2340 object_size = 0
2341 for chunk in chunks:
2342 yield chunk
2343 crc32 = binascii.crc32(chunk, crc32)
2344 self.cs.update(chunk)
2345 object_size += len(chunk)
2346 actual_num_records += 1
2347 self.entries[unpacked.sha()] = (offset, crc32)
2348 offset += object_size
2349 if actual_num_records != num_records:
2350 raise AssertionError(
2351 f"actual records written differs: {actual_num_records} != {num_records}"
2352 )
2354 yield self.cs.digest()
2357def write_pack_data(
2358 write,
2359 records: Iterator[UnpackedObject],
2360 *,
2361 num_records=None,
2362 progress=None,
2363 compression_level=-1,
2364):
2365 """Write a new pack data file.
2367 Args:
2368 write: Write function to use
2369 num_records: Number of records (defaults to len(records) if None)
2370 records: Iterator over type_num, object_id, delta_base, raw
2371 progress: Function to report progress to
2372 compression_level: the zlib compression level
2373 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2374 """
2375 chunk_generator = PackChunkGenerator(
2376 num_records=num_records,
2377 records=records,
2378 progress=progress,
2379 compression_level=compression_level,
2380 )
2381 for chunk in chunk_generator:
2382 write(chunk)
2383 return chunk_generator.entries, chunk_generator.sha1digest()
2386def write_pack_index_v1(f, entries, pack_checksum):
2387 """Write a new pack index file.
2389 Args:
2390 f: A file-like object to write to
2391 entries: List of tuples with object name (sha), offset_in_pack,
2392 and crc32_checksum.
2393 pack_checksum: Checksum of the pack file.
2394 Returns: The SHA of the written index file
2395 """
2396 f = SHA1Writer(f)
2397 fan_out_table = defaultdict(lambda: 0)
2398 for name, offset, entry_checksum in entries:
2399 fan_out_table[ord(name[:1])] += 1
2400 # Fan-out table
2401 for i in range(0x100):
2402 f.write(struct.pack(">L", fan_out_table[i]))
2403 fan_out_table[i + 1] += fan_out_table[i]
2404 for name, offset, entry_checksum in entries:
2405 if not (offset <= 0xFFFFFFFF):
2406 raise TypeError("pack format 1 only supports offsets < 2Gb")
2407 f.write(struct.pack(">L20s", offset, name))
2408 assert len(pack_checksum) == 20
2409 f.write(pack_checksum)
2410 return f.write_sha()
2413def _delta_encode_size(size) -> bytes:
2414 ret = bytearray()
2415 c = size & 0x7F
2416 size >>= 7
2417 while size:
2418 ret.append(c | 0x80)
2419 c = size & 0x7F
2420 size >>= 7
2421 ret.append(c)
2422 return bytes(ret)
2425# The length of delta compression copy operations in version 2 packs is limited
2426# to 64K. To copy more, we use several copy operations. Version 3 packs allow
2427# 24-bit lengths in copy operations, but we always make version 2 packs.
2428_MAX_COPY_LEN = 0xFFFF
2431def _encode_copy_operation(start, length):
2432 scratch = bytearray([0x80])
2433 for i in range(4):
2434 if start & 0xFF << i * 8:
2435 scratch.append((start >> i * 8) & 0xFF)
2436 scratch[0] |= 1 << i
2437 for i in range(2):
2438 if length & 0xFF << i * 8:
2439 scratch.append((length >> i * 8) & 0xFF)
2440 scratch[0] |= 1 << (4 + i)
2441 return bytes(scratch)
2444def create_delta(base_buf, target_buf):
2445 """Use python difflib to work out how to transform base_buf to target_buf.
2447 Args:
2448 base_buf: Base buffer
2449 target_buf: Target buffer
2450 """
2451 if isinstance(base_buf, list):
2452 base_buf = b"".join(base_buf)
2453 if isinstance(target_buf, list):
2454 target_buf = b"".join(target_buf)
2455 assert isinstance(base_buf, bytes)
2456 assert isinstance(target_buf, bytes)
2457 # write delta header
2458 yield _delta_encode_size(len(base_buf))
2459 yield _delta_encode_size(len(target_buf))
2460 # write out delta opcodes
2461 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf)
2462 for opcode, i1, i2, j1, j2 in seq.get_opcodes():
2463 # Git patch opcodes don't care about deletes!
2464 # if opcode == 'replace' or opcode == 'delete':
2465 # pass
2466 if opcode == "equal":
2467 # If they are equal, unpacker will use data from base_buf
2468 # Write out an opcode that says what range to use
2469 copy_start = i1
2470 copy_len = i2 - i1
2471 while copy_len > 0:
2472 to_copy = min(copy_len, _MAX_COPY_LEN)
2473 yield _encode_copy_operation(copy_start, to_copy)
2474 copy_start += to_copy
2475 copy_len -= to_copy
2476 if opcode == "replace" or opcode == "insert":
2477 # If we are replacing a range or adding one, then we just
2478 # output it to the stream (prefixed by its size)
2479 s = j2 - j1
2480 o = j1
2481 while s > 127:
2482 yield bytes([127])
2483 yield memoryview(target_buf)[o : o + 127]
2484 s -= 127
2485 o += 127
2486 yield bytes([s])
2487 yield memoryview(target_buf)[o : o + s]
2490def apply_delta(src_buf, delta):
2491 """Based on the similar function in git's patch-delta.c.
2493 Args:
2494 src_buf: Source buffer
2495 delta: Delta instructions
2496 """
2497 if not isinstance(src_buf, bytes):
2498 src_buf = b"".join(src_buf)
2499 if not isinstance(delta, bytes):
2500 delta = b"".join(delta)
2501 out = []
2502 index = 0
2503 delta_length = len(delta)
2505 def get_delta_header_size(delta, index):
2506 size = 0
2507 i = 0
2508 while delta:
2509 cmd = ord(delta[index : index + 1])
2510 index += 1
2511 size |= (cmd & ~0x80) << i
2512 i += 7
2513 if not cmd & 0x80:
2514 break
2515 return size, index
2517 src_size, index = get_delta_header_size(delta, index)
2518 dest_size, index = get_delta_header_size(delta, index)
2519 if src_size != len(src_buf):
2520 raise ApplyDeltaError(
2521 f"Unexpected source buffer size: {src_size} vs {len(src_buf)}"
2522 )
2523 while index < delta_length:
2524 cmd = ord(delta[index : index + 1])
2525 index += 1
2526 if cmd & 0x80:
2527 cp_off = 0
2528 for i in range(4):
2529 if cmd & (1 << i):
2530 x = ord(delta[index : index + 1])
2531 index += 1
2532 cp_off |= x << (i * 8)
2533 cp_size = 0
2534 # Version 3 packs can contain copy sizes larger than 64K.
2535 for i in range(3):
2536 if cmd & (1 << (4 + i)):
2537 x = ord(delta[index : index + 1])
2538 index += 1
2539 cp_size |= x << (i * 8)
2540 if cp_size == 0:
2541 cp_size = 0x10000
2542 if (
2543 cp_off + cp_size < cp_size
2544 or cp_off + cp_size > src_size
2545 or cp_size > dest_size
2546 ):
2547 break
2548 out.append(src_buf[cp_off : cp_off + cp_size])
2549 elif cmd != 0:
2550 out.append(delta[index : index + cmd])
2551 index += cmd
2552 else:
2553 raise ApplyDeltaError("Invalid opcode 0")
2555 if index != delta_length:
2556 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}")
2558 if dest_size != chunks_length(out):
2559 raise ApplyDeltaError("dest size incorrect")
2561 return out
2564def write_pack_index_v2(
2565 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes
2566) -> bytes:
2567 """Write a new pack index file.
2569 Args:
2570 f: File-like object to write to
2571 entries: List of tuples with object name (sha), offset_in_pack, and
2572 crc32_checksum.
2573 pack_checksum: Checksum of the pack file.
2574 Returns: The SHA of the index file written
2575 """
2576 f = SHA1Writer(f)
2577 f.write(b"\377tOc") # Magic!
2578 f.write(struct.pack(">L", 2))
2579 fan_out_table: dict[int, int] = defaultdict(lambda: 0)
2580 for name, offset, entry_checksum in entries:
2581 fan_out_table[ord(name[:1])] += 1
2582 # Fan-out table
2583 largetable: list[int] = []
2584 for i in range(0x100):
2585 f.write(struct.pack(b">L", fan_out_table[i]))
2586 fan_out_table[i + 1] += fan_out_table[i]
2587 for name, offset, entry_checksum in entries:
2588 f.write(name)
2589 for name, offset, entry_checksum in entries:
2590 f.write(struct.pack(b">L", entry_checksum))
2591 for name, offset, entry_checksum in entries:
2592 if offset < 2**31:
2593 f.write(struct.pack(b">L", offset))
2594 else:
2595 f.write(struct.pack(b">L", 2**31 + len(largetable)))
2596 largetable.append(offset)
2597 for offset in largetable:
2598 f.write(struct.pack(b">Q", offset))
2599 assert len(pack_checksum) == 20
2600 f.write(pack_checksum)
2601 return f.write_sha()
2604def write_pack_index_v3(
2605 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes, hash_algorithm: int = 1
2606) -> bytes:
2607 """Write a new pack index file in v3 format.
2609 Args:
2610 f: File-like object to write to
2611 entries: List of tuples with object name (sha), offset_in_pack, and
2612 crc32_checksum.
2613 pack_checksum: Checksum of the pack file.
2614 hash_algorithm: Hash algorithm identifier (1 = SHA-1, 2 = SHA-256)
2615 Returns: The SHA of the index file written
2616 """
2617 if hash_algorithm == 1:
2618 hash_size = 20 # SHA-1
2619 writer_cls = SHA1Writer
2620 elif hash_algorithm == 2:
2621 hash_size = 32 # SHA-256
2622 # TODO: Add SHA256Writer when SHA-256 support is implemented
2623 raise NotImplementedError("SHA-256 support not yet implemented")
2624 else:
2625 raise ValueError(f"Unknown hash algorithm {hash_algorithm}")
2627 # Convert entries to list to allow multiple iterations
2628 entries_list = list(entries)
2630 # Calculate shortest unambiguous prefix length for object names
2631 # For now, use full hash size (this could be optimized)
2632 shortened_oid_len = hash_size
2634 f = writer_cls(f)
2635 f.write(b"\377tOc") # Magic!
2636 f.write(struct.pack(">L", 3)) # Version 3
2637 f.write(struct.pack(">L", hash_algorithm)) # Hash algorithm
2638 f.write(struct.pack(">L", shortened_oid_len)) # Shortened OID length
2640 fan_out_table: dict[int, int] = defaultdict(lambda: 0)
2641 for name, offset, entry_checksum in entries_list:
2642 if len(name) != hash_size:
2643 raise ValueError(
2644 f"Object name has wrong length: expected {hash_size}, got {len(name)}"
2645 )
2646 fan_out_table[ord(name[:1])] += 1
2648 # Fan-out table
2649 largetable: list[int] = []
2650 for i in range(0x100):
2651 f.write(struct.pack(b">L", fan_out_table[i]))
2652 fan_out_table[i + 1] += fan_out_table[i]
2654 # Object names table
2655 for name, offset, entry_checksum in entries_list:
2656 f.write(name)
2658 # CRC32 checksums table
2659 for name, offset, entry_checksum in entries_list:
2660 f.write(struct.pack(b">L", entry_checksum))
2662 # Offset table
2663 for name, offset, entry_checksum in entries_list:
2664 if offset < 2**31:
2665 f.write(struct.pack(b">L", offset))
2666 else:
2667 f.write(struct.pack(b">L", 2**31 + len(largetable)))
2668 largetable.append(offset)
2670 # Large offset table
2671 for offset in largetable:
2672 f.write(struct.pack(b">Q", offset))
2674 assert len(pack_checksum) == hash_size, (
2675 f"Pack checksum has wrong length: expected {hash_size}, got {len(pack_checksum)}"
2676 )
2677 f.write(pack_checksum)
2678 return f.write_sha()
2681def write_pack_index(
2682 index_filename, entries, pack_checksum, progress=None, version=None
2683):
2684 """Write a pack index file.
2686 Args:
2687 index_filename: Index filename.
2688 entries: List of (checksum, offset, crc32) tuples
2689 pack_checksum: Checksum of the pack file.
2690 progress: Progress function (not currently used)
2691 version: Pack index version to use (1, 2, or 3). If None, defaults to DEFAULT_PACK_INDEX_VERSION.
2693 Returns:
2694 SHA of the written index file
2695 """
2696 if version is None:
2697 version = DEFAULT_PACK_INDEX_VERSION
2699 if version == 1:
2700 return write_pack_index_v1(index_filename, entries, pack_checksum)
2701 elif version == 2:
2702 return write_pack_index_v2(index_filename, entries, pack_checksum)
2703 elif version == 3:
2704 return write_pack_index_v3(index_filename, entries, pack_checksum)
2705 else:
2706 raise ValueError(f"Unsupported pack index version: {version}")
2709class Pack:
2710 """A Git pack object."""
2712 _data_load: Optional[Callable[[], PackData]]
2713 _idx_load: Optional[Callable[[], PackIndex]]
2715 _data: Optional[PackData]
2716 _idx: Optional[PackIndex]
2718 def __init__(
2719 self, basename, resolve_ext_ref: Optional[ResolveExtRefFn] = None
2720 ) -> None:
2721 self._basename = basename
2722 self._data = None
2723 self._idx = None
2724 self._idx_path = self._basename + ".idx"
2725 self._data_path = self._basename + ".pack"
2726 self._data_load = lambda: PackData(self._data_path)
2727 self._idx_load = lambda: load_pack_index(self._idx_path)
2728 self.resolve_ext_ref = resolve_ext_ref
2730 @classmethod
2731 def from_lazy_objects(cls, data_fn, idx_fn):
2732 """Create a new pack object from callables to load pack data and
2733 index objects.
2734 """
2735 ret = cls("")
2736 ret._data_load = data_fn
2737 ret._idx_load = idx_fn
2738 return ret
2740 @classmethod
2741 def from_objects(cls, data, idx):
2742 """Create a new pack object from pack data and index objects."""
2743 ret = cls("")
2744 ret._data = data
2745 ret._data_load = None
2746 ret._idx = idx
2747 ret._idx_load = None
2748 ret.check_length_and_checksum()
2749 return ret
2751 def name(self):
2752 """The SHA over the SHAs of the objects in this pack."""
2753 return self.index.objects_sha1()
2755 @property
2756 def data(self) -> PackData:
2757 """The pack data object being used."""
2758 if self._data is None:
2759 assert self._data_load
2760 self._data = self._data_load()
2761 self.check_length_and_checksum()
2762 return self._data
2764 @property
2765 def index(self) -> PackIndex:
2766 """The index being used.
2768 Note: This may be an in-memory index
2769 """
2770 if self._idx is None:
2771 assert self._idx_load
2772 self._idx = self._idx_load()
2773 return self._idx
2775 def close(self) -> None:
2776 if self._data is not None:
2777 self._data.close()
2778 if self._idx is not None:
2779 self._idx.close()
2781 def __enter__(self):
2782 return self
2784 def __exit__(self, exc_type, exc_val, exc_tb):
2785 self.close()
2787 def __eq__(self, other):
2788 return isinstance(self, type(other)) and self.index == other.index
2790 def __len__(self) -> int:
2791 """Number of entries in this pack."""
2792 return len(self.index)
2794 def __repr__(self) -> str:
2795 return f"{self.__class__.__name__}({self._basename!r})"
2797 def __iter__(self):
2798 """Iterate over all the sha1s of the objects in this pack."""
2799 return iter(self.index)
2801 def check_length_and_checksum(self) -> None:
2802 """Sanity check the length and checksum of the pack index and data."""
2803 assert len(self.index) == len(self.data), (
2804 f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)"
2805 )
2806 idx_stored_checksum = self.index.get_pack_checksum()
2807 data_stored_checksum = self.data.get_stored_checksum()
2808 if idx_stored_checksum != data_stored_checksum:
2809 raise ChecksumMismatch(
2810 sha_to_hex(idx_stored_checksum),
2811 sha_to_hex(data_stored_checksum),
2812 )
2814 def check(self) -> None:
2815 """Check the integrity of this pack.
2817 Raises:
2818 ChecksumMismatch: if a checksum for the index or data is wrong
2819 """
2820 self.index.check()
2821 self.data.check()
2822 for obj in self.iterobjects():
2823 obj.check()
2824 # TODO: object connectivity checks
2826 def get_stored_checksum(self) -> bytes:
2827 return self.data.get_stored_checksum()
2829 def pack_tuples(self):
2830 return [(o, None) for o in self.iterobjects()]
2832 def __contains__(self, sha1: bytes) -> bool:
2833 """Check whether this pack contains a particular SHA1."""
2834 try:
2835 self.index.object_offset(sha1)
2836 return True
2837 except KeyError:
2838 return False
2840 def get_raw(self, sha1: bytes) -> tuple[int, bytes]:
2841 offset = self.index.object_offset(sha1)
2842 obj_type, obj = self.data.get_object_at(offset)
2843 type_num, chunks = self.resolve_object(offset, obj_type, obj)
2844 return type_num, b"".join(chunks)
2846 def __getitem__(self, sha1: bytes) -> ShaFile:
2847 """Retrieve the specified SHA1."""
2848 type, uncomp = self.get_raw(sha1)
2849 return ShaFile.from_raw_string(type, uncomp, sha=sha1)
2851 def iterobjects(self) -> Iterator[ShaFile]:
2852 """Iterate over the objects in this pack."""
2853 return iter(
2854 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref)
2855 )
2857 def iterobjects_subset(
2858 self, shas: Iterable[ObjectID], *, allow_missing: bool = False
2859 ) -> Iterator[ShaFile]:
2860 return (
2861 uo
2862 for uo in PackInflater.for_pack_subset(
2863 self,
2864 shas,
2865 allow_missing=allow_missing,
2866 resolve_ext_ref=self.resolve_ext_ref,
2867 )
2868 if uo.id in shas
2869 )
2871 def iter_unpacked_subset(
2872 self,
2873 shas: Iterable[ObjectID],
2874 *,
2875 include_comp: bool = False,
2876 allow_missing: bool = False,
2877 convert_ofs_delta: bool = False,
2878 ) -> Iterator[UnpackedObject]:
2879 ofs_pending: dict[int, list[UnpackedObject]] = defaultdict(list)
2880 ofs: dict[bytes, int] = {}
2881 todo = set(shas)
2882 for unpacked in self.iter_unpacked(include_comp=include_comp):
2883 sha = unpacked.sha()
2884 ofs[unpacked.offset] = sha
2885 hexsha = sha_to_hex(sha)
2886 if hexsha in todo:
2887 if unpacked.pack_type_num == OFS_DELTA:
2888 assert isinstance(unpacked.delta_base, int)
2889 base_offset = unpacked.offset - unpacked.delta_base
2890 try:
2891 unpacked.delta_base = ofs[base_offset]
2892 except KeyError:
2893 ofs_pending[base_offset].append(unpacked)
2894 continue
2895 else:
2896 unpacked.pack_type_num = REF_DELTA
2897 yield unpacked
2898 todo.remove(hexsha)
2899 for child in ofs_pending.pop(unpacked.offset, []):
2900 child.pack_type_num = REF_DELTA
2901 child.delta_base = sha
2902 yield child
2903 assert not ofs_pending
2904 if not allow_missing and todo:
2905 raise UnresolvedDeltas(todo)
2907 def iter_unpacked(self, include_comp=False):
2908 ofs_to_entries = {
2909 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries()
2910 }
2911 for unpacked in self.data.iter_unpacked(include_comp=include_comp):
2912 (sha, crc32) = ofs_to_entries[unpacked.offset]
2913 unpacked._sha = sha
2914 unpacked.crc32 = crc32
2915 yield unpacked
2917 def keep(self, msg: Optional[bytes] = None) -> str:
2918 """Add a .keep file for the pack, preventing git from garbage collecting it.
2920 Args:
2921 msg: A message written inside the .keep file; can be used later
2922 to determine whether or not a .keep file is obsolete.
2923 Returns: The path of the .keep file, as a string.
2924 """
2925 keepfile_name = f"{self._basename}.keep"
2926 with GitFile(keepfile_name, "wb") as keepfile:
2927 if msg:
2928 keepfile.write(msg)
2929 keepfile.write(b"\n")
2930 return keepfile_name
2932 def get_ref(self, sha: bytes) -> tuple[Optional[int], int, OldUnpackedObject]:
2933 """Get the object for a ref SHA, only looking in this pack."""
2934 # TODO: cache these results
2935 try:
2936 offset = self.index.object_offset(sha)
2937 except KeyError:
2938 offset = None
2939 if offset:
2940 type, obj = self.data.get_object_at(offset)
2941 elif self.resolve_ext_ref:
2942 type, obj = self.resolve_ext_ref(sha)
2943 else:
2944 raise KeyError(sha)
2945 return offset, type, obj
2947 def resolve_object(
2948 self, offset: int, type: int, obj, get_ref=None
2949 ) -> tuple[int, Iterable[bytes]]:
2950 """Resolve an object, possibly resolving deltas when necessary.
2952 Returns: Tuple with object type and contents.
2953 """
2954 # Walk down the delta chain, building a stack of deltas to reach
2955 # the requested object.
2956 base_offset = offset
2957 base_type = type
2958 base_obj = obj
2959 delta_stack = []
2960 while base_type in DELTA_TYPES:
2961 prev_offset = base_offset
2962 if get_ref is None:
2963 get_ref = self.get_ref
2964 if base_type == OFS_DELTA:
2965 (delta_offset, delta) = base_obj
2966 # TODO: clean up asserts and replace with nicer error messages
2967 base_offset = base_offset - delta_offset
2968 base_type, base_obj = self.data.get_object_at(base_offset)
2969 assert isinstance(base_type, int)
2970 elif base_type == REF_DELTA:
2971 (basename, delta) = base_obj
2972 assert isinstance(basename, bytes) and len(basename) == 20
2973 base_offset, base_type, base_obj = get_ref(basename)
2974 assert isinstance(base_type, int)
2975 if base_offset == prev_offset: # object is based on itself
2976 raise UnresolvedDeltas(sha_to_hex(basename))
2977 delta_stack.append((prev_offset, base_type, delta))
2979 # Now grab the base object (mustn't be a delta) and apply the
2980 # deltas all the way up the stack.
2981 chunks = base_obj
2982 for prev_offset, delta_type, delta in reversed(delta_stack):
2983 chunks = apply_delta(chunks, delta)
2984 # TODO(dborowitz): This can result in poor performance if
2985 # large base objects are separated from deltas in the pack.
2986 # We should reorganize so that we apply deltas to all
2987 # objects in a chain one after the other to optimize cache
2988 # performance.
2989 if prev_offset is not None:
2990 self.data._offset_cache[prev_offset] = base_type, chunks
2991 return base_type, chunks
2993 def entries(
2994 self, progress: Optional[ProgressFn] = None
2995 ) -> Iterator[PackIndexEntry]:
2996 """Yield entries summarizing the contents of this pack.
2998 Args:
2999 progress: Progress function, called with current and total
3000 object count.
3001 Returns: iterator of tuples with (sha, offset, crc32)
3002 """
3003 return self.data.iterentries(
3004 progress=progress, resolve_ext_ref=self.resolve_ext_ref
3005 )
3007 def sorted_entries(
3008 self, progress: Optional[ProgressFn] = None
3009 ) -> Iterator[PackIndexEntry]:
3010 """Return entries in this pack, sorted by SHA.
3012 Args:
3013 progress: Progress function, called with current and total
3014 object count
3015 Returns: Iterator of tuples with (sha, offset, crc32)
3016 """
3017 return self.data.sorted_entries(
3018 progress=progress, resolve_ext_ref=self.resolve_ext_ref
3019 )
3021 def get_unpacked_object(
3022 self, sha: bytes, *, include_comp: bool = False, convert_ofs_delta: bool = True
3023 ) -> UnpackedObject:
3024 """Get the unpacked object for a sha.
3026 Args:
3027 sha: SHA of object to fetch
3028 include_comp: Whether to include compression data in UnpackedObject
3029 """
3030 offset = self.index.object_offset(sha)
3031 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp)
3032 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta:
3033 assert isinstance(unpacked.delta_base, int)
3034 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base)
3035 unpacked.pack_type_num = REF_DELTA
3036 return unpacked
3039def extend_pack(
3040 f: BinaryIO,
3041 object_ids: set[ObjectID],
3042 get_raw,
3043 *,
3044 compression_level=-1,
3045 progress=None,
3046) -> tuple[bytes, list]:
3047 """Extend a pack file with more objects.
3049 The caller should make sure that object_ids does not contain any objects
3050 that are already in the pack
3051 """
3052 # Update the header with the new number of objects.
3053 f.seek(0)
3054 _version, num_objects = read_pack_header(f.read)
3056 if object_ids:
3057 f.seek(0)
3058 write_pack_header(f.write, num_objects + len(object_ids))
3060 # Must flush before reading (http://bugs.python.org/issue3207)
3061 f.flush()
3063 # Rescan the rest of the pack, computing the SHA with the new header.
3064 new_sha = compute_file_sha(f, end_ofs=-20)
3066 # Must reposition before writing (http://bugs.python.org/issue3207)
3067 f.seek(0, os.SEEK_CUR)
3069 extra_entries = []
3071 # Complete the pack.
3072 for i, object_id in enumerate(object_ids):
3073 if progress is not None:
3074 progress(
3075 (f"writing extra base objects: {i}/{len(object_ids)}\r").encode("ascii")
3076 )
3077 assert len(object_id) == 20
3078 type_num, data = get_raw(object_id)
3079 offset = f.tell()
3080 crc32 = write_pack_object(
3081 f.write,
3082 type_num,
3083 data,
3084 sha=new_sha,
3085 compression_level=compression_level,
3086 )
3087 extra_entries.append((object_id, offset, crc32))
3088 pack_sha = new_sha.digest()
3089 f.write(pack_sha)
3090 return pack_sha, extra_entries
3093try:
3094 from dulwich._pack import ( # type: ignore
3095 apply_delta, # type: ignore
3096 bisect_find_sha, # type: ignore
3097 )
3098except ImportError:
3099 pass