Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/dulwich/pack.py: 27%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# pack.py -- For dealing with packed git objects.
2# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk>
4#
5# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
6# General Public License as public by the Free Software Foundation; version 2.0
7# or (at your option) any later version. You can redistribute it and/or
8# modify it under the terms of either of these two licenses.
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#
16# You should have received a copy of the licenses; if not, see
17# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
18# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
19# License, Version 2.0.
20#
22"""Classes for dealing with packed git objects.
24A pack is a compact representation of a bunch of objects, stored
25using deltas where possible.
27They have two parts, the pack file, which stores the data, and an index
28that tells you where the data is.
30To find an object you look in all of the index files 'til you find a
31match for the object name. You then use the pointer got from this as
32a pointer in to the corresponding packfile.
33"""
35import binascii
36from collections import defaultdict, deque
37from contextlib import suppress
38from io import BytesIO, UnsupportedOperation
40try:
41 from cdifflib import CSequenceMatcher as SequenceMatcher
42except ModuleNotFoundError:
43 from difflib import SequenceMatcher
45import os
46import struct
47import sys
48import warnings
49import zlib
50from hashlib import sha1
51from itertools import chain
52from os import SEEK_CUR, SEEK_END
53from struct import unpack_from
54from typing import (
55 BinaryIO,
56 Callable,
57 Deque,
58 Dict,
59 Generic,
60 Iterable,
61 Iterator,
62 List,
63 Optional,
64 Protocol,
65 Sequence,
66 Set,
67 Tuple,
68 TypeVar,
69 Union,
70)
72try:
73 import mmap
74except ImportError:
75 has_mmap = False
76else:
77 has_mmap = True
79# For some reason the above try, except fails to set has_mmap = False for plan9
80if sys.platform == "Plan9":
81 has_mmap = False
83from .errors import ApplyDeltaError, ChecksumMismatch
84from .file import GitFile
85from .lru_cache import LRUSizeCache
86from .objects import ObjectID, ShaFile, hex_to_sha, object_header, sha_to_hex
88OFS_DELTA = 6
89REF_DELTA = 7
91DELTA_TYPES = (OFS_DELTA, REF_DELTA)
94DEFAULT_PACK_DELTA_WINDOW_SIZE = 10
96# Keep pack files under 16Mb in memory, otherwise write them out to disk
97PACK_SPOOL_FILE_MAX_SIZE = 16 * 1024 * 1024
100OldUnpackedObject = Union[Tuple[Union[bytes, int], List[bytes]], List[bytes]]
101ResolveExtRefFn = Callable[[bytes], Tuple[int, OldUnpackedObject]]
102ProgressFn = Callable[[int, str], None]
103PackHint = Tuple[int, Optional[bytes]]
106class UnresolvedDeltas(Exception):
107 """Delta objects could not be resolved."""
109 def __init__(self, shas):
110 self.shas = shas
113class ObjectContainer(Protocol):
114 def add_object(self, obj: ShaFile) -> None:
115 """Add a single object to this object store."""
117 def add_objects(
118 self,
119 objects: Sequence[Tuple[ShaFile, Optional[str]]],
120 progress: Optional[Callable[[str], None]] = None,
121 ) -> None:
122 """Add a set of objects to this object store.
124 Args:
125 objects: Iterable over a list of (object, path) tuples
126 """
128 def __contains__(self, sha1: bytes) -> bool:
129 """Check if a hex sha is present."""
131 def __getitem__(self, sha1: bytes) -> ShaFile:
132 """Retrieve an object."""
135class PackedObjectContainer(ObjectContainer):
136 def get_unpacked_object(
137 self, sha1: bytes, *, include_comp: bool = False
138 ) -> "UnpackedObject":
139 """Get a raw unresolved object."""
140 raise NotImplementedError(self.get_unpacked_object)
142 def iterobjects_subset(
143 self, shas: Iterable[bytes], *, allow_missing: bool = False
144 ) -> Iterator[ShaFile]:
145 raise NotImplementedError(self.iterobjects_subset)
147 def iter_unpacked_subset(
148 self,
149 shas: Set[bytes],
150 include_comp: bool = False,
151 allow_missing: bool = False,
152 convert_ofs_delta: bool = True,
153 ) -> Iterator["UnpackedObject"]:
154 raise NotImplementedError(self.iter_unpacked_subset)
157class UnpackedObjectStream:
158 def __iter__(self) -> Iterator["UnpackedObject"]:
159 raise NotImplementedError(self.__iter__)
161 def __len__(self) -> int:
162 raise NotImplementedError(self.__len__)
165def take_msb_bytes(
166 read: Callable[[int], bytes], crc32: Optional[int] = None
167) -> Tuple[List[int], Optional[int]]:
168 """Read bytes marked with most significant bit.
170 Args:
171 read: Read function
172 """
173 ret: List[int] = []
174 while len(ret) == 0 or ret[-1] & 0x80:
175 b = read(1)
176 if crc32 is not None:
177 crc32 = binascii.crc32(b, crc32)
178 ret.append(ord(b[:1]))
179 return ret, crc32
182class PackFileDisappeared(Exception):
183 def __init__(self, obj) -> None:
184 self.obj = obj
187class UnpackedObject:
188 """Class encapsulating an object unpacked from a pack file.
190 These objects should only be created from within unpack_object. Most
191 members start out as empty and are filled in at various points by
192 read_zlib_chunks, unpack_object, DeltaChainIterator, etc.
194 End users of this object should take care that the function they're getting
195 this object from is guaranteed to set the members they need.
196 """
198 __slots__ = [
199 "offset", # Offset in its pack.
200 "_sha", # Cached binary SHA.
201 "obj_type_num", # Type of this object.
202 "obj_chunks", # Decompressed and delta-resolved chunks.
203 "pack_type_num", # Type of this object in the pack (may be a delta).
204 "delta_base", # Delta base offset or SHA.
205 "comp_chunks", # Compressed object chunks.
206 "decomp_chunks", # Decompressed object chunks.
207 "decomp_len", # Decompressed length of this object.
208 "crc32", # CRC32.
209 ]
211 obj_type_num: Optional[int]
212 obj_chunks: Optional[List[bytes]]
213 delta_base: Union[None, bytes, int]
214 decomp_chunks: List[bytes]
215 comp_chunks: Optional[List[bytes]]
217 # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be
218 # methods of this object.
219 def __init__(
220 self,
221 pack_type_num,
222 *,
223 delta_base=None,
224 decomp_len=None,
225 crc32=None,
226 sha=None,
227 decomp_chunks=None,
228 offset=None,
229 ) -> None:
230 self.offset = offset
231 self._sha = sha
232 self.pack_type_num = pack_type_num
233 self.delta_base = delta_base
234 self.comp_chunks = None
235 self.decomp_chunks: List[bytes] = decomp_chunks or []
236 if decomp_chunks is not None and decomp_len is None:
237 self.decomp_len = sum(map(len, decomp_chunks))
238 else:
239 self.decomp_len = decomp_len
240 self.crc32 = crc32
242 if pack_type_num in DELTA_TYPES:
243 self.obj_type_num = None
244 self.obj_chunks = None
245 else:
246 self.obj_type_num = pack_type_num
247 self.obj_chunks = self.decomp_chunks
248 self.delta_base = delta_base
250 def sha(self):
251 """Return the binary SHA of this object."""
252 if self._sha is None:
253 self._sha = obj_sha(self.obj_type_num, self.obj_chunks)
254 return self._sha
256 def sha_file(self):
257 """Return a ShaFile from this object."""
258 assert self.obj_type_num is not None and self.obj_chunks is not None
259 return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)
261 # Only provided for backwards compatibility with code that expects either
262 # chunks or a delta tuple.
263 def _obj(self) -> OldUnpackedObject:
264 """Return the decompressed chunks, or (delta base, delta chunks)."""
265 if self.pack_type_num in DELTA_TYPES:
266 assert isinstance(self.delta_base, (bytes, int))
267 return (self.delta_base, self.decomp_chunks)
268 else:
269 return self.decomp_chunks
271 def __eq__(self, other):
272 if not isinstance(other, UnpackedObject):
273 return False
274 for slot in self.__slots__:
275 if getattr(self, slot) != getattr(other, slot):
276 return False
277 return True
279 def __ne__(self, other):
280 return not (self == other)
282 def __repr__(self) -> str:
283 data = [f"{s}={getattr(self, s)!r}" for s in self.__slots__]
284 return "{}({})".format(self.__class__.__name__, ", ".join(data))
287_ZLIB_BUFSIZE = 4096
290def read_zlib_chunks(
291 read_some: Callable[[int], bytes],
292 unpacked: UnpackedObject,
293 include_comp: bool = False,
294 buffer_size: int = _ZLIB_BUFSIZE,
295) -> bytes:
296 """Read zlib data from a buffer.
298 This function requires that the buffer have additional data following the
299 compressed data, which is guaranteed to be the case for git pack files.
301 Args:
302 read_some: Read function that returns at least one byte, but may
303 return less than the requested size.
304 unpacked: An UnpackedObject to write result data to. If its crc32
305 attr is not None, the CRC32 of the compressed bytes will be computed
306 using this starting CRC32.
307 After this function, will have the following attrs set:
308 * comp_chunks (if include_comp is True)
309 * decomp_chunks
310 * decomp_len
311 * crc32
312 include_comp: If True, include compressed data in the result.
313 buffer_size: Size of the read buffer.
314 Returns: Leftover unused data from the decompression.
316 Raises:
317 zlib.error: if a decompression error occurred.
318 """
319 if unpacked.decomp_len <= -1:
320 raise ValueError("non-negative zlib data stream size expected")
321 decomp_obj = zlib.decompressobj()
323 comp_chunks = []
324 decomp_chunks = unpacked.decomp_chunks
325 decomp_len = 0
326 crc32 = unpacked.crc32
328 while True:
329 add = read_some(buffer_size)
330 if not add:
331 raise zlib.error("EOF before end of zlib stream")
332 comp_chunks.append(add)
333 decomp = decomp_obj.decompress(add)
334 decomp_len += len(decomp)
335 decomp_chunks.append(decomp)
336 unused = decomp_obj.unused_data
337 if unused:
338 left = len(unused)
339 if crc32 is not None:
340 crc32 = binascii.crc32(add[:-left], crc32)
341 if include_comp:
342 comp_chunks[-1] = add[:-left]
343 break
344 elif crc32 is not None:
345 crc32 = binascii.crc32(add, crc32)
346 if crc32 is not None:
347 crc32 &= 0xFFFFFFFF
349 if decomp_len != unpacked.decomp_len:
350 raise zlib.error("decompressed data does not match expected size")
352 unpacked.crc32 = crc32
353 if include_comp:
354 unpacked.comp_chunks = comp_chunks
355 return unused
358def iter_sha1(iter):
359 """Return the hexdigest of the SHA1 over a set of names.
361 Args:
362 iter: Iterator over string objects
363 Returns: 40-byte hex sha1 digest
364 """
365 sha = sha1()
366 for name in iter:
367 sha.update(name)
368 return sha.hexdigest().encode("ascii")
371def load_pack_index(path):
372 """Load an index file by path.
374 Args:
375 path: Path to the index file
376 Returns: A PackIndex loaded from the given path
377 """
378 with GitFile(path, "rb") as f:
379 return load_pack_index_file(path, f)
382def _load_file_contents(f, size=None):
383 try:
384 fd = f.fileno()
385 except (UnsupportedOperation, AttributeError):
386 fd = None
387 # Attempt to use mmap if possible
388 if fd is not None:
389 if size is None:
390 size = os.fstat(fd).st_size
391 if has_mmap:
392 try:
393 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
394 except OSError:
395 # Perhaps a socket?
396 pass
397 else:
398 return contents, size
399 contents = f.read()
400 size = len(contents)
401 return contents, size
404def load_pack_index_file(path, f):
405 """Load an index file from a file-like object.
407 Args:
408 path: Path for the index file
409 f: File-like object
410 Returns: A PackIndex loaded from the given file
411 """
412 contents, size = _load_file_contents(f)
413 if contents[:4] == b"\377tOc":
414 version = struct.unpack(b">L", contents[4:8])[0]
415 if version == 2:
416 return PackIndex2(path, file=f, contents=contents, size=size)
417 else:
418 raise KeyError("Unknown pack index format %d" % version)
419 else:
420 return PackIndex1(path, file=f, contents=contents, size=size)
423def bisect_find_sha(start, end, sha, unpack_name):
424 """Find a SHA in a data blob with sorted SHAs.
426 Args:
427 start: Start index of range to search
428 end: End index of range to search
429 sha: Sha to find
430 unpack_name: Callback to retrieve SHA by index
431 Returns: Index of the SHA, or None if it wasn't found
432 """
433 assert start <= end
434 while start <= end:
435 i = (start + end) // 2
436 file_sha = unpack_name(i)
437 if file_sha < sha:
438 start = i + 1
439 elif file_sha > sha:
440 end = i - 1
441 else:
442 return i
443 return None
446PackIndexEntry = Tuple[bytes, int, Optional[int]]
449class PackIndex:
450 """An index in to a packfile.
452 Given a sha id of an object a pack index can tell you the location in the
453 packfile of that object if it has it.
454 """
456 def __eq__(self, other):
457 if not isinstance(other, PackIndex):
458 return False
460 for (name1, _, _), (name2, _, _) in zip(
461 self.iterentries(), other.iterentries()
462 ):
463 if name1 != name2:
464 return False
465 return True
467 def __ne__(self, other):
468 return not self.__eq__(other)
470 def __len__(self) -> int:
471 """Return the number of entries in this pack index."""
472 raise NotImplementedError(self.__len__)
474 def __iter__(self) -> Iterator[bytes]:
475 """Iterate over the SHAs in this pack."""
476 return map(sha_to_hex, self._itersha())
478 def iterentries(self) -> Iterator[PackIndexEntry]:
479 """Iterate over the entries in this pack index.
481 Returns: iterator over tuples with object name, offset in packfile and
482 crc32 checksum.
483 """
484 raise NotImplementedError(self.iterentries)
486 def get_pack_checksum(self) -> bytes:
487 """Return the SHA1 checksum stored for the corresponding packfile.
489 Returns: 20-byte binary digest
490 """
491 raise NotImplementedError(self.get_pack_checksum)
493 def object_index(self, sha: bytes) -> int:
494 warnings.warn(
495 "Please use object_offset instead", DeprecationWarning, stacklevel=2
496 )
497 return self.object_offset(sha)
499 def object_offset(self, sha: bytes) -> int:
500 """Return the offset in to the corresponding packfile for the object.
502 Given the name of an object it will return the offset that object
503 lives at within the corresponding pack file. If the pack file doesn't
504 have the object then None will be returned.
505 """
506 raise NotImplementedError(self.object_offset)
508 def object_sha1(self, index: int) -> bytes:
509 """Return the SHA1 corresponding to the index in the pack file."""
510 for name, offset, crc32 in self.iterentries():
511 if offset == index:
512 return name
513 else:
514 raise KeyError(index)
516 def _object_offset(self, sha: bytes) -> int:
517 """See object_offset.
519 Args:
520 sha: A *binary* SHA string. (20 characters long)_
521 """
522 raise NotImplementedError(self._object_offset)
524 def objects_sha1(self) -> bytes:
525 """Return the hex SHA1 over all the shas of all objects in this pack.
527 Note: This is used for the filename of the pack.
528 """
529 return iter_sha1(self._itersha())
531 def _itersha(self) -> Iterator[bytes]:
532 """Yield all the SHA1's of the objects in the index, sorted."""
533 raise NotImplementedError(self._itersha)
535 def close(self):
536 pass
538 def check(self) -> None:
539 pass
542class MemoryPackIndex(PackIndex):
543 """Pack index that is stored entirely in memory."""
545 def __init__(self, entries, pack_checksum=None) -> None:
546 """Create a new MemoryPackIndex.
548 Args:
549 entries: Sequence of name, idx, crc32 (sorted)
550 pack_checksum: Optional pack checksum
551 """
552 self._by_sha = {}
553 self._by_offset = {}
554 for name, offset, crc32 in entries:
555 self._by_sha[name] = offset
556 self._by_offset[offset] = name
557 self._entries = entries
558 self._pack_checksum = pack_checksum
560 def get_pack_checksum(self):
561 return self._pack_checksum
563 def __len__(self) -> int:
564 return len(self._entries)
566 def object_offset(self, sha):
567 if len(sha) == 40:
568 sha = hex_to_sha(sha)
569 return self._by_sha[sha]
571 def object_sha1(self, offset):
572 return self._by_offset[offset]
574 def _itersha(self):
575 return iter(self._by_sha)
577 def iterentries(self):
578 return iter(self._entries)
580 @classmethod
581 def for_pack(cls, pack):
582 return MemoryPackIndex(pack.sorted_entries(), pack.calculate_checksum())
584 @classmethod
585 def clone(cls, other_index):
586 return cls(other_index.iterentries(), other_index.get_pack_checksum())
589class FilePackIndex(PackIndex):
590 """Pack index that is based on a file.
592 To do the loop it opens the file, and indexes first 256 4 byte groups
593 with the first byte of the sha id. The value in the four byte group indexed
594 is the end of the group that shares the same starting byte. Subtract one
595 from the starting byte and index again to find the start of the group.
596 The values are sorted by sha id within the group, so do the math to find
597 the start and end offset and then bisect in to find if the value is
598 present.
599 """
601 _fan_out_table: List[int]
603 def __init__(self, filename, file=None, contents=None, size=None) -> None:
604 """Create a pack index object.
606 Provide it with the name of the index file to consider, and it will map
607 it whenever required.
608 """
609 self._filename = filename
610 # Take the size now, so it can be checked each time we map the file to
611 # ensure that it hasn't changed.
612 if file is None:
613 self._file = GitFile(filename, "rb")
614 else:
615 self._file = file
616 if contents is None:
617 self._contents, self._size = _load_file_contents(self._file, size)
618 else:
619 self._contents, self._size = (contents, size)
621 @property
622 def path(self) -> str:
623 return self._filename
625 def __eq__(self, other):
626 # Quick optimization:
627 if (
628 isinstance(other, FilePackIndex)
629 and self._fan_out_table != other._fan_out_table
630 ):
631 return False
633 return super().__eq__(other)
635 def close(self) -> None:
636 self._file.close()
637 if getattr(self._contents, "close", None) is not None:
638 self._contents.close()
640 def __len__(self) -> int:
641 """Return the number of entries in this pack index."""
642 return self._fan_out_table[-1]
644 def _unpack_entry(self, i: int) -> PackIndexEntry:
645 """Unpack the i-th entry in the index file.
647 Returns: Tuple with object name (SHA), offset in pack file and CRC32
648 checksum (if known).
649 """
650 raise NotImplementedError(self._unpack_entry)
652 def _unpack_name(self, i):
653 """Unpack the i-th name from the index file."""
654 raise NotImplementedError(self._unpack_name)
656 def _unpack_offset(self, i):
657 """Unpack the i-th object offset from the index file."""
658 raise NotImplementedError(self._unpack_offset)
660 def _unpack_crc32_checksum(self, i):
661 """Unpack the crc32 checksum for the ith object from the index file."""
662 raise NotImplementedError(self._unpack_crc32_checksum)
664 def _itersha(self) -> Iterator[bytes]:
665 for i in range(len(self)):
666 yield self._unpack_name(i)
668 def iterentries(self) -> Iterator[PackIndexEntry]:
669 """Iterate over the entries in this pack index.
671 Returns: iterator over tuples with object name, offset in packfile and
672 crc32 checksum.
673 """
674 for i in range(len(self)):
675 yield self._unpack_entry(i)
677 def _read_fan_out_table(self, start_offset: int):
678 ret = []
679 for i in range(0x100):
680 fanout_entry = self._contents[
681 start_offset + i * 4 : start_offset + (i + 1) * 4
682 ]
683 ret.append(struct.unpack(">L", fanout_entry)[0])
684 return ret
686 def check(self) -> None:
687 """Check that the stored checksum matches the actual checksum."""
688 actual = self.calculate_checksum()
689 stored = self.get_stored_checksum()
690 if actual != stored:
691 raise ChecksumMismatch(stored, actual)
693 def calculate_checksum(self) -> bytes:
694 """Calculate the SHA1 checksum over this pack index.
696 Returns: This is a 20-byte binary digest
697 """
698 return sha1(self._contents[:-20]).digest()
700 def get_pack_checksum(self) -> bytes:
701 """Return the SHA1 checksum stored for the corresponding packfile.
703 Returns: 20-byte binary digest
704 """
705 return bytes(self._contents[-40:-20])
707 def get_stored_checksum(self) -> bytes:
708 """Return the SHA1 checksum stored for this index.
710 Returns: 20-byte binary digest
711 """
712 return bytes(self._contents[-20:])
714 def object_offset(self, sha: bytes) -> int:
715 """Return the offset in to the corresponding packfile for the object.
717 Given the name of an object it will return the offset that object
718 lives at within the corresponding pack file. If the pack file doesn't
719 have the object then None will be returned.
720 """
721 if len(sha) == 40:
722 sha = hex_to_sha(sha)
723 try:
724 return self._object_offset(sha)
725 except ValueError as exc:
726 closed = getattr(self._contents, "closed", None)
727 if closed in (None, True):
728 raise PackFileDisappeared(self) from exc
729 raise
731 def _object_offset(self, sha: bytes) -> int:
732 """See object_offset.
734 Args:
735 sha: A *binary* SHA string. (20 characters long)_
736 """
737 assert len(sha) == 20
738 idx = ord(sha[:1])
739 if idx == 0:
740 start = 0
741 else:
742 start = self._fan_out_table[idx - 1]
743 end = self._fan_out_table[idx]
744 i = bisect_find_sha(start, end, sha, self._unpack_name)
745 if i is None:
746 raise KeyError(sha)
747 return self._unpack_offset(i)
750class PackIndex1(FilePackIndex):
751 """Version 1 Pack Index file."""
753 def __init__(self, filename: str, file=None, contents=None, size=None) -> None:
754 super().__init__(filename, file, contents, size)
755 self.version = 1
756 self._fan_out_table = self._read_fan_out_table(0)
758 def _unpack_entry(self, i):
759 (offset, name) = unpack_from(">L20s", self._contents, (0x100 * 4) + (i * 24))
760 return (name, offset, None)
762 def _unpack_name(self, i):
763 offset = (0x100 * 4) + (i * 24) + 4
764 return self._contents[offset : offset + 20]
766 def _unpack_offset(self, i):
767 offset = (0x100 * 4) + (i * 24)
768 return unpack_from(">L", self._contents, offset)[0]
770 def _unpack_crc32_checksum(self, i):
771 # Not stored in v1 index files
772 return None
775class PackIndex2(FilePackIndex):
776 """Version 2 Pack Index file."""
778 def __init__(self, filename: str, file=None, contents=None, size=None) -> None:
779 super().__init__(filename, file, contents, size)
780 if self._contents[:4] != b"\377tOc":
781 raise AssertionError("Not a v2 pack index file")
782 (self.version,) = unpack_from(b">L", self._contents, 4)
783 if self.version != 2:
784 raise AssertionError("Version was %d" % self.version)
785 self._fan_out_table = self._read_fan_out_table(8)
786 self._name_table_offset = 8 + 0x100 * 4
787 self._crc32_table_offset = self._name_table_offset + 20 * len(self)
788 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
789 self._pack_offset_largetable_offset = self._pack_offset_table_offset + 4 * len(
790 self
791 )
793 def _unpack_entry(self, i):
794 return (
795 self._unpack_name(i),
796 self._unpack_offset(i),
797 self._unpack_crc32_checksum(i),
798 )
800 def _unpack_name(self, i):
801 offset = self._name_table_offset + i * 20
802 return self._contents[offset : offset + 20]
804 def _unpack_offset(self, i):
805 offset = self._pack_offset_table_offset + i * 4
806 offset = unpack_from(">L", self._contents, offset)[0]
807 if offset & (2**31):
808 offset = self._pack_offset_largetable_offset + (offset & (2**31 - 1)) * 8
809 offset = unpack_from(">Q", self._contents, offset)[0]
810 return offset
812 def _unpack_crc32_checksum(self, i):
813 return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0]
816def read_pack_header(read) -> Tuple[int, int]:
817 """Read the header of a pack file.
819 Args:
820 read: Read function
821 Returns: Tuple of (pack version, number of objects). If no data is
822 available to read, returns (None, None).
823 """
824 header = read(12)
825 if not header:
826 raise AssertionError("file too short to contain pack")
827 if header[:4] != b"PACK":
828 raise AssertionError(f"Invalid pack header {header!r}")
829 (version,) = unpack_from(b">L", header, 4)
830 if version not in (2, 3):
831 raise AssertionError("Version was %d" % version)
832 (num_objects,) = unpack_from(b">L", header, 8)
833 return (version, num_objects)
836def chunks_length(chunks: Union[bytes, Iterable[bytes]]) -> int:
837 if isinstance(chunks, bytes):
838 return len(chunks)
839 else:
840 return sum(map(len, chunks))
843def unpack_object(
844 read_all: Callable[[int], bytes],
845 read_some: Optional[Callable[[int], bytes]] = None,
846 compute_crc32=False,
847 include_comp=False,
848 zlib_bufsize=_ZLIB_BUFSIZE,
849) -> Tuple[UnpackedObject, bytes]:
850 """Unpack a Git object.
852 Args:
853 read_all: Read function that blocks until the number of requested
854 bytes are read.
855 read_some: Read function that returns at least one byte, but may not
856 return the number of bytes requested.
857 compute_crc32: If True, compute the CRC32 of the compressed data. If
858 False, the returned CRC32 will be None.
859 include_comp: If True, include compressed data in the result.
860 zlib_bufsize: An optional buffer size for zlib operations.
861 Returns: A tuple of (unpacked, unused), where unused is the unused data
862 leftover from decompression, and unpacked in an UnpackedObject with
863 the following attrs set:
865 * obj_chunks (for non-delta types)
866 * pack_type_num
867 * delta_base (for delta types)
868 * comp_chunks (if include_comp is True)
869 * decomp_chunks
870 * decomp_len
871 * crc32 (if compute_crc32 is True)
872 """
873 if read_some is None:
874 read_some = read_all
875 if compute_crc32:
876 crc32 = 0
877 else:
878 crc32 = None
880 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)
881 type_num = (raw[0] >> 4) & 0x07
882 size = raw[0] & 0x0F
883 for i, byte in enumerate(raw[1:]):
884 size += (byte & 0x7F) << ((i * 7) + 4)
886 delta_base: Union[int, bytes, None]
887 raw_base = len(raw)
888 if type_num == OFS_DELTA:
889 raw, crc32 = take_msb_bytes(read_all, crc32=crc32)
890 raw_base += len(raw)
891 if raw[-1] & 0x80:
892 raise AssertionError
893 delta_base_offset = raw[0] & 0x7F
894 for byte in raw[1:]:
895 delta_base_offset += 1
896 delta_base_offset <<= 7
897 delta_base_offset += byte & 0x7F
898 delta_base = delta_base_offset
899 elif type_num == REF_DELTA:
900 delta_base_obj = read_all(20)
901 if crc32 is not None:
902 crc32 = binascii.crc32(delta_base_obj, crc32)
903 delta_base = delta_base_obj
904 raw_base += 20
905 else:
906 delta_base = None
908 unpacked = UnpackedObject(
909 type_num, delta_base=delta_base, decomp_len=size, crc32=crc32
910 )
911 unused = read_zlib_chunks(
912 read_some,
913 unpacked,
914 buffer_size=zlib_bufsize,
915 include_comp=include_comp,
916 )
917 return unpacked, unused
920def _compute_object_size(value):
921 """Compute the size of a unresolved object for use with LRUSizeCache."""
922 (num, obj) = value
923 if num in DELTA_TYPES:
924 return chunks_length(obj[1])
925 return chunks_length(obj)
928class PackStreamReader:
929 """Class to read a pack stream.
931 The pack is read from a ReceivableProtocol using read() or recv() as
932 appropriate.
933 """
935 def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE) -> None:
936 self.read_all = read_all
937 if read_some is None:
938 self.read_some = read_all
939 else:
940 self.read_some = read_some
941 self.sha = sha1()
942 self._offset = 0
943 self._rbuf = BytesIO()
944 # trailer is a deque to avoid memory allocation on small reads
945 self._trailer: Deque[bytes] = deque()
946 self._zlib_bufsize = zlib_bufsize
948 def _read(self, read, size):
949 """Read up to size bytes using the given callback.
951 As a side effect, update the verifier's hash (excluding the last 20
952 bytes read).
954 Args:
955 read: The read callback to read from.
956 size: The maximum number of bytes to read; the particular
957 behavior is callback-specific.
958 """
959 data = read(size)
961 # maintain a trailer of the last 20 bytes we've read
962 n = len(data)
963 self._offset += n
964 tn = len(self._trailer)
965 if n >= 20:
966 to_pop = tn
967 to_add = 20
968 else:
969 to_pop = max(n + tn - 20, 0)
970 to_add = n
971 self.sha.update(
972 bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)]))
973 )
974 self._trailer.extend(data[-to_add:])
976 # hash everything but the trailer
977 self.sha.update(data[:-to_add])
978 return data
980 def _buf_len(self):
981 buf = self._rbuf
982 start = buf.tell()
983 buf.seek(0, SEEK_END)
984 end = buf.tell()
985 buf.seek(start)
986 return end - start
988 @property
989 def offset(self):
990 return self._offset - self._buf_len()
992 def read(self, size):
993 """Read, blocking until size bytes are read."""
994 buf_len = self._buf_len()
995 if buf_len >= size:
996 return self._rbuf.read(size)
997 buf_data = self._rbuf.read()
998 self._rbuf = BytesIO()
999 return buf_data + self._read(self.read_all, size - buf_len)
1001 def recv(self, size):
1002 """Read up to size bytes, blocking until one byte is read."""
1003 buf_len = self._buf_len()
1004 if buf_len:
1005 data = self._rbuf.read(size)
1006 if size >= buf_len:
1007 self._rbuf = BytesIO()
1008 return data
1009 return self._read(self.read_some, size)
1011 def __len__(self) -> int:
1012 return self._num_objects
1014 def read_objects(self, compute_crc32=False) -> Iterator[UnpackedObject]:
1015 """Read the objects in this pack file.
1017 Args:
1018 compute_crc32: If True, compute the CRC32 of the compressed
1019 data. If False, the returned CRC32 will be None.
1020 Returns: Iterator over UnpackedObjects with the following members set:
1021 offset
1022 obj_type_num
1023 obj_chunks (for non-delta types)
1024 delta_base (for delta types)
1025 decomp_chunks
1026 decomp_len
1027 crc32 (if compute_crc32 is True)
1029 Raises:
1030 ChecksumMismatch: if the checksum of the pack contents does not
1031 match the checksum in the pack trailer.
1032 zlib.error: if an error occurred during zlib decompression.
1033 IOError: if an error occurred writing to the output file.
1034 """
1035 pack_version, self._num_objects = read_pack_header(self.read)
1037 for i in range(self._num_objects):
1038 offset = self.offset
1039 unpacked, unused = unpack_object(
1040 self.read,
1041 read_some=self.recv,
1042 compute_crc32=compute_crc32,
1043 zlib_bufsize=self._zlib_bufsize,
1044 )
1045 unpacked.offset = offset
1047 # prepend any unused data to current read buffer
1048 buf = BytesIO()
1049 buf.write(unused)
1050 buf.write(self._rbuf.read())
1051 buf.seek(0)
1052 self._rbuf = buf
1054 yield unpacked
1056 if self._buf_len() < 20:
1057 # If the read buffer is full, then the last read() got the whole
1058 # trailer off the wire. If not, it means there is still some of the
1059 # trailer to read. We need to read() all 20 bytes; N come from the
1060 # read buffer and (20 - N) come from the wire.
1061 self.read(20)
1063 pack_sha = bytearray(self._trailer) # type: ignore
1064 if pack_sha != self.sha.digest():
1065 raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest())
1068class PackStreamCopier(PackStreamReader):
1069 """Class to verify a pack stream as it is being read.
1071 The pack is read from a ReceivableProtocol using read() or recv() as
1072 appropriate and written out to the given file-like object.
1073 """
1075 def __init__(self, read_all, read_some, outfile, delta_iter=None) -> None:
1076 """Initialize the copier.
1078 Args:
1079 read_all: Read function that blocks until the number of
1080 requested bytes are read.
1081 read_some: Read function that returns at least one byte, but may
1082 not return the number of bytes requested.
1083 outfile: File-like object to write output through.
1084 delta_iter: Optional DeltaChainIterator to record deltas as we
1085 read them.
1086 """
1087 super().__init__(read_all, read_some=read_some)
1088 self.outfile = outfile
1089 self._delta_iter = delta_iter
1091 def _read(self, read, size):
1092 """Read data from the read callback and write it to the file."""
1093 data = super()._read(read, size)
1094 self.outfile.write(data)
1095 return data
1097 def verify(self, progress=None):
1098 """Verify a pack stream and write it to the output file.
1100 See PackStreamReader.iterobjects for a list of exceptions this may
1101 throw.
1102 """
1103 i = 0 # default count of entries if read_objects() is empty
1104 for i, unpacked in enumerate(self.read_objects()):
1105 if self._delta_iter:
1106 self._delta_iter.record(unpacked)
1107 if progress is not None:
1108 progress(
1109 ("copying pack entries: %d/%d\r" % (i, len(self))).encode("ascii")
1110 )
1111 if progress is not None:
1112 progress(("copied %d pack entries\n" % i).encode("ascii"))
1115def obj_sha(type, chunks):
1116 """Compute the SHA for a numeric type and object chunks."""
1117 sha = sha1()
1118 sha.update(object_header(type, chunks_length(chunks)))
1119 if isinstance(chunks, bytes):
1120 sha.update(chunks)
1121 else:
1122 for chunk in chunks:
1123 sha.update(chunk)
1124 return sha.digest()
1127def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1 << 16):
1128 """Hash a portion of a file into a new SHA.
1130 Args:
1131 f: A file-like object to read from that supports seek().
1132 start_ofs: The offset in the file to start reading at.
1133 end_ofs: The offset in the file to end reading at, relative to the
1134 end of the file.
1135 buffer_size: A buffer size for reading.
1136 Returns: A new SHA object updated with data read from the file.
1137 """
1138 sha = sha1()
1139 f.seek(0, SEEK_END)
1140 length = f.tell()
1141 if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length:
1142 raise AssertionError(
1143 "Attempt to read beyond file length. "
1144 "start_ofs: %d, end_ofs: %d, file length: %d" % (start_ofs, end_ofs, length)
1145 )
1146 todo = length + end_ofs - start_ofs
1147 f.seek(start_ofs)
1148 while todo:
1149 data = f.read(min(todo, buffer_size))
1150 sha.update(data)
1151 todo -= len(data)
1152 return sha
1155class PackData:
1156 """The data contained in a packfile.
1158 Pack files can be accessed both sequentially for exploding a pack, and
1159 directly with the help of an index to retrieve a specific object.
1161 The objects within are either complete or a delta against another.
1163 The header is variable length. If the MSB of each byte is set then it
1164 indicates that the subsequent byte is still part of the header.
1165 For the first byte the next MS bits are the type, which tells you the type
1166 of object, and whether it is a delta. The LS byte is the lowest bits of the
1167 size. For each subsequent byte the LS 7 bits are the next MS bits of the
1168 size, i.e. the last byte of the header contains the MS bits of the size.
1170 For the complete objects the data is stored as zlib deflated data.
1171 The size in the header is the uncompressed object size, so to uncompress
1172 you need to just keep feeding data to zlib until you get an object back,
1173 or it errors on bad data. This is done here by just giving the complete
1174 buffer from the start of the deflated object on. This is bad, but until I
1175 get mmap sorted out it will have to do.
1177 Currently there are no integrity checks done. Also no attempt is made to
1178 try and detect the delta case, or a request for an object at the wrong
1179 position. It will all just throw a zlib or KeyError.
1180 """
1182 def __init__(self, filename, file=None, size=None) -> None:
1183 """Create a PackData object representing the pack in the given filename.
1185 The file must exist and stay readable until the object is disposed of.
1186 It must also stay the same size. It will be mapped whenever needed.
1188 Currently there is a restriction on the size of the pack as the python
1189 mmap implementation is flawed.
1190 """
1191 self._filename = filename
1192 self._size = size
1193 self._header_size = 12
1194 if file is None:
1195 self._file = GitFile(self._filename, "rb")
1196 else:
1197 self._file = file
1198 (version, self._num_objects) = read_pack_header(self._file.read)
1199 self._offset_cache = LRUSizeCache[int, Tuple[int, OldUnpackedObject]](
1200 1024 * 1024 * 20, compute_size=_compute_object_size
1201 )
1203 @property
1204 def filename(self):
1205 return os.path.basename(self._filename)
1207 @property
1208 def path(self):
1209 return self._filename
1211 @classmethod
1212 def from_file(cls, file, size=None):
1213 return cls(str(file), file=file, size=size)
1215 @classmethod
1216 def from_path(cls, path):
1217 return cls(filename=path)
1219 def close(self):
1220 self._file.close()
1222 def __enter__(self):
1223 return self
1225 def __exit__(self, exc_type, exc_val, exc_tb):
1226 self.close()
1228 def __eq__(self, other):
1229 if isinstance(other, PackData):
1230 return self.get_stored_checksum() == other.get_stored_checksum()
1231 return False
1233 def _get_size(self):
1234 if self._size is not None:
1235 return self._size
1236 self._size = os.path.getsize(self._filename)
1237 if self._size < self._header_size:
1238 errmsg = "%s is too small for a packfile (%d < %d)" % (
1239 self._filename,
1240 self._size,
1241 self._header_size,
1242 )
1243 raise AssertionError(errmsg)
1244 return self._size
1246 def __len__(self) -> int:
1247 """Returns the number of objects in this pack."""
1248 return self._num_objects
1250 def calculate_checksum(self):
1251 """Calculate the checksum for this pack.
1253 Returns: 20-byte binary SHA1 digest
1254 """
1255 return compute_file_sha(self._file, end_ofs=-20).digest()
1257 def iter_unpacked(self, *, include_comp: bool = False):
1258 self._file.seek(self._header_size)
1260 if self._num_objects is None:
1261 return
1263 for _ in range(self._num_objects):
1264 offset = self._file.tell()
1265 unpacked, unused = unpack_object(
1266 self._file.read, compute_crc32=False, include_comp=include_comp
1267 )
1268 unpacked.offset = offset
1269 yield unpacked
1270 # Back up over unused data.
1271 self._file.seek(-len(unused), SEEK_CUR)
1273 def iterentries(
1274 self, progress=None, resolve_ext_ref: Optional[ResolveExtRefFn] = None
1275 ):
1276 """Yield entries summarizing the contents of this pack.
1278 Args:
1279 progress: Progress function, called with current and total
1280 object count.
1281 Returns: iterator of tuples with (sha, offset, crc32)
1282 """
1283 num_objects = self._num_objects
1284 indexer = PackIndexer.for_pack_data(self, resolve_ext_ref=resolve_ext_ref)
1285 for i, result in enumerate(indexer):
1286 if progress is not None:
1287 progress(i, num_objects)
1288 yield result
1290 def sorted_entries(
1291 self,
1292 progress: Optional[ProgressFn] = None,
1293 resolve_ext_ref: Optional[ResolveExtRefFn] = None,
1294 ):
1295 """Return entries in this pack, sorted by SHA.
1297 Args:
1298 progress: Progress function, called with current and total
1299 object count
1300 Returns: Iterator of tuples with (sha, offset, crc32)
1301 """
1302 return sorted(
1303 self.iterentries(progress=progress, resolve_ext_ref=resolve_ext_ref)
1304 )
1306 def create_index_v1(self, filename, progress=None, resolve_ext_ref=None):
1307 """Create a version 1 file for this data file.
1309 Args:
1310 filename: Index filename.
1311 progress: Progress report function
1312 Returns: Checksum of index file
1313 """
1314 entries = self.sorted_entries(
1315 progress=progress, resolve_ext_ref=resolve_ext_ref
1316 )
1317 with GitFile(filename, "wb") as f:
1318 return write_pack_index_v1(f, entries, self.calculate_checksum())
1320 def create_index_v2(self, filename, progress=None, resolve_ext_ref=None):
1321 """Create a version 2 index file for this data file.
1323 Args:
1324 filename: Index filename.
1325 progress: Progress report function
1326 Returns: Checksum of index file
1327 """
1328 entries = self.sorted_entries(
1329 progress=progress, resolve_ext_ref=resolve_ext_ref
1330 )
1331 with GitFile(filename, "wb") as f:
1332 return write_pack_index_v2(f, entries, self.calculate_checksum())
1334 def create_index(self, filename, progress=None, version=2, resolve_ext_ref=None):
1335 """Create an index file for this data file.
1337 Args:
1338 filename: Index filename.
1339 progress: Progress report function
1340 Returns: Checksum of index file
1341 """
1342 if version == 1:
1343 return self.create_index_v1(
1344 filename, progress, resolve_ext_ref=resolve_ext_ref
1345 )
1346 elif version == 2:
1347 return self.create_index_v2(
1348 filename, progress, resolve_ext_ref=resolve_ext_ref
1349 )
1350 else:
1351 raise ValueError("unknown index format %d" % version)
1353 def get_stored_checksum(self):
1354 """Return the expected checksum stored in this pack."""
1355 self._file.seek(-20, SEEK_END)
1356 return self._file.read(20)
1358 def check(self):
1359 """Check the consistency of this pack."""
1360 actual = self.calculate_checksum()
1361 stored = self.get_stored_checksum()
1362 if actual != stored:
1363 raise ChecksumMismatch(stored, actual)
1365 def get_unpacked_object_at(
1366 self, offset: int, *, include_comp: bool = False
1367 ) -> UnpackedObject:
1368 """Given offset in the packfile return a UnpackedObject."""
1369 assert offset >= self._header_size
1370 self._file.seek(offset)
1371 unpacked, _ = unpack_object(self._file.read, include_comp=include_comp)
1372 unpacked.offset = offset
1373 return unpacked
1375 def get_object_at(self, offset: int) -> Tuple[int, OldUnpackedObject]:
1376 """Given an offset in to the packfile return the object that is there.
1378 Using the associated index the location of an object can be looked up,
1379 and then the packfile can be asked directly for that object using this
1380 function.
1381 """
1382 try:
1383 return self._offset_cache[offset]
1384 except KeyError:
1385 pass
1386 unpacked = self.get_unpacked_object_at(offset, include_comp=False)
1387 return (unpacked.pack_type_num, unpacked._obj())
1390T = TypeVar("T")
1393class DeltaChainIterator(Generic[T]):
1394 """Abstract iterator over pack data based on delta chains.
1396 Each object in the pack is guaranteed to be inflated exactly once,
1397 regardless of how many objects reference it as a delta base. As a result,
1398 memory usage is proportional to the length of the longest delta chain.
1400 Subclasses can override _result to define the result type of the iterator.
1401 By default, results are UnpackedObjects with the following members set:
1403 * offset
1404 * obj_type_num
1405 * obj_chunks
1406 * pack_type_num
1407 * delta_base (for delta types)
1408 * comp_chunks (if _include_comp is True)
1409 * decomp_chunks
1410 * decomp_len
1411 * crc32 (if _compute_crc32 is True)
1412 """
1414 _compute_crc32 = False
1415 _include_comp = False
1417 def __init__(self, file_obj, *, resolve_ext_ref=None) -> None:
1418 self._file = file_obj
1419 self._resolve_ext_ref = resolve_ext_ref
1420 self._pending_ofs: Dict[int, List[int]] = defaultdict(list)
1421 self._pending_ref: Dict[bytes, List[int]] = defaultdict(list)
1422 self._full_ofs: List[Tuple[int, int]] = []
1423 self._ext_refs: List[bytes] = []
1425 @classmethod
1426 def for_pack_data(cls, pack_data: PackData, resolve_ext_ref=None):
1427 walker = cls(None, resolve_ext_ref=resolve_ext_ref)
1428 walker.set_pack_data(pack_data)
1429 for unpacked in pack_data.iter_unpacked(include_comp=False):
1430 walker.record(unpacked)
1431 return walker
1433 @classmethod
1434 def for_pack_subset(
1435 cls,
1436 pack: "Pack",
1437 shas: Iterable[bytes],
1438 *,
1439 allow_missing: bool = False,
1440 resolve_ext_ref=None,
1441 ):
1442 walker = cls(None, resolve_ext_ref=resolve_ext_ref)
1443 walker.set_pack_data(pack.data)
1444 todo = set()
1445 for sha in shas:
1446 assert isinstance(sha, bytes)
1447 try:
1448 off = pack.index.object_offset(sha)
1449 except KeyError:
1450 if not allow_missing:
1451 raise
1452 else:
1453 todo.add(off)
1454 done = set()
1455 while todo:
1456 off = todo.pop()
1457 unpacked = pack.data.get_unpacked_object_at(off)
1458 walker.record(unpacked)
1459 done.add(off)
1460 base_ofs = None
1461 if unpacked.pack_type_num == OFS_DELTA:
1462 base_ofs = unpacked.offset - unpacked.delta_base
1463 elif unpacked.pack_type_num == REF_DELTA:
1464 with suppress(KeyError):
1465 assert isinstance(unpacked.delta_base, bytes)
1466 base_ofs = pack.index.object_index(unpacked.delta_base)
1467 if base_ofs is not None and base_ofs not in done:
1468 todo.add(base_ofs)
1469 return walker
1471 def record(self, unpacked: UnpackedObject) -> None:
1472 type_num = unpacked.pack_type_num
1473 offset = unpacked.offset
1474 if type_num == OFS_DELTA:
1475 base_offset = offset - unpacked.delta_base
1476 self._pending_ofs[base_offset].append(offset)
1477 elif type_num == REF_DELTA:
1478 assert isinstance(unpacked.delta_base, bytes)
1479 self._pending_ref[unpacked.delta_base].append(offset)
1480 else:
1481 self._full_ofs.append((offset, type_num))
1483 def set_pack_data(self, pack_data: PackData) -> None:
1484 self._file = pack_data._file
1486 def _walk_all_chains(self):
1487 for offset, type_num in self._full_ofs:
1488 yield from self._follow_chain(offset, type_num, None)
1489 yield from self._walk_ref_chains()
1490 assert not self._pending_ofs, repr(self._pending_ofs)
1492 def _ensure_no_pending(self) -> None:
1493 if self._pending_ref:
1494 raise UnresolvedDeltas([sha_to_hex(s) for s in self._pending_ref])
1496 def _walk_ref_chains(self):
1497 if not self._resolve_ext_ref:
1498 self._ensure_no_pending()
1499 return
1501 for base_sha, pending in sorted(self._pending_ref.items()):
1502 if base_sha not in self._pending_ref:
1503 continue
1504 try:
1505 type_num, chunks = self._resolve_ext_ref(base_sha)
1506 except KeyError:
1507 # Not an external ref, but may depend on one. Either it will
1508 # get popped via a _follow_chain call, or we will raise an
1509 # error below.
1510 continue
1511 self._ext_refs.append(base_sha)
1512 self._pending_ref.pop(base_sha)
1513 for new_offset in pending:
1514 yield from self._follow_chain(new_offset, type_num, chunks)
1516 self._ensure_no_pending()
1518 def _result(self, unpacked: UnpackedObject) -> T:
1519 raise NotImplementedError
1521 def _resolve_object(
1522 self, offset: int, obj_type_num: int, base_chunks: List[bytes]
1523 ) -> UnpackedObject:
1524 self._file.seek(offset)
1525 unpacked, _ = unpack_object(
1526 self._file.read,
1527 include_comp=self._include_comp,
1528 compute_crc32=self._compute_crc32,
1529 )
1530 unpacked.offset = offset
1531 if base_chunks is None:
1532 assert unpacked.pack_type_num == obj_type_num
1533 else:
1534 assert unpacked.pack_type_num in DELTA_TYPES
1535 unpacked.obj_type_num = obj_type_num
1536 unpacked.obj_chunks = apply_delta(base_chunks, unpacked.decomp_chunks)
1537 return unpacked
1539 def _follow_chain(self, offset: int, obj_type_num: int, base_chunks: List[bytes]):
1540 # Unlike PackData.get_object_at, there is no need to cache offsets as
1541 # this approach by design inflates each object exactly once.
1542 todo = [(offset, obj_type_num, base_chunks)]
1543 while todo:
1544 (offset, obj_type_num, base_chunks) = todo.pop()
1545 unpacked = self._resolve_object(offset, obj_type_num, base_chunks)
1546 yield self._result(unpacked)
1548 unblocked = chain(
1549 self._pending_ofs.pop(unpacked.offset, []),
1550 self._pending_ref.pop(unpacked.sha(), []),
1551 )
1552 todo.extend(
1553 (new_offset, unpacked.obj_type_num, unpacked.obj_chunks) # type: ignore
1554 for new_offset in unblocked
1555 )
1557 def __iter__(self) -> Iterator[T]:
1558 return self._walk_all_chains()
1560 def ext_refs(self):
1561 return self._ext_refs
1564class UnpackedObjectIterator(DeltaChainIterator[UnpackedObject]):
1565 """Delta chain iterator that yield unpacked objects."""
1567 def _result(self, unpacked):
1568 return unpacked
1571class PackIndexer(DeltaChainIterator[PackIndexEntry]):
1572 """Delta chain iterator that yields index entries."""
1574 _compute_crc32 = True
1576 def _result(self, unpacked):
1577 return unpacked.sha(), unpacked.offset, unpacked.crc32
1580class PackInflater(DeltaChainIterator[ShaFile]):
1581 """Delta chain iterator that yields ShaFile objects."""
1583 def _result(self, unpacked):
1584 return unpacked.sha_file()
1587class SHA1Reader:
1588 """Wrapper for file-like object that remembers the SHA1 of its data."""
1590 def __init__(self, f) -> None:
1591 self.f = f
1592 self.sha1 = sha1(b"")
1594 def read(self, num=None):
1595 data = self.f.read(num)
1596 self.sha1.update(data)
1597 return data
1599 def check_sha(self):
1600 stored = self.f.read(20)
1601 if stored != self.sha1.digest():
1602 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
1604 def close(self):
1605 return self.f.close()
1607 def tell(self):
1608 return self.f.tell()
1611class SHA1Writer:
1612 """Wrapper for file-like object that remembers the SHA1 of its data."""
1614 def __init__(self, f) -> None:
1615 self.f = f
1616 self.length = 0
1617 self.sha1 = sha1(b"")
1619 def write(self, data):
1620 self.sha1.update(data)
1621 self.f.write(data)
1622 self.length += len(data)
1624 def write_sha(self):
1625 sha = self.sha1.digest()
1626 assert len(sha) == 20
1627 self.f.write(sha)
1628 self.length += len(sha)
1629 return sha
1631 def close(self):
1632 sha = self.write_sha()
1633 self.f.close()
1634 return sha
1636 def offset(self):
1637 return self.length
1639 def tell(self):
1640 return self.f.tell()
1643def pack_object_header(type_num, delta_base, size):
1644 """Create a pack object header for the given object info.
1646 Args:
1647 type_num: Numeric type of the object.
1648 delta_base: Delta base offset or ref, or None for whole objects.
1649 size: Uncompressed object size.
1650 Returns: A header for a packed object.
1651 """
1652 header = []
1653 c = (type_num << 4) | (size & 15)
1654 size >>= 4
1655 while size:
1656 header.append(c | 0x80)
1657 c = size & 0x7F
1658 size >>= 7
1659 header.append(c)
1660 if type_num == OFS_DELTA:
1661 ret = [delta_base & 0x7F]
1662 delta_base >>= 7
1663 while delta_base:
1664 delta_base -= 1
1665 ret.insert(0, 0x80 | (delta_base & 0x7F))
1666 delta_base >>= 7
1667 header.extend(ret)
1668 elif type_num == REF_DELTA:
1669 assert len(delta_base) == 20
1670 header += delta_base
1671 return bytearray(header)
1674def pack_object_chunks(type, object, compression_level=-1):
1675 """Generate chunks for a pack object.
1677 Args:
1678 type: Numeric type of the object
1679 object: Object to write
1680 compression_level: the zlib compression level
1681 Returns: Chunks
1682 """
1683 if type in DELTA_TYPES:
1684 delta_base, object = object
1685 else:
1686 delta_base = None
1687 if isinstance(object, bytes):
1688 object = [object]
1689 yield bytes(pack_object_header(type, delta_base, sum(map(len, object))))
1690 compressor = zlib.compressobj(level=compression_level)
1691 for data in object:
1692 yield compressor.compress(data)
1693 yield compressor.flush()
1696def write_pack_object(write, type, object, sha=None, compression_level=-1):
1697 """Write pack object to a file.
1699 Args:
1700 write: Write function to use
1701 type: Numeric type of the object
1702 object: Object to write
1703 compression_level: the zlib compression level
1704 Returns: Tuple with offset at which the object was written, and crc32
1705 """
1706 crc32 = 0
1707 for chunk in pack_object_chunks(type, object, compression_level=compression_level):
1708 write(chunk)
1709 if sha is not None:
1710 sha.update(chunk)
1711 crc32 = binascii.crc32(chunk, crc32)
1712 return crc32 & 0xFFFFFFFF
1715def write_pack(
1716 filename,
1717 objects: Union[Sequence[ShaFile], Sequence[Tuple[ShaFile, Optional[bytes]]]],
1718 *,
1719 deltify: Optional[bool] = None,
1720 delta_window_size: Optional[int] = None,
1721 compression_level: int = -1,
1722):
1723 """Write a new pack data file.
1725 Args:
1726 filename: Path to the new pack file (without .pack extension)
1727 container: PackedObjectContainer
1728 entries: Sequence of (object_id, path) tuples to write
1729 delta_window_size: Delta window size
1730 deltify: Whether to deltify pack objects
1731 compression_level: the zlib compression level
1732 Returns: Tuple with checksum of pack file and index file
1733 """
1734 with GitFile(filename + ".pack", "wb") as f:
1735 entries, data_sum = write_pack_objects(
1736 f.write,
1737 objects,
1738 delta_window_size=delta_window_size,
1739 deltify=deltify,
1740 compression_level=compression_level,
1741 )
1742 entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])
1743 with GitFile(filename + ".idx", "wb") as f:
1744 return data_sum, write_pack_index_v2(f, entries, data_sum)
1747def pack_header_chunks(num_objects):
1748 """Yield chunks for a pack header."""
1749 yield b"PACK" # Pack header
1750 yield struct.pack(b">L", 2) # Pack version
1751 yield struct.pack(b">L", num_objects) # Number of objects in pack
1754def write_pack_header(write, num_objects):
1755 """Write a pack header for the given number of objects."""
1756 if hasattr(write, "write"):
1757 write = write.write
1758 warnings.warn(
1759 "write_pack_header() now takes a write rather than file argument",
1760 DeprecationWarning,
1761 stacklevel=2,
1762 )
1763 for chunk in pack_header_chunks(num_objects):
1764 write(chunk)
1767def find_reusable_deltas(
1768 container: PackedObjectContainer,
1769 object_ids: Set[bytes],
1770 *,
1771 other_haves: Optional[Set[bytes]] = None,
1772 progress=None,
1773) -> Iterator[UnpackedObject]:
1774 if other_haves is None:
1775 other_haves = set()
1776 reused = 0
1777 for i, unpacked in enumerate(
1778 container.iter_unpacked_subset(
1779 object_ids, allow_missing=True, convert_ofs_delta=True
1780 )
1781 ):
1782 if progress is not None and i % 1000 == 0:
1783 progress(
1784 ("checking for reusable deltas: %d/%d\r" % (i, len(object_ids))).encode(
1785 "utf-8"
1786 )
1787 )
1788 if unpacked.pack_type_num == REF_DELTA:
1789 hexsha = sha_to_hex(unpacked.delta_base)
1790 if hexsha in object_ids or hexsha in other_haves:
1791 yield unpacked
1792 reused += 1
1793 if progress is not None:
1794 progress(("found %d deltas to reuse\n" % (reused,)).encode("utf-8"))
1797def deltify_pack_objects(
1798 objects: Union[Iterator[bytes], Iterator[Tuple[ShaFile, Optional[bytes]]]],
1799 *,
1800 window_size: Optional[int] = None,
1801 progress=None,
1802) -> Iterator[UnpackedObject]:
1803 """Generate deltas for pack objects.
1805 Args:
1806 objects: An iterable of (object, path) tuples to deltify.
1807 window_size: Window size; None for default
1808 Returns: Iterator over type_num, object id, delta_base, content
1809 delta_base is None for full text entries
1810 """
1812 def objects_with_hints():
1813 for e in objects:
1814 if isinstance(e, ShaFile):
1815 yield (e, (e.type_num, None))
1816 else:
1817 yield (e[0], (e[0].type_num, e[1]))
1819 yield from deltas_from_sorted_objects(
1820 sort_objects_for_delta(objects_with_hints()),
1821 window_size=window_size,
1822 progress=progress,
1823 )
1826def sort_objects_for_delta(
1827 objects: Union[Iterator[ShaFile], Iterator[Tuple[ShaFile, Optional[PackHint]]]],
1828) -> Iterator[ShaFile]:
1829 magic = []
1830 for entry in objects:
1831 if isinstance(entry, tuple):
1832 obj, hint = entry
1833 if hint is None:
1834 type_num = None
1835 path = None
1836 else:
1837 (type_num, path) = hint
1838 else:
1839 obj = entry
1840 magic.append((type_num, path, -obj.raw_length(), obj))
1841 # Build a list of objects ordered by the magic Linus heuristic
1842 # This helps us find good objects to diff against us
1843 magic.sort()
1844 return (x[3] for x in magic)
1847def deltas_from_sorted_objects(
1848 objects, window_size: Optional[int] = None, progress=None
1849):
1850 # TODO(jelmer): Use threads
1851 if window_size is None:
1852 window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE
1854 possible_bases: Deque[Tuple[bytes, int, List[bytes]]] = deque()
1855 for i, o in enumerate(objects):
1856 if progress is not None and i % 1000 == 0:
1857 progress(("generating deltas: %d\r" % (i,)).encode("utf-8"))
1858 raw = o.as_raw_chunks()
1859 winner = raw
1860 winner_len = sum(map(len, winner))
1861 winner_base = None
1862 for base_id, base_type_num, base in possible_bases:
1863 if base_type_num != o.type_num:
1864 continue
1865 delta_len = 0
1866 delta = []
1867 for chunk in create_delta(base, raw):
1868 delta_len += len(chunk)
1869 if delta_len >= winner_len:
1870 break
1871 delta.append(chunk)
1872 else:
1873 winner_base = base_id
1874 winner = delta
1875 winner_len = sum(map(len, winner))
1876 yield UnpackedObject(
1877 o.type_num,
1878 sha=o.sha().digest(),
1879 delta_base=winner_base,
1880 decomp_len=winner_len,
1881 decomp_chunks=winner,
1882 )
1883 possible_bases.appendleft((o.sha().digest(), o.type_num, raw))
1884 while len(possible_bases) > window_size:
1885 possible_bases.pop()
1888def pack_objects_to_data(
1889 objects: Union[Sequence[ShaFile], Sequence[Tuple[ShaFile, Optional[bytes]]]],
1890 *,
1891 deltify: Optional[bool] = None,
1892 delta_window_size: Optional[int] = None,
1893 ofs_delta: bool = True,
1894 progress=None,
1895) -> Tuple[int, Iterator[UnpackedObject]]:
1896 """Create pack data from objects.
1898 Args:
1899 objects: Pack objects
1900 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
1901 """
1902 # TODO(jelmer): support deltaifying
1903 count = len(objects)
1904 if deltify is None:
1905 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too
1906 # slow at the moment.
1907 deltify = False
1908 if deltify:
1909 return (
1910 count,
1911 deltify_pack_objects(
1912 iter(objects), # type: ignore
1913 window_size=delta_window_size,
1914 progress=progress,
1915 ),
1916 )
1917 else:
1919 def iter_without_path():
1920 for o in objects:
1921 if isinstance(o, tuple):
1922 yield full_unpacked_object(o[0])
1923 else:
1924 yield full_unpacked_object(o)
1926 return (count, iter_without_path())
1929def generate_unpacked_objects(
1930 container: PackedObjectContainer,
1931 object_ids: Sequence[Tuple[ObjectID, Optional[PackHint]]],
1932 delta_window_size: Optional[int] = None,
1933 deltify: Optional[bool] = None,
1934 reuse_deltas: bool = True,
1935 ofs_delta: bool = True,
1936 other_haves: Optional[Set[bytes]] = None,
1937 progress=None,
1938) -> Iterator[UnpackedObject]:
1939 """Create pack data from objects.
1941 Args:
1942 objects: Pack objects
1943 Returns: Tuples with (type_num, hexdigest, delta base, object chunks)
1944 """
1945 todo = dict(object_ids)
1946 if reuse_deltas:
1947 for unpack in find_reusable_deltas(
1948 container, set(todo), other_haves=other_haves, progress=progress
1949 ):
1950 del todo[sha_to_hex(unpack.sha())]
1951 yield unpack
1952 if deltify is None:
1953 # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too
1954 # slow at the moment.
1955 deltify = False
1956 if deltify:
1957 objects_to_delta = container.iterobjects_subset(
1958 todo.keys(), allow_missing=False
1959 )
1960 yield from deltas_from_sorted_objects(
1961 sort_objects_for_delta((o, todo[o.id]) for o in objects_to_delta),
1962 window_size=delta_window_size,
1963 progress=progress,
1964 )
1965 else:
1966 for oid in todo:
1967 yield full_unpacked_object(container[oid])
1970def full_unpacked_object(o: ShaFile) -> UnpackedObject:
1971 return UnpackedObject(
1972 o.type_num,
1973 delta_base=None,
1974 crc32=None,
1975 decomp_chunks=o.as_raw_chunks(),
1976 sha=o.sha().digest(),
1977 )
1980def write_pack_from_container(
1981 write,
1982 container: PackedObjectContainer,
1983 object_ids: Sequence[Tuple[ObjectID, Optional[PackHint]]],
1984 delta_window_size: Optional[int] = None,
1985 deltify: Optional[bool] = None,
1986 reuse_deltas: bool = True,
1987 compression_level: int = -1,
1988 other_haves: Optional[Set[bytes]] = None,
1989):
1990 """Write a new pack data file.
1992 Args:
1993 write: write function to use
1994 container: PackedObjectContainer
1995 entries: Sequence of (object_id, path) tuples to write
1996 delta_window_size: Sliding window size for searching for deltas;
1997 Set to None for default window size.
1998 deltify: Whether to deltify objects
1999 compression_level: the zlib compression level to use
2000 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2001 """
2002 pack_contents_count = len(object_ids)
2003 pack_contents = generate_unpacked_objects(
2004 container,
2005 object_ids,
2006 delta_window_size=delta_window_size,
2007 deltify=deltify,
2008 reuse_deltas=reuse_deltas,
2009 other_haves=other_haves,
2010 )
2012 return write_pack_data(
2013 write,
2014 pack_contents,
2015 num_records=pack_contents_count,
2016 compression_level=compression_level,
2017 )
2020def write_pack_objects(
2021 write,
2022 objects: Union[Sequence[ShaFile], Sequence[Tuple[ShaFile, Optional[bytes]]]],
2023 *,
2024 delta_window_size: Optional[int] = None,
2025 deltify: Optional[bool] = None,
2026 compression_level: int = -1,
2027):
2028 """Write a new pack data file.
2030 Args:
2031 write: write function to use
2032 objects: Sequence of (object, path) tuples to write
2033 delta_window_size: Sliding window size for searching for deltas;
2034 Set to None for default window size.
2035 deltify: Whether to deltify objects
2036 compression_level: the zlib compression level to use
2037 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2038 """
2039 pack_contents_count, pack_contents = pack_objects_to_data(objects, deltify=deltify)
2041 return write_pack_data(
2042 write,
2043 pack_contents,
2044 num_records=pack_contents_count,
2045 compression_level=compression_level,
2046 )
2049class PackChunkGenerator:
2050 def __init__(
2051 self,
2052 num_records=None,
2053 records=None,
2054 progress=None,
2055 compression_level=-1,
2056 reuse_compressed=True,
2057 ) -> None:
2058 self.cs = sha1(b"")
2059 self.entries: Dict[Union[int, bytes], Tuple[int, int]] = {}
2060 self._it = self._pack_data_chunks(
2061 num_records=num_records,
2062 records=records,
2063 progress=progress,
2064 compression_level=compression_level,
2065 reuse_compressed=reuse_compressed,
2066 )
2068 def sha1digest(self):
2069 return self.cs.digest()
2071 def __iter__(self):
2072 return self._it
2074 def _pack_data_chunks(
2075 self,
2076 records: Iterator[UnpackedObject],
2077 *,
2078 num_records=None,
2079 progress=None,
2080 compression_level: int = -1,
2081 reuse_compressed: bool = True,
2082 ) -> Iterator[bytes]:
2083 """Iterate pack data file chunks.
2085 Args:
2086 records: Iterator over UnpackedObject
2087 num_records: Number of records (defaults to len(records) if not specified)
2088 progress: Function to report progress to
2089 compression_level: the zlib compression level
2090 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2091 """
2092 # Write the pack
2093 if num_records is None:
2094 num_records = len(records) # type: ignore
2095 offset = 0
2096 for chunk in pack_header_chunks(num_records):
2097 yield chunk
2098 self.cs.update(chunk)
2099 offset += len(chunk)
2100 actual_num_records = 0
2101 for i, unpacked in enumerate(records):
2102 type_num = unpacked.pack_type_num
2103 if progress is not None and i % 1000 == 0:
2104 progress(
2105 ("writing pack data: %d/%d\r" % (i, num_records)).encode("ascii")
2106 )
2107 raw: Union[List[bytes], Tuple[int, List[bytes]], Tuple[bytes, List[bytes]]]
2108 if unpacked.delta_base is not None:
2109 try:
2110 base_offset, base_crc32 = self.entries[unpacked.delta_base]
2111 except KeyError:
2112 type_num = REF_DELTA
2113 assert isinstance(unpacked.delta_base, bytes)
2114 raw = (unpacked.delta_base, unpacked.decomp_chunks)
2115 else:
2116 type_num = OFS_DELTA
2117 raw = (offset - base_offset, unpacked.decomp_chunks)
2118 else:
2119 raw = unpacked.decomp_chunks
2120 if unpacked.comp_chunks is not None and reuse_compressed:
2121 chunks = unpacked.comp_chunks
2122 else:
2123 chunks = pack_object_chunks(
2124 type_num, raw, compression_level=compression_level
2125 )
2126 crc32 = 0
2127 object_size = 0
2128 for chunk in chunks:
2129 yield chunk
2130 crc32 = binascii.crc32(chunk, crc32)
2131 self.cs.update(chunk)
2132 object_size += len(chunk)
2133 actual_num_records += 1
2134 self.entries[unpacked.sha()] = (offset, crc32)
2135 offset += object_size
2136 if actual_num_records != num_records:
2137 raise AssertionError(
2138 "actual records written differs: %d != %d"
2139 % (actual_num_records, num_records)
2140 )
2142 yield self.cs.digest()
2145def write_pack_data(
2146 write,
2147 records: Iterator[UnpackedObject],
2148 *,
2149 num_records=None,
2150 progress=None,
2151 compression_level=-1,
2152):
2153 """Write a new pack data file.
2155 Args:
2156 write: Write function to use
2157 num_records: Number of records (defaults to len(records) if None)
2158 records: Iterator over type_num, object_id, delta_base, raw
2159 progress: Function to report progress to
2160 compression_level: the zlib compression level
2161 Returns: Dict mapping id -> (offset, crc32 checksum), pack checksum
2162 """
2163 chunk_generator = PackChunkGenerator(
2164 num_records=num_records,
2165 records=records,
2166 progress=progress,
2167 compression_level=compression_level,
2168 )
2169 for chunk in chunk_generator:
2170 write(chunk)
2171 return chunk_generator.entries, chunk_generator.sha1digest()
2174def write_pack_index_v1(f, entries, pack_checksum):
2175 """Write a new pack index file.
2177 Args:
2178 f: A file-like object to write to
2179 entries: List of tuples with object name (sha), offset_in_pack,
2180 and crc32_checksum.
2181 pack_checksum: Checksum of the pack file.
2182 Returns: The SHA of the written index file
2183 """
2184 f = SHA1Writer(f)
2185 fan_out_table = defaultdict(lambda: 0)
2186 for name, offset, entry_checksum in entries:
2187 fan_out_table[ord(name[:1])] += 1
2188 # Fan-out table
2189 for i in range(0x100):
2190 f.write(struct.pack(">L", fan_out_table[i]))
2191 fan_out_table[i + 1] += fan_out_table[i]
2192 for name, offset, entry_checksum in entries:
2193 if not (offset <= 0xFFFFFFFF):
2194 raise TypeError("pack format 1 only supports offsets < 2Gb")
2195 f.write(struct.pack(">L20s", offset, name))
2196 assert len(pack_checksum) == 20
2197 f.write(pack_checksum)
2198 return f.write_sha()
2201def _delta_encode_size(size) -> bytes:
2202 ret = bytearray()
2203 c = size & 0x7F
2204 size >>= 7
2205 while size:
2206 ret.append(c | 0x80)
2207 c = size & 0x7F
2208 size >>= 7
2209 ret.append(c)
2210 return bytes(ret)
2213# The length of delta compression copy operations in version 2 packs is limited
2214# to 64K. To copy more, we use several copy operations. Version 3 packs allow
2215# 24-bit lengths in copy operations, but we always make version 2 packs.
2216_MAX_COPY_LEN = 0xFFFF
2219def _encode_copy_operation(start, length):
2220 scratch = bytearray([0x80])
2221 for i in range(4):
2222 if start & 0xFF << i * 8:
2223 scratch.append((start >> i * 8) & 0xFF)
2224 scratch[0] |= 1 << i
2225 for i in range(2):
2226 if length & 0xFF << i * 8:
2227 scratch.append((length >> i * 8) & 0xFF)
2228 scratch[0] |= 1 << (4 + i)
2229 return bytes(scratch)
2232def create_delta(base_buf, target_buf):
2233 """Use python difflib to work out how to transform base_buf to target_buf.
2235 Args:
2236 base_buf: Base buffer
2237 target_buf: Target buffer
2238 """
2239 if isinstance(base_buf, list):
2240 base_buf = b"".join(base_buf)
2241 if isinstance(target_buf, list):
2242 target_buf = b"".join(target_buf)
2243 assert isinstance(base_buf, bytes)
2244 assert isinstance(target_buf, bytes)
2245 # write delta header
2246 yield _delta_encode_size(len(base_buf))
2247 yield _delta_encode_size(len(target_buf))
2248 # write out delta opcodes
2249 seq = SequenceMatcher(isjunk=None, a=base_buf, b=target_buf)
2250 for opcode, i1, i2, j1, j2 in seq.get_opcodes():
2251 # Git patch opcodes don't care about deletes!
2252 # if opcode == 'replace' or opcode == 'delete':
2253 # pass
2254 if opcode == "equal":
2255 # If they are equal, unpacker will use data from base_buf
2256 # Write out an opcode that says what range to use
2257 copy_start = i1
2258 copy_len = i2 - i1
2259 while copy_len > 0:
2260 to_copy = min(copy_len, _MAX_COPY_LEN)
2261 yield _encode_copy_operation(copy_start, to_copy)
2262 copy_start += to_copy
2263 copy_len -= to_copy
2264 if opcode == "replace" or opcode == "insert":
2265 # If we are replacing a range or adding one, then we just
2266 # output it to the stream (prefixed by its size)
2267 s = j2 - j1
2268 o = j1
2269 while s > 127:
2270 yield bytes([127])
2271 yield memoryview(target_buf)[o : o + 127]
2272 s -= 127
2273 o += 127
2274 yield bytes([s])
2275 yield memoryview(target_buf)[o : o + s]
2278def apply_delta(src_buf, delta):
2279 """Based on the similar function in git's patch-delta.c.
2281 Args:
2282 src_buf: Source buffer
2283 delta: Delta instructions
2284 """
2285 if not isinstance(src_buf, bytes):
2286 src_buf = b"".join(src_buf)
2287 if not isinstance(delta, bytes):
2288 delta = b"".join(delta)
2289 out = []
2290 index = 0
2291 delta_length = len(delta)
2293 def get_delta_header_size(delta, index):
2294 size = 0
2295 i = 0
2296 while delta:
2297 cmd = ord(delta[index : index + 1])
2298 index += 1
2299 size |= (cmd & ~0x80) << i
2300 i += 7
2301 if not cmd & 0x80:
2302 break
2303 return size, index
2305 src_size, index = get_delta_header_size(delta, index)
2306 dest_size, index = get_delta_header_size(delta, index)
2307 assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
2308 while index < delta_length:
2309 cmd = ord(delta[index : index + 1])
2310 index += 1
2311 if cmd & 0x80:
2312 cp_off = 0
2313 for i in range(4):
2314 if cmd & (1 << i):
2315 x = ord(delta[index : index + 1])
2316 index += 1
2317 cp_off |= x << (i * 8)
2318 cp_size = 0
2319 # Version 3 packs can contain copy sizes larger than 64K.
2320 for i in range(3):
2321 if cmd & (1 << (4 + i)):
2322 x = ord(delta[index : index + 1])
2323 index += 1
2324 cp_size |= x << (i * 8)
2325 if cp_size == 0:
2326 cp_size = 0x10000
2327 if (
2328 cp_off + cp_size < cp_size
2329 or cp_off + cp_size > src_size
2330 or cp_size > dest_size
2331 ):
2332 break
2333 out.append(src_buf[cp_off : cp_off + cp_size])
2334 elif cmd != 0:
2335 out.append(delta[index : index + cmd])
2336 index += cmd
2337 else:
2338 raise ApplyDeltaError("Invalid opcode 0")
2340 if index != delta_length:
2341 raise ApplyDeltaError(f"delta not empty: {delta[index:]!r}")
2343 if dest_size != chunks_length(out):
2344 raise ApplyDeltaError("dest size incorrect")
2346 return out
2349def write_pack_index_v2(
2350 f, entries: Iterable[PackIndexEntry], pack_checksum: bytes
2351) -> bytes:
2352 """Write a new pack index file.
2354 Args:
2355 f: File-like object to write to
2356 entries: List of tuples with object name (sha), offset_in_pack, and
2357 crc32_checksum.
2358 pack_checksum: Checksum of the pack file.
2359 Returns: The SHA of the index file written
2360 """
2361 f = SHA1Writer(f)
2362 f.write(b"\377tOc") # Magic!
2363 f.write(struct.pack(">L", 2))
2364 fan_out_table: Dict[int, int] = defaultdict(lambda: 0)
2365 for name, offset, entry_checksum in entries:
2366 fan_out_table[ord(name[:1])] += 1
2367 # Fan-out table
2368 largetable: List[int] = []
2369 for i in range(0x100):
2370 f.write(struct.pack(b">L", fan_out_table[i]))
2371 fan_out_table[i + 1] += fan_out_table[i]
2372 for name, offset, entry_checksum in entries:
2373 f.write(name)
2374 for name, offset, entry_checksum in entries:
2375 f.write(struct.pack(b">L", entry_checksum))
2376 for name, offset, entry_checksum in entries:
2377 if offset < 2**31:
2378 f.write(struct.pack(b">L", offset))
2379 else:
2380 f.write(struct.pack(b">L", 2**31 + len(largetable)))
2381 largetable.append(offset)
2382 for offset in largetable:
2383 f.write(struct.pack(b">Q", offset))
2384 assert len(pack_checksum) == 20
2385 f.write(pack_checksum)
2386 return f.write_sha()
2389write_pack_index = write_pack_index_v2
2392class Pack:
2393 """A Git pack object."""
2395 _data_load: Optional[Callable[[], PackData]]
2396 _idx_load: Optional[Callable[[], PackIndex]]
2398 _data: Optional[PackData]
2399 _idx: Optional[PackIndex]
2401 def __init__(
2402 self, basename, resolve_ext_ref: Optional[ResolveExtRefFn] = None
2403 ) -> None:
2404 self._basename = basename
2405 self._data = None
2406 self._idx = None
2407 self._idx_path = self._basename + ".idx"
2408 self._data_path = self._basename + ".pack"
2409 self._data_load = lambda: PackData(self._data_path)
2410 self._idx_load = lambda: load_pack_index(self._idx_path)
2411 self.resolve_ext_ref = resolve_ext_ref
2413 @classmethod
2414 def from_lazy_objects(cls, data_fn, idx_fn):
2415 """Create a new pack object from callables to load pack data and
2416 index objects.
2417 """
2418 ret = cls("")
2419 ret._data_load = data_fn
2420 ret._idx_load = idx_fn
2421 return ret
2423 @classmethod
2424 def from_objects(cls, data, idx):
2425 """Create a new pack object from pack data and index objects."""
2426 ret = cls("")
2427 ret._data = data
2428 ret._data_load = None
2429 ret._idx = idx
2430 ret._idx_load = None
2431 ret.check_length_and_checksum()
2432 return ret
2434 def name(self):
2435 """The SHA over the SHAs of the objects in this pack."""
2436 return self.index.objects_sha1()
2438 @property
2439 def data(self) -> PackData:
2440 """The pack data object being used."""
2441 if self._data is None:
2442 assert self._data_load
2443 self._data = self._data_load()
2444 self.check_length_and_checksum()
2445 return self._data
2447 @property
2448 def index(self) -> PackIndex:
2449 """The index being used.
2451 Note: This may be an in-memory index
2452 """
2453 if self._idx is None:
2454 assert self._idx_load
2455 self._idx = self._idx_load()
2456 return self._idx
2458 def close(self):
2459 if self._data is not None:
2460 self._data.close()
2461 if self._idx is not None:
2462 self._idx.close()
2464 def __enter__(self):
2465 return self
2467 def __exit__(self, exc_type, exc_val, exc_tb):
2468 self.close()
2470 def __eq__(self, other):
2471 return isinstance(self, type(other)) and self.index == other.index
2473 def __len__(self) -> int:
2474 """Number of entries in this pack."""
2475 return len(self.index)
2477 def __repr__(self) -> str:
2478 return f"{self.__class__.__name__}({self._basename!r})"
2480 def __iter__(self):
2481 """Iterate over all the sha1s of the objects in this pack."""
2482 return iter(self.index)
2484 def check_length_and_checksum(self) -> None:
2485 """Sanity check the length and checksum of the pack index and data."""
2486 assert len(self.index) == len(
2487 self.data
2488 ), f"Length mismatch: {len(self.index)} (index) != {len(self.data)} (data)"
2489 idx_stored_checksum = self.index.get_pack_checksum()
2490 data_stored_checksum = self.data.get_stored_checksum()
2491 if idx_stored_checksum != data_stored_checksum:
2492 raise ChecksumMismatch(
2493 sha_to_hex(idx_stored_checksum),
2494 sha_to_hex(data_stored_checksum),
2495 )
2497 def check(self) -> None:
2498 """Check the integrity of this pack.
2500 Raises:
2501 ChecksumMismatch: if a checksum for the index or data is wrong
2502 """
2503 self.index.check()
2504 self.data.check()
2505 for obj in self.iterobjects():
2506 obj.check()
2507 # TODO: object connectivity checks
2509 def get_stored_checksum(self) -> bytes:
2510 return self.data.get_stored_checksum()
2512 def pack_tuples(self):
2513 return [(o, None) for o in self.iterobjects()]
2515 def __contains__(self, sha1: bytes) -> bool:
2516 """Check whether this pack contains a particular SHA1."""
2517 try:
2518 self.index.object_offset(sha1)
2519 return True
2520 except KeyError:
2521 return False
2523 def get_raw(self, sha1: bytes) -> Tuple[int, bytes]:
2524 offset = self.index.object_offset(sha1)
2525 obj_type, obj = self.data.get_object_at(offset)
2526 type_num, chunks = self.resolve_object(offset, obj_type, obj)
2527 return type_num, b"".join(chunks)
2529 def __getitem__(self, sha1: bytes) -> ShaFile:
2530 """Retrieve the specified SHA1."""
2531 type, uncomp = self.get_raw(sha1)
2532 return ShaFile.from_raw_string(type, uncomp, sha=sha1)
2534 def iterobjects(self) -> Iterator[ShaFile]:
2535 """Iterate over the objects in this pack."""
2536 return iter(
2537 PackInflater.for_pack_data(self.data, resolve_ext_ref=self.resolve_ext_ref)
2538 )
2540 def iterobjects_subset(
2541 self, shas: Iterable[ObjectID], *, allow_missing: bool = False
2542 ) -> Iterator[ShaFile]:
2543 return (
2544 uo
2545 for uo in PackInflater.for_pack_subset(
2546 self,
2547 shas,
2548 allow_missing=allow_missing,
2549 resolve_ext_ref=self.resolve_ext_ref,
2550 )
2551 if uo.id in shas
2552 )
2554 def iter_unpacked_subset(
2555 self,
2556 shas: Iterable[ObjectID],
2557 *,
2558 include_comp: bool = False,
2559 allow_missing: bool = False,
2560 convert_ofs_delta: bool = False,
2561 ) -> Iterator[UnpackedObject]:
2562 ofs_pending: Dict[int, List[UnpackedObject]] = defaultdict(list)
2563 ofs: Dict[bytes, int] = {}
2564 todo = set(shas)
2565 for unpacked in self.iter_unpacked(include_comp=include_comp):
2566 sha = unpacked.sha()
2567 ofs[unpacked.offset] = sha
2568 hexsha = sha_to_hex(sha)
2569 if hexsha in todo:
2570 if unpacked.pack_type_num == OFS_DELTA:
2571 assert isinstance(unpacked.delta_base, int)
2572 base_offset = unpacked.offset - unpacked.delta_base
2573 try:
2574 unpacked.delta_base = ofs[base_offset]
2575 except KeyError:
2576 ofs_pending[base_offset].append(unpacked)
2577 continue
2578 else:
2579 unpacked.pack_type_num = REF_DELTA
2580 yield unpacked
2581 todo.remove(hexsha)
2582 for child in ofs_pending.pop(unpacked.offset, []):
2583 child.pack_type_num = REF_DELTA
2584 child.delta_base = sha
2585 yield child
2586 assert not ofs_pending
2587 if not allow_missing and todo:
2588 raise UnresolvedDeltas(todo)
2590 def iter_unpacked(self, include_comp=False):
2591 ofs_to_entries = {
2592 ofs: (sha, crc32) for (sha, ofs, crc32) in self.index.iterentries()
2593 }
2594 for unpacked in self.data.iter_unpacked(include_comp=include_comp):
2595 (sha, crc32) = ofs_to_entries[unpacked.offset]
2596 unpacked._sha = sha
2597 unpacked.crc32 = crc32
2598 yield unpacked
2600 def keep(self, msg: Optional[bytes] = None) -> str:
2601 """Add a .keep file for the pack, preventing git from garbage collecting it.
2603 Args:
2604 msg: A message written inside the .keep file; can be used later
2605 to determine whether or not a .keep file is obsolete.
2606 Returns: The path of the .keep file, as a string.
2607 """
2608 keepfile_name = f"{self._basename}.keep"
2609 with GitFile(keepfile_name, "wb") as keepfile:
2610 if msg:
2611 keepfile.write(msg)
2612 keepfile.write(b"\n")
2613 return keepfile_name
2615 def get_ref(self, sha: bytes) -> Tuple[Optional[int], int, OldUnpackedObject]:
2616 """Get the object for a ref SHA, only looking in this pack."""
2617 # TODO: cache these results
2618 try:
2619 offset = self.index.object_offset(sha)
2620 except KeyError:
2621 offset = None
2622 if offset:
2623 type, obj = self.data.get_object_at(offset)
2624 elif self.resolve_ext_ref:
2625 type, obj = self.resolve_ext_ref(sha)
2626 else:
2627 raise KeyError(sha)
2628 return offset, type, obj
2630 def resolve_object(
2631 self, offset: int, type: int, obj, get_ref=None
2632 ) -> Tuple[int, Iterable[bytes]]:
2633 """Resolve an object, possibly resolving deltas when necessary.
2635 Returns: Tuple with object type and contents.
2636 """
2637 # Walk down the delta chain, building a stack of deltas to reach
2638 # the requested object.
2639 base_offset = offset
2640 base_type = type
2641 base_obj = obj
2642 delta_stack = []
2643 while base_type in DELTA_TYPES:
2644 prev_offset = base_offset
2645 if get_ref is None:
2646 get_ref = self.get_ref
2647 if base_type == OFS_DELTA:
2648 (delta_offset, delta) = base_obj
2649 # TODO: clean up asserts and replace with nicer error messages
2650 base_offset = base_offset - delta_offset
2651 base_type, base_obj = self.data.get_object_at(base_offset)
2652 assert isinstance(base_type, int)
2653 elif base_type == REF_DELTA:
2654 (basename, delta) = base_obj
2655 assert isinstance(basename, bytes) and len(basename) == 20
2656 base_offset, base_type, base_obj = get_ref(basename)
2657 assert isinstance(base_type, int)
2658 if base_offset == prev_offset: # object is based on itself
2659 raise UnresolvedDeltas(sha_to_hex(basename))
2660 delta_stack.append((prev_offset, base_type, delta))
2662 # Now grab the base object (mustn't be a delta) and apply the
2663 # deltas all the way up the stack.
2664 chunks = base_obj
2665 for prev_offset, delta_type, delta in reversed(delta_stack):
2666 chunks = apply_delta(chunks, delta)
2667 # TODO(dborowitz): This can result in poor performance if
2668 # large base objects are separated from deltas in the pack.
2669 # We should reorganize so that we apply deltas to all
2670 # objects in a chain one after the other to optimize cache
2671 # performance.
2672 if prev_offset is not None:
2673 self.data._offset_cache[prev_offset] = base_type, chunks
2674 return base_type, chunks
2676 def entries(
2677 self, progress: Optional[ProgressFn] = None
2678 ) -> Iterator[PackIndexEntry]:
2679 """Yield entries summarizing the contents of this pack.
2681 Args:
2682 progress: Progress function, called with current and total
2683 object count.
2684 Returns: iterator of tuples with (sha, offset, crc32)
2685 """
2686 return self.data.iterentries(
2687 progress=progress, resolve_ext_ref=self.resolve_ext_ref
2688 )
2690 def sorted_entries(
2691 self, progress: Optional[ProgressFn] = None
2692 ) -> Iterator[PackIndexEntry]:
2693 """Return entries in this pack, sorted by SHA.
2695 Args:
2696 progress: Progress function, called with current and total
2697 object count
2698 Returns: Iterator of tuples with (sha, offset, crc32)
2699 """
2700 return self.data.sorted_entries(
2701 progress=progress, resolve_ext_ref=self.resolve_ext_ref
2702 )
2704 def get_unpacked_object(
2705 self, sha: bytes, *, include_comp: bool = False, convert_ofs_delta: bool = True
2706 ) -> UnpackedObject:
2707 """Get the unpacked object for a sha.
2709 Args:
2710 sha: SHA of object to fetch
2711 include_comp: Whether to include compression data in UnpackedObject
2712 """
2713 offset = self.index.object_offset(sha)
2714 unpacked = self.data.get_unpacked_object_at(offset, include_comp=include_comp)
2715 if unpacked.pack_type_num == OFS_DELTA and convert_ofs_delta:
2716 assert isinstance(unpacked.delta_base, int)
2717 unpacked.delta_base = self.index.object_sha1(offset - unpacked.delta_base)
2718 unpacked.pack_type_num = REF_DELTA
2719 return unpacked
2722def extend_pack(
2723 f: BinaryIO,
2724 object_ids: Set[ObjectID],
2725 get_raw,
2726 *,
2727 compression_level=-1,
2728 progress=None,
2729) -> Tuple[bytes, List]:
2730 """Extend a pack file with more objects.
2732 The caller should make sure that object_ids does not contain any objects
2733 that are already in the pack
2734 """
2735 # Update the header with the new number of objects.
2736 f.seek(0)
2737 _version, num_objects = read_pack_header(f.read)
2739 if object_ids:
2740 f.seek(0)
2741 write_pack_header(f.write, num_objects + len(object_ids))
2743 # Must flush before reading (http://bugs.python.org/issue3207)
2744 f.flush()
2746 # Rescan the rest of the pack, computing the SHA with the new header.
2747 new_sha = compute_file_sha(f, end_ofs=-20)
2749 # Must reposition before writing (http://bugs.python.org/issue3207)
2750 f.seek(0, os.SEEK_CUR)
2752 extra_entries = []
2754 # Complete the pack.
2755 for i, object_id in enumerate(object_ids):
2756 if progress is not None:
2757 progress(
2758 ("writing extra base objects: %d/%d\r" % (i, len(object_ids))).encode(
2759 "ascii"
2760 )
2761 )
2762 assert len(object_id) == 20
2763 type_num, data = get_raw(object_id)
2764 offset = f.tell()
2765 crc32 = write_pack_object(
2766 f.write,
2767 type_num,
2768 data,
2769 sha=new_sha,
2770 compression_level=compression_level,
2771 )
2772 extra_entries.append((object_id, offset, crc32))
2773 pack_sha = new_sha.digest()
2774 f.write(pack_sha)
2775 return pack_sha, extra_entries
2778try:
2779 from dulwich._pack import ( # type: ignore
2780 apply_delta, # type: ignore
2781 bisect_find_sha, # type: ignore
2782 )
2783except ImportError:
2784 pass