1# object_store.py -- Object store for git objects
2# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk>
3# and others
4#
5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
7# General Public License as public by the Free Software Foundation; version 2.0
8# or (at your option) any later version. You can redistribute it and/or
9# modify it under the terms of either of these two licenses.
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# You should have received a copy of the licenses; if not, see
18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
20# License, Version 2.0.
21#
22
23
24"""Git object store interfaces and implementation."""
25
26import binascii
27import os
28import stat
29import sys
30import time
31import warnings
32from collections.abc import Iterable, Iterator, Sequence
33from contextlib import suppress
34from io import BytesIO
35from typing import (
36 Callable,
37 Optional,
38 Protocol,
39 Union,
40 cast,
41)
42
43from .errors import NotTreeError
44from .file import GitFile
45from .objects import (
46 S_ISGITLINK,
47 ZERO_SHA,
48 Blob,
49 Commit,
50 ObjectID,
51 ShaFile,
52 Tag,
53 Tree,
54 TreeEntry,
55 hex_to_filename,
56 hex_to_sha,
57 object_class,
58 sha_to_hex,
59 valid_hexsha,
60)
61from .pack import (
62 PACK_SPOOL_FILE_MAX_SIZE,
63 ObjectContainer,
64 Pack,
65 PackData,
66 PackedObjectContainer,
67 PackFileDisappeared,
68 PackHint,
69 PackIndexer,
70 PackInflater,
71 PackStreamCopier,
72 UnpackedObject,
73 extend_pack,
74 full_unpacked_object,
75 generate_unpacked_objects,
76 iter_sha1,
77 load_pack_index_file,
78 pack_objects_to_data,
79 write_pack_data,
80 write_pack_index,
81)
82from .protocol import DEPTH_INFINITE
83from .refs import PEELED_TAG_SUFFIX, Ref
84
85INFODIR = "info"
86PACKDIR = "pack"
87
88# use permissions consistent with Git; just readable by everyone
89# TODO: should packs also be non-writable on Windows? if so, that
90# would requite some rather significant adjustments to the test suite
91PACK_MODE = 0o444 if sys.platform != "win32" else 0o644
92
93# Grace period for cleaning up temporary pack files (in seconds)
94# Matches git's default of 2 weeks
95DEFAULT_TEMPFILE_GRACE_PERIOD = 14 * 24 * 60 * 60 # 2 weeks
96
97
98def find_shallow(store, heads, depth):
99 """Find shallow commits according to a given depth.
100
101 Args:
102 store: An ObjectStore for looking up objects.
103 heads: Iterable of head SHAs to start walking from.
104 depth: The depth of ancestors to include. A depth of one includes
105 only the heads themselves.
106 Returns: A tuple of (shallow, not_shallow), sets of SHAs that should be
107 considered shallow and unshallow according to the arguments. Note that
108 these sets may overlap if a commit is reachable along multiple paths.
109 """
110 parents = {}
111 commit_graph = store.get_commit_graph()
112
113 def get_parents(sha):
114 result = parents.get(sha, None)
115 if not result:
116 # Try to use commit graph first if available
117 if commit_graph:
118 graph_parents = commit_graph.get_parents(sha)
119 if graph_parents is not None:
120 result = graph_parents
121 parents[sha] = result
122 return result
123 # Fall back to loading the object
124 result = store[sha].parents
125 parents[sha] = result
126 return result
127
128 todo = [] # stack of (sha, depth)
129 for head_sha in heads:
130 obj = store[head_sha]
131 # Peel tags if necessary
132 while isinstance(obj, Tag):
133 _, sha = obj.object
134 obj = store[sha]
135 if isinstance(obj, Commit):
136 todo.append((obj.id, 1))
137
138 not_shallow = set()
139 shallow = set()
140 while todo:
141 sha, cur_depth = todo.pop()
142 if cur_depth < depth:
143 not_shallow.add(sha)
144 new_depth = cur_depth + 1
145 todo.extend((p, new_depth) for p in get_parents(sha))
146 else:
147 shallow.add(sha)
148
149 return shallow, not_shallow
150
151
152def get_depth(
153 store,
154 head,
155 get_parents=lambda commit: commit.parents,
156 max_depth=None,
157):
158 """Return the current available depth for the given head.
159 For commits with multiple parents, the largest possible depth will be
160 returned.
161
162 Args:
163 head: commit to start from
164 get_parents: optional function for getting the parents of a commit
165 max_depth: maximum depth to search
166 """
167 if head not in store:
168 return 0
169 current_depth = 1
170 queue = [(head, current_depth)]
171 commit_graph = store.get_commit_graph()
172
173 while queue and (max_depth is None or current_depth < max_depth):
174 e, depth = queue.pop(0)
175 current_depth = max(current_depth, depth)
176
177 # Try to use commit graph for parent lookup if available
178 parents = None
179 if commit_graph:
180 parents = commit_graph.get_parents(e)
181
182 if parents is None:
183 # Fall back to loading the object
184 cmt = store[e]
185 if isinstance(cmt, Tag):
186 _cls, sha = cmt.object
187 cmt = store[sha]
188 parents = get_parents(cmt)
189
190 queue.extend((parent, depth + 1) for parent in parents if parent in store)
191 return current_depth
192
193
194class PackContainer(Protocol):
195 def add_pack(self) -> tuple[BytesIO, Callable[[], None], Callable[[], None]]:
196 """Add a new pack."""
197
198
199class BaseObjectStore:
200 """Object store interface."""
201
202 def determine_wants_all(
203 self, refs: dict[Ref, ObjectID], depth: Optional[int] = None
204 ) -> list[ObjectID]:
205 def _want_deepen(sha):
206 if not depth:
207 return False
208 if depth == DEPTH_INFINITE:
209 return True
210 return depth > self._get_depth(sha)
211
212 return [
213 sha
214 for (ref, sha) in refs.items()
215 if (sha not in self or _want_deepen(sha))
216 and not ref.endswith(PEELED_TAG_SUFFIX)
217 and not sha == ZERO_SHA
218 ]
219
220 def contains_loose(self, sha) -> bool:
221 """Check if a particular object is present by SHA1 and is loose."""
222 raise NotImplementedError(self.contains_loose)
223
224 def __contains__(self, sha1: bytes) -> bool:
225 """Check if a particular object is present by SHA1.
226
227 This method makes no distinction between loose and packed objects.
228 """
229 return self.contains_loose(sha1)
230
231 @property
232 def packs(self):
233 """Iterable of pack objects."""
234 raise NotImplementedError
235
236 def get_raw(self, name) -> tuple[int, bytes]:
237 """Obtain the raw text for an object.
238
239 Args:
240 name: sha for the object.
241 Returns: tuple with numeric type and object contents.
242 """
243 raise NotImplementedError(self.get_raw)
244
245 def __getitem__(self, sha1: ObjectID) -> ShaFile:
246 """Obtain an object by SHA1."""
247 type_num, uncomp = self.get_raw(sha1)
248 return ShaFile.from_raw_string(type_num, uncomp, sha=sha1)
249
250 def __iter__(self):
251 """Iterate over the SHAs that are present in this store."""
252 raise NotImplementedError(self.__iter__)
253
254 def add_object(self, obj) -> None:
255 """Add a single object to this object store."""
256 raise NotImplementedError(self.add_object)
257
258 def add_objects(self, objects, progress=None) -> None:
259 """Add a set of objects to this object store.
260
261 Args:
262 objects: Iterable over a list of (object, path) tuples
263 """
264 raise NotImplementedError(self.add_objects)
265
266 def tree_changes(
267 self,
268 source,
269 target,
270 want_unchanged=False,
271 include_trees=False,
272 change_type_same=False,
273 rename_detector=None,
274 paths=None,
275 ):
276 """Find the differences between the contents of two trees.
277
278 Args:
279 source: SHA1 of the source tree
280 target: SHA1 of the target tree
281 want_unchanged: Whether unchanged files should be reported
282 include_trees: Whether to include trees
283 change_type_same: Whether to report files changing
284 type in the same entry.
285 rename_detector: RenameDetector object for detecting renames.
286 paths: Optional list of paths to filter to (as bytes).
287 Returns: Iterator over tuples with
288 (oldpath, newpath), (oldmode, newmode), (oldsha, newsha)
289 """
290 from .diff_tree import tree_changes
291
292 for change in tree_changes(
293 self,
294 source,
295 target,
296 want_unchanged=want_unchanged,
297 include_trees=include_trees,
298 change_type_same=change_type_same,
299 rename_detector=rename_detector,
300 paths=paths,
301 ):
302 yield (
303 (change.old.path, change.new.path),
304 (change.old.mode, change.new.mode),
305 (change.old.sha, change.new.sha),
306 )
307
308 def iter_tree_contents(self, tree_id, include_trees=False):
309 """Iterate the contents of a tree and all subtrees.
310
311 Iteration is depth-first pre-order, as in e.g. os.walk.
312
313 Args:
314 tree_id: SHA1 of the tree.
315 include_trees: If True, include tree objects in the iteration.
316 Returns: Iterator over TreeEntry namedtuples for all the objects in a
317 tree.
318 """
319 warnings.warn(
320 "Please use dulwich.object_store.iter_tree_contents",
321 DeprecationWarning,
322 stacklevel=2,
323 )
324 return iter_tree_contents(self, tree_id, include_trees=include_trees)
325
326 def iterobjects_subset(
327 self, shas: Iterable[bytes], *, allow_missing: bool = False
328 ) -> Iterator[ShaFile]:
329 for sha in shas:
330 try:
331 yield self[sha]
332 except KeyError:
333 if not allow_missing:
334 raise
335
336 def find_missing_objects(
337 self,
338 haves,
339 wants,
340 shallow=None,
341 progress=None,
342 get_tagged=None,
343 get_parents=lambda commit: commit.parents,
344 ):
345 """Find the missing objects required for a set of revisions.
346
347 Args:
348 haves: Iterable over SHAs already in common.
349 wants: Iterable over SHAs of objects to fetch.
350 shallow: Set of shallow commit SHA1s to skip
351 progress: Simple progress function that will be called with
352 updated progress strings.
353 get_tagged: Function that returns a dict of pointed-to sha ->
354 tag sha for including tags.
355 get_parents: Optional function for getting the parents of a
356 commit.
357 Returns: Iterator over (sha, path) pairs.
358 """
359 warnings.warn("Please use MissingObjectFinder(store)", DeprecationWarning)
360 finder = MissingObjectFinder(
361 self,
362 haves=haves,
363 wants=wants,
364 shallow=shallow,
365 progress=progress,
366 get_tagged=get_tagged,
367 get_parents=get_parents,
368 )
369 return iter(finder)
370
371 def find_common_revisions(self, graphwalker):
372 """Find which revisions this store has in common using graphwalker.
373
374 Args:
375 graphwalker: A graphwalker object.
376 Returns: List of SHAs that are in common
377 """
378 haves = []
379 sha = next(graphwalker)
380 while sha:
381 if sha in self:
382 haves.append(sha)
383 graphwalker.ack(sha)
384 sha = next(graphwalker)
385 return haves
386
387 def generate_pack_data(
388 self, have, want, shallow=None, progress=None, ofs_delta=True
389 ) -> tuple[int, Iterator[UnpackedObject]]:
390 """Generate pack data objects for a set of wants/haves.
391
392 Args:
393 have: List of SHA1s of objects that should not be sent
394 want: List of SHA1s of objects that should be sent
395 shallow: Set of shallow commit SHA1s to skip
396 ofs_delta: Whether OFS deltas can be included
397 progress: Optional progress reporting method
398 """
399 # Note that the pack-specific implementation below is more efficient,
400 # as it reuses deltas
401 missing_objects = MissingObjectFinder(
402 self, haves=have, wants=want, shallow=shallow, progress=progress
403 )
404 object_ids = list(missing_objects)
405 return pack_objects_to_data(
406 [(self[oid], path) for oid, path in object_ids],
407 ofs_delta=ofs_delta,
408 progress=progress,
409 )
410
411 def peel_sha(self, sha):
412 """Peel all tags from a SHA.
413
414 Args:
415 sha: The object SHA to peel.
416 Returns: The fully-peeled SHA1 of a tag object, after peeling all
417 intermediate tags; if the original ref does not point to a tag,
418 this will equal the original SHA1.
419 """
420 warnings.warn(
421 "Please use dulwich.object_store.peel_sha()",
422 DeprecationWarning,
423 stacklevel=2,
424 )
425 return peel_sha(self, sha)[1]
426
427 def _get_depth(
428 self,
429 head,
430 get_parents=lambda commit: commit.parents,
431 max_depth=None,
432 ):
433 """Return the current available depth for the given head.
434 For commits with multiple parents, the largest possible depth will be
435 returned.
436
437 Args:
438 head: commit to start from
439 get_parents: optional function for getting the parents of a commit
440 max_depth: maximum depth to search
441 """
442 return get_depth(self, head, get_parents=get_parents, max_depth=max_depth)
443
444 def close(self) -> None:
445 """Close any files opened by this object store."""
446 # Default implementation is a NO-OP
447
448 def prune(self, grace_period: Optional[int] = None) -> None:
449 """Prune/clean up this object store.
450
451 This includes removing orphaned temporary files and other
452 housekeeping tasks. Default implementation is a NO-OP.
453
454 Args:
455 grace_period: Grace period in seconds for removing temporary files.
456 If None, uses the default grace period.
457 """
458 # Default implementation is a NO-OP
459
460 def iter_prefix(self, prefix: bytes) -> Iterator[ObjectID]:
461 """Iterate over all SHA1s that start with a given prefix.
462
463 The default implementation is a naive iteration over all objects.
464 However, subclasses may override this method with more efficient
465 implementations.
466 """
467 for sha in self:
468 if sha.startswith(prefix):
469 yield sha
470
471 def get_commit_graph(self):
472 """Get the commit graph for this object store.
473
474 Returns:
475 CommitGraph object if available, None otherwise
476 """
477 return None
478
479 def write_commit_graph(self, refs=None, reachable=True) -> None:
480 """Write a commit graph file for this object store.
481
482 Args:
483 refs: List of refs to include. If None, includes all refs from object store.
484 reachable: If True, includes all commits reachable from refs.
485 If False, only includes the direct ref targets.
486
487 Note:
488 Default implementation does nothing. Subclasses should override
489 this method to provide commit graph writing functionality.
490 """
491 raise NotImplementedError(self.write_commit_graph)
492
493 def get_object_mtime(self, sha):
494 """Get the modification time of an object.
495
496 Args:
497 sha: SHA1 of the object
498
499 Returns:
500 Modification time as seconds since epoch
501
502 Raises:
503 KeyError: if the object is not found
504 """
505 # Default implementation raises KeyError
506 # Subclasses should override to provide actual mtime
507 raise KeyError(sha)
508
509
510class PackBasedObjectStore(BaseObjectStore, PackedObjectContainer):
511 def __init__(self, pack_compression_level=-1, pack_index_version=None) -> None:
512 self._pack_cache: dict[str, Pack] = {}
513 self.pack_compression_level = pack_compression_level
514 self.pack_index_version = pack_index_version
515
516 def add_pack(self) -> tuple[BytesIO, Callable[[], None], Callable[[], None]]:
517 """Add a new pack to this object store."""
518 raise NotImplementedError(self.add_pack)
519
520 def add_pack_data(
521 self, count: int, unpacked_objects: Iterator[UnpackedObject], progress=None
522 ) -> None:
523 """Add pack data to this object store.
524
525 Args:
526 count: Number of items to add
527 """
528 if count == 0:
529 # Don't bother writing an empty pack file
530 return
531 f, commit, abort = self.add_pack()
532 try:
533 write_pack_data(
534 f.write,
535 unpacked_objects,
536 num_records=count,
537 progress=progress,
538 compression_level=self.pack_compression_level,
539 )
540 except BaseException:
541 abort()
542 raise
543 else:
544 return commit()
545
546 @property
547 def alternates(self):
548 return []
549
550 def contains_packed(self, sha) -> bool:
551 """Check if a particular object is present by SHA1 and is packed.
552
553 This does not check alternates.
554 """
555 for pack in self.packs:
556 try:
557 if sha in pack:
558 return True
559 except PackFileDisappeared:
560 pass
561 return False
562
563 def __contains__(self, sha) -> bool:
564 """Check if a particular object is present by SHA1.
565
566 This method makes no distinction between loose and packed objects.
567 """
568 if self.contains_packed(sha) or self.contains_loose(sha):
569 return True
570 for alternate in self.alternates:
571 if sha in alternate:
572 return True
573 return False
574
575 def _add_cached_pack(self, base_name, pack) -> None:
576 """Add a newly appeared pack to the cache by path."""
577 prev_pack = self._pack_cache.get(base_name)
578 if prev_pack is not pack:
579 self._pack_cache[base_name] = pack
580 if prev_pack:
581 prev_pack.close()
582
583 def generate_pack_data(
584 self, have, want, shallow=None, progress=None, ofs_delta=True
585 ) -> tuple[int, Iterator[UnpackedObject]]:
586 """Generate pack data objects for a set of wants/haves.
587
588 Args:
589 have: List of SHA1s of objects that should not be sent
590 want: List of SHA1s of objects that should be sent
591 shallow: Set of shallow commit SHA1s to skip
592 ofs_delta: Whether OFS deltas can be included
593 progress: Optional progress reporting method
594 """
595 missing_objects = MissingObjectFinder(
596 self, haves=have, wants=want, shallow=shallow, progress=progress
597 )
598 remote_has = missing_objects.get_remote_has()
599 object_ids = list(missing_objects)
600 return len(object_ids), generate_unpacked_objects(
601 cast(PackedObjectContainer, self),
602 object_ids,
603 progress=progress,
604 ofs_delta=ofs_delta,
605 other_haves=remote_has,
606 )
607
608 def _clear_cached_packs(self) -> None:
609 pack_cache = self._pack_cache
610 self._pack_cache = {}
611 while pack_cache:
612 (name, pack) = pack_cache.popitem()
613 pack.close()
614
615 def _iter_cached_packs(self):
616 return self._pack_cache.values()
617
618 def _update_pack_cache(self) -> list[Pack]:
619 raise NotImplementedError(self._update_pack_cache)
620
621 def close(self) -> None:
622 self._clear_cached_packs()
623
624 @property
625 def packs(self):
626 """List with pack objects."""
627 return list(self._iter_cached_packs()) + list(self._update_pack_cache())
628
629 def count_pack_files(self) -> int:
630 """Count the number of pack files.
631
632 Returns:
633 Number of pack files (excluding those with .keep files)
634 """
635 count = 0
636 for pack in self.packs:
637 # Check if there's a .keep file for this pack
638 keep_path = pack._basename + ".keep"
639 if not os.path.exists(keep_path):
640 count += 1
641 return count
642
643 def _iter_alternate_objects(self):
644 """Iterate over the SHAs of all the objects in alternate stores."""
645 for alternate in self.alternates:
646 yield from alternate
647
648 def _iter_loose_objects(self):
649 """Iterate over the SHAs of all loose objects."""
650 raise NotImplementedError(self._iter_loose_objects)
651
652 def _get_loose_object(self, sha) -> Optional[ShaFile]:
653 raise NotImplementedError(self._get_loose_object)
654
655 def delete_loose_object(self, sha) -> None:
656 """Delete a loose object.
657
658 This method only handles loose objects. For packed objects,
659 use repack(exclude=...) to exclude them during repacking.
660 """
661 raise NotImplementedError(self.delete_loose_object)
662
663 def _remove_pack(self, name) -> None:
664 raise NotImplementedError(self._remove_pack)
665
666 def pack_loose_objects(self):
667 """Pack loose objects.
668
669 Returns: Number of objects packed
670 """
671 objects = set()
672 for sha in self._iter_loose_objects():
673 objects.add((self._get_loose_object(sha), None))
674 self.add_objects(list(objects))
675 for obj, path in objects:
676 self.delete_loose_object(obj.id)
677 return len(objects)
678
679 def repack(self, exclude=None):
680 """Repack the packs in this repository.
681
682 Note that this implementation is fairly naive and currently keeps all
683 objects in memory while it repacks.
684
685 Args:
686 exclude: Optional set of object SHAs to exclude from repacking
687 """
688 if exclude is None:
689 exclude = set()
690
691 loose_objects = set()
692 excluded_loose_objects = set()
693 for sha in self._iter_loose_objects():
694 if sha not in exclude:
695 loose_objects.add(self._get_loose_object(sha))
696 else:
697 excluded_loose_objects.add(sha)
698
699 objects = {(obj, None) for obj in loose_objects}
700 old_packs = {p.name(): p for p in self.packs}
701 for name, pack in old_packs.items():
702 objects.update(
703 (obj, None) for obj in pack.iterobjects() if obj.id not in exclude
704 )
705
706 # Only create a new pack if there are objects to pack
707 if objects:
708 # The name of the consolidated pack might match the name of a
709 # pre-existing pack. Take care not to remove the newly created
710 # consolidated pack.
711 consolidated = self.add_objects(objects)
712 old_packs.pop(consolidated.name(), None)
713
714 # Delete loose objects that were packed
715 for obj in loose_objects:
716 self.delete_loose_object(obj.id)
717 # Delete excluded loose objects
718 for sha in excluded_loose_objects:
719 self.delete_loose_object(sha)
720 for name, pack in old_packs.items():
721 self._remove_pack(pack)
722 self._update_pack_cache()
723 return len(objects)
724
725 def __iter__(self):
726 """Iterate over the SHAs that are present in this store."""
727 self._update_pack_cache()
728 for pack in self._iter_cached_packs():
729 try:
730 yield from pack
731 except PackFileDisappeared:
732 pass
733 yield from self._iter_loose_objects()
734 yield from self._iter_alternate_objects()
735
736 def contains_loose(self, sha):
737 """Check if a particular object is present by SHA1 and is loose.
738
739 This does not check alternates.
740 """
741 return self._get_loose_object(sha) is not None
742
743 def get_raw(self, name):
744 """Obtain the raw fulltext for an object.
745
746 Args:
747 name: sha for the object.
748 Returns: tuple with numeric type and object contents.
749 """
750 if name == ZERO_SHA:
751 raise KeyError(name)
752 if len(name) == 40:
753 sha = hex_to_sha(name)
754 hexsha = name
755 elif len(name) == 20:
756 sha = name
757 hexsha = None
758 else:
759 raise AssertionError(f"Invalid object name {name!r}")
760 for pack in self._iter_cached_packs():
761 try:
762 return pack.get_raw(sha)
763 except (KeyError, PackFileDisappeared):
764 pass
765 if hexsha is None:
766 hexsha = sha_to_hex(name)
767 ret = self._get_loose_object(hexsha)
768 if ret is not None:
769 return ret.type_num, ret.as_raw_string()
770 # Maybe something else has added a pack with the object
771 # in the mean time?
772 for pack in self._update_pack_cache():
773 try:
774 return pack.get_raw(sha)
775 except KeyError:
776 pass
777 for alternate in self.alternates:
778 try:
779 return alternate.get_raw(hexsha)
780 except KeyError:
781 pass
782 raise KeyError(hexsha)
783
784 def iter_unpacked_subset(
785 self,
786 shas: set[bytes],
787 include_comp: bool = False,
788 allow_missing: bool = False,
789 convert_ofs_delta: bool = True,
790 ) -> Iterator[UnpackedObject]:
791 todo: set[bytes] = set(shas)
792 for p in self._iter_cached_packs():
793 for unpacked in p.iter_unpacked_subset(
794 todo,
795 include_comp=include_comp,
796 allow_missing=True,
797 convert_ofs_delta=convert_ofs_delta,
798 ):
799 yield unpacked
800 hexsha = sha_to_hex(unpacked.sha())
801 todo.remove(hexsha)
802 # Maybe something else has added a pack with the object
803 # in the mean time?
804 for p in self._update_pack_cache():
805 for unpacked in p.iter_unpacked_subset(
806 todo,
807 include_comp=include_comp,
808 allow_missing=True,
809 convert_ofs_delta=convert_ofs_delta,
810 ):
811 yield unpacked
812 hexsha = sha_to_hex(unpacked.sha())
813 todo.remove(hexsha)
814 for alternate in self.alternates:
815 for unpacked in alternate.iter_unpacked_subset(
816 todo,
817 include_comp=include_comp,
818 allow_missing=True,
819 convert_ofs_delta=convert_ofs_delta,
820 ):
821 yield unpacked
822 hexsha = sha_to_hex(unpacked.sha())
823 todo.remove(hexsha)
824
825 def iterobjects_subset(
826 self, shas: Iterable[bytes], *, allow_missing: bool = False
827 ) -> Iterator[ShaFile]:
828 todo: set[bytes] = set(shas)
829 for p in self._iter_cached_packs():
830 for o in p.iterobjects_subset(todo, allow_missing=True):
831 yield o
832 todo.remove(o.id)
833 # Maybe something else has added a pack with the object
834 # in the mean time?
835 for p in self._update_pack_cache():
836 for o in p.iterobjects_subset(todo, allow_missing=True):
837 yield o
838 todo.remove(o.id)
839 for alternate in self.alternates:
840 for o in alternate.iterobjects_subset(todo, allow_missing=True):
841 yield o
842 todo.remove(o.id)
843 for oid in todo:
844 o = self._get_loose_object(oid)
845 if o is not None:
846 yield o
847 elif not allow_missing:
848 raise KeyError(oid)
849
850 def get_unpacked_object(
851 self, sha1: bytes, *, include_comp: bool = False
852 ) -> UnpackedObject:
853 """Obtain the unpacked object.
854
855 Args:
856 sha1: sha for the object.
857 """
858 if sha1 == ZERO_SHA:
859 raise KeyError(sha1)
860 if len(sha1) == 40:
861 sha = hex_to_sha(sha1)
862 hexsha = sha1
863 elif len(sha1) == 20:
864 sha = sha1
865 hexsha = None
866 else:
867 raise AssertionError(f"Invalid object sha1 {sha1!r}")
868 for pack in self._iter_cached_packs():
869 try:
870 return pack.get_unpacked_object(sha, include_comp=include_comp)
871 except (KeyError, PackFileDisappeared):
872 pass
873 if hexsha is None:
874 hexsha = sha_to_hex(sha1)
875 # Maybe something else has added a pack with the object
876 # in the mean time?
877 for pack in self._update_pack_cache():
878 try:
879 return pack.get_unpacked_object(sha, include_comp=include_comp)
880 except KeyError:
881 pass
882 for alternate in self.alternates:
883 try:
884 return alternate.get_unpacked_object(hexsha, include_comp=include_comp)
885 except KeyError:
886 pass
887 raise KeyError(hexsha)
888
889 def add_objects(
890 self,
891 objects: Sequence[tuple[ShaFile, Optional[str]]],
892 progress: Optional[Callable[[str], None]] = None,
893 ) -> None:
894 """Add a set of objects to this object store.
895
896 Args:
897 objects: Iterable over (object, path) tuples, should support
898 __len__.
899 Returns: Pack object of the objects written.
900 """
901 count = len(objects)
902 record_iter = (full_unpacked_object(o) for (o, p) in objects)
903 return self.add_pack_data(count, record_iter, progress=progress)
904
905
906class DiskObjectStore(PackBasedObjectStore):
907 """Git-style object store that exists on disk."""
908
909 path: Union[str, os.PathLike]
910 pack_dir: Union[str, os.PathLike]
911
912 def __init__(
913 self,
914 path: Union[str, os.PathLike],
915 loose_compression_level=-1,
916 pack_compression_level=-1,
917 pack_index_version=None,
918 ) -> None:
919 """Open an object store.
920
921 Args:
922 path: Path of the object store.
923 loose_compression_level: zlib compression level for loose objects
924 pack_compression_level: zlib compression level for pack objects
925 pack_index_version: pack index version to use (1, 2, or 3)
926 """
927 super().__init__(
928 pack_compression_level=pack_compression_level,
929 pack_index_version=pack_index_version,
930 )
931 self.path = path
932 self.pack_dir = os.path.join(self.path, PACKDIR)
933 self._alternates = None
934 self.loose_compression_level = loose_compression_level
935 self.pack_compression_level = pack_compression_level
936 self.pack_index_version = pack_index_version
937
938 # Commit graph support - lazy loaded
939 self._commit_graph = None
940 self._use_commit_graph = True # Default to true
941
942 def __repr__(self) -> str:
943 return f"<{self.__class__.__name__}({self.path!r})>"
944
945 @classmethod
946 def from_config(cls, path: Union[str, os.PathLike], config):
947 try:
948 default_compression_level = int(
949 config.get((b"core",), b"compression").decode()
950 )
951 except KeyError:
952 default_compression_level = -1
953 try:
954 loose_compression_level = int(
955 config.get((b"core",), b"looseCompression").decode()
956 )
957 except KeyError:
958 loose_compression_level = default_compression_level
959 try:
960 pack_compression_level = int(
961 config.get((b"core",), "packCompression").decode()
962 )
963 except KeyError:
964 pack_compression_level = default_compression_level
965 try:
966 pack_index_version = int(config.get((b"pack",), b"indexVersion").decode())
967 except KeyError:
968 pack_index_version = None
969
970 # Read core.commitGraph setting
971 use_commit_graph = config.get_boolean((b"core",), b"commitGraph", True)
972
973 instance = cls(
974 path, loose_compression_level, pack_compression_level, pack_index_version
975 )
976 instance._use_commit_graph = use_commit_graph
977 return instance
978
979 @property
980 def alternates(self):
981 if self._alternates is not None:
982 return self._alternates
983 self._alternates = []
984 for path in self._read_alternate_paths():
985 self._alternates.append(DiskObjectStore(path))
986 return self._alternates
987
988 def _read_alternate_paths(self):
989 try:
990 f = GitFile(os.path.join(self.path, INFODIR, "alternates"), "rb")
991 except FileNotFoundError:
992 return
993 with f:
994 for line in f.readlines():
995 line = line.rstrip(b"\n")
996 if line.startswith(b"#"):
997 continue
998 if os.path.isabs(line):
999 yield os.fsdecode(line)
1000 else:
1001 yield os.fsdecode(os.path.join(os.fsencode(self.path), line))
1002
1003 def add_alternate_path(self, path) -> None:
1004 """Add an alternate path to this object store."""
1005 try:
1006 os.mkdir(os.path.join(self.path, INFODIR))
1007 except FileExistsError:
1008 pass
1009 alternates_path = os.path.join(self.path, INFODIR, "alternates")
1010 with GitFile(alternates_path, "wb") as f:
1011 try:
1012 orig_f = open(alternates_path, "rb")
1013 except FileNotFoundError:
1014 pass
1015 else:
1016 with orig_f:
1017 f.write(orig_f.read())
1018 f.write(os.fsencode(path) + b"\n")
1019
1020 if not os.path.isabs(path):
1021 path = os.path.join(self.path, path)
1022 self.alternates.append(DiskObjectStore(path))
1023
1024 def _update_pack_cache(self):
1025 """Read and iterate over new pack files and cache them."""
1026 try:
1027 pack_dir_contents = os.listdir(self.pack_dir)
1028 except FileNotFoundError:
1029 self.close()
1030 return []
1031 pack_files = set()
1032 for name in pack_dir_contents:
1033 if name.startswith("pack-") and name.endswith(".pack"):
1034 # verify that idx exists first (otherwise the pack was not yet
1035 # fully written)
1036 idx_name = os.path.splitext(name)[0] + ".idx"
1037 if idx_name in pack_dir_contents:
1038 pack_name = name[: -len(".pack")]
1039 pack_files.add(pack_name)
1040
1041 # Open newly appeared pack files
1042 new_packs = []
1043 for f in pack_files:
1044 if f not in self._pack_cache:
1045 pack = Pack(os.path.join(self.pack_dir, f))
1046 new_packs.append(pack)
1047 self._pack_cache[f] = pack
1048 # Remove disappeared pack files
1049 for f in set(self._pack_cache) - pack_files:
1050 self._pack_cache.pop(f).close()
1051 return new_packs
1052
1053 def _get_shafile_path(self, sha):
1054 # Check from object dir
1055 return hex_to_filename(self.path, sha)
1056
1057 def _iter_loose_objects(self):
1058 for base in os.listdir(self.path):
1059 if len(base) != 2:
1060 continue
1061 for rest in os.listdir(os.path.join(self.path, base)):
1062 sha = os.fsencode(base + rest)
1063 if not valid_hexsha(sha):
1064 continue
1065 yield sha
1066
1067 def count_loose_objects(self) -> int:
1068 """Count the number of loose objects in the object store.
1069
1070 Returns:
1071 Number of loose objects
1072 """
1073 count = 0
1074 if not os.path.exists(self.path):
1075 return 0
1076
1077 for i in range(256):
1078 subdir = os.path.join(self.path, f"{i:02x}")
1079 try:
1080 count += len(
1081 [
1082 name
1083 for name in os.listdir(subdir)
1084 if len(name) == 38 # 40 - 2 for the prefix
1085 ]
1086 )
1087 except FileNotFoundError:
1088 # Directory may have been removed or is inaccessible
1089 continue
1090
1091 return count
1092
1093 def _get_loose_object(self, sha):
1094 path = self._get_shafile_path(sha)
1095 try:
1096 return ShaFile.from_path(path)
1097 except FileNotFoundError:
1098 return None
1099
1100 def delete_loose_object(self, sha) -> None:
1101 os.remove(self._get_shafile_path(sha))
1102
1103 def get_object_mtime(self, sha):
1104 """Get the modification time of an object.
1105
1106 Args:
1107 sha: SHA1 of the object
1108
1109 Returns:
1110 Modification time as seconds since epoch
1111
1112 Raises:
1113 KeyError: if the object is not found
1114 """
1115 # First check if it's a loose object
1116 if self.contains_loose(sha):
1117 path = self._get_shafile_path(sha)
1118 try:
1119 return os.path.getmtime(path)
1120 except FileNotFoundError:
1121 pass
1122
1123 # Check if it's in a pack file
1124 for pack in self.packs:
1125 try:
1126 if sha in pack:
1127 # Use the pack file's mtime for packed objects
1128 pack_path = pack._data_path
1129 try:
1130 return os.path.getmtime(pack_path)
1131 except (FileNotFoundError, AttributeError):
1132 pass
1133 except PackFileDisappeared:
1134 pass
1135
1136 raise KeyError(sha)
1137
1138 def _remove_pack(self, pack) -> None:
1139 try:
1140 del self._pack_cache[os.path.basename(pack._basename)]
1141 except KeyError:
1142 pass
1143 pack.close()
1144 os.remove(pack.data.path)
1145 os.remove(pack.index.path)
1146
1147 def _get_pack_basepath(self, entries):
1148 suffix = iter_sha1(entry[0] for entry in entries)
1149 # TODO: Handle self.pack_dir being bytes
1150 suffix = suffix.decode("ascii")
1151 return os.path.join(self.pack_dir, "pack-" + suffix)
1152
1153 def _complete_pack(self, f, path, num_objects, indexer, progress=None):
1154 """Move a specific file containing a pack into the pack directory.
1155
1156 Note: The file should be on the same file system as the
1157 packs directory.
1158
1159 Args:
1160 f: Open file object for the pack.
1161 path: Path to the pack file.
1162 indexer: A PackIndexer for indexing the pack.
1163 """
1164 entries = []
1165 for i, entry in enumerate(indexer):
1166 if progress is not None:
1167 progress(f"generating index: {i}/{num_objects}\r".encode("ascii"))
1168 entries.append(entry)
1169
1170 pack_sha, extra_entries = extend_pack(
1171 f,
1172 indexer.ext_refs(),
1173 get_raw=self.get_raw,
1174 compression_level=self.pack_compression_level,
1175 progress=progress,
1176 )
1177 f.flush()
1178 try:
1179 fileno = f.fileno()
1180 except AttributeError:
1181 pass
1182 else:
1183 os.fsync(fileno)
1184 f.close()
1185
1186 entries.extend(extra_entries)
1187
1188 # Move the pack in.
1189 entries.sort()
1190 pack_base_name = self._get_pack_basepath(entries)
1191
1192 for pack in self.packs:
1193 if pack._basename == pack_base_name:
1194 return pack
1195
1196 target_pack_path = pack_base_name + ".pack"
1197 target_index_path = pack_base_name + ".idx"
1198 if sys.platform == "win32":
1199 # Windows might have the target pack file lingering. Attempt
1200 # removal, silently passing if the target does not exist.
1201 with suppress(FileNotFoundError):
1202 os.remove(target_pack_path)
1203 os.rename(path, target_pack_path)
1204
1205 # Write the index.
1206 with GitFile(target_index_path, "wb", mask=PACK_MODE) as index_file:
1207 write_pack_index(
1208 index_file, entries, pack_sha, version=self.pack_index_version
1209 )
1210
1211 # Add the pack to the store and return it.
1212 final_pack = Pack(pack_base_name)
1213 final_pack.check_length_and_checksum()
1214 self._add_cached_pack(pack_base_name, final_pack)
1215 return final_pack
1216
1217 def add_thin_pack(self, read_all, read_some, progress=None):
1218 """Add a new thin pack to this object store.
1219
1220 Thin packs are packs that contain deltas with parents that exist
1221 outside the pack. They should never be placed in the object store
1222 directly, and always indexed and completed as they are copied.
1223
1224 Args:
1225 read_all: Read function that blocks until the number of
1226 requested bytes are read.
1227 read_some: Read function that returns at least one byte, but may
1228 not return the number of bytes requested.
1229 Returns: A Pack object pointing at the now-completed thin pack in the
1230 objects/pack directory.
1231 """
1232 import tempfile
1233
1234 fd, path = tempfile.mkstemp(dir=self.path, prefix="tmp_pack_")
1235 with os.fdopen(fd, "w+b") as f:
1236 os.chmod(path, PACK_MODE)
1237 indexer = PackIndexer(f, resolve_ext_ref=self.get_raw)
1238 copier = PackStreamCopier(read_all, read_some, f, delta_iter=indexer)
1239 copier.verify(progress=progress)
1240 return self._complete_pack(f, path, len(copier), indexer, progress=progress)
1241
1242 def add_pack(self):
1243 """Add a new pack to this object store.
1244
1245 Returns: Fileobject to write to, a commit function to
1246 call when the pack is finished and an abort
1247 function.
1248 """
1249 import tempfile
1250
1251 fd, path = tempfile.mkstemp(dir=self.pack_dir, suffix=".pack")
1252 f = os.fdopen(fd, "w+b")
1253 os.chmod(path, PACK_MODE)
1254
1255 def commit():
1256 if f.tell() > 0:
1257 f.seek(0)
1258 with PackData(path, f) as pd:
1259 indexer = PackIndexer.for_pack_data(
1260 pd, resolve_ext_ref=self.get_raw
1261 )
1262 return self._complete_pack(f, path, len(pd), indexer)
1263 else:
1264 f.close()
1265 os.remove(path)
1266 return None
1267
1268 def abort() -> None:
1269 f.close()
1270 os.remove(path)
1271
1272 return f, commit, abort
1273
1274 def add_object(self, obj) -> None:
1275 """Add a single object to this object store.
1276
1277 Args:
1278 obj: Object to add
1279 """
1280 path = self._get_shafile_path(obj.id)
1281 dir = os.path.dirname(path)
1282 try:
1283 os.mkdir(dir)
1284 except FileExistsError:
1285 pass
1286 if os.path.exists(path):
1287 return # Already there, no need to write again
1288 with GitFile(path, "wb", mask=PACK_MODE) as f:
1289 f.write(
1290 obj.as_legacy_object(compression_level=self.loose_compression_level)
1291 )
1292
1293 @classmethod
1294 def init(cls, path: Union[str, os.PathLike]):
1295 try:
1296 os.mkdir(path)
1297 except FileExistsError:
1298 pass
1299 os.mkdir(os.path.join(path, "info"))
1300 os.mkdir(os.path.join(path, PACKDIR))
1301 return cls(path)
1302
1303 def iter_prefix(self, prefix):
1304 if len(prefix) < 2:
1305 yield from super().iter_prefix(prefix)
1306 return
1307 seen = set()
1308 dir = prefix[:2].decode()
1309 rest = prefix[2:].decode()
1310 try:
1311 for name in os.listdir(os.path.join(self.path, dir)):
1312 if name.startswith(rest):
1313 sha = os.fsencode(dir + name)
1314 if sha not in seen:
1315 seen.add(sha)
1316 yield sha
1317 except FileNotFoundError:
1318 pass
1319
1320 for p in self.packs:
1321 bin_prefix = (
1322 binascii.unhexlify(prefix)
1323 if len(prefix) % 2 == 0
1324 else binascii.unhexlify(prefix[:-1])
1325 )
1326 for sha in p.index.iter_prefix(bin_prefix):
1327 sha = sha_to_hex(sha)
1328 if sha.startswith(prefix) and sha not in seen:
1329 seen.add(sha)
1330 yield sha
1331 for alternate in self.alternates:
1332 for sha in alternate.iter_prefix(prefix):
1333 if sha not in seen:
1334 seen.add(sha)
1335 yield sha
1336
1337 def get_commit_graph(self):
1338 """Get the commit graph for this object store.
1339
1340 Returns:
1341 CommitGraph object if available, None otherwise
1342 """
1343 if not self._use_commit_graph:
1344 return None
1345
1346 if self._commit_graph is None:
1347 from .commit_graph import read_commit_graph
1348
1349 # Look for commit graph in our objects directory
1350 graph_file = os.path.join(self.path, "info", "commit-graph")
1351 if os.path.exists(graph_file):
1352 self._commit_graph = read_commit_graph(graph_file)
1353 return self._commit_graph
1354
1355 def write_commit_graph(self, refs=None, reachable=True) -> None:
1356 """Write a commit graph file for this object store.
1357
1358 Args:
1359 refs: List of refs to include. If None, includes all refs from object store.
1360 reachable: If True, includes all commits reachable from refs.
1361 If False, only includes the direct ref targets.
1362 """
1363 from .commit_graph import get_reachable_commits
1364
1365 if refs is None:
1366 # Get all commit objects from the object store
1367 all_refs = []
1368 # Iterate through all objects to find commits
1369 for sha in self:
1370 try:
1371 obj = self[sha]
1372 if obj.type_name == b"commit":
1373 all_refs.append(sha)
1374 except KeyError:
1375 continue
1376 else:
1377 # Use provided refs
1378 all_refs = refs
1379
1380 if not all_refs:
1381 return # No commits to include
1382
1383 if reachable:
1384 # Get all reachable commits
1385 commit_ids = get_reachable_commits(self, all_refs)
1386 else:
1387 # Just use the direct ref targets - ensure they're hex ObjectIDs
1388 commit_ids = []
1389 for ref in all_refs:
1390 if isinstance(ref, bytes) and len(ref) == 40:
1391 # Already hex ObjectID
1392 commit_ids.append(ref)
1393 elif isinstance(ref, bytes) and len(ref) == 20:
1394 # Binary SHA, convert to hex ObjectID
1395 from .objects import sha_to_hex
1396
1397 commit_ids.append(sha_to_hex(ref))
1398 else:
1399 # Assume it's already correct format
1400 commit_ids.append(ref)
1401
1402 if commit_ids:
1403 # Write commit graph directly to our object store path
1404 # Generate the commit graph
1405 from .commit_graph import generate_commit_graph
1406
1407 graph = generate_commit_graph(self, commit_ids)
1408
1409 if graph.entries:
1410 # Ensure the info directory exists
1411 info_dir = os.path.join(self.path, "info")
1412 os.makedirs(info_dir, exist_ok=True)
1413
1414 # Write using GitFile for atomic operation
1415 graph_path = os.path.join(info_dir, "commit-graph")
1416 with GitFile(graph_path, "wb") as f:
1417 graph.write_to_file(f)
1418
1419 # Clear cached commit graph so it gets reloaded
1420 self._commit_graph = None
1421
1422 def prune(self, grace_period: Optional[int] = None) -> None:
1423 """Prune/clean up this object store.
1424
1425 This removes temporary files that were left behind by interrupted
1426 pack operations. These are files that start with ``tmp_pack_`` in the
1427 repository directory or files with .pack extension but no corresponding
1428 .idx file in the pack directory.
1429
1430 Args:
1431 grace_period: Grace period in seconds for removing temporary files.
1432 If None, uses DEFAULT_TEMPFILE_GRACE_PERIOD.
1433 """
1434 import glob
1435
1436 if grace_period is None:
1437 grace_period = DEFAULT_TEMPFILE_GRACE_PERIOD
1438
1439 # Clean up tmp_pack_* files in the repository directory
1440 for tmp_file in glob.glob(os.path.join(self.path, "tmp_pack_*")):
1441 # Check if file is old enough (more than grace period)
1442 mtime = os.path.getmtime(tmp_file)
1443 if time.time() - mtime > grace_period:
1444 os.remove(tmp_file)
1445
1446 # Clean up orphaned .pack files without corresponding .idx files
1447 try:
1448 pack_dir_contents = os.listdir(self.pack_dir)
1449 except FileNotFoundError:
1450 return
1451
1452 pack_files = {}
1453 idx_files = set()
1454
1455 for name in pack_dir_contents:
1456 if name.endswith(".pack"):
1457 base_name = name[:-5] # Remove .pack extension
1458 pack_files[base_name] = name
1459 elif name.endswith(".idx"):
1460 base_name = name[:-4] # Remove .idx extension
1461 idx_files.add(base_name)
1462
1463 # Remove .pack files without corresponding .idx files
1464 for base_name, pack_name in pack_files.items():
1465 if base_name not in idx_files:
1466 pack_path = os.path.join(self.pack_dir, pack_name)
1467 # Check if file is old enough (more than grace period)
1468 mtime = os.path.getmtime(pack_path)
1469 if time.time() - mtime > grace_period:
1470 os.remove(pack_path)
1471
1472
1473class MemoryObjectStore(BaseObjectStore):
1474 """Object store that keeps all objects in memory."""
1475
1476 def __init__(self) -> None:
1477 super().__init__()
1478 self._data: dict[str, ShaFile] = {}
1479 self.pack_compression_level = -1
1480
1481 def _to_hexsha(self, sha):
1482 if len(sha) == 40:
1483 return sha
1484 elif len(sha) == 20:
1485 return sha_to_hex(sha)
1486 else:
1487 raise ValueError(f"Invalid sha {sha!r}")
1488
1489 def contains_loose(self, sha):
1490 """Check if a particular object is present by SHA1 and is loose."""
1491 return self._to_hexsha(sha) in self._data
1492
1493 def contains_packed(self, sha) -> bool:
1494 """Check if a particular object is present by SHA1 and is packed."""
1495 return False
1496
1497 def __iter__(self):
1498 """Iterate over the SHAs that are present in this store."""
1499 return iter(self._data.keys())
1500
1501 @property
1502 def packs(self):
1503 """List with pack objects."""
1504 return []
1505
1506 def get_raw(self, name: ObjectID):
1507 """Obtain the raw text for an object.
1508
1509 Args:
1510 name: sha for the object.
1511 Returns: tuple with numeric type and object contents.
1512 """
1513 obj = self[self._to_hexsha(name)]
1514 return obj.type_num, obj.as_raw_string()
1515
1516 def __getitem__(self, name: ObjectID):
1517 return self._data[self._to_hexsha(name)].copy()
1518
1519 def __delitem__(self, name: ObjectID) -> None:
1520 """Delete an object from this store, for testing only."""
1521 del self._data[self._to_hexsha(name)]
1522
1523 def add_object(self, obj) -> None:
1524 """Add a single object to this object store."""
1525 self._data[obj.id] = obj.copy()
1526
1527 def add_objects(self, objects, progress=None) -> None:
1528 """Add a set of objects to this object store.
1529
1530 Args:
1531 objects: Iterable over a list of (object, path) tuples
1532 """
1533 for obj, path in objects:
1534 self.add_object(obj)
1535
1536 def add_pack(self):
1537 """Add a new pack to this object store.
1538
1539 Because this object store doesn't support packs, we extract and add the
1540 individual objects.
1541
1542 Returns: Fileobject to write to and a commit function to
1543 call when the pack is finished.
1544 """
1545 from tempfile import SpooledTemporaryFile
1546
1547 f = SpooledTemporaryFile(max_size=PACK_SPOOL_FILE_MAX_SIZE, prefix="incoming-")
1548
1549 def commit() -> None:
1550 size = f.tell()
1551 if size > 0:
1552 f.seek(0)
1553 p = PackData.from_file(f, size)
1554 for obj in PackInflater.for_pack_data(p, self.get_raw):
1555 self.add_object(obj)
1556 p.close()
1557 f.close()
1558 else:
1559 f.close()
1560
1561 def abort() -> None:
1562 f.close()
1563
1564 return f, commit, abort
1565
1566 def add_pack_data(
1567 self, count: int, unpacked_objects: Iterator[UnpackedObject], progress=None
1568 ) -> None:
1569 """Add pack data to this object store.
1570
1571 Args:
1572 count: Number of items to add
1573 """
1574 if count == 0:
1575 return
1576
1577 # Since MemoryObjectStore doesn't support pack files, we need to
1578 # extract individual objects. To handle deltas properly, we write
1579 # to a temporary pack and then use PackInflater to resolve them.
1580 f, commit, abort = self.add_pack()
1581 try:
1582 write_pack_data(
1583 f.write,
1584 unpacked_objects,
1585 num_records=count,
1586 progress=progress,
1587 )
1588 except BaseException:
1589 abort()
1590 raise
1591 else:
1592 commit()
1593
1594 def add_thin_pack(self, read_all, read_some, progress=None) -> None:
1595 """Add a new thin pack to this object store.
1596
1597 Thin packs are packs that contain deltas with parents that exist
1598 outside the pack. Because this object store doesn't support packs, we
1599 extract and add the individual objects.
1600
1601 Args:
1602 read_all: Read function that blocks until the number of
1603 requested bytes are read.
1604 read_some: Read function that returns at least one byte, but may
1605 not return the number of bytes requested.
1606 """
1607 f, commit, abort = self.add_pack()
1608 try:
1609 copier = PackStreamCopier(read_all, read_some, f)
1610 copier.verify()
1611 except BaseException:
1612 abort()
1613 raise
1614 else:
1615 commit()
1616
1617
1618class ObjectIterator(Protocol):
1619 """Interface for iterating over objects."""
1620
1621 def iterobjects(self) -> Iterator[ShaFile]:
1622 raise NotImplementedError(self.iterobjects)
1623
1624
1625def tree_lookup_path(lookup_obj, root_sha, path):
1626 """Look up an object in a Git tree.
1627
1628 Args:
1629 lookup_obj: Callback for retrieving object by SHA1
1630 root_sha: SHA1 of the root tree
1631 path: Path to lookup
1632 Returns: A tuple of (mode, SHA) of the resulting path.
1633 """
1634 tree = lookup_obj(root_sha)
1635 if not isinstance(tree, Tree):
1636 raise NotTreeError(root_sha)
1637 return tree.lookup_path(lookup_obj, path)
1638
1639
1640def _collect_filetree_revs(
1641 obj_store: ObjectContainer, tree_sha: ObjectID, kset: set[ObjectID]
1642) -> None:
1643 """Collect SHA1s of files and directories for specified tree.
1644
1645 Args:
1646 obj_store: Object store to get objects by SHA from
1647 tree_sha: tree reference to walk
1648 kset: set to fill with references to files and directories
1649 """
1650 filetree = obj_store[tree_sha]
1651 assert isinstance(filetree, Tree)
1652 for name, mode, sha in filetree.iteritems():
1653 if not S_ISGITLINK(mode) and sha not in kset:
1654 kset.add(sha)
1655 if stat.S_ISDIR(mode):
1656 _collect_filetree_revs(obj_store, sha, kset)
1657
1658
1659def _split_commits_and_tags(
1660 obj_store: ObjectContainer, lst, *, ignore_unknown=False
1661) -> tuple[set[bytes], set[bytes], set[bytes]]:
1662 """Split object id list into three lists with commit, tag, and other SHAs.
1663
1664 Commits referenced by tags are included into commits
1665 list as well. Only SHA1s known in this repository will get
1666 through, and unless ignore_unknown argument is True, KeyError
1667 is thrown for SHA1 missing in the repository
1668
1669 Args:
1670 obj_store: Object store to get objects by SHA1 from
1671 lst: Collection of commit and tag SHAs
1672 ignore_unknown: True to skip SHA1 missing in the repository
1673 silently.
1674 Returns: A tuple of (commits, tags, others) SHA1s
1675 """
1676 commits: set[bytes] = set()
1677 tags: set[bytes] = set()
1678 others: set[bytes] = set()
1679 for e in lst:
1680 try:
1681 o = obj_store[e]
1682 except KeyError:
1683 if not ignore_unknown:
1684 raise
1685 else:
1686 if isinstance(o, Commit):
1687 commits.add(e)
1688 elif isinstance(o, Tag):
1689 tags.add(e)
1690 tagged = o.object[1]
1691 c, t, os = _split_commits_and_tags(
1692 obj_store, [tagged], ignore_unknown=ignore_unknown
1693 )
1694 commits |= c
1695 tags |= t
1696 others |= os
1697 else:
1698 others.add(e)
1699 return (commits, tags, others)
1700
1701
1702class MissingObjectFinder:
1703 """Find the objects missing from another object store.
1704
1705 Args:
1706 object_store: Object store containing at least all objects to be
1707 sent
1708 haves: SHA1s of commits not to send (already present in target)
1709 wants: SHA1s of commits to send
1710 progress: Optional function to report progress to.
1711 get_tagged: Function that returns a dict of pointed-to sha -> tag
1712 sha for including tags.
1713 get_parents: Optional function for getting the parents of a commit.
1714 """
1715
1716 def __init__(
1717 self,
1718 object_store,
1719 haves,
1720 wants,
1721 *,
1722 shallow=None,
1723 progress=None,
1724 get_tagged=None,
1725 get_parents=lambda commit: commit.parents,
1726 ) -> None:
1727 self.object_store = object_store
1728 if shallow is None:
1729 shallow = set()
1730 self._get_parents = get_parents
1731 # process Commits and Tags differently
1732 # Note, while haves may list commits/tags not available locally,
1733 # and such SHAs would get filtered out by _split_commits_and_tags,
1734 # wants shall list only known SHAs, and otherwise
1735 # _split_commits_and_tags fails with KeyError
1736 have_commits, have_tags, have_others = _split_commits_and_tags(
1737 object_store, haves, ignore_unknown=True
1738 )
1739 want_commits, want_tags, want_others = _split_commits_and_tags(
1740 object_store, wants, ignore_unknown=False
1741 )
1742 # all_ancestors is a set of commits that shall not be sent
1743 # (complete repository up to 'haves')
1744 all_ancestors = _collect_ancestors(
1745 object_store, have_commits, shallow=shallow, get_parents=self._get_parents
1746 )[0]
1747 # all_missing - complete set of commits between haves and wants
1748 # common - commits from all_ancestors we hit into while
1749 # traversing parent hierarchy of wants
1750 missing_commits, common_commits = _collect_ancestors(
1751 object_store,
1752 want_commits,
1753 all_ancestors,
1754 shallow=shallow,
1755 get_parents=self._get_parents,
1756 )
1757 self.remote_has: set[bytes] = set()
1758 # Now, fill sha_done with commits and revisions of
1759 # files and directories known to be both locally
1760 # and on target. Thus these commits and files
1761 # won't get selected for fetch
1762 for h in common_commits:
1763 self.remote_has.add(h)
1764 cmt = object_store[h]
1765 _collect_filetree_revs(object_store, cmt.tree, self.remote_has)
1766 # record tags we have as visited, too
1767 for t in have_tags:
1768 self.remote_has.add(t)
1769 self.sha_done = set(self.remote_has)
1770
1771 # in fact, what we 'want' is commits, tags, and others
1772 # we've found missing
1773 self.objects_to_send: set[
1774 tuple[ObjectID, Optional[bytes], Optional[int], bool]
1775 ] = {(w, None, Commit.type_num, False) for w in missing_commits}
1776 missing_tags = want_tags.difference(have_tags)
1777 self.objects_to_send.update(
1778 {(w, None, Tag.type_num, False) for w in missing_tags}
1779 )
1780 missing_others = want_others.difference(have_others)
1781 self.objects_to_send.update({(w, None, None, False) for w in missing_others})
1782
1783 if progress is None:
1784 self.progress = lambda x: None
1785 else:
1786 self.progress = progress
1787 self._tagged = (get_tagged and get_tagged()) or {}
1788
1789 def get_remote_has(self):
1790 return self.remote_has
1791
1792 def add_todo(
1793 self, entries: Iterable[tuple[ObjectID, Optional[bytes], Optional[int], bool]]
1794 ) -> None:
1795 self.objects_to_send.update([e for e in entries if e[0] not in self.sha_done])
1796
1797 def __next__(self) -> tuple[bytes, Optional[PackHint]]:
1798 while True:
1799 if not self.objects_to_send:
1800 self.progress(
1801 f"counting objects: {len(self.sha_done)}, done.\n".encode("ascii")
1802 )
1803 raise StopIteration
1804 (sha, name, type_num, leaf) = self.objects_to_send.pop()
1805 if sha not in self.sha_done:
1806 break
1807 if not leaf:
1808 o = self.object_store[sha]
1809 if isinstance(o, Commit):
1810 self.add_todo([(o.tree, b"", Tree.type_num, False)])
1811 elif isinstance(o, Tree):
1812 self.add_todo(
1813 [
1814 (
1815 s,
1816 n,
1817 (Blob.type_num if stat.S_ISREG(m) else Tree.type_num),
1818 not stat.S_ISDIR(m),
1819 )
1820 for n, m, s in o.iteritems()
1821 if not S_ISGITLINK(m)
1822 ]
1823 )
1824 elif isinstance(o, Tag):
1825 self.add_todo([(o.object[1], None, o.object[0].type_num, False)])
1826 if sha in self._tagged:
1827 self.add_todo([(self._tagged[sha], None, None, True)])
1828 self.sha_done.add(sha)
1829 if len(self.sha_done) % 1000 == 0:
1830 self.progress(f"counting objects: {len(self.sha_done)}\r".encode("ascii"))
1831 if type_num is None:
1832 pack_hint = None
1833 else:
1834 pack_hint = (type_num, name)
1835 return (sha, pack_hint)
1836
1837 def __iter__(self):
1838 return self
1839
1840
1841class ObjectStoreGraphWalker:
1842 """Graph walker that finds what commits are missing from an object store."""
1843
1844 heads: set[ObjectID]
1845 """Revisions without descendants in the local repo."""
1846
1847 get_parents: Callable[[ObjectID], ObjectID]
1848 """Function to retrieve parents in the local repo."""
1849
1850 shallow: set[ObjectID]
1851
1852 def __init__(
1853 self,
1854 local_heads: Iterable[ObjectID],
1855 get_parents,
1856 shallow: Optional[set[ObjectID]] = None,
1857 update_shallow=None,
1858 ) -> None:
1859 """Create a new instance.
1860
1861 Args:
1862 local_heads: Heads to start search with
1863 get_parents: Function for finding the parents of a SHA1.
1864 """
1865 self.heads = set(local_heads)
1866 self.get_parents = get_parents
1867 self.parents: dict[ObjectID, Optional[list[ObjectID]]] = {}
1868 if shallow is None:
1869 shallow = set()
1870 self.shallow = shallow
1871 self.update_shallow = update_shallow
1872
1873 def nak(self) -> None:
1874 """Nothing in common was found."""
1875
1876 def ack(self, sha: ObjectID) -> None:
1877 """Ack that a revision and its ancestors are present in the source."""
1878 if len(sha) != 40:
1879 raise ValueError(f"unexpected sha {sha!r} received")
1880 ancestors = {sha}
1881
1882 # stop if we run out of heads to remove
1883 while self.heads:
1884 for a in ancestors:
1885 if a in self.heads:
1886 self.heads.remove(a)
1887
1888 # collect all ancestors
1889 new_ancestors = set()
1890 for a in ancestors:
1891 ps = self.parents.get(a)
1892 if ps is not None:
1893 new_ancestors.update(ps)
1894 self.parents[a] = None
1895
1896 # no more ancestors; stop
1897 if not new_ancestors:
1898 break
1899
1900 ancestors = new_ancestors
1901
1902 def next(self):
1903 """Iterate over ancestors of heads in the target."""
1904 if self.heads:
1905 ret = self.heads.pop()
1906 try:
1907 ps = self.get_parents(ret)
1908 except KeyError:
1909 return None
1910 self.parents[ret] = ps
1911 self.heads.update([p for p in ps if p not in self.parents])
1912 return ret
1913 return None
1914
1915 __next__ = next
1916
1917
1918def commit_tree_changes(object_store, tree, changes):
1919 """Commit a specified set of changes to a tree structure.
1920
1921 This will apply a set of changes on top of an existing tree, storing new
1922 objects in object_store.
1923
1924 changes are a list of tuples with (path, mode, object_sha).
1925 Paths can be both blobs and trees. See the mode and
1926 object sha to None deletes the path.
1927
1928 This method works especially well if there are only a small
1929 number of changes to a big tree. For a large number of changes
1930 to a large tree, use e.g. commit_tree.
1931
1932 Args:
1933 object_store: Object store to store new objects in
1934 and retrieve old ones from.
1935 tree: Original tree root
1936 changes: changes to apply
1937 Returns: New tree root object
1938 """
1939 # TODO(jelmer): Save up the objects and add them using .add_objects
1940 # rather than with individual calls to .add_object.
1941 nested_changes = {}
1942 for path, new_mode, new_sha in changes:
1943 try:
1944 (dirname, subpath) = path.split(b"/", 1)
1945 except ValueError:
1946 if new_sha is None:
1947 del tree[path]
1948 else:
1949 tree[path] = (new_mode, new_sha)
1950 else:
1951 nested_changes.setdefault(dirname, []).append((subpath, new_mode, new_sha))
1952 for name, subchanges in nested_changes.items():
1953 try:
1954 orig_subtree = object_store[tree[name][1]]
1955 except KeyError:
1956 orig_subtree = Tree()
1957 subtree = commit_tree_changes(object_store, orig_subtree, subchanges)
1958 if len(subtree) == 0:
1959 del tree[name]
1960 else:
1961 tree[name] = (stat.S_IFDIR, subtree.id)
1962 object_store.add_object(tree)
1963 return tree
1964
1965
1966class OverlayObjectStore(BaseObjectStore):
1967 """Object store that can overlay multiple object stores."""
1968
1969 def __init__(self, bases, add_store=None) -> None:
1970 self.bases = bases
1971 self.add_store = add_store
1972
1973 def add_object(self, object):
1974 if self.add_store is None:
1975 raise NotImplementedError(self.add_object)
1976 return self.add_store.add_object(object)
1977
1978 def add_objects(self, objects, progress=None):
1979 if self.add_store is None:
1980 raise NotImplementedError(self.add_object)
1981 return self.add_store.add_objects(objects, progress)
1982
1983 @property
1984 def packs(self):
1985 ret = []
1986 for b in self.bases:
1987 ret.extend(b.packs)
1988 return ret
1989
1990 def __iter__(self):
1991 done = set()
1992 for b in self.bases:
1993 for o_id in b:
1994 if o_id not in done:
1995 yield o_id
1996 done.add(o_id)
1997
1998 def iterobjects_subset(
1999 self, shas: Iterable[bytes], *, allow_missing: bool = False
2000 ) -> Iterator[ShaFile]:
2001 todo = set(shas)
2002 found: set[bytes] = set()
2003
2004 for b in self.bases:
2005 # Create a copy of todo for each base to avoid modifying
2006 # the set while iterating through it
2007 current_todo = todo - found
2008 for o in b.iterobjects_subset(current_todo, allow_missing=True):
2009 yield o
2010 found.add(o.id)
2011
2012 # Check for any remaining objects not found
2013 missing = todo - found
2014 if missing and not allow_missing:
2015 raise KeyError(next(iter(missing)))
2016
2017 def iter_unpacked_subset(
2018 self,
2019 shas: Iterable[bytes],
2020 *,
2021 include_comp=False,
2022 allow_missing: bool = False,
2023 convert_ofs_delta=True,
2024 ) -> Iterator[ShaFile]:
2025 todo = set(shas)
2026 for b in self.bases:
2027 for o in b.iter_unpacked_subset(
2028 todo,
2029 include_comp=include_comp,
2030 allow_missing=True,
2031 convert_ofs_delta=convert_ofs_delta,
2032 ):
2033 yield o
2034 todo.remove(o.id)
2035 if todo and not allow_missing:
2036 raise KeyError(o.id)
2037
2038 def get_raw(self, sha_id):
2039 for b in self.bases:
2040 try:
2041 return b.get_raw(sha_id)
2042 except KeyError:
2043 pass
2044 raise KeyError(sha_id)
2045
2046 def contains_packed(self, sha) -> bool:
2047 for b in self.bases:
2048 if b.contains_packed(sha):
2049 return True
2050 return False
2051
2052 def contains_loose(self, sha) -> bool:
2053 for b in self.bases:
2054 if b.contains_loose(sha):
2055 return True
2056 return False
2057
2058
2059def read_packs_file(f):
2060 """Yield the packs listed in a packs file."""
2061 for line in f.read().splitlines():
2062 if not line:
2063 continue
2064 (kind, name) = line.split(b" ", 1)
2065 if kind != b"P":
2066 continue
2067 yield os.fsdecode(name)
2068
2069
2070class BucketBasedObjectStore(PackBasedObjectStore):
2071 """Object store implementation that uses a bucket store like S3 as backend."""
2072
2073 def _iter_loose_objects(self):
2074 """Iterate over the SHAs of all loose objects."""
2075 return iter([])
2076
2077 def _get_loose_object(self, sha) -> None:
2078 return None
2079
2080 def delete_loose_object(self, sha) -> None:
2081 # Doesn't exist..
2082 pass
2083
2084 def _remove_pack(self, name) -> None:
2085 raise NotImplementedError(self._remove_pack)
2086
2087 def _iter_pack_names(self) -> Iterator[str]:
2088 raise NotImplementedError(self._iter_pack_names)
2089
2090 def _get_pack(self, name) -> Pack:
2091 raise NotImplementedError(self._get_pack)
2092
2093 def _update_pack_cache(self):
2094 pack_files = set(self._iter_pack_names())
2095
2096 # Open newly appeared pack files
2097 new_packs = []
2098 for f in pack_files:
2099 if f not in self._pack_cache:
2100 pack = self._get_pack(f)
2101 new_packs.append(pack)
2102 self._pack_cache[f] = pack
2103 # Remove disappeared pack files
2104 for f in set(self._pack_cache) - pack_files:
2105 self._pack_cache.pop(f).close()
2106 return new_packs
2107
2108 def _upload_pack(self, basename, pack_file, index_file) -> None:
2109 raise NotImplementedError
2110
2111 def add_pack(self):
2112 """Add a new pack to this object store.
2113
2114 Returns: Fileobject to write to, a commit function to
2115 call when the pack is finished and an abort
2116 function.
2117 """
2118 import tempfile
2119
2120 pf = tempfile.SpooledTemporaryFile(
2121 max_size=PACK_SPOOL_FILE_MAX_SIZE, prefix="incoming-"
2122 )
2123
2124 def commit():
2125 if pf.tell() == 0:
2126 pf.close()
2127 return None
2128
2129 pf.seek(0)
2130 p = PackData(pf.name, pf)
2131 entries = p.sorted_entries()
2132 basename = iter_sha1(entry[0] for entry in entries).decode("ascii")
2133 idxf = tempfile.SpooledTemporaryFile(
2134 max_size=PACK_SPOOL_FILE_MAX_SIZE, prefix="incoming-"
2135 )
2136 checksum = p.get_stored_checksum()
2137 write_pack_index(idxf, entries, checksum, version=self.pack_index_version)
2138 idxf.seek(0)
2139 idx = load_pack_index_file(basename + ".idx", idxf)
2140 for pack in self.packs:
2141 if pack.get_stored_checksum() == p.get_stored_checksum():
2142 p.close()
2143 idx.close()
2144 pf.close()
2145 idxf.close()
2146 return pack
2147 pf.seek(0)
2148 idxf.seek(0)
2149 self._upload_pack(basename, pf, idxf)
2150 final_pack = Pack.from_objects(p, idx)
2151 self._add_cached_pack(basename, final_pack)
2152 pf.close()
2153 idxf.close()
2154 return final_pack
2155
2156 return pf, commit, pf.close
2157
2158
2159def _collect_ancestors(
2160 store: ObjectContainer,
2161 heads,
2162 common: frozenset[ObjectID] = frozenset(),
2163 shallow: frozenset[ObjectID] = frozenset(),
2164 get_parents=lambda commit: commit.parents,
2165):
2166 """Collect all ancestors of heads up to (excluding) those in common.
2167
2168 Args:
2169 heads: commits to start from
2170 common: commits to end at, or empty set to walk repository
2171 completely
2172 get_parents: Optional function for getting the parents of a
2173 commit.
2174 Returns: a tuple (A, B) where A - all commits reachable
2175 from heads but not present in common, B - common (shared) elements
2176 that are directly reachable from heads
2177 """
2178 bases = set()
2179 commits = set()
2180 queue = []
2181 queue.extend(heads)
2182
2183 # Try to use commit graph if available
2184 commit_graph = store.get_commit_graph()
2185
2186 while queue:
2187 e = queue.pop(0)
2188 if e in common:
2189 bases.add(e)
2190 elif e not in commits:
2191 commits.add(e)
2192 if e in shallow:
2193 continue
2194
2195 # Try to use commit graph for parent lookup
2196 parents = None
2197 if commit_graph:
2198 parents = commit_graph.get_parents(e)
2199
2200 if parents is None:
2201 # Fall back to loading the object
2202 cmt = store[e]
2203 parents = get_parents(cmt)
2204
2205 queue.extend(parents)
2206 return (commits, bases)
2207
2208
2209def iter_tree_contents(
2210 store: ObjectContainer, tree_id: Optional[ObjectID], *, include_trees: bool = False
2211):
2212 """Iterate the contents of a tree and all subtrees.
2213
2214 Iteration is depth-first pre-order, as in e.g. os.walk.
2215
2216 Args:
2217 tree_id: SHA1 of the tree.
2218 include_trees: If True, include tree objects in the iteration.
2219 Returns: Iterator over TreeEntry namedtuples for all the objects in a
2220 tree.
2221 """
2222 if tree_id is None:
2223 return
2224 # This could be fairly easily generalized to >2 trees if we find a use
2225 # case.
2226 todo = [TreeEntry(b"", stat.S_IFDIR, tree_id)]
2227 while todo:
2228 entry = todo.pop()
2229 if stat.S_ISDIR(entry.mode):
2230 extra = []
2231 tree = store[entry.sha]
2232 assert isinstance(tree, Tree)
2233 for subentry in tree.iteritems(name_order=True):
2234 extra.append(subentry.in_path(entry.path))
2235 todo.extend(reversed(extra))
2236 if not stat.S_ISDIR(entry.mode) or include_trees:
2237 yield entry
2238
2239
2240def peel_sha(store: ObjectContainer, sha: bytes) -> tuple[ShaFile, ShaFile]:
2241 """Peel all tags from a SHA.
2242
2243 Args:
2244 sha: The object SHA to peel.
2245 Returns: The fully-peeled SHA1 of a tag object, after peeling all
2246 intermediate tags; if the original ref does not point to a tag,
2247 this will equal the original SHA1.
2248 """
2249 unpeeled = obj = store[sha]
2250 obj_class = object_class(obj.type_name)
2251 while obj_class is Tag:
2252 assert isinstance(obj, Tag)
2253 obj_class, sha = obj.object
2254 obj = store[sha]
2255 return unpeeled, obj