1# object_store.py -- Object store for git objects
2# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk>
3# and others
4#
5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
7# General Public License as public by the Free Software Foundation; version 2.0
8# or (at your option) any later version. You can redistribute it and/or
9# modify it under the terms of either of these two licenses.
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# You should have received a copy of the licenses; if not, see
18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
20# License, Version 2.0.
21#
22
23
24"""Git object store interfaces and implementation."""
25
26import binascii
27import os
28import stat
29import sys
30import time
31import warnings
32from collections.abc import Iterable, Iterator, Sequence
33from contextlib import suppress
34from io import BytesIO
35from typing import (
36 Callable,
37 Optional,
38 Protocol,
39 Union,
40 cast,
41)
42
43from .errors import NotTreeError
44from .file import GitFile
45from .objects import (
46 S_ISGITLINK,
47 ZERO_SHA,
48 Blob,
49 Commit,
50 ObjectID,
51 ShaFile,
52 Tag,
53 Tree,
54 TreeEntry,
55 hex_to_filename,
56 hex_to_sha,
57 object_class,
58 sha_to_hex,
59 valid_hexsha,
60)
61from .pack import (
62 PACK_SPOOL_FILE_MAX_SIZE,
63 ObjectContainer,
64 Pack,
65 PackData,
66 PackedObjectContainer,
67 PackFileDisappeared,
68 PackHint,
69 PackIndexer,
70 PackInflater,
71 PackStreamCopier,
72 UnpackedObject,
73 extend_pack,
74 full_unpacked_object,
75 generate_unpacked_objects,
76 iter_sha1,
77 load_pack_index_file,
78 pack_objects_to_data,
79 write_pack_data,
80 write_pack_index,
81)
82from .protocol import DEPTH_INFINITE
83from .refs import PEELED_TAG_SUFFIX, Ref
84
85INFODIR = "info"
86PACKDIR = "pack"
87
88# use permissions consistent with Git; just readable by everyone
89# TODO: should packs also be non-writable on Windows? if so, that
90# would requite some rather significant adjustments to the test suite
91PACK_MODE = 0o444 if sys.platform != "win32" else 0o644
92
93# Grace period for cleaning up temporary pack files (in seconds)
94# Matches git's default of 2 weeks
95DEFAULT_TEMPFILE_GRACE_PERIOD = 14 * 24 * 60 * 60 # 2 weeks
96
97
98def find_shallow(store, heads, depth):
99 """Find shallow commits according to a given depth.
100
101 Args:
102 store: An ObjectStore for looking up objects.
103 heads: Iterable of head SHAs to start walking from.
104 depth: The depth of ancestors to include. A depth of one includes
105 only the heads themselves.
106 Returns: A tuple of (shallow, not_shallow), sets of SHAs that should be
107 considered shallow and unshallow according to the arguments. Note that
108 these sets may overlap if a commit is reachable along multiple paths.
109 """
110 parents = {}
111
112 def get_parents(sha):
113 result = parents.get(sha, None)
114 if not result:
115 result = store[sha].parents
116 parents[sha] = result
117 return result
118
119 todo = [] # stack of (sha, depth)
120 for head_sha in heads:
121 obj = store[head_sha]
122 # Peel tags if necessary
123 while isinstance(obj, Tag):
124 _, sha = obj.object
125 obj = store[sha]
126 if isinstance(obj, Commit):
127 todo.append((obj.id, 1))
128
129 not_shallow = set()
130 shallow = set()
131 while todo:
132 sha, cur_depth = todo.pop()
133 if cur_depth < depth:
134 not_shallow.add(sha)
135 new_depth = cur_depth + 1
136 todo.extend((p, new_depth) for p in get_parents(sha))
137 else:
138 shallow.add(sha)
139
140 return shallow, not_shallow
141
142
143def get_depth(
144 store,
145 head,
146 get_parents=lambda commit: commit.parents,
147 max_depth=None,
148):
149 """Return the current available depth for the given head.
150 For commits with multiple parents, the largest possible depth will be
151 returned.
152
153 Args:
154 head: commit to start from
155 get_parents: optional function for getting the parents of a commit
156 max_depth: maximum depth to search
157 """
158 if head not in store:
159 return 0
160 current_depth = 1
161 queue = [(head, current_depth)]
162 while queue and (max_depth is None or current_depth < max_depth):
163 e, depth = queue.pop(0)
164 current_depth = max(current_depth, depth)
165 cmt = store[e]
166 if isinstance(cmt, Tag):
167 _cls, sha = cmt.object
168 cmt = store[sha]
169 queue.extend(
170 (parent, depth + 1) for parent in get_parents(cmt) if parent in store
171 )
172 return current_depth
173
174
175class PackContainer(Protocol):
176 def add_pack(self) -> tuple[BytesIO, Callable[[], None], Callable[[], None]]:
177 """Add a new pack."""
178
179
180class BaseObjectStore:
181 """Object store interface."""
182
183 def determine_wants_all(
184 self, refs: dict[Ref, ObjectID], depth: Optional[int] = None
185 ) -> list[ObjectID]:
186 def _want_deepen(sha):
187 if not depth:
188 return False
189 if depth == DEPTH_INFINITE:
190 return True
191 return depth > self._get_depth(sha)
192
193 return [
194 sha
195 for (ref, sha) in refs.items()
196 if (sha not in self or _want_deepen(sha))
197 and not ref.endswith(PEELED_TAG_SUFFIX)
198 and not sha == ZERO_SHA
199 ]
200
201 def contains_loose(self, sha) -> bool:
202 """Check if a particular object is present by SHA1 and is loose."""
203 raise NotImplementedError(self.contains_loose)
204
205 def __contains__(self, sha1: bytes) -> bool:
206 """Check if a particular object is present by SHA1.
207
208 This method makes no distinction between loose and packed objects.
209 """
210 return self.contains_loose(sha1)
211
212 @property
213 def packs(self):
214 """Iterable of pack objects."""
215 raise NotImplementedError
216
217 def get_raw(self, name) -> tuple[int, bytes]:
218 """Obtain the raw text for an object.
219
220 Args:
221 name: sha for the object.
222 Returns: tuple with numeric type and object contents.
223 """
224 raise NotImplementedError(self.get_raw)
225
226 def __getitem__(self, sha1: ObjectID) -> ShaFile:
227 """Obtain an object by SHA1."""
228 type_num, uncomp = self.get_raw(sha1)
229 return ShaFile.from_raw_string(type_num, uncomp, sha=sha1)
230
231 def __iter__(self):
232 """Iterate over the SHAs that are present in this store."""
233 raise NotImplementedError(self.__iter__)
234
235 def add_object(self, obj) -> None:
236 """Add a single object to this object store."""
237 raise NotImplementedError(self.add_object)
238
239 def add_objects(self, objects, progress=None) -> None:
240 """Add a set of objects to this object store.
241
242 Args:
243 objects: Iterable over a list of (object, path) tuples
244 """
245 raise NotImplementedError(self.add_objects)
246
247 def tree_changes(
248 self,
249 source,
250 target,
251 want_unchanged=False,
252 include_trees=False,
253 change_type_same=False,
254 rename_detector=None,
255 ):
256 """Find the differences between the contents of two trees.
257
258 Args:
259 source: SHA1 of the source tree
260 target: SHA1 of the target tree
261 want_unchanged: Whether unchanged files should be reported
262 include_trees: Whether to include trees
263 change_type_same: Whether to report files changing
264 type in the same entry.
265 Returns: Iterator over tuples with
266 (oldpath, newpath), (oldmode, newmode), (oldsha, newsha)
267 """
268 from .diff_tree import tree_changes
269
270 for change in tree_changes(
271 self,
272 source,
273 target,
274 want_unchanged=want_unchanged,
275 include_trees=include_trees,
276 change_type_same=change_type_same,
277 rename_detector=rename_detector,
278 ):
279 yield (
280 (change.old.path, change.new.path),
281 (change.old.mode, change.new.mode),
282 (change.old.sha, change.new.sha),
283 )
284
285 def iter_tree_contents(self, tree_id, include_trees=False):
286 """Iterate the contents of a tree and all subtrees.
287
288 Iteration is depth-first pre-order, as in e.g. os.walk.
289
290 Args:
291 tree_id: SHA1 of the tree.
292 include_trees: If True, include tree objects in the iteration.
293 Returns: Iterator over TreeEntry namedtuples for all the objects in a
294 tree.
295 """
296 warnings.warn(
297 "Please use dulwich.object_store.iter_tree_contents",
298 DeprecationWarning,
299 stacklevel=2,
300 )
301 return iter_tree_contents(self, tree_id, include_trees=include_trees)
302
303 def iterobjects_subset(
304 self, shas: Iterable[bytes], *, allow_missing: bool = False
305 ) -> Iterator[ShaFile]:
306 for sha in shas:
307 try:
308 yield self[sha]
309 except KeyError:
310 if not allow_missing:
311 raise
312
313 def find_missing_objects(
314 self,
315 haves,
316 wants,
317 shallow=None,
318 progress=None,
319 get_tagged=None,
320 get_parents=lambda commit: commit.parents,
321 ):
322 """Find the missing objects required for a set of revisions.
323
324 Args:
325 haves: Iterable over SHAs already in common.
326 wants: Iterable over SHAs of objects to fetch.
327 shallow: Set of shallow commit SHA1s to skip
328 progress: Simple progress function that will be called with
329 updated progress strings.
330 get_tagged: Function that returns a dict of pointed-to sha ->
331 tag sha for including tags.
332 get_parents: Optional function for getting the parents of a
333 commit.
334 Returns: Iterator over (sha, path) pairs.
335 """
336 warnings.warn("Please use MissingObjectFinder(store)", DeprecationWarning)
337 finder = MissingObjectFinder(
338 self,
339 haves=haves,
340 wants=wants,
341 shallow=shallow,
342 progress=progress,
343 get_tagged=get_tagged,
344 get_parents=get_parents,
345 )
346 return iter(finder)
347
348 def find_common_revisions(self, graphwalker):
349 """Find which revisions this store has in common using graphwalker.
350
351 Args:
352 graphwalker: A graphwalker object.
353 Returns: List of SHAs that are in common
354 """
355 haves = []
356 sha = next(graphwalker)
357 while sha:
358 if sha in self:
359 haves.append(sha)
360 graphwalker.ack(sha)
361 sha = next(graphwalker)
362 return haves
363
364 def generate_pack_data(
365 self, have, want, shallow=None, progress=None, ofs_delta=True
366 ) -> tuple[int, Iterator[UnpackedObject]]:
367 """Generate pack data objects for a set of wants/haves.
368
369 Args:
370 have: List of SHA1s of objects that should not be sent
371 want: List of SHA1s of objects that should be sent
372 shallow: Set of shallow commit SHA1s to skip
373 ofs_delta: Whether OFS deltas can be included
374 progress: Optional progress reporting method
375 """
376 # Note that the pack-specific implementation below is more efficient,
377 # as it reuses deltas
378 missing_objects = MissingObjectFinder(
379 self, haves=have, wants=want, shallow=shallow, progress=progress
380 )
381 object_ids = list(missing_objects)
382 return pack_objects_to_data(
383 [(self[oid], path) for oid, path in object_ids],
384 ofs_delta=ofs_delta,
385 progress=progress,
386 )
387
388 def peel_sha(self, sha):
389 """Peel all tags from a SHA.
390
391 Args:
392 sha: The object SHA to peel.
393 Returns: The fully-peeled SHA1 of a tag object, after peeling all
394 intermediate tags; if the original ref does not point to a tag,
395 this will equal the original SHA1.
396 """
397 warnings.warn(
398 "Please use dulwich.object_store.peel_sha()",
399 DeprecationWarning,
400 stacklevel=2,
401 )
402 return peel_sha(self, sha)[1]
403
404 def _get_depth(
405 self,
406 head,
407 get_parents=lambda commit: commit.parents,
408 max_depth=None,
409 ):
410 """Return the current available depth for the given head.
411 For commits with multiple parents, the largest possible depth will be
412 returned.
413
414 Args:
415 head: commit to start from
416 get_parents: optional function for getting the parents of a commit
417 max_depth: maximum depth to search
418 """
419 return get_depth(self, head, get_parents=get_parents, max_depth=max_depth)
420
421 def close(self) -> None:
422 """Close any files opened by this object store."""
423 # Default implementation is a NO-OP
424
425 def prune(self, grace_period: Optional[int] = None) -> None:
426 """Prune/clean up this object store.
427
428 This includes removing orphaned temporary files and other
429 housekeeping tasks. Default implementation is a NO-OP.
430
431 Args:
432 grace_period: Grace period in seconds for removing temporary files.
433 If None, uses the default grace period.
434 """
435 # Default implementation is a NO-OP
436
437 def iter_prefix(self, prefix: bytes) -> Iterator[ObjectID]:
438 """Iterate over all SHA1s that start with a given prefix.
439
440 The default implementation is a naive iteration over all objects.
441 However, subclasses may override this method with more efficient
442 implementations.
443 """
444 for sha in self:
445 if sha.startswith(prefix):
446 yield sha
447
448 def get_commit_graph(self):
449 """Get the commit graph for this object store.
450
451 Returns:
452 CommitGraph object if available, None otherwise
453 """
454 return None
455
456 def write_commit_graph(self, refs=None, reachable=True) -> None:
457 """Write a commit graph file for this object store.
458
459 Args:
460 refs: List of refs to include. If None, includes all refs from object store.
461 reachable: If True, includes all commits reachable from refs.
462 If False, only includes the direct ref targets.
463
464 Note:
465 Default implementation does nothing. Subclasses should override
466 this method to provide commit graph writing functionality.
467 """
468 raise NotImplementedError(self.write_commit_graph)
469
470 def get_object_mtime(self, sha):
471 """Get the modification time of an object.
472
473 Args:
474 sha: SHA1 of the object
475
476 Returns:
477 Modification time as seconds since epoch
478
479 Raises:
480 KeyError: if the object is not found
481 """
482 # Default implementation raises KeyError
483 # Subclasses should override to provide actual mtime
484 raise KeyError(sha)
485
486
487class PackBasedObjectStore(BaseObjectStore, PackedObjectContainer):
488 def __init__(self, pack_compression_level=-1, pack_index_version=None) -> None:
489 self._pack_cache: dict[str, Pack] = {}
490 self.pack_compression_level = pack_compression_level
491 self.pack_index_version = pack_index_version
492
493 def add_pack(self) -> tuple[BytesIO, Callable[[], None], Callable[[], None]]:
494 """Add a new pack to this object store."""
495 raise NotImplementedError(self.add_pack)
496
497 def add_pack_data(
498 self, count: int, unpacked_objects: Iterator[UnpackedObject], progress=None
499 ) -> None:
500 """Add pack data to this object store.
501
502 Args:
503 count: Number of items to add
504 """
505 if count == 0:
506 # Don't bother writing an empty pack file
507 return
508 f, commit, abort = self.add_pack()
509 try:
510 write_pack_data(
511 f.write,
512 unpacked_objects,
513 num_records=count,
514 progress=progress,
515 compression_level=self.pack_compression_level,
516 )
517 except BaseException:
518 abort()
519 raise
520 else:
521 return commit()
522
523 @property
524 def alternates(self):
525 return []
526
527 def contains_packed(self, sha) -> bool:
528 """Check if a particular object is present by SHA1 and is packed.
529
530 This does not check alternates.
531 """
532 for pack in self.packs:
533 try:
534 if sha in pack:
535 return True
536 except PackFileDisappeared:
537 pass
538 return False
539
540 def __contains__(self, sha) -> bool:
541 """Check if a particular object is present by SHA1.
542
543 This method makes no distinction between loose and packed objects.
544 """
545 if self.contains_packed(sha) or self.contains_loose(sha):
546 return True
547 for alternate in self.alternates:
548 if sha in alternate:
549 return True
550 return False
551
552 def _add_cached_pack(self, base_name, pack) -> None:
553 """Add a newly appeared pack to the cache by path."""
554 prev_pack = self._pack_cache.get(base_name)
555 if prev_pack is not pack:
556 self._pack_cache[base_name] = pack
557 if prev_pack:
558 prev_pack.close()
559
560 def generate_pack_data(
561 self, have, want, shallow=None, progress=None, ofs_delta=True
562 ) -> tuple[int, Iterator[UnpackedObject]]:
563 """Generate pack data objects for a set of wants/haves.
564
565 Args:
566 have: List of SHA1s of objects that should not be sent
567 want: List of SHA1s of objects that should be sent
568 shallow: Set of shallow commit SHA1s to skip
569 ofs_delta: Whether OFS deltas can be included
570 progress: Optional progress reporting method
571 """
572 missing_objects = MissingObjectFinder(
573 self, haves=have, wants=want, shallow=shallow, progress=progress
574 )
575 remote_has = missing_objects.get_remote_has()
576 object_ids = list(missing_objects)
577 return len(object_ids), generate_unpacked_objects(
578 cast(PackedObjectContainer, self),
579 object_ids,
580 progress=progress,
581 ofs_delta=ofs_delta,
582 other_haves=remote_has,
583 )
584
585 def _clear_cached_packs(self) -> None:
586 pack_cache = self._pack_cache
587 self._pack_cache = {}
588 while pack_cache:
589 (name, pack) = pack_cache.popitem()
590 pack.close()
591
592 def _iter_cached_packs(self):
593 return self._pack_cache.values()
594
595 def _update_pack_cache(self) -> list[Pack]:
596 raise NotImplementedError(self._update_pack_cache)
597
598 def close(self) -> None:
599 self._clear_cached_packs()
600
601 @property
602 def packs(self):
603 """List with pack objects."""
604 return list(self._iter_cached_packs()) + list(self._update_pack_cache())
605
606 def count_pack_files(self) -> int:
607 """Count the number of pack files.
608
609 Returns:
610 Number of pack files (excluding those with .keep files)
611 """
612 count = 0
613 for pack in self.packs:
614 # Check if there's a .keep file for this pack
615 keep_path = pack._basename + ".keep"
616 if not os.path.exists(keep_path):
617 count += 1
618 return count
619
620 def _iter_alternate_objects(self):
621 """Iterate over the SHAs of all the objects in alternate stores."""
622 for alternate in self.alternates:
623 yield from alternate
624
625 def _iter_loose_objects(self):
626 """Iterate over the SHAs of all loose objects."""
627 raise NotImplementedError(self._iter_loose_objects)
628
629 def _get_loose_object(self, sha) -> Optional[ShaFile]:
630 raise NotImplementedError(self._get_loose_object)
631
632 def delete_loose_object(self, sha) -> None:
633 """Delete a loose object.
634
635 This method only handles loose objects. For packed objects,
636 use repack(exclude=...) to exclude them during repacking.
637 """
638 raise NotImplementedError(self.delete_loose_object)
639
640 def _remove_pack(self, name) -> None:
641 raise NotImplementedError(self._remove_pack)
642
643 def pack_loose_objects(self):
644 """Pack loose objects.
645
646 Returns: Number of objects packed
647 """
648 objects = set()
649 for sha in self._iter_loose_objects():
650 objects.add((self._get_loose_object(sha), None))
651 self.add_objects(list(objects))
652 for obj, path in objects:
653 self.delete_loose_object(obj.id)
654 return len(objects)
655
656 def repack(self, exclude=None):
657 """Repack the packs in this repository.
658
659 Note that this implementation is fairly naive and currently keeps all
660 objects in memory while it repacks.
661
662 Args:
663 exclude: Optional set of object SHAs to exclude from repacking
664 """
665 if exclude is None:
666 exclude = set()
667
668 loose_objects = set()
669 excluded_loose_objects = set()
670 for sha in self._iter_loose_objects():
671 if sha not in exclude:
672 loose_objects.add(self._get_loose_object(sha))
673 else:
674 excluded_loose_objects.add(sha)
675
676 objects = {(obj, None) for obj in loose_objects}
677 old_packs = {p.name(): p for p in self.packs}
678 for name, pack in old_packs.items():
679 objects.update(
680 (obj, None) for obj in pack.iterobjects() if obj.id not in exclude
681 )
682
683 # Only create a new pack if there are objects to pack
684 if objects:
685 # The name of the consolidated pack might match the name of a
686 # pre-existing pack. Take care not to remove the newly created
687 # consolidated pack.
688 consolidated = self.add_objects(objects)
689 old_packs.pop(consolidated.name(), None)
690
691 # Delete loose objects that were packed
692 for obj in loose_objects:
693 self.delete_loose_object(obj.id)
694 # Delete excluded loose objects
695 for sha in excluded_loose_objects:
696 self.delete_loose_object(sha)
697 for name, pack in old_packs.items():
698 self._remove_pack(pack)
699 self._update_pack_cache()
700 return len(objects)
701
702 def __iter__(self):
703 """Iterate over the SHAs that are present in this store."""
704 self._update_pack_cache()
705 for pack in self._iter_cached_packs():
706 try:
707 yield from pack
708 except PackFileDisappeared:
709 pass
710 yield from self._iter_loose_objects()
711 yield from self._iter_alternate_objects()
712
713 def contains_loose(self, sha):
714 """Check if a particular object is present by SHA1 and is loose.
715
716 This does not check alternates.
717 """
718 return self._get_loose_object(sha) is not None
719
720 def get_raw(self, name):
721 """Obtain the raw fulltext for an object.
722
723 Args:
724 name: sha for the object.
725 Returns: tuple with numeric type and object contents.
726 """
727 if name == ZERO_SHA:
728 raise KeyError(name)
729 if len(name) == 40:
730 sha = hex_to_sha(name)
731 hexsha = name
732 elif len(name) == 20:
733 sha = name
734 hexsha = None
735 else:
736 raise AssertionError(f"Invalid object name {name!r}")
737 for pack in self._iter_cached_packs():
738 try:
739 return pack.get_raw(sha)
740 except (KeyError, PackFileDisappeared):
741 pass
742 if hexsha is None:
743 hexsha = sha_to_hex(name)
744 ret = self._get_loose_object(hexsha)
745 if ret is not None:
746 return ret.type_num, ret.as_raw_string()
747 # Maybe something else has added a pack with the object
748 # in the mean time?
749 for pack in self._update_pack_cache():
750 try:
751 return pack.get_raw(sha)
752 except KeyError:
753 pass
754 for alternate in self.alternates:
755 try:
756 return alternate.get_raw(hexsha)
757 except KeyError:
758 pass
759 raise KeyError(hexsha)
760
761 def iter_unpacked_subset(
762 self,
763 shas: set[bytes],
764 include_comp: bool = False,
765 allow_missing: bool = False,
766 convert_ofs_delta: bool = True,
767 ) -> Iterator[UnpackedObject]:
768 todo: set[bytes] = set(shas)
769 for p in self._iter_cached_packs():
770 for unpacked in p.iter_unpacked_subset(
771 todo,
772 include_comp=include_comp,
773 allow_missing=True,
774 convert_ofs_delta=convert_ofs_delta,
775 ):
776 yield unpacked
777 hexsha = sha_to_hex(unpacked.sha())
778 todo.remove(hexsha)
779 # Maybe something else has added a pack with the object
780 # in the mean time?
781 for p in self._update_pack_cache():
782 for unpacked in p.iter_unpacked_subset(
783 todo,
784 include_comp=include_comp,
785 allow_missing=True,
786 convert_ofs_delta=convert_ofs_delta,
787 ):
788 yield unpacked
789 hexsha = sha_to_hex(unpacked.sha())
790 todo.remove(hexsha)
791 for alternate in self.alternates:
792 for unpacked in alternate.iter_unpacked_subset(
793 todo,
794 include_comp=include_comp,
795 allow_missing=True,
796 convert_ofs_delta=convert_ofs_delta,
797 ):
798 yield unpacked
799 hexsha = sha_to_hex(unpacked.sha())
800 todo.remove(hexsha)
801
802 def iterobjects_subset(
803 self, shas: Iterable[bytes], *, allow_missing: bool = False
804 ) -> Iterator[ShaFile]:
805 todo: set[bytes] = set(shas)
806 for p in self._iter_cached_packs():
807 for o in p.iterobjects_subset(todo, allow_missing=True):
808 yield o
809 todo.remove(o.id)
810 # Maybe something else has added a pack with the object
811 # in the mean time?
812 for p in self._update_pack_cache():
813 for o in p.iterobjects_subset(todo, allow_missing=True):
814 yield o
815 todo.remove(o.id)
816 for alternate in self.alternates:
817 for o in alternate.iterobjects_subset(todo, allow_missing=True):
818 yield o
819 todo.remove(o.id)
820 for oid in todo:
821 o = self._get_loose_object(oid)
822 if o is not None:
823 yield o
824 elif not allow_missing:
825 raise KeyError(oid)
826
827 def get_unpacked_object(
828 self, sha1: bytes, *, include_comp: bool = False
829 ) -> UnpackedObject:
830 """Obtain the unpacked object.
831
832 Args:
833 sha1: sha for the object.
834 """
835 if sha1 == ZERO_SHA:
836 raise KeyError(sha1)
837 if len(sha1) == 40:
838 sha = hex_to_sha(sha1)
839 hexsha = sha1
840 elif len(sha1) == 20:
841 sha = sha1
842 hexsha = None
843 else:
844 raise AssertionError(f"Invalid object sha1 {sha1!r}")
845 for pack in self._iter_cached_packs():
846 try:
847 return pack.get_unpacked_object(sha, include_comp=include_comp)
848 except (KeyError, PackFileDisappeared):
849 pass
850 if hexsha is None:
851 hexsha = sha_to_hex(sha1)
852 # Maybe something else has added a pack with the object
853 # in the mean time?
854 for pack in self._update_pack_cache():
855 try:
856 return pack.get_unpacked_object(sha, include_comp=include_comp)
857 except KeyError:
858 pass
859 for alternate in self.alternates:
860 try:
861 return alternate.get_unpacked_object(hexsha, include_comp=include_comp)
862 except KeyError:
863 pass
864 raise KeyError(hexsha)
865
866 def add_objects(
867 self,
868 objects: Sequence[tuple[ShaFile, Optional[str]]],
869 progress: Optional[Callable[[str], None]] = None,
870 ) -> None:
871 """Add a set of objects to this object store.
872
873 Args:
874 objects: Iterable over (object, path) tuples, should support
875 __len__.
876 Returns: Pack object of the objects written.
877 """
878 count = len(objects)
879 record_iter = (full_unpacked_object(o) for (o, p) in objects)
880 return self.add_pack_data(count, record_iter, progress=progress)
881
882
883class DiskObjectStore(PackBasedObjectStore):
884 """Git-style object store that exists on disk."""
885
886 path: Union[str, os.PathLike]
887 pack_dir: Union[str, os.PathLike]
888
889 def __init__(
890 self,
891 path: Union[str, os.PathLike],
892 loose_compression_level=-1,
893 pack_compression_level=-1,
894 pack_index_version=None,
895 ) -> None:
896 """Open an object store.
897
898 Args:
899 path: Path of the object store.
900 loose_compression_level: zlib compression level for loose objects
901 pack_compression_level: zlib compression level for pack objects
902 pack_index_version: pack index version to use (1, 2, or 3)
903 """
904 super().__init__(
905 pack_compression_level=pack_compression_level,
906 pack_index_version=pack_index_version,
907 )
908 self.path = path
909 self.pack_dir = os.path.join(self.path, PACKDIR)
910 self._alternates = None
911 self.loose_compression_level = loose_compression_level
912 self.pack_compression_level = pack_compression_level
913 self.pack_index_version = pack_index_version
914
915 # Commit graph support - lazy loaded
916 self._commit_graph = None
917
918 def __repr__(self) -> str:
919 return f"<{self.__class__.__name__}({self.path!r})>"
920
921 @classmethod
922 def from_config(cls, path: Union[str, os.PathLike], config):
923 try:
924 default_compression_level = int(
925 config.get((b"core",), b"compression").decode()
926 )
927 except KeyError:
928 default_compression_level = -1
929 try:
930 loose_compression_level = int(
931 config.get((b"core",), b"looseCompression").decode()
932 )
933 except KeyError:
934 loose_compression_level = default_compression_level
935 try:
936 pack_compression_level = int(
937 config.get((b"core",), "packCompression").decode()
938 )
939 except KeyError:
940 pack_compression_level = default_compression_level
941 try:
942 pack_index_version = int(config.get((b"pack",), b"indexVersion").decode())
943 except KeyError:
944 pack_index_version = None
945 return cls(
946 path, loose_compression_level, pack_compression_level, pack_index_version
947 )
948
949 @property
950 def alternates(self):
951 if self._alternates is not None:
952 return self._alternates
953 self._alternates = []
954 for path in self._read_alternate_paths():
955 self._alternates.append(DiskObjectStore(path))
956 return self._alternates
957
958 def _read_alternate_paths(self):
959 try:
960 f = GitFile(os.path.join(self.path, INFODIR, "alternates"), "rb")
961 except FileNotFoundError:
962 return
963 with f:
964 for line in f.readlines():
965 line = line.rstrip(b"\n")
966 if line.startswith(b"#"):
967 continue
968 if os.path.isabs(line):
969 yield os.fsdecode(line)
970 else:
971 yield os.fsdecode(os.path.join(os.fsencode(self.path), line))
972
973 def add_alternate_path(self, path) -> None:
974 """Add an alternate path to this object store."""
975 try:
976 os.mkdir(os.path.join(self.path, INFODIR))
977 except FileExistsError:
978 pass
979 alternates_path = os.path.join(self.path, INFODIR, "alternates")
980 with GitFile(alternates_path, "wb") as f:
981 try:
982 orig_f = open(alternates_path, "rb")
983 except FileNotFoundError:
984 pass
985 else:
986 with orig_f:
987 f.write(orig_f.read())
988 f.write(os.fsencode(path) + b"\n")
989
990 if not os.path.isabs(path):
991 path = os.path.join(self.path, path)
992 self.alternates.append(DiskObjectStore(path))
993
994 def _update_pack_cache(self):
995 """Read and iterate over new pack files and cache them."""
996 try:
997 pack_dir_contents = os.listdir(self.pack_dir)
998 except FileNotFoundError:
999 self.close()
1000 return []
1001 pack_files = set()
1002 for name in pack_dir_contents:
1003 if name.startswith("pack-") and name.endswith(".pack"):
1004 # verify that idx exists first (otherwise the pack was not yet
1005 # fully written)
1006 idx_name = os.path.splitext(name)[0] + ".idx"
1007 if idx_name in pack_dir_contents:
1008 pack_name = name[: -len(".pack")]
1009 pack_files.add(pack_name)
1010
1011 # Open newly appeared pack files
1012 new_packs = []
1013 for f in pack_files:
1014 if f not in self._pack_cache:
1015 pack = Pack(os.path.join(self.pack_dir, f))
1016 new_packs.append(pack)
1017 self._pack_cache[f] = pack
1018 # Remove disappeared pack files
1019 for f in set(self._pack_cache) - pack_files:
1020 self._pack_cache.pop(f).close()
1021 return new_packs
1022
1023 def _get_shafile_path(self, sha):
1024 # Check from object dir
1025 return hex_to_filename(self.path, sha)
1026
1027 def _iter_loose_objects(self):
1028 for base in os.listdir(self.path):
1029 if len(base) != 2:
1030 continue
1031 for rest in os.listdir(os.path.join(self.path, base)):
1032 sha = os.fsencode(base + rest)
1033 if not valid_hexsha(sha):
1034 continue
1035 yield sha
1036
1037 def count_loose_objects(self) -> int:
1038 """Count the number of loose objects in the object store.
1039
1040 Returns:
1041 Number of loose objects
1042 """
1043 count = 0
1044 if not os.path.exists(self.path):
1045 return 0
1046
1047 for i in range(256):
1048 subdir = os.path.join(self.path, f"{i:02x}")
1049 try:
1050 count += len(
1051 [
1052 name
1053 for name in os.listdir(subdir)
1054 if len(name) == 38 # 40 - 2 for the prefix
1055 ]
1056 )
1057 except FileNotFoundError:
1058 # Directory may have been removed or is inaccessible
1059 continue
1060
1061 return count
1062
1063 def _get_loose_object(self, sha):
1064 path = self._get_shafile_path(sha)
1065 try:
1066 return ShaFile.from_path(path)
1067 except FileNotFoundError:
1068 return None
1069
1070 def delete_loose_object(self, sha) -> None:
1071 os.remove(self._get_shafile_path(sha))
1072
1073 def get_object_mtime(self, sha):
1074 """Get the modification time of an object.
1075
1076 Args:
1077 sha: SHA1 of the object
1078
1079 Returns:
1080 Modification time as seconds since epoch
1081
1082 Raises:
1083 KeyError: if the object is not found
1084 """
1085 # First check if it's a loose object
1086 if self.contains_loose(sha):
1087 path = self._get_shafile_path(sha)
1088 try:
1089 return os.path.getmtime(path)
1090 except FileNotFoundError:
1091 pass
1092
1093 # Check if it's in a pack file
1094 for pack in self.packs:
1095 try:
1096 if sha in pack:
1097 # Use the pack file's mtime for packed objects
1098 pack_path = pack._data_path
1099 try:
1100 return os.path.getmtime(pack_path)
1101 except (FileNotFoundError, AttributeError):
1102 pass
1103 except PackFileDisappeared:
1104 pass
1105
1106 raise KeyError(sha)
1107
1108 def _remove_pack(self, pack) -> None:
1109 try:
1110 del self._pack_cache[os.path.basename(pack._basename)]
1111 except KeyError:
1112 pass
1113 pack.close()
1114 os.remove(pack.data.path)
1115 os.remove(pack.index.path)
1116
1117 def _get_pack_basepath(self, entries):
1118 suffix = iter_sha1(entry[0] for entry in entries)
1119 # TODO: Handle self.pack_dir being bytes
1120 suffix = suffix.decode("ascii")
1121 return os.path.join(self.pack_dir, "pack-" + suffix)
1122
1123 def _complete_pack(self, f, path, num_objects, indexer, progress=None):
1124 """Move a specific file containing a pack into the pack directory.
1125
1126 Note: The file should be on the same file system as the
1127 packs directory.
1128
1129 Args:
1130 f: Open file object for the pack.
1131 path: Path to the pack file.
1132 indexer: A PackIndexer for indexing the pack.
1133 """
1134 entries = []
1135 for i, entry in enumerate(indexer):
1136 if progress is not None:
1137 progress(f"generating index: {i}/{num_objects}\r".encode("ascii"))
1138 entries.append(entry)
1139
1140 pack_sha, extra_entries = extend_pack(
1141 f,
1142 indexer.ext_refs(),
1143 get_raw=self.get_raw,
1144 compression_level=self.pack_compression_level,
1145 progress=progress,
1146 )
1147 f.flush()
1148 try:
1149 fileno = f.fileno()
1150 except AttributeError:
1151 pass
1152 else:
1153 os.fsync(fileno)
1154 f.close()
1155
1156 entries.extend(extra_entries)
1157
1158 # Move the pack in.
1159 entries.sort()
1160 pack_base_name = self._get_pack_basepath(entries)
1161
1162 for pack in self.packs:
1163 if pack._basename == pack_base_name:
1164 return pack
1165
1166 target_pack_path = pack_base_name + ".pack"
1167 target_index_path = pack_base_name + ".idx"
1168 if sys.platform == "win32":
1169 # Windows might have the target pack file lingering. Attempt
1170 # removal, silently passing if the target does not exist.
1171 with suppress(FileNotFoundError):
1172 os.remove(target_pack_path)
1173 os.rename(path, target_pack_path)
1174
1175 # Write the index.
1176 with GitFile(target_index_path, "wb", mask=PACK_MODE) as index_file:
1177 write_pack_index(
1178 index_file, entries, pack_sha, version=self.pack_index_version
1179 )
1180
1181 # Add the pack to the store and return it.
1182 final_pack = Pack(pack_base_name)
1183 final_pack.check_length_and_checksum()
1184 self._add_cached_pack(pack_base_name, final_pack)
1185 return final_pack
1186
1187 def add_thin_pack(self, read_all, read_some, progress=None):
1188 """Add a new thin pack to this object store.
1189
1190 Thin packs are packs that contain deltas with parents that exist
1191 outside the pack. They should never be placed in the object store
1192 directly, and always indexed and completed as they are copied.
1193
1194 Args:
1195 read_all: Read function that blocks until the number of
1196 requested bytes are read.
1197 read_some: Read function that returns at least one byte, but may
1198 not return the number of bytes requested.
1199 Returns: A Pack object pointing at the now-completed thin pack in the
1200 objects/pack directory.
1201 """
1202 import tempfile
1203
1204 fd, path = tempfile.mkstemp(dir=self.path, prefix="tmp_pack_")
1205 with os.fdopen(fd, "w+b") as f:
1206 os.chmod(path, PACK_MODE)
1207 indexer = PackIndexer(f, resolve_ext_ref=self.get_raw)
1208 copier = PackStreamCopier(read_all, read_some, f, delta_iter=indexer)
1209 copier.verify(progress=progress)
1210 return self._complete_pack(f, path, len(copier), indexer, progress=progress)
1211
1212 def add_pack(self):
1213 """Add a new pack to this object store.
1214
1215 Returns: Fileobject to write to, a commit function to
1216 call when the pack is finished and an abort
1217 function.
1218 """
1219 import tempfile
1220
1221 fd, path = tempfile.mkstemp(dir=self.pack_dir, suffix=".pack")
1222 f = os.fdopen(fd, "w+b")
1223 os.chmod(path, PACK_MODE)
1224
1225 def commit():
1226 if f.tell() > 0:
1227 f.seek(0)
1228 with PackData(path, f) as pd:
1229 indexer = PackIndexer.for_pack_data(
1230 pd, resolve_ext_ref=self.get_raw
1231 )
1232 return self._complete_pack(f, path, len(pd), indexer)
1233 else:
1234 f.close()
1235 os.remove(path)
1236 return None
1237
1238 def abort() -> None:
1239 f.close()
1240 os.remove(path)
1241
1242 return f, commit, abort
1243
1244 def add_object(self, obj) -> None:
1245 """Add a single object to this object store.
1246
1247 Args:
1248 obj: Object to add
1249 """
1250 path = self._get_shafile_path(obj.id)
1251 dir = os.path.dirname(path)
1252 try:
1253 os.mkdir(dir)
1254 except FileExistsError:
1255 pass
1256 if os.path.exists(path):
1257 return # Already there, no need to write again
1258 with GitFile(path, "wb", mask=PACK_MODE) as f:
1259 f.write(
1260 obj.as_legacy_object(compression_level=self.loose_compression_level)
1261 )
1262
1263 @classmethod
1264 def init(cls, path: Union[str, os.PathLike]):
1265 try:
1266 os.mkdir(path)
1267 except FileExistsError:
1268 pass
1269 os.mkdir(os.path.join(path, "info"))
1270 os.mkdir(os.path.join(path, PACKDIR))
1271 return cls(path)
1272
1273 def iter_prefix(self, prefix):
1274 if len(prefix) < 2:
1275 yield from super().iter_prefix(prefix)
1276 return
1277 seen = set()
1278 dir = prefix[:2].decode()
1279 rest = prefix[2:].decode()
1280 try:
1281 for name in os.listdir(os.path.join(self.path, dir)):
1282 if name.startswith(rest):
1283 sha = os.fsencode(dir + name)
1284 if sha not in seen:
1285 seen.add(sha)
1286 yield sha
1287 except FileNotFoundError:
1288 pass
1289
1290 for p in self.packs:
1291 bin_prefix = (
1292 binascii.unhexlify(prefix)
1293 if len(prefix) % 2 == 0
1294 else binascii.unhexlify(prefix[:-1])
1295 )
1296 for sha in p.index.iter_prefix(bin_prefix):
1297 sha = sha_to_hex(sha)
1298 if sha.startswith(prefix) and sha not in seen:
1299 seen.add(sha)
1300 yield sha
1301 for alternate in self.alternates:
1302 for sha in alternate.iter_prefix(prefix):
1303 if sha not in seen:
1304 seen.add(sha)
1305 yield sha
1306
1307 def get_commit_graph(self):
1308 """Get the commit graph for this object store.
1309
1310 Returns:
1311 CommitGraph object if available, None otherwise
1312 """
1313 if self._commit_graph is None:
1314 from .commit_graph import read_commit_graph
1315
1316 # Look for commit graph in our objects directory
1317 graph_file = os.path.join(self.path, "info", "commit-graph")
1318 if os.path.exists(graph_file):
1319 self._commit_graph = read_commit_graph(graph_file)
1320 return self._commit_graph
1321
1322 def write_commit_graph(self, refs=None, reachable=True) -> None:
1323 """Write a commit graph file for this object store.
1324
1325 Args:
1326 refs: List of refs to include. If None, includes all refs from object store.
1327 reachable: If True, includes all commits reachable from refs.
1328 If False, only includes the direct ref targets.
1329 """
1330 from .commit_graph import get_reachable_commits
1331
1332 if refs is None:
1333 # Get all commit objects from the object store
1334 all_refs = []
1335 # Iterate through all objects to find commits
1336 for sha in self:
1337 try:
1338 obj = self[sha]
1339 if obj.type_name == b"commit":
1340 all_refs.append(sha)
1341 except KeyError:
1342 continue
1343 else:
1344 # Use provided refs
1345 all_refs = refs
1346
1347 if not all_refs:
1348 return # No commits to include
1349
1350 if reachable:
1351 # Get all reachable commits
1352 commit_ids = get_reachable_commits(self, all_refs)
1353 else:
1354 # Just use the direct ref targets - ensure they're hex ObjectIDs
1355 commit_ids = []
1356 for ref in all_refs:
1357 if isinstance(ref, bytes) and len(ref) == 40:
1358 # Already hex ObjectID
1359 commit_ids.append(ref)
1360 elif isinstance(ref, bytes) and len(ref) == 20:
1361 # Binary SHA, convert to hex ObjectID
1362 from .objects import sha_to_hex
1363
1364 commit_ids.append(sha_to_hex(ref))
1365 else:
1366 # Assume it's already correct format
1367 commit_ids.append(ref)
1368
1369 if commit_ids:
1370 # Write commit graph directly to our object store path
1371 # Generate the commit graph
1372 from .commit_graph import generate_commit_graph
1373
1374 graph = generate_commit_graph(self, commit_ids)
1375
1376 if graph.entries:
1377 # Ensure the info directory exists
1378 info_dir = os.path.join(self.path, "info")
1379 os.makedirs(info_dir, exist_ok=True)
1380
1381 # Write using GitFile for atomic operation
1382 graph_path = os.path.join(info_dir, "commit-graph")
1383 with GitFile(graph_path, "wb") as f:
1384 graph.write_to_file(f)
1385
1386 # Clear cached commit graph so it gets reloaded
1387 self._commit_graph = None
1388
1389 def prune(self, grace_period: Optional[int] = None) -> None:
1390 """Prune/clean up this object store.
1391
1392 This removes temporary files that were left behind by interrupted
1393 pack operations. These are files that start with ``tmp_pack_`` in the
1394 repository directory or files with .pack extension but no corresponding
1395 .idx file in the pack directory.
1396
1397 Args:
1398 grace_period: Grace period in seconds for removing temporary files.
1399 If None, uses DEFAULT_TEMPFILE_GRACE_PERIOD.
1400 """
1401 import glob
1402
1403 if grace_period is None:
1404 grace_period = DEFAULT_TEMPFILE_GRACE_PERIOD
1405
1406 # Clean up tmp_pack_* files in the repository directory
1407 for tmp_file in glob.glob(os.path.join(self.path, "tmp_pack_*")):
1408 # Check if file is old enough (more than grace period)
1409 mtime = os.path.getmtime(tmp_file)
1410 if time.time() - mtime > grace_period:
1411 os.remove(tmp_file)
1412
1413 # Clean up orphaned .pack files without corresponding .idx files
1414 try:
1415 pack_dir_contents = os.listdir(self.pack_dir)
1416 except FileNotFoundError:
1417 return
1418
1419 pack_files = {}
1420 idx_files = set()
1421
1422 for name in pack_dir_contents:
1423 if name.endswith(".pack"):
1424 base_name = name[:-5] # Remove .pack extension
1425 pack_files[base_name] = name
1426 elif name.endswith(".idx"):
1427 base_name = name[:-4] # Remove .idx extension
1428 idx_files.add(base_name)
1429
1430 # Remove .pack files without corresponding .idx files
1431 for base_name, pack_name in pack_files.items():
1432 if base_name not in idx_files:
1433 pack_path = os.path.join(self.pack_dir, pack_name)
1434 # Check if file is old enough (more than grace period)
1435 mtime = os.path.getmtime(pack_path)
1436 if time.time() - mtime > grace_period:
1437 os.remove(pack_path)
1438
1439
1440class MemoryObjectStore(BaseObjectStore):
1441 """Object store that keeps all objects in memory."""
1442
1443 def __init__(self) -> None:
1444 super().__init__()
1445 self._data: dict[str, ShaFile] = {}
1446 self.pack_compression_level = -1
1447
1448 def _to_hexsha(self, sha):
1449 if len(sha) == 40:
1450 return sha
1451 elif len(sha) == 20:
1452 return sha_to_hex(sha)
1453 else:
1454 raise ValueError(f"Invalid sha {sha!r}")
1455
1456 def contains_loose(self, sha):
1457 """Check if a particular object is present by SHA1 and is loose."""
1458 return self._to_hexsha(sha) in self._data
1459
1460 def contains_packed(self, sha) -> bool:
1461 """Check if a particular object is present by SHA1 and is packed."""
1462 return False
1463
1464 def __iter__(self):
1465 """Iterate over the SHAs that are present in this store."""
1466 return iter(self._data.keys())
1467
1468 @property
1469 def packs(self):
1470 """List with pack objects."""
1471 return []
1472
1473 def get_raw(self, name: ObjectID):
1474 """Obtain the raw text for an object.
1475
1476 Args:
1477 name: sha for the object.
1478 Returns: tuple with numeric type and object contents.
1479 """
1480 obj = self[self._to_hexsha(name)]
1481 return obj.type_num, obj.as_raw_string()
1482
1483 def __getitem__(self, name: ObjectID):
1484 return self._data[self._to_hexsha(name)].copy()
1485
1486 def __delitem__(self, name: ObjectID) -> None:
1487 """Delete an object from this store, for testing only."""
1488 del self._data[self._to_hexsha(name)]
1489
1490 def add_object(self, obj) -> None:
1491 """Add a single object to this object store."""
1492 self._data[obj.id] = obj.copy()
1493
1494 def add_objects(self, objects, progress=None) -> None:
1495 """Add a set of objects to this object store.
1496
1497 Args:
1498 objects: Iterable over a list of (object, path) tuples
1499 """
1500 for obj, path in objects:
1501 self.add_object(obj)
1502
1503 def add_pack(self):
1504 """Add a new pack to this object store.
1505
1506 Because this object store doesn't support packs, we extract and add the
1507 individual objects.
1508
1509 Returns: Fileobject to write to and a commit function to
1510 call when the pack is finished.
1511 """
1512 from tempfile import SpooledTemporaryFile
1513
1514 f = SpooledTemporaryFile(max_size=PACK_SPOOL_FILE_MAX_SIZE, prefix="incoming-")
1515
1516 def commit() -> None:
1517 size = f.tell()
1518 if size > 0:
1519 f.seek(0)
1520 p = PackData.from_file(f, size)
1521 for obj in PackInflater.for_pack_data(p, self.get_raw):
1522 self.add_object(obj)
1523 p.close()
1524 f.close()
1525 else:
1526 f.close()
1527
1528 def abort() -> None:
1529 f.close()
1530
1531 return f, commit, abort
1532
1533 def add_pack_data(
1534 self, count: int, unpacked_objects: Iterator[UnpackedObject], progress=None
1535 ) -> None:
1536 """Add pack data to this object store.
1537
1538 Args:
1539 count: Number of items to add
1540 """
1541 if count == 0:
1542 return
1543
1544 # Since MemoryObjectStore doesn't support pack files, we need to
1545 # extract individual objects. To handle deltas properly, we write
1546 # to a temporary pack and then use PackInflater to resolve them.
1547 f, commit, abort = self.add_pack()
1548 try:
1549 write_pack_data(
1550 f.write,
1551 unpacked_objects,
1552 num_records=count,
1553 progress=progress,
1554 )
1555 except BaseException:
1556 abort()
1557 raise
1558 else:
1559 commit()
1560
1561 def add_thin_pack(self, read_all, read_some, progress=None) -> None:
1562 """Add a new thin pack to this object store.
1563
1564 Thin packs are packs that contain deltas with parents that exist
1565 outside the pack. Because this object store doesn't support packs, we
1566 extract and add the individual objects.
1567
1568 Args:
1569 read_all: Read function that blocks until the number of
1570 requested bytes are read.
1571 read_some: Read function that returns at least one byte, but may
1572 not return the number of bytes requested.
1573 """
1574 f, commit, abort = self.add_pack()
1575 try:
1576 copier = PackStreamCopier(read_all, read_some, f)
1577 copier.verify()
1578 except BaseException:
1579 abort()
1580 raise
1581 else:
1582 commit()
1583
1584
1585class ObjectIterator(Protocol):
1586 """Interface for iterating over objects."""
1587
1588 def iterobjects(self) -> Iterator[ShaFile]:
1589 raise NotImplementedError(self.iterobjects)
1590
1591
1592def tree_lookup_path(lookup_obj, root_sha, path):
1593 """Look up an object in a Git tree.
1594
1595 Args:
1596 lookup_obj: Callback for retrieving object by SHA1
1597 root_sha: SHA1 of the root tree
1598 path: Path to lookup
1599 Returns: A tuple of (mode, SHA) of the resulting path.
1600 """
1601 tree = lookup_obj(root_sha)
1602 if not isinstance(tree, Tree):
1603 raise NotTreeError(root_sha)
1604 return tree.lookup_path(lookup_obj, path)
1605
1606
1607def _collect_filetree_revs(
1608 obj_store: ObjectContainer, tree_sha: ObjectID, kset: set[ObjectID]
1609) -> None:
1610 """Collect SHA1s of files and directories for specified tree.
1611
1612 Args:
1613 obj_store: Object store to get objects by SHA from
1614 tree_sha: tree reference to walk
1615 kset: set to fill with references to files and directories
1616 """
1617 filetree = obj_store[tree_sha]
1618 assert isinstance(filetree, Tree)
1619 for name, mode, sha in filetree.iteritems():
1620 if not S_ISGITLINK(mode) and sha not in kset:
1621 kset.add(sha)
1622 if stat.S_ISDIR(mode):
1623 _collect_filetree_revs(obj_store, sha, kset)
1624
1625
1626def _split_commits_and_tags(
1627 obj_store: ObjectContainer, lst, *, ignore_unknown=False
1628) -> tuple[set[bytes], set[bytes], set[bytes]]:
1629 """Split object id list into three lists with commit, tag, and other SHAs.
1630
1631 Commits referenced by tags are included into commits
1632 list as well. Only SHA1s known in this repository will get
1633 through, and unless ignore_unknown argument is True, KeyError
1634 is thrown for SHA1 missing in the repository
1635
1636 Args:
1637 obj_store: Object store to get objects by SHA1 from
1638 lst: Collection of commit and tag SHAs
1639 ignore_unknown: True to skip SHA1 missing in the repository
1640 silently.
1641 Returns: A tuple of (commits, tags, others) SHA1s
1642 """
1643 commits: set[bytes] = set()
1644 tags: set[bytes] = set()
1645 others: set[bytes] = set()
1646 for e in lst:
1647 try:
1648 o = obj_store[e]
1649 except KeyError:
1650 if not ignore_unknown:
1651 raise
1652 else:
1653 if isinstance(o, Commit):
1654 commits.add(e)
1655 elif isinstance(o, Tag):
1656 tags.add(e)
1657 tagged = o.object[1]
1658 c, t, os = _split_commits_and_tags(
1659 obj_store, [tagged], ignore_unknown=ignore_unknown
1660 )
1661 commits |= c
1662 tags |= t
1663 others |= os
1664 else:
1665 others.add(e)
1666 return (commits, tags, others)
1667
1668
1669class MissingObjectFinder:
1670 """Find the objects missing from another object store.
1671
1672 Args:
1673 object_store: Object store containing at least all objects to be
1674 sent
1675 haves: SHA1s of commits not to send (already present in target)
1676 wants: SHA1s of commits to send
1677 progress: Optional function to report progress to.
1678 get_tagged: Function that returns a dict of pointed-to sha -> tag
1679 sha for including tags.
1680 get_parents: Optional function for getting the parents of a commit.
1681 """
1682
1683 def __init__(
1684 self,
1685 object_store,
1686 haves,
1687 wants,
1688 *,
1689 shallow=None,
1690 progress=None,
1691 get_tagged=None,
1692 get_parents=lambda commit: commit.parents,
1693 ) -> None:
1694 self.object_store = object_store
1695 if shallow is None:
1696 shallow = set()
1697 self._get_parents = get_parents
1698 # process Commits and Tags differently
1699 # Note, while haves may list commits/tags not available locally,
1700 # and such SHAs would get filtered out by _split_commits_and_tags,
1701 # wants shall list only known SHAs, and otherwise
1702 # _split_commits_and_tags fails with KeyError
1703 have_commits, have_tags, have_others = _split_commits_and_tags(
1704 object_store, haves, ignore_unknown=True
1705 )
1706 want_commits, want_tags, want_others = _split_commits_and_tags(
1707 object_store, wants, ignore_unknown=False
1708 )
1709 # all_ancestors is a set of commits that shall not be sent
1710 # (complete repository up to 'haves')
1711 all_ancestors = _collect_ancestors(
1712 object_store, have_commits, shallow=shallow, get_parents=self._get_parents
1713 )[0]
1714 # all_missing - complete set of commits between haves and wants
1715 # common - commits from all_ancestors we hit into while
1716 # traversing parent hierarchy of wants
1717 missing_commits, common_commits = _collect_ancestors(
1718 object_store,
1719 want_commits,
1720 all_ancestors,
1721 shallow=shallow,
1722 get_parents=self._get_parents,
1723 )
1724 self.remote_has: set[bytes] = set()
1725 # Now, fill sha_done with commits and revisions of
1726 # files and directories known to be both locally
1727 # and on target. Thus these commits and files
1728 # won't get selected for fetch
1729 for h in common_commits:
1730 self.remote_has.add(h)
1731 cmt = object_store[h]
1732 _collect_filetree_revs(object_store, cmt.tree, self.remote_has)
1733 # record tags we have as visited, too
1734 for t in have_tags:
1735 self.remote_has.add(t)
1736 self.sha_done = set(self.remote_has)
1737
1738 # in fact, what we 'want' is commits, tags, and others
1739 # we've found missing
1740 self.objects_to_send: set[
1741 tuple[ObjectID, Optional[bytes], Optional[int], bool]
1742 ] = {(w, None, Commit.type_num, False) for w in missing_commits}
1743 missing_tags = want_tags.difference(have_tags)
1744 self.objects_to_send.update(
1745 {(w, None, Tag.type_num, False) for w in missing_tags}
1746 )
1747 missing_others = want_others.difference(have_others)
1748 self.objects_to_send.update({(w, None, None, False) for w in missing_others})
1749
1750 if progress is None:
1751 self.progress = lambda x: None
1752 else:
1753 self.progress = progress
1754 self._tagged = (get_tagged and get_tagged()) or {}
1755
1756 def get_remote_has(self):
1757 return self.remote_has
1758
1759 def add_todo(
1760 self, entries: Iterable[tuple[ObjectID, Optional[bytes], Optional[int], bool]]
1761 ) -> None:
1762 self.objects_to_send.update([e for e in entries if e[0] not in self.sha_done])
1763
1764 def __next__(self) -> tuple[bytes, Optional[PackHint]]:
1765 while True:
1766 if not self.objects_to_send:
1767 self.progress(
1768 f"counting objects: {len(self.sha_done)}, done.\n".encode("ascii")
1769 )
1770 raise StopIteration
1771 (sha, name, type_num, leaf) = self.objects_to_send.pop()
1772 if sha not in self.sha_done:
1773 break
1774 if not leaf:
1775 o = self.object_store[sha]
1776 if isinstance(o, Commit):
1777 self.add_todo([(o.tree, b"", Tree.type_num, False)])
1778 elif isinstance(o, Tree):
1779 self.add_todo(
1780 [
1781 (
1782 s,
1783 n,
1784 (Blob.type_num if stat.S_ISREG(m) else Tree.type_num),
1785 not stat.S_ISDIR(m),
1786 )
1787 for n, m, s in o.iteritems()
1788 if not S_ISGITLINK(m)
1789 ]
1790 )
1791 elif isinstance(o, Tag):
1792 self.add_todo([(o.object[1], None, o.object[0].type_num, False)])
1793 if sha in self._tagged:
1794 self.add_todo([(self._tagged[sha], None, None, True)])
1795 self.sha_done.add(sha)
1796 if len(self.sha_done) % 1000 == 0:
1797 self.progress(f"counting objects: {len(self.sha_done)}\r".encode("ascii"))
1798 if type_num is None:
1799 pack_hint = None
1800 else:
1801 pack_hint = (type_num, name)
1802 return (sha, pack_hint)
1803
1804 def __iter__(self):
1805 return self
1806
1807
1808class ObjectStoreGraphWalker:
1809 """Graph walker that finds what commits are missing from an object store."""
1810
1811 heads: set[ObjectID]
1812 """Revisions without descendants in the local repo."""
1813
1814 get_parents: Callable[[ObjectID], ObjectID]
1815 """Function to retrieve parents in the local repo."""
1816
1817 shallow: set[ObjectID]
1818
1819 def __init__(
1820 self,
1821 local_heads: Iterable[ObjectID],
1822 get_parents,
1823 shallow: Optional[set[ObjectID]] = None,
1824 update_shallow=None,
1825 ) -> None:
1826 """Create a new instance.
1827
1828 Args:
1829 local_heads: Heads to start search with
1830 get_parents: Function for finding the parents of a SHA1.
1831 """
1832 self.heads = set(local_heads)
1833 self.get_parents = get_parents
1834 self.parents: dict[ObjectID, Optional[list[ObjectID]]] = {}
1835 if shallow is None:
1836 shallow = set()
1837 self.shallow = shallow
1838 self.update_shallow = update_shallow
1839
1840 def nak(self) -> None:
1841 """Nothing in common was found."""
1842
1843 def ack(self, sha: ObjectID) -> None:
1844 """Ack that a revision and its ancestors are present in the source."""
1845 if len(sha) != 40:
1846 raise ValueError(f"unexpected sha {sha!r} received")
1847 ancestors = {sha}
1848
1849 # stop if we run out of heads to remove
1850 while self.heads:
1851 for a in ancestors:
1852 if a in self.heads:
1853 self.heads.remove(a)
1854
1855 # collect all ancestors
1856 new_ancestors = set()
1857 for a in ancestors:
1858 ps = self.parents.get(a)
1859 if ps is not None:
1860 new_ancestors.update(ps)
1861 self.parents[a] = None
1862
1863 # no more ancestors; stop
1864 if not new_ancestors:
1865 break
1866
1867 ancestors = new_ancestors
1868
1869 def next(self):
1870 """Iterate over ancestors of heads in the target."""
1871 if self.heads:
1872 ret = self.heads.pop()
1873 try:
1874 ps = self.get_parents(ret)
1875 except KeyError:
1876 return None
1877 self.parents[ret] = ps
1878 self.heads.update([p for p in ps if p not in self.parents])
1879 return ret
1880 return None
1881
1882 __next__ = next
1883
1884
1885def commit_tree_changes(object_store, tree, changes):
1886 """Commit a specified set of changes to a tree structure.
1887
1888 This will apply a set of changes on top of an existing tree, storing new
1889 objects in object_store.
1890
1891 changes are a list of tuples with (path, mode, object_sha).
1892 Paths can be both blobs and trees. See the mode and
1893 object sha to None deletes the path.
1894
1895 This method works especially well if there are only a small
1896 number of changes to a big tree. For a large number of changes
1897 to a large tree, use e.g. commit_tree.
1898
1899 Args:
1900 object_store: Object store to store new objects in
1901 and retrieve old ones from.
1902 tree: Original tree root
1903 changes: changes to apply
1904 Returns: New tree root object
1905 """
1906 # TODO(jelmer): Save up the objects and add them using .add_objects
1907 # rather than with individual calls to .add_object.
1908 nested_changes = {}
1909 for path, new_mode, new_sha in changes:
1910 try:
1911 (dirname, subpath) = path.split(b"/", 1)
1912 except ValueError:
1913 if new_sha is None:
1914 del tree[path]
1915 else:
1916 tree[path] = (new_mode, new_sha)
1917 else:
1918 nested_changes.setdefault(dirname, []).append((subpath, new_mode, new_sha))
1919 for name, subchanges in nested_changes.items():
1920 try:
1921 orig_subtree = object_store[tree[name][1]]
1922 except KeyError:
1923 orig_subtree = Tree()
1924 subtree = commit_tree_changes(object_store, orig_subtree, subchanges)
1925 if len(subtree) == 0:
1926 del tree[name]
1927 else:
1928 tree[name] = (stat.S_IFDIR, subtree.id)
1929 object_store.add_object(tree)
1930 return tree
1931
1932
1933class OverlayObjectStore(BaseObjectStore):
1934 """Object store that can overlay multiple object stores."""
1935
1936 def __init__(self, bases, add_store=None) -> None:
1937 self.bases = bases
1938 self.add_store = add_store
1939
1940 def add_object(self, object):
1941 if self.add_store is None:
1942 raise NotImplementedError(self.add_object)
1943 return self.add_store.add_object(object)
1944
1945 def add_objects(self, objects, progress=None):
1946 if self.add_store is None:
1947 raise NotImplementedError(self.add_object)
1948 return self.add_store.add_objects(objects, progress)
1949
1950 @property
1951 def packs(self):
1952 ret = []
1953 for b in self.bases:
1954 ret.extend(b.packs)
1955 return ret
1956
1957 def __iter__(self):
1958 done = set()
1959 for b in self.bases:
1960 for o_id in b:
1961 if o_id not in done:
1962 yield o_id
1963 done.add(o_id)
1964
1965 def iterobjects_subset(
1966 self, shas: Iterable[bytes], *, allow_missing: bool = False
1967 ) -> Iterator[ShaFile]:
1968 todo = set(shas)
1969 found: set[bytes] = set()
1970
1971 for b in self.bases:
1972 # Create a copy of todo for each base to avoid modifying
1973 # the set while iterating through it
1974 current_todo = todo - found
1975 for o in b.iterobjects_subset(current_todo, allow_missing=True):
1976 yield o
1977 found.add(o.id)
1978
1979 # Check for any remaining objects not found
1980 missing = todo - found
1981 if missing and not allow_missing:
1982 raise KeyError(next(iter(missing)))
1983
1984 def iter_unpacked_subset(
1985 self,
1986 shas: Iterable[bytes],
1987 *,
1988 include_comp=False,
1989 allow_missing: bool = False,
1990 convert_ofs_delta=True,
1991 ) -> Iterator[ShaFile]:
1992 todo = set(shas)
1993 for b in self.bases:
1994 for o in b.iter_unpacked_subset(
1995 todo,
1996 include_comp=include_comp,
1997 allow_missing=True,
1998 convert_ofs_delta=convert_ofs_delta,
1999 ):
2000 yield o
2001 todo.remove(o.id)
2002 if todo and not allow_missing:
2003 raise KeyError(o.id)
2004
2005 def get_raw(self, sha_id):
2006 for b in self.bases:
2007 try:
2008 return b.get_raw(sha_id)
2009 except KeyError:
2010 pass
2011 raise KeyError(sha_id)
2012
2013 def contains_packed(self, sha) -> bool:
2014 for b in self.bases:
2015 if b.contains_packed(sha):
2016 return True
2017 return False
2018
2019 def contains_loose(self, sha) -> bool:
2020 for b in self.bases:
2021 if b.contains_loose(sha):
2022 return True
2023 return False
2024
2025
2026def read_packs_file(f):
2027 """Yield the packs listed in a packs file."""
2028 for line in f.read().splitlines():
2029 if not line:
2030 continue
2031 (kind, name) = line.split(b" ", 1)
2032 if kind != b"P":
2033 continue
2034 yield os.fsdecode(name)
2035
2036
2037class BucketBasedObjectStore(PackBasedObjectStore):
2038 """Object store implementation that uses a bucket store like S3 as backend."""
2039
2040 def _iter_loose_objects(self):
2041 """Iterate over the SHAs of all loose objects."""
2042 return iter([])
2043
2044 def _get_loose_object(self, sha) -> None:
2045 return None
2046
2047 def delete_loose_object(self, sha) -> None:
2048 # Doesn't exist..
2049 pass
2050
2051 def _remove_pack(self, name) -> None:
2052 raise NotImplementedError(self._remove_pack)
2053
2054 def _iter_pack_names(self) -> Iterator[str]:
2055 raise NotImplementedError(self._iter_pack_names)
2056
2057 def _get_pack(self, name) -> Pack:
2058 raise NotImplementedError(self._get_pack)
2059
2060 def _update_pack_cache(self):
2061 pack_files = set(self._iter_pack_names())
2062
2063 # Open newly appeared pack files
2064 new_packs = []
2065 for f in pack_files:
2066 if f not in self._pack_cache:
2067 pack = self._get_pack(f)
2068 new_packs.append(pack)
2069 self._pack_cache[f] = pack
2070 # Remove disappeared pack files
2071 for f in set(self._pack_cache) - pack_files:
2072 self._pack_cache.pop(f).close()
2073 return new_packs
2074
2075 def _upload_pack(self, basename, pack_file, index_file) -> None:
2076 raise NotImplementedError
2077
2078 def add_pack(self):
2079 """Add a new pack to this object store.
2080
2081 Returns: Fileobject to write to, a commit function to
2082 call when the pack is finished and an abort
2083 function.
2084 """
2085 import tempfile
2086
2087 pf = tempfile.SpooledTemporaryFile(
2088 max_size=PACK_SPOOL_FILE_MAX_SIZE, prefix="incoming-"
2089 )
2090
2091 def commit():
2092 if pf.tell() == 0:
2093 pf.close()
2094 return None
2095
2096 pf.seek(0)
2097 p = PackData(pf.name, pf)
2098 entries = p.sorted_entries()
2099 basename = iter_sha1(entry[0] for entry in entries).decode("ascii")
2100 idxf = tempfile.SpooledTemporaryFile(
2101 max_size=PACK_SPOOL_FILE_MAX_SIZE, prefix="incoming-"
2102 )
2103 checksum = p.get_stored_checksum()
2104 write_pack_index(idxf, entries, checksum, version=self.pack_index_version)
2105 idxf.seek(0)
2106 idx = load_pack_index_file(basename + ".idx", idxf)
2107 for pack in self.packs:
2108 if pack.get_stored_checksum() == p.get_stored_checksum():
2109 p.close()
2110 idx.close()
2111 pf.close()
2112 idxf.close()
2113 return pack
2114 pf.seek(0)
2115 idxf.seek(0)
2116 self._upload_pack(basename, pf, idxf)
2117 final_pack = Pack.from_objects(p, idx)
2118 self._add_cached_pack(basename, final_pack)
2119 pf.close()
2120 idxf.close()
2121 return final_pack
2122
2123 return pf, commit, pf.close
2124
2125
2126def _collect_ancestors(
2127 store: ObjectContainer,
2128 heads,
2129 common: frozenset[ObjectID] = frozenset(),
2130 shallow: frozenset[ObjectID] = frozenset(),
2131 get_parents=lambda commit: commit.parents,
2132):
2133 """Collect all ancestors of heads up to (excluding) those in common.
2134
2135 Args:
2136 heads: commits to start from
2137 common: commits to end at, or empty set to walk repository
2138 completely
2139 get_parents: Optional function for getting the parents of a
2140 commit.
2141 Returns: a tuple (A, B) where A - all commits reachable
2142 from heads but not present in common, B - common (shared) elements
2143 that are directly reachable from heads
2144 """
2145 bases = set()
2146 commits = set()
2147 queue = []
2148 queue.extend(heads)
2149 while queue:
2150 e = queue.pop(0)
2151 if e in common:
2152 bases.add(e)
2153 elif e not in commits:
2154 commits.add(e)
2155 if e in shallow:
2156 continue
2157 cmt = store[e]
2158 queue.extend(get_parents(cmt))
2159 return (commits, bases)
2160
2161
2162def iter_tree_contents(
2163 store: ObjectContainer, tree_id: Optional[ObjectID], *, include_trees: bool = False
2164):
2165 """Iterate the contents of a tree and all subtrees.
2166
2167 Iteration is depth-first pre-order, as in e.g. os.walk.
2168
2169 Args:
2170 tree_id: SHA1 of the tree.
2171 include_trees: If True, include tree objects in the iteration.
2172 Returns: Iterator over TreeEntry namedtuples for all the objects in a
2173 tree.
2174 """
2175 if tree_id is None:
2176 return
2177 # This could be fairly easily generalized to >2 trees if we find a use
2178 # case.
2179 todo = [TreeEntry(b"", stat.S_IFDIR, tree_id)]
2180 while todo:
2181 entry = todo.pop()
2182 if stat.S_ISDIR(entry.mode):
2183 extra = []
2184 tree = store[entry.sha]
2185 assert isinstance(tree, Tree)
2186 for subentry in tree.iteritems(name_order=True):
2187 extra.append(subentry.in_path(entry.path))
2188 todo.extend(reversed(extra))
2189 if not stat.S_ISDIR(entry.mode) or include_trees:
2190 yield entry
2191
2192
2193def peel_sha(store: ObjectContainer, sha: bytes) -> tuple[ShaFile, ShaFile]:
2194 """Peel all tags from a SHA.
2195
2196 Args:
2197 sha: The object SHA to peel.
2198 Returns: The fully-peeled SHA1 of a tag object, after peeling all
2199 intermediate tags; if the original ref does not point to a tag,
2200 this will equal the original SHA1.
2201 """
2202 unpeeled = obj = store[sha]
2203 obj_class = object_class(obj.type_name)
2204 while obj_class is Tag:
2205 assert isinstance(obj, Tag)
2206 obj_class, sha = obj.object
2207 obj = store[sha]
2208 return unpeeled, obj