1# object_store.py -- Object store for git objects
2# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk>
3# and others
4#
5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
7# General Public License as published by the Free Software Foundation; version 2.0
8# or (at your option) any later version. You can redistribute it and/or
9# modify it under the terms of either of these two licenses.
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# You should have received a copy of the licenses; if not, see
18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
20# License, Version 2.0.
21#
22
23
24"""Git object store interfaces and implementation."""
25
26import binascii
27import os
28import stat
29import sys
30import time
31import warnings
32from collections.abc import Iterable, Iterator, Sequence
33from contextlib import suppress
34from io import BytesIO
35from pathlib import Path
36from typing import (
37 TYPE_CHECKING,
38 Callable,
39 Optional,
40 Protocol,
41 Union,
42)
43
44from .errors import NotTreeError
45from .file import GitFile, _GitFile
46from .objects import (
47 S_ISGITLINK,
48 ZERO_SHA,
49 Blob,
50 Commit,
51 ObjectID,
52 ShaFile,
53 Tag,
54 Tree,
55 TreeEntry,
56 hex_to_filename,
57 hex_to_sha,
58 object_class,
59 sha_to_hex,
60 valid_hexsha,
61)
62from .pack import (
63 PACK_SPOOL_FILE_MAX_SIZE,
64 ObjectContainer,
65 Pack,
66 PackData,
67 PackedObjectContainer,
68 PackFileDisappeared,
69 PackHint,
70 PackIndexer,
71 PackInflater,
72 PackStreamCopier,
73 UnpackedObject,
74 extend_pack,
75 full_unpacked_object,
76 generate_unpacked_objects,
77 iter_sha1,
78 load_pack_index_file,
79 pack_objects_to_data,
80 write_pack_data,
81 write_pack_index,
82)
83from .protocol import DEPTH_INFINITE
84from .refs import PEELED_TAG_SUFFIX, Ref
85
86if TYPE_CHECKING:
87 from .commit_graph import CommitGraph
88 from .diff_tree import RenameDetector
89
90
91class GraphWalker(Protocol):
92 """Protocol for graph walker objects."""
93
94 def __next__(self) -> Optional[bytes]:
95 """Return the next object SHA to visit."""
96 ...
97
98 def ack(self, sha: bytes) -> None:
99 """Acknowledge that an object has been received."""
100 ...
101
102
103INFODIR = "info"
104PACKDIR = "pack"
105
106# use permissions consistent with Git; just readable by everyone
107# TODO: should packs also be non-writable on Windows? if so, that
108# would requite some rather significant adjustments to the test suite
109PACK_MODE = 0o444 if sys.platform != "win32" else 0o644
110
111# Grace period for cleaning up temporary pack files (in seconds)
112# Matches git's default of 2 weeks
113DEFAULT_TEMPFILE_GRACE_PERIOD = 14 * 24 * 60 * 60 # 2 weeks
114
115
116def find_shallow(
117 store: ObjectContainer, heads: Iterable[bytes], depth: int
118) -> tuple[set[bytes], set[bytes]]:
119 """Find shallow commits according to a given depth.
120
121 Args:
122 store: An ObjectStore for looking up objects.
123 heads: Iterable of head SHAs to start walking from.
124 depth: The depth of ancestors to include. A depth of one includes
125 only the heads themselves.
126 Returns: A tuple of (shallow, not_shallow), sets of SHAs that should be
127 considered shallow and unshallow according to the arguments. Note that
128 these sets may overlap if a commit is reachable along multiple paths.
129 """
130 parents: dict[bytes, list[bytes]] = {}
131 commit_graph = store.get_commit_graph()
132
133 def get_parents(sha: bytes) -> list[bytes]:
134 result = parents.get(sha, None)
135 if not result:
136 # Try to use commit graph first if available
137 if commit_graph:
138 graph_parents = commit_graph.get_parents(sha)
139 if graph_parents is not None:
140 result = graph_parents
141 parents[sha] = result
142 return result
143 # Fall back to loading the object
144 commit = store[sha]
145 assert isinstance(commit, Commit)
146 result = commit.parents
147 parents[sha] = result
148 return result
149
150 todo = [] # stack of (sha, depth)
151 for head_sha in heads:
152 obj = store[head_sha]
153 # Peel tags if necessary
154 while isinstance(obj, Tag):
155 _, sha = obj.object
156 obj = store[sha]
157 if isinstance(obj, Commit):
158 todo.append((obj.id, 1))
159
160 not_shallow = set()
161 shallow = set()
162 while todo:
163 sha, cur_depth = todo.pop()
164 if cur_depth < depth:
165 not_shallow.add(sha)
166 new_depth = cur_depth + 1
167 todo.extend((p, new_depth) for p in get_parents(sha))
168 else:
169 shallow.add(sha)
170
171 return shallow, not_shallow
172
173
174def get_depth(
175 store: ObjectContainer,
176 head: bytes,
177 get_parents: Callable = lambda commit: commit.parents,
178 max_depth: Optional[int] = None,
179) -> int:
180 """Return the current available depth for the given head.
181
182 For commits with multiple parents, the largest possible depth will be
183 returned.
184
185 Args:
186 store: Object store to search in
187 head: commit to start from
188 get_parents: optional function for getting the parents of a commit
189 max_depth: maximum depth to search
190 """
191 if head not in store:
192 return 0
193 current_depth = 1
194 queue = [(head, current_depth)]
195 commit_graph = store.get_commit_graph()
196
197 while queue and (max_depth is None or current_depth < max_depth):
198 e, depth = queue.pop(0)
199 current_depth = max(current_depth, depth)
200
201 # Try to use commit graph for parent lookup if available
202 parents = None
203 if commit_graph:
204 parents = commit_graph.get_parents(e)
205
206 if parents is None:
207 # Fall back to loading the object
208 cmt = store[e]
209 if isinstance(cmt, Tag):
210 _cls, sha = cmt.object
211 cmt = store[sha]
212 parents = get_parents(cmt)
213
214 queue.extend((parent, depth + 1) for parent in parents if parent in store)
215 return current_depth
216
217
218class PackContainer(Protocol):
219 """Protocol for containers that can accept pack files."""
220
221 def add_pack(self) -> tuple[BytesIO, Callable[[], None], Callable[[], None]]:
222 """Add a new pack."""
223
224
225class BaseObjectStore:
226 """Object store interface."""
227
228 def determine_wants_all(
229 self, refs: dict[Ref, ObjectID], depth: Optional[int] = None
230 ) -> list[ObjectID]:
231 """Determine which objects are wanted based on refs."""
232
233 def _want_deepen(sha: bytes) -> bool:
234 if not depth:
235 return False
236 if depth == DEPTH_INFINITE:
237 return True
238 return depth > self._get_depth(sha)
239
240 return [
241 sha
242 for (ref, sha) in refs.items()
243 if (sha not in self or _want_deepen(sha))
244 and not ref.endswith(PEELED_TAG_SUFFIX)
245 and not sha == ZERO_SHA
246 ]
247
248 def contains_loose(self, sha: bytes) -> bool:
249 """Check if a particular object is present by SHA1 and is loose."""
250 raise NotImplementedError(self.contains_loose)
251
252 def __contains__(self, sha1: bytes) -> bool:
253 """Check if a particular object is present by SHA1.
254
255 This method makes no distinction between loose and packed objects.
256 """
257 return self.contains_loose(sha1)
258
259 @property
260 def packs(self) -> list[Pack]:
261 """Iterable of pack objects."""
262 raise NotImplementedError
263
264 def get_raw(self, name: bytes) -> tuple[int, bytes]:
265 """Obtain the raw text for an object.
266
267 Args:
268 name: sha for the object.
269 Returns: tuple with numeric type and object contents.
270 """
271 raise NotImplementedError(self.get_raw)
272
273 def __getitem__(self, sha1: ObjectID) -> ShaFile:
274 """Obtain an object by SHA1."""
275 type_num, uncomp = self.get_raw(sha1)
276 return ShaFile.from_raw_string(type_num, uncomp, sha=sha1)
277
278 def __iter__(self) -> Iterator[bytes]:
279 """Iterate over the SHAs that are present in this store."""
280 raise NotImplementedError(self.__iter__)
281
282 def add_object(self, obj: ShaFile) -> None:
283 """Add a single object to this object store."""
284 raise NotImplementedError(self.add_object)
285
286 def add_objects(
287 self,
288 objects: Sequence[tuple[ShaFile, Optional[str]]],
289 progress: Optional[Callable] = None,
290 ) -> Optional["Pack"]:
291 """Add a set of objects to this object store.
292
293 Args:
294 objects: Iterable over a list of (object, path) tuples
295 progress: Optional progress callback
296 """
297 raise NotImplementedError(self.add_objects)
298
299 def tree_changes(
300 self,
301 source: Optional[bytes],
302 target: Optional[bytes],
303 want_unchanged: bool = False,
304 include_trees: bool = False,
305 change_type_same: bool = False,
306 rename_detector: Optional["RenameDetector"] = None,
307 paths: Optional[list[bytes]] = None,
308 ) -> Iterator[
309 tuple[
310 tuple[Optional[bytes], Optional[bytes]],
311 tuple[Optional[int], Optional[int]],
312 tuple[Optional[bytes], Optional[bytes]],
313 ]
314 ]:
315 """Find the differences between the contents of two trees.
316
317 Args:
318 source: SHA1 of the source tree
319 target: SHA1 of the target tree
320 want_unchanged: Whether unchanged files should be reported
321 include_trees: Whether to include trees
322 change_type_same: Whether to report files changing
323 type in the same entry.
324 rename_detector: RenameDetector object for detecting renames.
325 paths: Optional list of paths to filter to (as bytes).
326 Returns: Iterator over tuples with
327 (oldpath, newpath), (oldmode, newmode), (oldsha, newsha)
328 """
329 from .diff_tree import tree_changes
330
331 for change in tree_changes(
332 self,
333 source,
334 target,
335 want_unchanged=want_unchanged,
336 include_trees=include_trees,
337 change_type_same=change_type_same,
338 rename_detector=rename_detector,
339 paths=paths,
340 ):
341 yield (
342 (change.old.path, change.new.path),
343 (change.old.mode, change.new.mode),
344 (change.old.sha, change.new.sha),
345 )
346
347 def iter_tree_contents(
348 self, tree_id: bytes, include_trees: bool = False
349 ) -> Iterator[tuple[bytes, int, bytes]]:
350 """Iterate the contents of a tree and all subtrees.
351
352 Iteration is depth-first pre-order, as in e.g. os.walk.
353
354 Args:
355 tree_id: SHA1 of the tree.
356 include_trees: If True, include tree objects in the iteration.
357 Returns: Iterator over TreeEntry namedtuples for all the objects in a
358 tree.
359 """
360 warnings.warn(
361 "Please use dulwich.object_store.iter_tree_contents",
362 DeprecationWarning,
363 stacklevel=2,
364 )
365 return iter_tree_contents(self, tree_id, include_trees=include_trees)
366
367 def iterobjects_subset(
368 self, shas: Iterable[bytes], *, allow_missing: bool = False
369 ) -> Iterator[ShaFile]:
370 """Iterate over a subset of objects in the store.
371
372 Args:
373 shas: Iterable of object SHAs to retrieve
374 allow_missing: If True, skip missing objects; if False, raise KeyError
375
376 Returns:
377 Iterator of ShaFile objects
378
379 Raises:
380 KeyError: If an object is missing and allow_missing is False
381 """
382 for sha in shas:
383 try:
384 yield self[sha]
385 except KeyError:
386 if not allow_missing:
387 raise
388
389 def find_missing_objects(
390 self,
391 haves: Iterable[bytes],
392 wants: Iterable[bytes],
393 shallow: Optional[set[bytes]] = None,
394 progress: Optional[Callable] = None,
395 get_tagged: Optional[Callable] = None,
396 get_parents: Callable = lambda commit: commit.parents,
397 ) -> Iterator[tuple[bytes, Optional[bytes]]]:
398 """Find the missing objects required for a set of revisions.
399
400 Args:
401 haves: Iterable over SHAs already in common.
402 wants: Iterable over SHAs of objects to fetch.
403 shallow: Set of shallow commit SHA1s to skip
404 progress: Simple progress function that will be called with
405 updated progress strings.
406 get_tagged: Function that returns a dict of pointed-to sha ->
407 tag sha for including tags.
408 get_parents: Optional function for getting the parents of a
409 commit.
410 Returns: Iterator over (sha, path) pairs.
411 """
412 warnings.warn("Please use MissingObjectFinder(store)", DeprecationWarning)
413 finder = MissingObjectFinder(
414 self,
415 haves=haves,
416 wants=wants,
417 shallow=shallow,
418 progress=progress,
419 get_tagged=get_tagged,
420 get_parents=get_parents,
421 )
422 return iter(finder)
423
424 def find_common_revisions(self, graphwalker: GraphWalker) -> list[bytes]:
425 """Find which revisions this store has in common using graphwalker.
426
427 Args:
428 graphwalker: A graphwalker object.
429 Returns: List of SHAs that are in common
430 """
431 haves = []
432 sha = next(graphwalker)
433 while sha:
434 if sha in self:
435 haves.append(sha)
436 graphwalker.ack(sha)
437 sha = next(graphwalker)
438 return haves
439
440 def generate_pack_data(
441 self,
442 have: Iterable[bytes],
443 want: Iterable[bytes],
444 shallow: Optional[set[bytes]] = None,
445 progress: Optional[Callable] = None,
446 ofs_delta: bool = True,
447 ) -> tuple[int, Iterator[UnpackedObject]]:
448 """Generate pack data objects for a set of wants/haves.
449
450 Args:
451 have: List of SHA1s of objects that should not be sent
452 want: List of SHA1s of objects that should be sent
453 shallow: Set of shallow commit SHA1s to skip
454 ofs_delta: Whether OFS deltas can be included
455 progress: Optional progress reporting method
456 """
457 # Note that the pack-specific implementation below is more efficient,
458 # as it reuses deltas
459 missing_objects = MissingObjectFinder(
460 self, haves=have, wants=want, shallow=shallow, progress=progress
461 )
462 object_ids = list(missing_objects)
463 return pack_objects_to_data(
464 [(self[oid], path) for oid, path in object_ids],
465 ofs_delta=ofs_delta,
466 progress=progress,
467 )
468
469 def peel_sha(self, sha: bytes) -> bytes:
470 """Peel all tags from a SHA.
471
472 Args:
473 sha: The object SHA to peel.
474 Returns: The fully-peeled SHA1 of a tag object, after peeling all
475 intermediate tags; if the original ref does not point to a tag,
476 this will equal the original SHA1.
477 """
478 warnings.warn(
479 "Please use dulwich.object_store.peel_sha()",
480 DeprecationWarning,
481 stacklevel=2,
482 )
483 return peel_sha(self, sha)[1].id
484
485 def _get_depth(
486 self,
487 head: bytes,
488 get_parents: Callable = lambda commit: commit.parents,
489 max_depth: Optional[int] = None,
490 ) -> int:
491 """Return the current available depth for the given head.
492
493 For commits with multiple parents, the largest possible depth will be
494 returned.
495
496 Args:
497 head: commit to start from
498 get_parents: optional function for getting the parents of a commit
499 max_depth: maximum depth to search
500 """
501 return get_depth(self, head, get_parents=get_parents, max_depth=max_depth)
502
503 def close(self) -> None:
504 """Close any files opened by this object store."""
505 # Default implementation is a NO-OP
506
507 def prune(self, grace_period: Optional[int] = None) -> None:
508 """Prune/clean up this object store.
509
510 This includes removing orphaned temporary files and other
511 housekeeping tasks. Default implementation is a NO-OP.
512
513 Args:
514 grace_period: Grace period in seconds for removing temporary files.
515 If None, uses the default grace period.
516 """
517 # Default implementation is a NO-OP
518
519 def iter_prefix(self, prefix: bytes) -> Iterator[ObjectID]:
520 """Iterate over all SHA1s that start with a given prefix.
521
522 The default implementation is a naive iteration over all objects.
523 However, subclasses may override this method with more efficient
524 implementations.
525 """
526 for sha in self:
527 if sha.startswith(prefix):
528 yield sha
529
530 def get_commit_graph(self) -> Optional["CommitGraph"]:
531 """Get the commit graph for this object store.
532
533 Returns:
534 CommitGraph object if available, None otherwise
535 """
536 return None
537
538 def write_commit_graph(
539 self, refs: Optional[list[bytes]] = None, reachable: bool = True
540 ) -> None:
541 """Write a commit graph file for this object store.
542
543 Args:
544 refs: List of refs to include. If None, includes all refs from object store.
545 reachable: If True, includes all commits reachable from refs.
546 If False, only includes the direct ref targets.
547
548 Note:
549 Default implementation does nothing. Subclasses should override
550 this method to provide commit graph writing functionality.
551 """
552 raise NotImplementedError(self.write_commit_graph)
553
554 def get_object_mtime(self, sha: bytes) -> float:
555 """Get the modification time of an object.
556
557 Args:
558 sha: SHA1 of the object
559
560 Returns:
561 Modification time as seconds since epoch
562
563 Raises:
564 KeyError: if the object is not found
565 """
566 # Default implementation raises KeyError
567 # Subclasses should override to provide actual mtime
568 raise KeyError(sha)
569
570
571class PackBasedObjectStore(BaseObjectStore, PackedObjectContainer):
572 """Object store that uses pack files for storage.
573
574 This class provides a base implementation for object stores that use
575 Git pack files as their primary storage mechanism. It handles caching
576 of open pack files and provides configuration for pack file operations.
577 """
578
579 def __init__(
580 self,
581 pack_compression_level: int = -1,
582 pack_index_version: Optional[int] = None,
583 pack_delta_window_size: Optional[int] = None,
584 pack_window_memory: Optional[int] = None,
585 pack_delta_cache_size: Optional[int] = None,
586 pack_depth: Optional[int] = None,
587 pack_threads: Optional[int] = None,
588 pack_big_file_threshold: Optional[int] = None,
589 ) -> None:
590 """Initialize a PackBasedObjectStore.
591
592 Args:
593 pack_compression_level: Compression level for pack files (-1 to 9)
594 pack_index_version: Pack index version to use
595 pack_delta_window_size: Window size for delta compression
596 pack_window_memory: Maximum memory to use for delta window
597 pack_delta_cache_size: Cache size for delta operations
598 pack_depth: Maximum depth for pack deltas
599 pack_threads: Number of threads to use for packing
600 pack_big_file_threshold: Threshold for treating files as "big"
601 """
602 self._pack_cache: dict[str, Pack] = {}
603 self.pack_compression_level = pack_compression_level
604 self.pack_index_version = pack_index_version
605 self.pack_delta_window_size = pack_delta_window_size
606 self.pack_window_memory = pack_window_memory
607 self.pack_delta_cache_size = pack_delta_cache_size
608 self.pack_depth = pack_depth
609 self.pack_threads = pack_threads
610 self.pack_big_file_threshold = pack_big_file_threshold
611
612 def add_pack(self) -> tuple[BytesIO, Callable[[], None], Callable[[], None]]:
613 """Add a new pack to this object store."""
614 raise NotImplementedError(self.add_pack)
615
616 def add_pack_data(
617 self,
618 count: int,
619 unpacked_objects: Iterator[UnpackedObject],
620 progress: Optional[Callable] = None,
621 ) -> Optional["Pack"]:
622 """Add pack data to this object store.
623
624 Args:
625 count: Number of items to add
626 unpacked_objects: Iterator of UnpackedObject instances
627 progress: Optional progress callback
628 """
629 if count == 0:
630 # Don't bother writing an empty pack file
631 return None
632 f, commit, abort = self.add_pack()
633 try:
634 write_pack_data(
635 f.write,
636 unpacked_objects,
637 num_records=count,
638 progress=progress,
639 compression_level=self.pack_compression_level,
640 )
641 except BaseException:
642 abort()
643 raise
644 else:
645 return commit()
646
647 @property
648 def alternates(self) -> list:
649 """Return list of alternate object stores."""
650 return []
651
652 def contains_packed(self, sha: bytes) -> bool:
653 """Check if a particular object is present by SHA1 and is packed.
654
655 This does not check alternates.
656 """
657 for pack in self.packs:
658 try:
659 if sha in pack:
660 return True
661 except PackFileDisappeared:
662 pass
663 return False
664
665 def __contains__(self, sha) -> bool:
666 """Check if a particular object is present by SHA1.
667
668 This method makes no distinction between loose and packed objects.
669 """
670 if self.contains_packed(sha) or self.contains_loose(sha):
671 return True
672 for alternate in self.alternates:
673 if sha in alternate:
674 return True
675 return False
676
677 def _add_cached_pack(self, base_name: str, pack: Pack) -> None:
678 """Add a newly appeared pack to the cache by path."""
679 prev_pack = self._pack_cache.get(base_name)
680 if prev_pack is not pack:
681 self._pack_cache[base_name] = pack
682 if prev_pack:
683 prev_pack.close()
684
685 def generate_pack_data(
686 self, have, want, shallow=None, progress=None, ofs_delta=True
687 ) -> tuple[int, Iterator[UnpackedObject]]:
688 """Generate pack data objects for a set of wants/haves.
689
690 Args:
691 have: List of SHA1s of objects that should not be sent
692 want: List of SHA1s of objects that should be sent
693 shallow: Set of shallow commit SHA1s to skip
694 ofs_delta: Whether OFS deltas can be included
695 progress: Optional progress reporting method
696 """
697 missing_objects = MissingObjectFinder(
698 self, haves=have, wants=want, shallow=shallow, progress=progress
699 )
700 remote_has = missing_objects.get_remote_has()
701 object_ids = list(missing_objects)
702 return len(object_ids), generate_unpacked_objects(
703 self,
704 object_ids,
705 progress=progress,
706 ofs_delta=ofs_delta,
707 other_haves=remote_has,
708 )
709
710 def _clear_cached_packs(self) -> None:
711 pack_cache = self._pack_cache
712 self._pack_cache = {}
713 while pack_cache:
714 (name, pack) = pack_cache.popitem()
715 pack.close()
716
717 def _iter_cached_packs(self) -> Iterator[Pack]:
718 return iter(self._pack_cache.values())
719
720 def _update_pack_cache(self) -> list[Pack]:
721 raise NotImplementedError(self._update_pack_cache)
722
723 def close(self) -> None:
724 """Close the object store and release resources.
725
726 This method closes all cached pack files and frees associated resources.
727 """
728 self._clear_cached_packs()
729
730 @property
731 def packs(self) -> list[Pack]:
732 """List with pack objects."""
733 return list(self._iter_cached_packs()) + list(self._update_pack_cache())
734
735 def count_pack_files(self) -> int:
736 """Count the number of pack files.
737
738 Returns:
739 Number of pack files (excluding those with .keep files)
740 """
741 count = 0
742 for pack in self.packs:
743 # Check if there's a .keep file for this pack
744 keep_path = pack._basename + ".keep"
745 if not os.path.exists(keep_path):
746 count += 1
747 return count
748
749 def _iter_alternate_objects(self) -> Iterator[bytes]:
750 """Iterate over the SHAs of all the objects in alternate stores."""
751 for alternate in self.alternates:
752 yield from alternate
753
754 def _iter_loose_objects(self) -> Iterator[bytes]:
755 """Iterate over the SHAs of all loose objects."""
756 raise NotImplementedError(self._iter_loose_objects)
757
758 def _get_loose_object(self, sha: bytes) -> Optional[ShaFile]:
759 raise NotImplementedError(self._get_loose_object)
760
761 def delete_loose_object(self, sha: bytes) -> None:
762 """Delete a loose object.
763
764 This method only handles loose objects. For packed objects,
765 use repack(exclude=...) to exclude them during repacking.
766 """
767 raise NotImplementedError(self.delete_loose_object)
768
769 def _remove_pack(self, pack: "Pack") -> None:
770 raise NotImplementedError(self._remove_pack)
771
772 def pack_loose_objects(self) -> int:
773 """Pack loose objects.
774
775 Returns: Number of objects packed
776 """
777 objects: list[tuple[ShaFile, None]] = []
778 for sha in self._iter_loose_objects():
779 obj = self._get_loose_object(sha)
780 if obj is not None:
781 objects.append((obj, None))
782 self.add_objects(objects)
783 for obj, path in objects:
784 self.delete_loose_object(obj.id)
785 return len(objects)
786
787 def repack(self, exclude: Optional[set] = None) -> int:
788 """Repack the packs in this repository.
789
790 Note that this implementation is fairly naive and currently keeps all
791 objects in memory while it repacks.
792
793 Args:
794 exclude: Optional set of object SHAs to exclude from repacking
795 """
796 if exclude is None:
797 exclude = set()
798
799 loose_objects = set()
800 excluded_loose_objects = set()
801 for sha in self._iter_loose_objects():
802 if sha not in exclude:
803 obj = self._get_loose_object(sha)
804 if obj is not None:
805 loose_objects.add(obj)
806 else:
807 excluded_loose_objects.add(sha)
808
809 objects: set[tuple[ShaFile, None]] = {(obj, None) for obj in loose_objects}
810 old_packs = {p.name(): p for p in self.packs}
811 for name, pack in old_packs.items():
812 objects.update(
813 (obj, None) for obj in pack.iterobjects() if obj.id not in exclude
814 )
815
816 # Only create a new pack if there are objects to pack
817 if objects:
818 # The name of the consolidated pack might match the name of a
819 # pre-existing pack. Take care not to remove the newly created
820 # consolidated pack.
821 consolidated = self.add_objects(list(objects))
822 if consolidated is not None:
823 old_packs.pop(consolidated.name(), None)
824
825 # Delete loose objects that were packed
826 for obj in loose_objects:
827 if obj is not None:
828 self.delete_loose_object(obj.id)
829 # Delete excluded loose objects
830 for sha in excluded_loose_objects:
831 self.delete_loose_object(sha)
832 for name, pack in old_packs.items():
833 self._remove_pack(pack)
834 self._update_pack_cache()
835 return len(objects)
836
837 def __iter__(self):
838 """Iterate over the SHAs that are present in this store."""
839 self._update_pack_cache()
840 for pack in self._iter_cached_packs():
841 try:
842 yield from pack
843 except PackFileDisappeared:
844 pass
845 yield from self._iter_loose_objects()
846 yield from self._iter_alternate_objects()
847
848 def contains_loose(self, sha):
849 """Check if a particular object is present by SHA1 and is loose.
850
851 This does not check alternates.
852 """
853 return self._get_loose_object(sha) is not None
854
855 def get_raw(self, name):
856 """Obtain the raw fulltext for an object.
857
858 Args:
859 name: sha for the object.
860 Returns: tuple with numeric type and object contents.
861 """
862 if name == ZERO_SHA:
863 raise KeyError(name)
864 if len(name) == 40:
865 sha = hex_to_sha(name)
866 hexsha = name
867 elif len(name) == 20:
868 sha = name
869 hexsha = None
870 else:
871 raise AssertionError(f"Invalid object name {name!r}")
872 for pack in self._iter_cached_packs():
873 try:
874 return pack.get_raw(sha)
875 except (KeyError, PackFileDisappeared):
876 pass
877 if hexsha is None:
878 hexsha = sha_to_hex(name)
879 ret = self._get_loose_object(hexsha)
880 if ret is not None:
881 return ret.type_num, ret.as_raw_string()
882 # Maybe something else has added a pack with the object
883 # in the mean time?
884 for pack in self._update_pack_cache():
885 try:
886 return pack.get_raw(sha)
887 except KeyError:
888 pass
889 for alternate in self.alternates:
890 try:
891 return alternate.get_raw(hexsha)
892 except KeyError:
893 pass
894 raise KeyError(hexsha)
895
896 def iter_unpacked_subset(
897 self,
898 shas: set[bytes],
899 include_comp: bool = False,
900 allow_missing: bool = False,
901 convert_ofs_delta: bool = True,
902 ) -> Iterator[UnpackedObject]:
903 """Iterate over a subset of objects, yielding UnpackedObject instances.
904
905 Args:
906 shas: Set of object SHAs to retrieve
907 include_comp: Whether to include compressed data
908 allow_missing: If True, skip missing objects; if False, raise KeyError
909 convert_ofs_delta: Whether to convert OFS_DELTA objects
910
911 Returns:
912 Iterator of UnpackedObject instances
913
914 Raises:
915 KeyError: If an object is missing and allow_missing is False
916 """
917 todo: set[bytes] = set(shas)
918 for p in self._iter_cached_packs():
919 for unpacked in p.iter_unpacked_subset(
920 todo,
921 include_comp=include_comp,
922 allow_missing=True,
923 convert_ofs_delta=convert_ofs_delta,
924 ):
925 yield unpacked
926 hexsha = sha_to_hex(unpacked.sha())
927 todo.remove(hexsha)
928 # Maybe something else has added a pack with the object
929 # in the mean time?
930 for p in self._update_pack_cache():
931 for unpacked in p.iter_unpacked_subset(
932 todo,
933 include_comp=include_comp,
934 allow_missing=True,
935 convert_ofs_delta=convert_ofs_delta,
936 ):
937 yield unpacked
938 hexsha = sha_to_hex(unpacked.sha())
939 todo.remove(hexsha)
940 for alternate in self.alternates:
941 for unpacked in alternate.iter_unpacked_subset(
942 todo,
943 include_comp=include_comp,
944 allow_missing=True,
945 convert_ofs_delta=convert_ofs_delta,
946 ):
947 yield unpacked
948 hexsha = sha_to_hex(unpacked.sha())
949 todo.remove(hexsha)
950
951 def iterobjects_subset(
952 self, shas: Iterable[bytes], *, allow_missing: bool = False
953 ) -> Iterator[ShaFile]:
954 """Iterate over a subset of objects in the store.
955
956 This method searches for objects in pack files, alternates, and loose storage.
957
958 Args:
959 shas: Iterable of object SHAs to retrieve
960 allow_missing: If True, skip missing objects; if False, raise KeyError
961
962 Returns:
963 Iterator of ShaFile objects
964
965 Raises:
966 KeyError: If an object is missing and allow_missing is False
967 """
968 todo: set[bytes] = set(shas)
969 for p in self._iter_cached_packs():
970 for o in p.iterobjects_subset(todo, allow_missing=True):
971 yield o
972 todo.remove(o.id)
973 # Maybe something else has added a pack with the object
974 # in the mean time?
975 for p in self._update_pack_cache():
976 for o in p.iterobjects_subset(todo, allow_missing=True):
977 yield o
978 todo.remove(o.id)
979 for alternate in self.alternates:
980 for o in alternate.iterobjects_subset(todo, allow_missing=True):
981 yield o
982 todo.remove(o.id)
983 for oid in todo:
984 loose_obj: Optional[ShaFile] = self._get_loose_object(oid)
985 if loose_obj is not None:
986 yield loose_obj
987 elif not allow_missing:
988 raise KeyError(oid)
989
990 def get_unpacked_object(
991 self, sha1: bytes, *, include_comp: bool = False
992 ) -> UnpackedObject:
993 """Obtain the unpacked object.
994
995 Args:
996 sha1: sha for the object.
997 include_comp: Whether to include compression metadata.
998 """
999 if sha1 == ZERO_SHA:
1000 raise KeyError(sha1)
1001 if len(sha1) == 40:
1002 sha = hex_to_sha(sha1)
1003 hexsha = sha1
1004 elif len(sha1) == 20:
1005 sha = sha1
1006 hexsha = None
1007 else:
1008 raise AssertionError(f"Invalid object sha1 {sha1!r}")
1009 for pack in self._iter_cached_packs():
1010 try:
1011 return pack.get_unpacked_object(sha, include_comp=include_comp)
1012 except (KeyError, PackFileDisappeared):
1013 pass
1014 if hexsha is None:
1015 hexsha = sha_to_hex(sha1)
1016 # Maybe something else has added a pack with the object
1017 # in the mean time?
1018 for pack in self._update_pack_cache():
1019 try:
1020 return pack.get_unpacked_object(sha, include_comp=include_comp)
1021 except KeyError:
1022 pass
1023 for alternate in self.alternates:
1024 try:
1025 return alternate.get_unpacked_object(hexsha, include_comp=include_comp)
1026 except KeyError:
1027 pass
1028 raise KeyError(hexsha)
1029
1030 def add_objects(
1031 self,
1032 objects: Sequence[tuple[ShaFile, Optional[str]]],
1033 progress: Optional[Callable[[str], None]] = None,
1034 ) -> Optional["Pack"]:
1035 """Add a set of objects to this object store.
1036
1037 Args:
1038 objects: Iterable over (object, path) tuples, should support
1039 __len__.
1040 progress: Optional progress reporting function.
1041 Returns: Pack object of the objects written.
1042 """
1043 count = len(objects)
1044 record_iter = (full_unpacked_object(o) for (o, p) in objects)
1045 return self.add_pack_data(count, record_iter, progress=progress)
1046
1047
1048class DiskObjectStore(PackBasedObjectStore):
1049 """Git-style object store that exists on disk."""
1050
1051 path: Union[str, os.PathLike]
1052 pack_dir: Union[str, os.PathLike]
1053 _alternates: Optional[list["DiskObjectStore"]]
1054 _commit_graph: Optional["CommitGraph"]
1055
1056 def __init__(
1057 self,
1058 path: Union[str, os.PathLike],
1059 loose_compression_level=-1,
1060 pack_compression_level=-1,
1061 pack_index_version=None,
1062 pack_delta_window_size=None,
1063 pack_window_memory=None,
1064 pack_delta_cache_size=None,
1065 pack_depth=None,
1066 pack_threads=None,
1067 pack_big_file_threshold=None,
1068 ) -> None:
1069 """Open an object store.
1070
1071 Args:
1072 path: Path of the object store.
1073 loose_compression_level: zlib compression level for loose objects
1074 pack_compression_level: zlib compression level for pack objects
1075 pack_index_version: pack index version to use (1, 2, or 3)
1076 pack_delta_window_size: sliding window size for delta compression
1077 pack_window_memory: memory limit for delta window operations
1078 pack_delta_cache_size: size of cache for delta operations
1079 pack_depth: maximum delta chain depth
1080 pack_threads: number of threads for pack operations
1081 pack_big_file_threshold: threshold for treating files as big
1082 """
1083 super().__init__(
1084 pack_compression_level=pack_compression_level,
1085 pack_index_version=pack_index_version,
1086 pack_delta_window_size=pack_delta_window_size,
1087 pack_window_memory=pack_window_memory,
1088 pack_delta_cache_size=pack_delta_cache_size,
1089 pack_depth=pack_depth,
1090 pack_threads=pack_threads,
1091 pack_big_file_threshold=pack_big_file_threshold,
1092 )
1093 self.path = path
1094 self.pack_dir = os.path.join(self.path, PACKDIR)
1095 self._alternates = None
1096 self.loose_compression_level = loose_compression_level
1097 self.pack_compression_level = pack_compression_level
1098 self.pack_index_version = pack_index_version
1099
1100 # Commit graph support - lazy loaded
1101 self._commit_graph = None
1102 self._use_commit_graph = True # Default to true
1103
1104 def __repr__(self) -> str:
1105 """Return string representation of DiskObjectStore.
1106
1107 Returns:
1108 String representation including the store path
1109 """
1110 return f"<{self.__class__.__name__}({self.path!r})>"
1111
1112 @classmethod
1113 def from_config(cls, path: Union[str, os.PathLike], config):
1114 """Create a DiskObjectStore from a configuration object.
1115
1116 Args:
1117 path: Path to the object store directory
1118 config: Configuration object to read settings from
1119
1120 Returns:
1121 New DiskObjectStore instance configured according to config
1122 """
1123 try:
1124 default_compression_level = int(
1125 config.get((b"core",), b"compression").decode()
1126 )
1127 except KeyError:
1128 default_compression_level = -1
1129 try:
1130 loose_compression_level = int(
1131 config.get((b"core",), b"looseCompression").decode()
1132 )
1133 except KeyError:
1134 loose_compression_level = default_compression_level
1135 try:
1136 pack_compression_level = int(
1137 config.get((b"core",), "packCompression").decode()
1138 )
1139 except KeyError:
1140 pack_compression_level = default_compression_level
1141 try:
1142 pack_index_version = int(config.get((b"pack",), b"indexVersion").decode())
1143 except KeyError:
1144 pack_index_version = None
1145
1146 # Read pack configuration options
1147 try:
1148 pack_delta_window_size = int(
1149 config.get((b"pack",), b"deltaWindowSize").decode()
1150 )
1151 except KeyError:
1152 pack_delta_window_size = None
1153 try:
1154 pack_window_memory = int(config.get((b"pack",), b"windowMemory").decode())
1155 except KeyError:
1156 pack_window_memory = None
1157 try:
1158 pack_delta_cache_size = int(
1159 config.get((b"pack",), b"deltaCacheSize").decode()
1160 )
1161 except KeyError:
1162 pack_delta_cache_size = None
1163 try:
1164 pack_depth = int(config.get((b"pack",), b"depth").decode())
1165 except KeyError:
1166 pack_depth = None
1167 try:
1168 pack_threads = int(config.get((b"pack",), b"threads").decode())
1169 except KeyError:
1170 pack_threads = None
1171 try:
1172 pack_big_file_threshold = int(
1173 config.get((b"pack",), b"bigFileThreshold").decode()
1174 )
1175 except KeyError:
1176 pack_big_file_threshold = None
1177
1178 # Read core.commitGraph setting
1179 use_commit_graph = config.get_boolean((b"core",), b"commitGraph", True)
1180
1181 instance = cls(
1182 path,
1183 loose_compression_level,
1184 pack_compression_level,
1185 pack_index_version,
1186 pack_delta_window_size,
1187 pack_window_memory,
1188 pack_delta_cache_size,
1189 pack_depth,
1190 pack_threads,
1191 pack_big_file_threshold,
1192 )
1193 instance._use_commit_graph = use_commit_graph
1194 return instance
1195
1196 @property
1197 def alternates(self):
1198 """Get the list of alternate object stores.
1199
1200 Reads from .git/objects/info/alternates if not already cached.
1201
1202 Returns:
1203 List of DiskObjectStore instances for alternate object directories
1204 """
1205 if self._alternates is not None:
1206 return self._alternates
1207 self._alternates = []
1208 for path in self._read_alternate_paths():
1209 self._alternates.append(DiskObjectStore(path))
1210 return self._alternates
1211
1212 def _read_alternate_paths(self):
1213 try:
1214 f = GitFile(os.path.join(self.path, INFODIR, "alternates"), "rb")
1215 except FileNotFoundError:
1216 return
1217 with f:
1218 for line in f.readlines():
1219 line = line.rstrip(b"\n")
1220 if line.startswith(b"#"):
1221 continue
1222 if os.path.isabs(line):
1223 yield os.fsdecode(line)
1224 else:
1225 yield os.fsdecode(os.path.join(os.fsencode(self.path), line))
1226
1227 def add_alternate_path(self, path) -> None:
1228 """Add an alternate path to this object store."""
1229 try:
1230 os.mkdir(os.path.join(self.path, INFODIR))
1231 except FileExistsError:
1232 pass
1233 alternates_path = os.path.join(self.path, INFODIR, "alternates")
1234 with GitFile(alternates_path, "wb") as f:
1235 try:
1236 orig_f = open(alternates_path, "rb")
1237 except FileNotFoundError:
1238 pass
1239 else:
1240 with orig_f:
1241 f.write(orig_f.read())
1242 f.write(os.fsencode(path) + b"\n")
1243
1244 if not os.path.isabs(path):
1245 path = os.path.join(self.path, path)
1246 self.alternates.append(DiskObjectStore(path))
1247
1248 def _update_pack_cache(self):
1249 """Read and iterate over new pack files and cache them."""
1250 try:
1251 pack_dir_contents = os.listdir(self.pack_dir)
1252 except FileNotFoundError:
1253 self.close()
1254 return []
1255 pack_files = set()
1256 for name in pack_dir_contents:
1257 if name.startswith("pack-") and name.endswith(".pack"):
1258 # verify that idx exists first (otherwise the pack was not yet
1259 # fully written)
1260 idx_name = os.path.splitext(name)[0] + ".idx"
1261 if idx_name in pack_dir_contents:
1262 pack_name = name[: -len(".pack")]
1263 pack_files.add(pack_name)
1264
1265 # Open newly appeared pack files
1266 new_packs = []
1267 for f in pack_files:
1268 if f not in self._pack_cache:
1269 pack = Pack(
1270 os.path.join(self.pack_dir, f),
1271 delta_window_size=self.pack_delta_window_size,
1272 window_memory=self.pack_window_memory,
1273 delta_cache_size=self.pack_delta_cache_size,
1274 depth=self.pack_depth,
1275 threads=self.pack_threads,
1276 big_file_threshold=self.pack_big_file_threshold,
1277 )
1278 new_packs.append(pack)
1279 self._pack_cache[f] = pack
1280 # Remove disappeared pack files
1281 for f in set(self._pack_cache) - pack_files:
1282 self._pack_cache.pop(f).close()
1283 return new_packs
1284
1285 def _get_shafile_path(self, sha):
1286 # Check from object dir
1287 return hex_to_filename(os.fspath(self.path), sha)
1288
1289 def _iter_loose_objects(self):
1290 for base in os.listdir(self.path):
1291 if len(base) != 2:
1292 continue
1293 for rest in os.listdir(os.path.join(self.path, base)):
1294 sha = os.fsencode(base + rest)
1295 if not valid_hexsha(sha):
1296 continue
1297 yield sha
1298
1299 def count_loose_objects(self) -> int:
1300 """Count the number of loose objects in the object store.
1301
1302 Returns:
1303 Number of loose objects
1304 """
1305 count = 0
1306 if not os.path.exists(self.path):
1307 return 0
1308
1309 for i in range(256):
1310 subdir = os.path.join(self.path, f"{i:02x}")
1311 try:
1312 count += len(
1313 [
1314 name
1315 for name in os.listdir(subdir)
1316 if len(name) == 38 # 40 - 2 for the prefix
1317 ]
1318 )
1319 except FileNotFoundError:
1320 # Directory may have been removed or is inaccessible
1321 continue
1322
1323 return count
1324
1325 def _get_loose_object(self, sha):
1326 path = self._get_shafile_path(sha)
1327 try:
1328 return ShaFile.from_path(path)
1329 except FileNotFoundError:
1330 return None
1331
1332 def delete_loose_object(self, sha) -> None:
1333 """Delete a loose object from disk.
1334
1335 Args:
1336 sha: SHA1 of the object to delete
1337
1338 Raises:
1339 FileNotFoundError: If the object file doesn't exist
1340 """
1341 os.remove(self._get_shafile_path(sha))
1342
1343 def get_object_mtime(self, sha):
1344 """Get the modification time of an object.
1345
1346 Args:
1347 sha: SHA1 of the object
1348
1349 Returns:
1350 Modification time as seconds since epoch
1351
1352 Raises:
1353 KeyError: if the object is not found
1354 """
1355 # First check if it's a loose object
1356 if self.contains_loose(sha):
1357 path = self._get_shafile_path(sha)
1358 try:
1359 return os.path.getmtime(path)
1360 except FileNotFoundError:
1361 pass
1362
1363 # Check if it's in a pack file
1364 for pack in self.packs:
1365 try:
1366 if sha in pack:
1367 # Use the pack file's mtime for packed objects
1368 pack_path = pack._data_path
1369 try:
1370 return os.path.getmtime(pack_path)
1371 except (FileNotFoundError, AttributeError):
1372 pass
1373 except PackFileDisappeared:
1374 pass
1375
1376 raise KeyError(sha)
1377
1378 def _remove_pack(self, pack) -> None:
1379 try:
1380 del self._pack_cache[os.path.basename(pack._basename)]
1381 except KeyError:
1382 pass
1383 pack.close()
1384 os.remove(pack.data.path)
1385 os.remove(pack.index.path)
1386
1387 def _get_pack_basepath(self, entries):
1388 suffix_bytes = iter_sha1(entry[0] for entry in entries)
1389 # TODO: Handle self.pack_dir being bytes
1390 suffix = suffix_bytes.decode("ascii")
1391 return os.path.join(self.pack_dir, "pack-" + suffix)
1392
1393 def _complete_pack(self, f, path, num_objects, indexer, progress=None):
1394 """Move a specific file containing a pack into the pack directory.
1395
1396 Note: The file should be on the same file system as the
1397 packs directory.
1398
1399 Args:
1400 f: Open file object for the pack.
1401 path: Path to the pack file.
1402 num_objects: Number of objects in the pack.
1403 indexer: A PackIndexer for indexing the pack.
1404 progress: Optional progress reporting function.
1405 """
1406 entries = []
1407 for i, entry in enumerate(indexer):
1408 if progress is not None:
1409 progress(f"generating index: {i}/{num_objects}\r".encode("ascii"))
1410 entries.append(entry)
1411
1412 pack_sha, extra_entries = extend_pack(
1413 f,
1414 indexer.ext_refs(),
1415 get_raw=self.get_raw,
1416 compression_level=self.pack_compression_level,
1417 progress=progress,
1418 )
1419 f.flush()
1420 try:
1421 fileno = f.fileno()
1422 except AttributeError:
1423 pass
1424 else:
1425 os.fsync(fileno)
1426 f.close()
1427
1428 entries.extend(extra_entries)
1429
1430 # Move the pack in.
1431 entries.sort()
1432 pack_base_name = self._get_pack_basepath(entries)
1433
1434 for pack in self.packs:
1435 if pack._basename == pack_base_name:
1436 return pack
1437
1438 target_pack_path = pack_base_name + ".pack"
1439 target_index_path = pack_base_name + ".idx"
1440 if sys.platform == "win32":
1441 # Windows might have the target pack file lingering. Attempt
1442 # removal, silently passing if the target does not exist.
1443 with suppress(FileNotFoundError):
1444 os.remove(target_pack_path)
1445 os.rename(path, target_pack_path)
1446
1447 # Write the index.
1448 with GitFile(target_index_path, "wb", mask=PACK_MODE) as index_file:
1449 write_pack_index(
1450 index_file, entries, pack_sha, version=self.pack_index_version
1451 )
1452
1453 # Add the pack to the store and return it.
1454 final_pack = Pack(
1455 pack_base_name,
1456 delta_window_size=self.pack_delta_window_size,
1457 window_memory=self.pack_window_memory,
1458 delta_cache_size=self.pack_delta_cache_size,
1459 depth=self.pack_depth,
1460 threads=self.pack_threads,
1461 big_file_threshold=self.pack_big_file_threshold,
1462 )
1463 final_pack.check_length_and_checksum()
1464 self._add_cached_pack(pack_base_name, final_pack)
1465 return final_pack
1466
1467 def add_thin_pack(self, read_all, read_some, progress=None):
1468 """Add a new thin pack to this object store.
1469
1470 Thin packs are packs that contain deltas with parents that exist
1471 outside the pack. They should never be placed in the object store
1472 directly, and always indexed and completed as they are copied.
1473
1474 Args:
1475 read_all: Read function that blocks until the number of
1476 requested bytes are read.
1477 read_some: Read function that returns at least one byte, but may
1478 not return the number of bytes requested.
1479 progress: Optional progress reporting function.
1480 Returns: A Pack object pointing at the now-completed thin pack in the
1481 objects/pack directory.
1482 """
1483 import tempfile
1484
1485 fd, path = tempfile.mkstemp(dir=self.path, prefix="tmp_pack_")
1486 with os.fdopen(fd, "w+b") as f:
1487 os.chmod(path, PACK_MODE)
1488 indexer = PackIndexer(f, resolve_ext_ref=self.get_raw)
1489 copier = PackStreamCopier(read_all, read_some, f, delta_iter=indexer)
1490 copier.verify(progress=progress)
1491 return self._complete_pack(f, path, len(copier), indexer, progress=progress)
1492
1493 def add_pack(self):
1494 """Add a new pack to this object store.
1495
1496 Returns: Fileobject to write to, a commit function to
1497 call when the pack is finished and an abort
1498 function.
1499 """
1500 import tempfile
1501
1502 fd, path = tempfile.mkstemp(dir=self.pack_dir, suffix=".pack")
1503 f = os.fdopen(fd, "w+b")
1504 os.chmod(path, PACK_MODE)
1505
1506 def commit():
1507 if f.tell() > 0:
1508 f.seek(0)
1509
1510 with PackData(path, f) as pd:
1511 indexer = PackIndexer.for_pack_data(
1512 pd, resolve_ext_ref=self.get_raw
1513 )
1514 return self._complete_pack(f, path, len(pd), indexer)
1515 else:
1516 f.close()
1517 os.remove(path)
1518 return None
1519
1520 def abort() -> None:
1521 f.close()
1522 os.remove(path)
1523
1524 return f, commit, abort
1525
1526 def add_object(self, obj) -> None:
1527 """Add a single object to this object store.
1528
1529 Args:
1530 obj: Object to add
1531 """
1532 path = self._get_shafile_path(obj.id)
1533 dir = os.path.dirname(path)
1534 try:
1535 os.mkdir(dir)
1536 except FileExistsError:
1537 pass
1538 if os.path.exists(path):
1539 return # Already there, no need to write again
1540 with GitFile(path, "wb", mask=PACK_MODE) as f:
1541 f.write(
1542 obj.as_legacy_object(compression_level=self.loose_compression_level)
1543 )
1544
1545 @classmethod
1546 def init(cls, path: Union[str, os.PathLike]):
1547 """Initialize a new disk object store.
1548
1549 Creates the necessary directory structure for a Git object store.
1550
1551 Args:
1552 path: Path where the object store should be created
1553
1554 Returns:
1555 New DiskObjectStore instance
1556 """
1557 try:
1558 os.mkdir(path)
1559 except FileExistsError:
1560 pass
1561 os.mkdir(os.path.join(path, "info"))
1562 os.mkdir(os.path.join(path, PACKDIR))
1563 return cls(path)
1564
1565 def iter_prefix(self, prefix):
1566 """Iterate over all object SHAs with the given prefix.
1567
1568 Args:
1569 prefix: Hex prefix to search for (as bytes)
1570
1571 Returns:
1572 Iterator of object SHAs (as bytes) matching the prefix
1573 """
1574 if len(prefix) < 2:
1575 yield from super().iter_prefix(prefix)
1576 return
1577 seen = set()
1578 dir = prefix[:2].decode()
1579 rest = prefix[2:].decode()
1580 try:
1581 for name in os.listdir(os.path.join(self.path, dir)):
1582 if name.startswith(rest):
1583 sha = os.fsencode(dir + name)
1584 if sha not in seen:
1585 seen.add(sha)
1586 yield sha
1587 except FileNotFoundError:
1588 pass
1589
1590 for p in self.packs:
1591 bin_prefix = (
1592 binascii.unhexlify(prefix)
1593 if len(prefix) % 2 == 0
1594 else binascii.unhexlify(prefix[:-1])
1595 )
1596 for sha in p.index.iter_prefix(bin_prefix):
1597 sha = sha_to_hex(sha)
1598 if sha.startswith(prefix) and sha not in seen:
1599 seen.add(sha)
1600 yield sha
1601 for alternate in self.alternates:
1602 for sha in alternate.iter_prefix(prefix):
1603 if sha not in seen:
1604 seen.add(sha)
1605 yield sha
1606
1607 def get_commit_graph(self):
1608 """Get the commit graph for this object store.
1609
1610 Returns:
1611 CommitGraph object if available, None otherwise
1612 """
1613 if not self._use_commit_graph:
1614 return None
1615
1616 if self._commit_graph is None:
1617 from .commit_graph import read_commit_graph
1618
1619 # Look for commit graph in our objects directory
1620 graph_file = os.path.join(self.path, "info", "commit-graph")
1621 if os.path.exists(graph_file):
1622 self._commit_graph = read_commit_graph(graph_file)
1623 return self._commit_graph
1624
1625 def write_commit_graph(self, refs=None, reachable=True) -> None:
1626 """Write a commit graph file for this object store.
1627
1628 Args:
1629 refs: List of refs to include. If None, includes all refs from object store.
1630 reachable: If True, includes all commits reachable from refs.
1631 If False, only includes the direct ref targets.
1632 """
1633 from .commit_graph import get_reachable_commits
1634
1635 if refs is None:
1636 # Get all commit objects from the object store
1637 all_refs = []
1638 # Iterate through all objects to find commits
1639 for sha in self:
1640 try:
1641 obj = self[sha]
1642 if obj.type_name == b"commit":
1643 all_refs.append(sha)
1644 except KeyError:
1645 continue
1646 else:
1647 # Use provided refs
1648 all_refs = refs
1649
1650 if not all_refs:
1651 return # No commits to include
1652
1653 if reachable:
1654 # Get all reachable commits
1655 commit_ids = get_reachable_commits(self, all_refs)
1656 else:
1657 # Just use the direct ref targets - ensure they're hex ObjectIDs
1658 commit_ids = []
1659 for ref in all_refs:
1660 if isinstance(ref, bytes) and len(ref) == 40:
1661 # Already hex ObjectID
1662 commit_ids.append(ref)
1663 elif isinstance(ref, bytes) and len(ref) == 20:
1664 # Binary SHA, convert to hex ObjectID
1665 from .objects import sha_to_hex
1666
1667 commit_ids.append(sha_to_hex(ref))
1668 else:
1669 # Assume it's already correct format
1670 commit_ids.append(ref)
1671
1672 if commit_ids:
1673 # Write commit graph directly to our object store path
1674 # Generate the commit graph
1675 from .commit_graph import generate_commit_graph
1676
1677 graph = generate_commit_graph(self, commit_ids)
1678
1679 if graph.entries:
1680 # Ensure the info directory exists
1681 info_dir = os.path.join(self.path, "info")
1682 os.makedirs(info_dir, exist_ok=True)
1683
1684 # Write using GitFile for atomic operation
1685 graph_path = os.path.join(info_dir, "commit-graph")
1686 with GitFile(graph_path, "wb") as f:
1687 assert isinstance(
1688 f, _GitFile
1689 ) # GitFile in write mode always returns _GitFile
1690 graph.write_to_file(f)
1691
1692 # Clear cached commit graph so it gets reloaded
1693 self._commit_graph = None
1694
1695 def prune(self, grace_period: Optional[int] = None) -> None:
1696 """Prune/clean up this object store.
1697
1698 This removes temporary files that were left behind by interrupted
1699 pack operations. These are files that start with ``tmp_pack_`` in the
1700 repository directory or files with .pack extension but no corresponding
1701 .idx file in the pack directory.
1702
1703 Args:
1704 grace_period: Grace period in seconds for removing temporary files.
1705 If None, uses DEFAULT_TEMPFILE_GRACE_PERIOD.
1706 """
1707 import glob
1708
1709 if grace_period is None:
1710 grace_period = DEFAULT_TEMPFILE_GRACE_PERIOD
1711
1712 # Clean up tmp_pack_* files in the repository directory
1713 for tmp_file in glob.glob(os.path.join(self.path, "tmp_pack_*")):
1714 # Check if file is old enough (more than grace period)
1715 mtime = os.path.getmtime(tmp_file)
1716 if time.time() - mtime > grace_period:
1717 os.remove(tmp_file)
1718
1719 # Clean up orphaned .pack files without corresponding .idx files
1720 try:
1721 pack_dir_contents = os.listdir(self.pack_dir)
1722 except FileNotFoundError:
1723 return
1724
1725 pack_files = {}
1726 idx_files = set()
1727
1728 for name in pack_dir_contents:
1729 if name.endswith(".pack"):
1730 base_name = name[:-5] # Remove .pack extension
1731 pack_files[base_name] = name
1732 elif name.endswith(".idx"):
1733 base_name = name[:-4] # Remove .idx extension
1734 idx_files.add(base_name)
1735
1736 # Remove .pack files without corresponding .idx files
1737 for base_name, pack_name in pack_files.items():
1738 if base_name not in idx_files:
1739 pack_path = os.path.join(self.pack_dir, pack_name)
1740 # Check if file is old enough (more than grace period)
1741 mtime = os.path.getmtime(pack_path)
1742 if time.time() - mtime > grace_period:
1743 os.remove(pack_path)
1744
1745
1746class MemoryObjectStore(BaseObjectStore):
1747 """Object store that keeps all objects in memory."""
1748
1749 def __init__(self) -> None:
1750 """Initialize a MemoryObjectStore.
1751
1752 Creates an empty in-memory object store.
1753 """
1754 super().__init__()
1755 self._data: dict[str, ShaFile] = {}
1756 self.pack_compression_level = -1
1757
1758 def _to_hexsha(self, sha):
1759 if len(sha) == 40:
1760 return sha
1761 elif len(sha) == 20:
1762 return sha_to_hex(sha)
1763 else:
1764 raise ValueError(f"Invalid sha {sha!r}")
1765
1766 def contains_loose(self, sha):
1767 """Check if a particular object is present by SHA1 and is loose."""
1768 return self._to_hexsha(sha) in self._data
1769
1770 def contains_packed(self, sha) -> bool:
1771 """Check if a particular object is present by SHA1 and is packed."""
1772 return False
1773
1774 def __iter__(self):
1775 """Iterate over the SHAs that are present in this store."""
1776 return iter(self._data.keys())
1777
1778 @property
1779 def packs(self):
1780 """List with pack objects."""
1781 return []
1782
1783 def get_raw(self, name: ObjectID):
1784 """Obtain the raw text for an object.
1785
1786 Args:
1787 name: sha for the object.
1788 Returns: tuple with numeric type and object contents.
1789 """
1790 obj = self[self._to_hexsha(name)]
1791 return obj.type_num, obj.as_raw_string()
1792
1793 def __getitem__(self, name: ObjectID):
1794 """Retrieve an object by SHA.
1795
1796 Args:
1797 name: SHA of the object (as hex string or bytes)
1798
1799 Returns:
1800 Copy of the ShaFile object
1801
1802 Raises:
1803 KeyError: If the object is not found
1804 """
1805 return self._data[self._to_hexsha(name)].copy()
1806
1807 def __delitem__(self, name: ObjectID) -> None:
1808 """Delete an object from this store, for testing only."""
1809 del self._data[self._to_hexsha(name)]
1810
1811 def add_object(self, obj) -> None:
1812 """Add a single object to this object store."""
1813 self._data[obj.id] = obj.copy()
1814
1815 def add_objects(self, objects, progress=None) -> None:
1816 """Add a set of objects to this object store.
1817
1818 Args:
1819 objects: Iterable over a list of (object, path) tuples
1820 progress: Optional progress reporting function.
1821 """
1822 for obj, path in objects:
1823 self.add_object(obj)
1824
1825 def add_pack(self):
1826 """Add a new pack to this object store.
1827
1828 Because this object store doesn't support packs, we extract and add the
1829 individual objects.
1830
1831 Returns: Fileobject to write to and a commit function to
1832 call when the pack is finished.
1833 """
1834 from tempfile import SpooledTemporaryFile
1835
1836 f = SpooledTemporaryFile(max_size=PACK_SPOOL_FILE_MAX_SIZE, prefix="incoming-")
1837
1838 def commit() -> None:
1839 size = f.tell()
1840 if size > 0:
1841 f.seek(0)
1842
1843 p = PackData.from_file(f, size)
1844 for obj in PackInflater.for_pack_data(p, self.get_raw):
1845 self.add_object(obj)
1846 p.close()
1847 f.close()
1848 else:
1849 f.close()
1850
1851 def abort() -> None:
1852 f.close()
1853
1854 return f, commit, abort
1855
1856 def add_pack_data(
1857 self, count: int, unpacked_objects: Iterator[UnpackedObject], progress=None
1858 ) -> None:
1859 """Add pack data to this object store.
1860
1861 Args:
1862 count: Number of items to add
1863 unpacked_objects: Iterator of UnpackedObject instances
1864 progress: Optional progress reporting function.
1865 """
1866 if count == 0:
1867 return
1868
1869 # Since MemoryObjectStore doesn't support pack files, we need to
1870 # extract individual objects. To handle deltas properly, we write
1871 # to a temporary pack and then use PackInflater to resolve them.
1872 f, commit, abort = self.add_pack()
1873 try:
1874 write_pack_data(
1875 f.write,
1876 unpacked_objects,
1877 num_records=count,
1878 progress=progress,
1879 )
1880 except BaseException:
1881 abort()
1882 raise
1883 else:
1884 commit()
1885
1886 def add_thin_pack(self, read_all, read_some, progress=None) -> None:
1887 """Add a new thin pack to this object store.
1888
1889 Thin packs are packs that contain deltas with parents that exist
1890 outside the pack. Because this object store doesn't support packs, we
1891 extract and add the individual objects.
1892
1893 Args:
1894 read_all: Read function that blocks until the number of
1895 requested bytes are read.
1896 read_some: Read function that returns at least one byte, but may
1897 not return the number of bytes requested.
1898 progress: Optional progress reporting function.
1899 """
1900 f, commit, abort = self.add_pack()
1901 try:
1902 copier = PackStreamCopier(read_all, read_some, f)
1903 copier.verify()
1904 except BaseException:
1905 abort()
1906 raise
1907 else:
1908 commit()
1909
1910
1911class ObjectIterator(Protocol):
1912 """Interface for iterating over objects."""
1913
1914 def iterobjects(self) -> Iterator[ShaFile]:
1915 """Iterate over all objects.
1916
1917 Returns:
1918 Iterator of ShaFile objects
1919 """
1920 raise NotImplementedError(self.iterobjects)
1921
1922
1923def tree_lookup_path(lookup_obj, root_sha, path):
1924 """Look up an object in a Git tree.
1925
1926 Args:
1927 lookup_obj: Callback for retrieving object by SHA1
1928 root_sha: SHA1 of the root tree
1929 path: Path to lookup
1930 Returns: A tuple of (mode, SHA) of the resulting path.
1931 """
1932 tree = lookup_obj(root_sha)
1933 if not isinstance(tree, Tree):
1934 raise NotTreeError(root_sha)
1935 return tree.lookup_path(lookup_obj, path)
1936
1937
1938def _collect_filetree_revs(
1939 obj_store: ObjectContainer, tree_sha: ObjectID, kset: set[ObjectID]
1940) -> None:
1941 """Collect SHA1s of files and directories for specified tree.
1942
1943 Args:
1944 obj_store: Object store to get objects by SHA from
1945 tree_sha: tree reference to walk
1946 kset: set to fill with references to files and directories
1947 """
1948 filetree = obj_store[tree_sha]
1949 assert isinstance(filetree, Tree)
1950 for name, mode, sha in filetree.iteritems():
1951 if not S_ISGITLINK(mode) and sha not in kset:
1952 kset.add(sha)
1953 if stat.S_ISDIR(mode):
1954 _collect_filetree_revs(obj_store, sha, kset)
1955
1956
1957def _split_commits_and_tags(
1958 obj_store: ObjectContainer, lst, *, ignore_unknown=False
1959) -> tuple[set[bytes], set[bytes], set[bytes]]:
1960 """Split object id list into three lists with commit, tag, and other SHAs.
1961
1962 Commits referenced by tags are included into commits
1963 list as well. Only SHA1s known in this repository will get
1964 through, and unless ignore_unknown argument is True, KeyError
1965 is thrown for SHA1 missing in the repository
1966
1967 Args:
1968 obj_store: Object store to get objects by SHA1 from
1969 lst: Collection of commit and tag SHAs
1970 ignore_unknown: True to skip SHA1 missing in the repository
1971 silently.
1972 Returns: A tuple of (commits, tags, others) SHA1s
1973 """
1974 commits: set[bytes] = set()
1975 tags: set[bytes] = set()
1976 others: set[bytes] = set()
1977 for e in lst:
1978 try:
1979 o = obj_store[e]
1980 except KeyError:
1981 if not ignore_unknown:
1982 raise
1983 else:
1984 if isinstance(o, Commit):
1985 commits.add(e)
1986 elif isinstance(o, Tag):
1987 tags.add(e)
1988 tagged = o.object[1]
1989 c, t, os = _split_commits_and_tags(
1990 obj_store, [tagged], ignore_unknown=ignore_unknown
1991 )
1992 commits |= c
1993 tags |= t
1994 others |= os
1995 else:
1996 others.add(e)
1997 return (commits, tags, others)
1998
1999
2000class MissingObjectFinder:
2001 """Find the objects missing from another object store.
2002
2003 Args:
2004 object_store: Object store containing at least all objects to be
2005 sent
2006 haves: SHA1s of commits not to send (already present in target)
2007 wants: SHA1s of commits to send
2008 progress: Optional function to report progress to.
2009 get_tagged: Function that returns a dict of pointed-to sha -> tag
2010 sha for including tags.
2011 get_parents: Optional function for getting the parents of a commit.
2012 """
2013
2014 def __init__(
2015 self,
2016 object_store,
2017 haves,
2018 wants,
2019 *,
2020 shallow=None,
2021 progress=None,
2022 get_tagged=None,
2023 get_parents=lambda commit: commit.parents,
2024 ) -> None:
2025 """Initialize a MissingObjectFinder.
2026
2027 Args:
2028 object_store: Object store containing objects
2029 haves: SHA1s of objects already present in target
2030 wants: SHA1s of objects to send
2031 shallow: Set of shallow commit SHA1s
2032 progress: Optional progress reporting callback
2033 get_tagged: Function returning dict of pointed-to sha -> tag sha
2034 get_parents: Function for getting commit parents
2035 """
2036 self.object_store = object_store
2037 if shallow is None:
2038 shallow = set()
2039 self._get_parents = get_parents
2040 # process Commits and Tags differently
2041 # Note, while haves may list commits/tags not available locally,
2042 # and such SHAs would get filtered out by _split_commits_and_tags,
2043 # wants shall list only known SHAs, and otherwise
2044 # _split_commits_and_tags fails with KeyError
2045 have_commits, have_tags, have_others = _split_commits_and_tags(
2046 object_store, haves, ignore_unknown=True
2047 )
2048 want_commits, want_tags, want_others = _split_commits_and_tags(
2049 object_store, wants, ignore_unknown=False
2050 )
2051 # all_ancestors is a set of commits that shall not be sent
2052 # (complete repository up to 'haves')
2053 all_ancestors = _collect_ancestors(
2054 object_store, have_commits, shallow=shallow, get_parents=self._get_parents
2055 )[0]
2056 # all_missing - complete set of commits between haves and wants
2057 # common - commits from all_ancestors we hit into while
2058 # traversing parent hierarchy of wants
2059 missing_commits, common_commits = _collect_ancestors(
2060 object_store,
2061 want_commits,
2062 all_ancestors,
2063 shallow=shallow,
2064 get_parents=self._get_parents,
2065 )
2066 self.remote_has: set[bytes] = set()
2067 # Now, fill sha_done with commits and revisions of
2068 # files and directories known to be both locally
2069 # and on target. Thus these commits and files
2070 # won't get selected for fetch
2071 for h in common_commits:
2072 self.remote_has.add(h)
2073 cmt = object_store[h]
2074 _collect_filetree_revs(object_store, cmt.tree, self.remote_has)
2075 # record tags we have as visited, too
2076 for t in have_tags:
2077 self.remote_has.add(t)
2078 self.sha_done = set(self.remote_has)
2079
2080 # in fact, what we 'want' is commits, tags, and others
2081 # we've found missing
2082 self.objects_to_send: set[
2083 tuple[ObjectID, Optional[bytes], Optional[int], bool]
2084 ] = {(w, None, Commit.type_num, False) for w in missing_commits}
2085 missing_tags = want_tags.difference(have_tags)
2086 self.objects_to_send.update(
2087 {(w, None, Tag.type_num, False) for w in missing_tags}
2088 )
2089 missing_others = want_others.difference(have_others)
2090 self.objects_to_send.update({(w, None, None, False) for w in missing_others})
2091
2092 if progress is None:
2093 self.progress = lambda x: None
2094 else:
2095 self.progress = progress
2096 self._tagged = (get_tagged and get_tagged()) or {}
2097
2098 def get_remote_has(self):
2099 """Get the set of SHAs the remote has.
2100
2101 Returns:
2102 Set of SHA1s that the remote side already has
2103 """
2104 return self.remote_has
2105
2106 def add_todo(
2107 self, entries: Iterable[tuple[ObjectID, Optional[bytes], Optional[int], bool]]
2108 ) -> None:
2109 """Add objects to the todo list.
2110
2111 Args:
2112 entries: Iterable of tuples (sha, name, type_num, is_leaf)
2113 """
2114 self.objects_to_send.update([e for e in entries if e[0] not in self.sha_done])
2115
2116 def __next__(self) -> tuple[bytes, Optional[PackHint]]:
2117 """Get the next object to send.
2118
2119 Returns:
2120 Tuple of (sha, pack_hint)
2121
2122 Raises:
2123 StopIteration: When no more objects to send
2124 """
2125 while True:
2126 if not self.objects_to_send:
2127 self.progress(
2128 f"counting objects: {len(self.sha_done)}, done.\n".encode("ascii")
2129 )
2130 raise StopIteration
2131 (sha, name, type_num, leaf) = self.objects_to_send.pop()
2132 if sha not in self.sha_done:
2133 break
2134 if not leaf:
2135 o = self.object_store[sha]
2136 if isinstance(o, Commit):
2137 self.add_todo([(o.tree, b"", Tree.type_num, False)])
2138 elif isinstance(o, Tree):
2139 self.add_todo(
2140 [
2141 (
2142 s,
2143 n,
2144 (Blob.type_num if stat.S_ISREG(m) else Tree.type_num),
2145 not stat.S_ISDIR(m),
2146 )
2147 for n, m, s in o.iteritems()
2148 if not S_ISGITLINK(m)
2149 ]
2150 )
2151 elif isinstance(o, Tag):
2152 self.add_todo([(o.object[1], None, o.object[0].type_num, False)])
2153 if sha in self._tagged:
2154 self.add_todo([(self._tagged[sha], None, None, True)])
2155 self.sha_done.add(sha)
2156 if len(self.sha_done) % 1000 == 0:
2157 self.progress(f"counting objects: {len(self.sha_done)}\r".encode("ascii"))
2158 if type_num is None:
2159 pack_hint = None
2160 else:
2161 pack_hint = (type_num, name)
2162 return (sha, pack_hint)
2163
2164 def __iter__(self):
2165 """Return iterator over objects to send.
2166
2167 Returns:
2168 Self (this class implements the iterator protocol)
2169 """
2170 return self
2171
2172
2173class ObjectStoreGraphWalker:
2174 """Graph walker that finds what commits are missing from an object store."""
2175
2176 heads: set[ObjectID]
2177 """Revisions without descendants in the local repo."""
2178
2179 get_parents: Callable[[ObjectID], list[ObjectID]]
2180 """Function to retrieve parents in the local repo."""
2181
2182 shallow: set[ObjectID]
2183
2184 def __init__(
2185 self,
2186 local_heads: Iterable[ObjectID],
2187 get_parents,
2188 shallow: Optional[set[ObjectID]] = None,
2189 update_shallow=None,
2190 ) -> None:
2191 """Create a new instance.
2192
2193 Args:
2194 local_heads: Heads to start search with
2195 get_parents: Function for finding the parents of a SHA1.
2196 shallow: Set of shallow commits.
2197 update_shallow: Function to update shallow commits.
2198 """
2199 self.heads = set(local_heads)
2200 self.get_parents = get_parents
2201 self.parents: dict[ObjectID, Optional[list[ObjectID]]] = {}
2202 if shallow is None:
2203 shallow = set()
2204 self.shallow = shallow
2205 self.update_shallow = update_shallow
2206
2207 def nak(self) -> None:
2208 """Nothing in common was found."""
2209
2210 def ack(self, sha: ObjectID) -> None:
2211 """Ack that a revision and its ancestors are present in the source."""
2212 if len(sha) != 40:
2213 raise ValueError(f"unexpected sha {sha!r} received")
2214 ancestors = {sha}
2215
2216 # stop if we run out of heads to remove
2217 while self.heads:
2218 for a in ancestors:
2219 if a in self.heads:
2220 self.heads.remove(a)
2221
2222 # collect all ancestors
2223 new_ancestors = set()
2224 for a in ancestors:
2225 ps = self.parents.get(a)
2226 if ps is not None:
2227 new_ancestors.update(ps)
2228 self.parents[a] = None
2229
2230 # no more ancestors; stop
2231 if not new_ancestors:
2232 break
2233
2234 ancestors = new_ancestors
2235
2236 def next(self):
2237 """Iterate over ancestors of heads in the target."""
2238 if self.heads:
2239 ret = self.heads.pop()
2240 try:
2241 ps = self.get_parents(ret)
2242 except KeyError:
2243 return None
2244 self.parents[ret] = ps
2245 self.heads.update([p for p in ps if p not in self.parents])
2246 return ret
2247 return None
2248
2249 __next__ = next
2250
2251
2252def commit_tree_changes(object_store, tree, changes):
2253 """Commit a specified set of changes to a tree structure.
2254
2255 This will apply a set of changes on top of an existing tree, storing new
2256 objects in object_store.
2257
2258 changes are a list of tuples with (path, mode, object_sha).
2259 Paths can be both blobs and trees. See the mode and
2260 object sha to None deletes the path.
2261
2262 This method works especially well if there are only a small
2263 number of changes to a big tree. For a large number of changes
2264 to a large tree, use e.g. commit_tree.
2265
2266 Args:
2267 object_store: Object store to store new objects in
2268 and retrieve old ones from.
2269 tree: Original tree root
2270 changes: changes to apply
2271 Returns: New tree root object
2272 """
2273 # TODO(jelmer): Save up the objects and add them using .add_objects
2274 # rather than with individual calls to .add_object.
2275 nested_changes: dict[bytes, list[tuple[bytes, Optional[int], Optional[bytes]]]] = {}
2276 for path, new_mode, new_sha in changes:
2277 try:
2278 (dirname, subpath) = path.split(b"/", 1)
2279 except ValueError:
2280 if new_sha is None:
2281 del tree[path]
2282 else:
2283 tree[path] = (new_mode, new_sha)
2284 else:
2285 nested_changes.setdefault(dirname, []).append((subpath, new_mode, new_sha))
2286 for name, subchanges in nested_changes.items():
2287 try:
2288 orig_subtree = object_store[tree[name][1]]
2289 except KeyError:
2290 orig_subtree = Tree()
2291 subtree = commit_tree_changes(object_store, orig_subtree, subchanges)
2292 if len(subtree) == 0:
2293 del tree[name]
2294 else:
2295 tree[name] = (stat.S_IFDIR, subtree.id)
2296 object_store.add_object(tree)
2297 return tree
2298
2299
2300class OverlayObjectStore(BaseObjectStore):
2301 """Object store that can overlay multiple object stores."""
2302
2303 def __init__(self, bases, add_store=None) -> None:
2304 """Initialize an OverlayObjectStore.
2305
2306 Args:
2307 bases: List of base object stores to overlay
2308 add_store: Optional store to write new objects to
2309 """
2310 self.bases = bases
2311 self.add_store = add_store
2312
2313 def add_object(self, object):
2314 """Add a single object to the store.
2315
2316 Args:
2317 object: Object to add
2318
2319 Raises:
2320 NotImplementedError: If no add_store was provided
2321 """
2322 if self.add_store is None:
2323 raise NotImplementedError(self.add_object)
2324 return self.add_store.add_object(object)
2325
2326 def add_objects(self, objects, progress=None):
2327 """Add multiple objects to the store.
2328
2329 Args:
2330 objects: Iterator of objects to add
2331 progress: Optional progress reporting callback
2332
2333 Raises:
2334 NotImplementedError: If no add_store was provided
2335 """
2336 if self.add_store is None:
2337 raise NotImplementedError(self.add_object)
2338 return self.add_store.add_objects(objects, progress)
2339
2340 @property
2341 def packs(self):
2342 """Get the list of packs from all overlaid stores.
2343
2344 Returns:
2345 Combined list of packs from all base stores
2346 """
2347 ret = []
2348 for b in self.bases:
2349 ret.extend(b.packs)
2350 return ret
2351
2352 def __iter__(self):
2353 """Iterate over all object SHAs in the overlaid stores.
2354
2355 Returns:
2356 Iterator of object SHAs (deduped across stores)
2357 """
2358 done = set()
2359 for b in self.bases:
2360 for o_id in b:
2361 if o_id not in done:
2362 yield o_id
2363 done.add(o_id)
2364
2365 def iterobjects_subset(
2366 self, shas: Iterable[bytes], *, allow_missing: bool = False
2367 ) -> Iterator[ShaFile]:
2368 """Iterate over a subset of objects from the overlaid stores.
2369
2370 Args:
2371 shas: Iterable of object SHAs to retrieve
2372 allow_missing: If True, skip missing objects; if False, raise KeyError
2373
2374 Returns:
2375 Iterator of ShaFile objects
2376
2377 Raises:
2378 KeyError: If an object is missing and allow_missing is False
2379 """
2380 todo = set(shas)
2381 found: set[bytes] = set()
2382
2383 for b in self.bases:
2384 # Create a copy of todo for each base to avoid modifying
2385 # the set while iterating through it
2386 current_todo = todo - found
2387 for o in b.iterobjects_subset(current_todo, allow_missing=True):
2388 yield o
2389 found.add(o.id)
2390
2391 # Check for any remaining objects not found
2392 missing = todo - found
2393 if missing and not allow_missing:
2394 raise KeyError(next(iter(missing)))
2395
2396 def iter_unpacked_subset(
2397 self,
2398 shas: Iterable[bytes],
2399 *,
2400 include_comp=False,
2401 allow_missing: bool = False,
2402 convert_ofs_delta=True,
2403 ) -> Iterator[ShaFile]:
2404 """Iterate over unpacked objects from the overlaid stores.
2405
2406 Args:
2407 shas: Iterable of object SHAs to retrieve
2408 include_comp: Whether to include compressed data
2409 allow_missing: If True, skip missing objects; if False, raise KeyError
2410 convert_ofs_delta: Whether to convert OFS_DELTA objects
2411
2412 Returns:
2413 Iterator of unpacked objects
2414
2415 Raises:
2416 KeyError: If an object is missing and allow_missing is False
2417 """
2418 todo = set(shas)
2419 for b in self.bases:
2420 for o in b.iter_unpacked_subset(
2421 todo,
2422 include_comp=include_comp,
2423 allow_missing=True,
2424 convert_ofs_delta=convert_ofs_delta,
2425 ):
2426 yield o
2427 todo.remove(o.id)
2428 if todo and not allow_missing:
2429 raise KeyError(o.id)
2430
2431 def get_raw(self, sha_id):
2432 """Get the raw object data from the overlaid stores.
2433
2434 Args:
2435 sha_id: SHA of the object
2436
2437 Returns:
2438 Tuple of (type_num, raw_data)
2439
2440 Raises:
2441 KeyError: If object not found in any base store
2442 """
2443 for b in self.bases:
2444 try:
2445 return b.get_raw(sha_id)
2446 except KeyError:
2447 pass
2448 raise KeyError(sha_id)
2449
2450 def contains_packed(self, sha) -> bool:
2451 """Check if an object is packed in any base store.
2452
2453 Args:
2454 sha: SHA of the object
2455
2456 Returns:
2457 True if object is packed in any base store
2458 """
2459 for b in self.bases:
2460 if b.contains_packed(sha):
2461 return True
2462 return False
2463
2464 def contains_loose(self, sha) -> bool:
2465 """Check if an object is loose in any base store.
2466
2467 Args:
2468 sha: SHA of the object
2469
2470 Returns:
2471 True if object is loose in any base store
2472 """
2473 for b in self.bases:
2474 if b.contains_loose(sha):
2475 return True
2476 return False
2477
2478
2479def read_packs_file(f):
2480 """Yield the packs listed in a packs file."""
2481 for line in f.read().splitlines():
2482 if not line:
2483 continue
2484 (kind, name) = line.split(b" ", 1)
2485 if kind != b"P":
2486 continue
2487 yield os.fsdecode(name)
2488
2489
2490class BucketBasedObjectStore(PackBasedObjectStore):
2491 """Object store implementation that uses a bucket store like S3 as backend."""
2492
2493 def _iter_loose_objects(self):
2494 """Iterate over the SHAs of all loose objects."""
2495 return iter([])
2496
2497 def _get_loose_object(self, sha) -> None:
2498 return None
2499
2500 def delete_loose_object(self, sha) -> None:
2501 """Delete a loose object (no-op for bucket stores).
2502
2503 Bucket-based stores don't have loose objects, so this is a no-op.
2504
2505 Args:
2506 sha: SHA of the object to delete
2507 """
2508 # Doesn't exist..
2509
2510 def pack_loose_objects(self) -> int:
2511 """Pack loose objects. Returns number of objects packed.
2512
2513 BucketBasedObjectStore doesn't support loose objects, so this is a no-op.
2514 """
2515 return 0
2516
2517 def _remove_pack_by_name(self, name: str) -> None:
2518 """Remove a pack by name. Subclasses should implement this."""
2519 raise NotImplementedError(self._remove_pack_by_name)
2520
2521 def _iter_pack_names(self) -> Iterator[str]:
2522 raise NotImplementedError(self._iter_pack_names)
2523
2524 def _get_pack(self, name) -> Pack:
2525 raise NotImplementedError(self._get_pack)
2526
2527 def _update_pack_cache(self):
2528 pack_files = set(self._iter_pack_names())
2529
2530 # Open newly appeared pack files
2531 new_packs = []
2532 for f in pack_files:
2533 if f not in self._pack_cache:
2534 pack = self._get_pack(f)
2535 new_packs.append(pack)
2536 self._pack_cache[f] = pack
2537 # Remove disappeared pack files
2538 for f in set(self._pack_cache) - pack_files:
2539 self._pack_cache.pop(f).close()
2540 return new_packs
2541
2542 def _upload_pack(self, basename, pack_file, index_file) -> None:
2543 raise NotImplementedError
2544
2545 def add_pack(self):
2546 """Add a new pack to this object store.
2547
2548 Returns: Fileobject to write to, a commit function to
2549 call when the pack is finished and an abort
2550 function.
2551 """
2552 import tempfile
2553
2554 pf = tempfile.SpooledTemporaryFile(
2555 max_size=PACK_SPOOL_FILE_MAX_SIZE, prefix="incoming-"
2556 )
2557
2558 def commit():
2559 if pf.tell() == 0:
2560 pf.close()
2561 return None
2562
2563 pf.seek(0)
2564
2565 p = PackData(pf.name, pf)
2566 entries = p.sorted_entries()
2567 basename = iter_sha1(entry[0] for entry in entries).decode("ascii")
2568 idxf = tempfile.SpooledTemporaryFile(
2569 max_size=PACK_SPOOL_FILE_MAX_SIZE, prefix="incoming-"
2570 )
2571 checksum = p.get_stored_checksum()
2572 write_pack_index(idxf, entries, checksum, version=self.pack_index_version)
2573 idxf.seek(0)
2574 idx = load_pack_index_file(basename + ".idx", idxf)
2575 for pack in self.packs:
2576 if pack.get_stored_checksum() == p.get_stored_checksum():
2577 p.close()
2578 idx.close()
2579 pf.close()
2580 idxf.close()
2581 return pack
2582 pf.seek(0)
2583 idxf.seek(0)
2584 self._upload_pack(basename, pf, idxf)
2585 final_pack = Pack.from_objects(p, idx)
2586 self._add_cached_pack(basename, final_pack)
2587 pf.close()
2588 idxf.close()
2589 return final_pack
2590
2591 return pf, commit, pf.close
2592
2593
2594def _collect_ancestors(
2595 store: ObjectContainer,
2596 heads,
2597 common: frozenset[ObjectID] = frozenset(),
2598 shallow: frozenset[ObjectID] = frozenset(),
2599 get_parents=lambda commit: commit.parents,
2600):
2601 """Collect all ancestors of heads up to (excluding) those in common.
2602
2603 Args:
2604 store: Object store to get commits from
2605 heads: commits to start from
2606 common: commits to end at, or empty set to walk repository
2607 completely
2608 shallow: Set of shallow commits
2609 get_parents: Optional function for getting the parents of a
2610 commit.
2611 Returns: a tuple (A, B) where A - all commits reachable
2612 from heads but not present in common, B - common (shared) elements
2613 that are directly reachable from heads
2614 """
2615 bases = set()
2616 commits = set()
2617 queue = []
2618 queue.extend(heads)
2619
2620 # Try to use commit graph if available
2621 commit_graph = store.get_commit_graph()
2622
2623 while queue:
2624 e = queue.pop(0)
2625 if e in common:
2626 bases.add(e)
2627 elif e not in commits:
2628 commits.add(e)
2629 if e in shallow:
2630 continue
2631
2632 # Try to use commit graph for parent lookup
2633 parents = None
2634 if commit_graph:
2635 parents = commit_graph.get_parents(e)
2636
2637 if parents is None:
2638 # Fall back to loading the object
2639 cmt = store[e]
2640 parents = get_parents(cmt)
2641
2642 queue.extend(parents)
2643 return (commits, bases)
2644
2645
2646def iter_tree_contents(
2647 store: ObjectContainer, tree_id: Optional[ObjectID], *, include_trees: bool = False
2648):
2649 """Iterate the contents of a tree and all subtrees.
2650
2651 Iteration is depth-first pre-order, as in e.g. os.walk.
2652
2653 Args:
2654 store: Object store to get trees from
2655 tree_id: SHA1 of the tree.
2656 include_trees: If True, include tree objects in the iteration.
2657
2658 Yields: TreeEntry namedtuples for all the objects in a tree.
2659 """
2660 if tree_id is None:
2661 return
2662 # This could be fairly easily generalized to >2 trees if we find a use
2663 # case.
2664 todo = [TreeEntry(b"", stat.S_IFDIR, tree_id)]
2665 while todo:
2666 entry = todo.pop()
2667 if stat.S_ISDIR(entry.mode):
2668 extra = []
2669 tree = store[entry.sha]
2670 assert isinstance(tree, Tree)
2671 for subentry in tree.iteritems(name_order=True):
2672 extra.append(subentry.in_path(entry.path))
2673 todo.extend(reversed(extra))
2674 if not stat.S_ISDIR(entry.mode) or include_trees:
2675 yield entry
2676
2677
2678def iter_commit_contents(
2679 store: ObjectContainer,
2680 commit: Union[Commit, bytes],
2681 *,
2682 include: Optional[Sequence[Union[str, bytes, Path]]] = None,
2683):
2684 """Iterate the contents of the repository at the specified commit.
2685
2686 This is a wrapper around iter_tree_contents() and
2687 tree_lookup_path() to simplify the common task of getting the
2688 contest of a repo at a particular commit. See also
2689 dulwich.index.build_file_from_blob() for writing individual files
2690 to disk.
2691
2692 Args:
2693 store: Object store to get trees from
2694 commit: Commit object, or SHA1 of a commit
2695 include: if provided, only the entries whose paths are in the
2696 list, or whose parent tree is in the list, will be
2697 included. Note that duplicate or overlapping paths
2698 (e.g. ["foo", "foo/bar"]) may result in duplicate entries
2699
2700 Yields: TreeEntry namedtuples for all matching files in a commit.
2701 """
2702 sha = commit.id if isinstance(commit, Commit) else commit
2703 if not isinstance(obj := store[sha], Commit):
2704 raise TypeError(
2705 f"{sha.decode('ascii')} should be ID of a Commit, but is {type(obj)}"
2706 )
2707 commit = obj
2708 encoding = commit.encoding or "utf-8"
2709 include = (
2710 [
2711 path if isinstance(path, bytes) else str(path).encode(encoding)
2712 for path in include
2713 ]
2714 if include is not None
2715 else [b""]
2716 )
2717
2718 for path in include:
2719 mode, obj_id = tree_lookup_path(store.__getitem__, commit.tree, path)
2720 # Iterate all contained files if path points to a dir, otherwise just get that
2721 # single file
2722 if isinstance(store[obj_id], Tree):
2723 for entry in iter_tree_contents(store, obj_id):
2724 yield entry.in_path(path)
2725 else:
2726 yield TreeEntry(path, mode, obj_id)
2727
2728
2729def peel_sha(store: ObjectContainer, sha: bytes) -> tuple[ShaFile, ShaFile]:
2730 """Peel all tags from a SHA.
2731
2732 Args:
2733 store: Object store to get objects from
2734 sha: The object SHA to peel.
2735 Returns: The fully-peeled SHA1 of a tag object, after peeling all
2736 intermediate tags; if the original ref does not point to a tag,
2737 this will equal the original SHA1.
2738 """
2739 unpeeled = obj = store[sha]
2740 obj_class = object_class(obj.type_name)
2741 while obj_class is Tag:
2742 assert isinstance(obj, Tag)
2743 obj_class, sha = obj.object
2744 obj = store[sha]
2745 return unpeeled, obj