Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/gc.py: 23%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Git garbage collection implementation."""
3import collections
4import logging
5import os
6import time
7from dataclasses import dataclass, field
8from typing import TYPE_CHECKING, Callable, Optional
10from dulwich.object_store import (
11 BaseObjectStore,
12 DiskObjectStore,
13)
14from dulwich.objects import Commit, ObjectID, Tag, Tree
15from dulwich.refs import RefsContainer
17if TYPE_CHECKING:
18 from .config import Config
19 from .repo import BaseRepo, Repo
22DEFAULT_GC_AUTO = 6700
23DEFAULT_GC_AUTO_PACK_LIMIT = 50
26@dataclass
27class GCStats:
28 """Statistics from garbage collection."""
30 pruned_objects: set[bytes] = field(default_factory=set)
31 bytes_freed: int = 0
32 packs_before: int = 0
33 packs_after: int = 0
34 loose_objects_before: int = 0
35 loose_objects_after: int = 0
38def find_reachable_objects(
39 object_store: BaseObjectStore,
40 refs_container: RefsContainer,
41 include_reflogs: bool = True,
42 progress: Optional[Callable[[str], None]] = None,
43) -> set[bytes]:
44 """Find all reachable objects in the repository.
46 Args:
47 object_store: Object store to search
48 refs_container: Reference container
49 include_reflogs: Whether to include reflog entries
50 progress: Optional progress callback
52 Returns:
53 Set of reachable object SHAs
54 """
55 reachable = set()
56 pending: collections.deque[ObjectID] = collections.deque()
58 # Start with all refs
59 for ref in refs_container.allkeys():
60 try:
61 sha = refs_container[ref] # This follows symbolic refs
62 if sha and sha not in reachable:
63 pending.append(sha)
64 reachable.add(sha)
65 except KeyError:
66 # Broken ref
67 if progress:
68 progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}")
69 continue
71 # TODO: Add reflog support when reflog functionality is available
73 # Walk all reachable objects
74 while pending:
75 sha = pending.popleft()
77 if progress:
78 progress(f"Checking object {sha.decode('ascii', 'replace')}")
80 try:
81 obj = object_store[sha]
82 except KeyError:
83 continue
85 # Add referenced objects
86 if isinstance(obj, Commit):
87 # Tree
88 if obj.tree not in reachable:
89 pending.append(obj.tree)
90 reachable.add(obj.tree)
91 # Parents
92 for parent in obj.parents:
93 if parent not in reachable:
94 pending.append(parent)
95 reachable.add(parent)
96 elif isinstance(obj, Tree):
97 # Tree entries
98 for entry in obj.items():
99 assert entry.sha is not None
100 if entry.sha not in reachable:
101 pending.append(entry.sha)
102 reachable.add(entry.sha)
103 elif isinstance(obj, Tag):
104 # Tagged object
105 if obj.object[1] not in reachable:
106 pending.append(obj.object[1])
107 reachable.add(obj.object[1])
109 return reachable
112def find_unreachable_objects(
113 object_store: BaseObjectStore,
114 refs_container: RefsContainer,
115 include_reflogs: bool = True,
116 progress: Optional[Callable[[str], None]] = None,
117) -> set[bytes]:
118 """Find all unreachable objects in the repository.
120 Args:
121 object_store: Object store to search
122 refs_container: Reference container
123 include_reflogs: Whether to include reflog entries
124 progress: Optional progress callback
126 Returns:
127 Set of unreachable object SHAs
128 """
129 reachable = find_reachable_objects(
130 object_store, refs_container, include_reflogs, progress
131 )
133 unreachable = set()
134 for sha in object_store:
135 if sha not in reachable:
136 unreachable.add(sha)
138 return unreachable
141def prune_unreachable_objects(
142 object_store: DiskObjectStore,
143 refs_container: RefsContainer,
144 grace_period: Optional[int] = None,
145 dry_run: bool = False,
146 progress: Optional[Callable[[str], None]] = None,
147) -> tuple[set[bytes], int]:
148 """Remove unreachable objects from the repository.
150 Args:
151 object_store: Object store to prune
152 refs_container: Reference container
153 grace_period: Grace period in seconds (objects newer than this are kept)
154 dry_run: If True, only report what would be deleted
155 progress: Optional progress callback
157 Returns:
158 Tuple of (set of pruned object SHAs, total bytes freed)
159 """
160 unreachable = find_unreachable_objects(
161 object_store, refs_container, progress=progress
162 )
164 pruned = set()
165 bytes_freed = 0
167 for sha in unreachable:
168 try:
169 obj = object_store[sha]
171 # Check grace period
172 if grace_period is not None:
173 try:
174 mtime = object_store.get_object_mtime(sha)
175 age = time.time() - mtime
176 if age < grace_period:
177 if progress:
178 progress(
179 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
180 )
181 continue
182 except KeyError:
183 # Object not found, skip it
184 continue
186 if progress:
187 progress(f"Pruning {sha.decode('ascii', 'replace')}")
189 # Calculate size before attempting deletion
190 obj_size = len(obj.as_raw_string())
192 if not dry_run:
193 object_store.delete_loose_object(sha)
195 # Only count as pruned if we get here (deletion succeeded or dry run)
196 pruned.add(sha)
197 bytes_freed += obj_size
199 except KeyError:
200 # Object already gone
201 pass
202 except OSError as e:
203 # File system errors during deletion
204 if progress:
205 progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}")
206 return pruned, bytes_freed
209def garbage_collect(
210 repo: "Repo",
211 auto: bool = False,
212 aggressive: bool = False,
213 prune: bool = True,
214 grace_period: Optional[int] = 1209600, # 2 weeks default
215 dry_run: bool = False,
216 progress: Optional[Callable[[str], None]] = None,
217) -> GCStats:
218 """Run garbage collection on a repository.
220 Args:
221 repo: Repository to garbage collect
222 auto: Whether this is an automatic gc
223 aggressive: Whether to use aggressive settings
224 prune: Whether to prune unreachable objects
225 grace_period: Grace period for pruning in seconds
226 dry_run: If True, only report what would be done
227 progress: Optional progress callback
229 Returns:
230 GCStats object with garbage collection statistics
231 """
232 stats = GCStats()
234 object_store = repo.object_store
235 refs_container = repo.refs
237 # Count initial state
238 stats.packs_before = len(list(object_store.packs))
239 stats.loose_objects_before = object_store.count_loose_objects()
241 # Find unreachable objects to exclude from repacking
242 unreachable_to_prune = set()
243 if prune:
244 if progress:
245 progress("Finding unreachable objects")
246 unreachable = find_unreachable_objects(
247 object_store, refs_container, progress=progress
248 )
250 # Apply grace period check
251 for sha in unreachable:
252 try:
253 if grace_period is not None:
254 try:
255 mtime = object_store.get_object_mtime(sha)
256 age = time.time() - mtime
257 if age < grace_period:
258 if progress:
259 progress(
260 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
261 )
262 continue
263 except KeyError:
264 # Object not found, skip it
265 continue
267 unreachable_to_prune.add(sha)
268 obj = object_store[sha]
269 stats.bytes_freed += len(obj.as_raw_string())
270 except KeyError:
271 pass
273 stats.pruned_objects = unreachable_to_prune
275 # Pack refs
276 if progress:
277 progress("Packing references")
278 if not dry_run:
279 repo.refs.pack_refs()
281 # Delete loose unreachable objects
282 if prune and not dry_run:
283 for sha in unreachable_to_prune:
284 if object_store.contains_loose(sha):
285 try:
286 object_store.delete_loose_object(sha)
287 except OSError:
288 pass
290 # Repack everything, excluding unreachable objects
291 # This handles both loose object packing and pack consolidation
292 if progress:
293 progress("Repacking repository")
294 if not dry_run:
295 if prune and unreachable_to_prune:
296 # Repack excluding unreachable objects
297 object_store.repack(exclude=unreachable_to_prune, progress=progress)
298 else:
299 # Normal repack
300 object_store.repack(progress=progress)
302 # Prune orphaned temporary files
303 if progress:
304 progress("Pruning temporary files")
305 if not dry_run:
306 object_store.prune(grace_period=grace_period)
308 # Count final state
309 stats.packs_after = len(list(object_store.packs))
310 stats.loose_objects_after = object_store.count_loose_objects()
312 return stats
315def should_run_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool:
316 """Check if automatic garbage collection should run.
318 Args:
319 repo: Repository to check
320 config: Configuration to use (defaults to repo config)
322 Returns:
323 True if GC should run, False otherwise
324 """
325 # Check environment variable first
326 if os.environ.get("GIT_AUTO_GC") == "0":
327 return False
329 # Check programmatic disable flag
330 if getattr(repo, "_autogc_disabled", False):
331 return False
333 if config is None:
334 config = repo.get_config()
336 # Check if auto GC is disabled
337 try:
338 gc_auto = config.get(b"gc", b"auto")
339 gc_auto_value = int(gc_auto)
340 except KeyError:
341 gc_auto_value = DEFAULT_GC_AUTO
343 if gc_auto_value == 0:
344 # Auto GC is disabled
345 return False
347 # Check loose object count
348 object_store = repo.object_store
349 if not isinstance(object_store, DiskObjectStore):
350 # Can't count loose objects on non-disk stores
351 return False
353 loose_count = object_store.count_loose_objects()
354 if loose_count >= gc_auto_value:
355 return True
357 # Check pack file count
358 try:
359 gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit")
360 pack_limit = int(gc_auto_pack_limit)
361 except KeyError:
362 pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT
364 if pack_limit > 0:
365 pack_count = object_store.count_pack_files()
366 if pack_count >= pack_limit:
367 return True
369 return False
372def maybe_auto_gc(
373 repo: "Repo",
374 config: Optional["Config"] = None,
375 progress: Optional[Callable[[str], None]] = None,
376) -> bool:
377 """Run automatic garbage collection if needed.
379 Args:
380 repo: Repository to potentially GC
381 config: Configuration to use (defaults to repo config)
382 progress: Optional progress reporting callback
384 Returns:
385 True if GC was run, False otherwise
386 """
387 if not should_run_gc(repo, config):
388 return False
390 # Check for gc.log file - only for disk-based repos
391 if not hasattr(repo, "controldir"):
392 # For non-disk repos, just run GC without gc.log handling
393 garbage_collect(repo, auto=True, progress=progress)
394 return True
396 gc_log_path = os.path.join(repo.controldir(), "gc.log")
397 if os.path.exists(gc_log_path):
398 # Check gc.logExpiry
399 if config is None:
400 config = repo.get_config()
401 try:
402 log_expiry = config.get(b"gc", b"logExpiry")
403 except KeyError:
404 # Default to 1 day
405 expiry_seconds = 86400
406 else:
407 # Parse time value (simplified - just support days for now)
408 if log_expiry.endswith((b".days", b".day")):
409 days = int(log_expiry.split(b".")[0])
410 expiry_seconds = days * 86400
411 else:
412 # Default to 1 day
413 expiry_seconds = 86400
415 stat_info = os.stat(gc_log_path)
416 if time.time() - stat_info.st_mtime < expiry_seconds:
417 # gc.log exists and is not expired - skip GC
418 with open(gc_log_path, "rb") as f:
419 logging.info(
420 "gc.log content: %s", f.read().decode("utf-8", errors="replace")
421 )
422 return False
424 # TODO: Support gc.autoDetach to run in background
425 # For now, run in foreground
427 try:
428 # Run GC with auto=True flag
429 garbage_collect(repo, auto=True, progress=progress)
431 # Remove gc.log on successful completion
432 if os.path.exists(gc_log_path):
433 try:
434 os.unlink(gc_log_path)
435 except FileNotFoundError:
436 pass
438 return True
439 except OSError as e:
440 # Write error to gc.log
441 with open(gc_log_path, "wb") as f:
442 f.write(f"Auto GC failed: {e}\n".encode())
443 # Don't propagate the error - auto GC failures shouldn't break operations
444 return False