Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/gc.py: 23%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Git garbage collection implementation."""
3import collections
4import os
5import time
6from dataclasses import dataclass, field
7from typing import TYPE_CHECKING, Callable, Optional
9from dulwich.object_store import (
10 BaseObjectStore,
11 DiskObjectStore,
12)
13from dulwich.objects import Commit, ObjectID, Tag, Tree
14from dulwich.refs import RefsContainer
16if TYPE_CHECKING:
17 from .config import Config
18 from .repo import BaseRepo, Repo
21DEFAULT_GC_AUTO = 6700
22DEFAULT_GC_AUTO_PACK_LIMIT = 50
25@dataclass
26class GCStats:
27 """Statistics from garbage collection."""
29 pruned_objects: set[bytes] = field(default_factory=set)
30 bytes_freed: int = 0
31 packs_before: int = 0
32 packs_after: int = 0
33 loose_objects_before: int = 0
34 loose_objects_after: int = 0
37def find_reachable_objects(
38 object_store: BaseObjectStore,
39 refs_container: RefsContainer,
40 include_reflogs: bool = True,
41 progress: Optional[Callable[[str], None]] = None,
42) -> set[bytes]:
43 """Find all reachable objects in the repository.
45 Args:
46 object_store: Object store to search
47 refs_container: Reference container
48 include_reflogs: Whether to include reflog entries
49 progress: Optional progress callback
51 Returns:
52 Set of reachable object SHAs
53 """
54 reachable = set()
55 pending: collections.deque[ObjectID] = collections.deque()
57 # Start with all refs
58 for ref in refs_container.allkeys():
59 try:
60 sha = refs_container[ref] # This follows symbolic refs
61 if sha and sha not in reachable:
62 pending.append(sha)
63 reachable.add(sha)
64 except KeyError:
65 # Broken ref
66 if progress:
67 progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}")
68 continue
70 # TODO: Add reflog support when reflog functionality is available
72 # Walk all reachable objects
73 while pending:
74 sha = pending.popleft()
76 if progress:
77 progress(f"Checking object {sha.decode('ascii', 'replace')}")
79 try:
80 obj = object_store[sha]
81 except KeyError:
82 continue
84 # Add referenced objects
85 if isinstance(obj, Commit):
86 # Tree
87 if obj.tree not in reachable:
88 pending.append(obj.tree)
89 reachable.add(obj.tree)
90 # Parents
91 for parent in obj.parents:
92 if parent not in reachable:
93 pending.append(parent)
94 reachable.add(parent)
95 elif isinstance(obj, Tree):
96 # Tree entries
97 for entry in obj.items():
98 if entry.sha not in reachable:
99 pending.append(entry.sha)
100 reachable.add(entry.sha)
101 elif isinstance(obj, Tag):
102 # Tagged object
103 if obj.object[1] not in reachable:
104 pending.append(obj.object[1])
105 reachable.add(obj.object[1])
107 return reachable
110def find_unreachable_objects(
111 object_store: BaseObjectStore,
112 refs_container: RefsContainer,
113 include_reflogs: bool = True,
114 progress: Optional[Callable[[str], None]] = None,
115) -> set[bytes]:
116 """Find all unreachable objects in the repository.
118 Args:
119 object_store: Object store to search
120 refs_container: Reference container
121 include_reflogs: Whether to include reflog entries
122 progress: Optional progress callback
124 Returns:
125 Set of unreachable object SHAs
126 """
127 reachable = find_reachable_objects(
128 object_store, refs_container, include_reflogs, progress
129 )
131 unreachable = set()
132 for sha in object_store:
133 if sha not in reachable:
134 unreachable.add(sha)
136 return unreachable
139def prune_unreachable_objects(
140 object_store: DiskObjectStore,
141 refs_container: RefsContainer,
142 grace_period: Optional[int] = None,
143 dry_run: bool = False,
144 progress: Optional[Callable[[str], None]] = None,
145) -> tuple[set[bytes], int]:
146 """Remove unreachable objects from the repository.
148 Args:
149 object_store: Object store to prune
150 refs_container: Reference container
151 grace_period: Grace period in seconds (objects newer than this are kept)
152 dry_run: If True, only report what would be deleted
153 progress: Optional progress callback
155 Returns:
156 Tuple of (set of pruned object SHAs, total bytes freed)
157 """
158 unreachable = find_unreachable_objects(
159 object_store, refs_container, progress=progress
160 )
162 pruned = set()
163 bytes_freed = 0
165 for sha in unreachable:
166 try:
167 obj = object_store[sha]
169 # Check grace period
170 if grace_period is not None:
171 try:
172 mtime = object_store.get_object_mtime(sha)
173 age = time.time() - mtime
174 if age < grace_period:
175 if progress:
176 progress(
177 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
178 )
179 continue
180 except KeyError:
181 # Object not found, skip it
182 continue
184 if progress:
185 progress(f"Pruning {sha.decode('ascii', 'replace')}")
187 # Calculate size before attempting deletion
188 obj_size = len(obj.as_raw_string())
190 if not dry_run:
191 object_store.delete_loose_object(sha)
193 # Only count as pruned if we get here (deletion succeeded or dry run)
194 pruned.add(sha)
195 bytes_freed += obj_size
197 except KeyError:
198 # Object already gone
199 pass
200 except OSError as e:
201 # File system errors during deletion
202 if progress:
203 progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}")
204 return pruned, bytes_freed
207def garbage_collect(
208 repo: "Repo",
209 auto: bool = False,
210 aggressive: bool = False,
211 prune: bool = True,
212 grace_period: Optional[int] = 1209600, # 2 weeks default
213 dry_run: bool = False,
214 progress: Optional[Callable[[str], None]] = None,
215) -> GCStats:
216 """Run garbage collection on a repository.
218 Args:
219 repo: Repository to garbage collect
220 auto: Whether this is an automatic gc
221 aggressive: Whether to use aggressive settings
222 prune: Whether to prune unreachable objects
223 grace_period: Grace period for pruning in seconds
224 dry_run: If True, only report what would be done
225 progress: Optional progress callback
227 Returns:
228 GCStats object with garbage collection statistics
229 """
230 stats = GCStats()
232 object_store = repo.object_store
233 refs_container = repo.refs
235 # Count initial state
236 stats.packs_before = len(list(object_store.packs))
237 stats.loose_objects_before = object_store.count_loose_objects()
239 # Find unreachable objects to exclude from repacking
240 unreachable_to_prune = set()
241 if prune:
242 if progress:
243 progress("Finding unreachable objects")
244 unreachable = find_unreachable_objects(
245 object_store, refs_container, progress=progress
246 )
248 # Apply grace period check
249 for sha in unreachable:
250 try:
251 if grace_period is not None:
252 try:
253 mtime = object_store.get_object_mtime(sha)
254 age = time.time() - mtime
255 if age < grace_period:
256 if progress:
257 progress(
258 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
259 )
260 continue
261 except KeyError:
262 # Object not found, skip it
263 continue
265 unreachable_to_prune.add(sha)
266 obj = object_store[sha]
267 stats.bytes_freed += len(obj.as_raw_string())
268 except KeyError:
269 pass
271 stats.pruned_objects = unreachable_to_prune
273 # Pack refs
274 if progress:
275 progress("Packing references")
276 if not dry_run:
277 repo.refs.pack_refs()
279 # Delete loose unreachable objects
280 if prune and not dry_run:
281 for sha in unreachable_to_prune:
282 if object_store.contains_loose(sha):
283 try:
284 object_store.delete_loose_object(sha)
285 except OSError:
286 pass
288 # Repack everything, excluding unreachable objects
289 # This handles both loose object packing and pack consolidation
290 if progress:
291 progress("Repacking repository")
292 if not dry_run:
293 if prune and unreachable_to_prune:
294 # Repack excluding unreachable objects
295 object_store.repack(exclude=unreachable_to_prune)
296 else:
297 # Normal repack
298 object_store.repack()
300 # Prune orphaned temporary files
301 if progress:
302 progress("Pruning temporary files")
303 if not dry_run:
304 object_store.prune(grace_period=grace_period)
306 # Count final state
307 stats.packs_after = len(list(object_store.packs))
308 stats.loose_objects_after = object_store.count_loose_objects()
310 return stats
313def should_run_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool:
314 """Check if automatic garbage collection should run.
316 Args:
317 repo: Repository to check
318 config: Configuration to use (defaults to repo config)
320 Returns:
321 True if GC should run, False otherwise
322 """
323 # Check environment variable first
324 if os.environ.get("GIT_AUTO_GC") == "0":
325 return False
327 # Check programmatic disable flag
328 if getattr(repo, "_autogc_disabled", False):
329 return False
331 if config is None:
332 config = repo.get_config()
334 # Check if auto GC is disabled
335 try:
336 gc_auto = config.get(b"gc", b"auto")
337 gc_auto_value = int(gc_auto)
338 except KeyError:
339 gc_auto_value = DEFAULT_GC_AUTO
341 if gc_auto_value == 0:
342 # Auto GC is disabled
343 return False
345 # Check loose object count
346 object_store = repo.object_store
347 if not isinstance(object_store, DiskObjectStore):
348 # Can't count loose objects on non-disk stores
349 return False
351 loose_count = object_store.count_loose_objects()
352 if loose_count >= gc_auto_value:
353 return True
355 # Check pack file count
356 try:
357 gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit")
358 pack_limit = int(gc_auto_pack_limit)
359 except KeyError:
360 pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT
362 if pack_limit > 0:
363 pack_count = object_store.count_pack_files()
364 if pack_count >= pack_limit:
365 return True
367 return False
370def maybe_auto_gc(repo: "Repo", config: Optional["Config"] = None) -> bool:
371 """Run automatic garbage collection if needed.
373 Args:
374 repo: Repository to potentially GC
375 config: Configuration to use (defaults to repo config)
377 Returns:
378 True if GC was run, False otherwise
379 """
380 if not should_run_gc(repo, config):
381 return False
383 # Check for gc.log file - only for disk-based repos
384 if not hasattr(repo, "controldir"):
385 # For non-disk repos, just run GC without gc.log handling
386 garbage_collect(repo, auto=True)
387 return True
389 gc_log_path = os.path.join(repo.controldir(), "gc.log")
390 if os.path.exists(gc_log_path):
391 # Check gc.logExpiry
392 if config is None:
393 config = repo.get_config()
394 try:
395 log_expiry = config.get(b"gc", b"logExpiry")
396 except KeyError:
397 # Default to 1 day
398 expiry_seconds = 86400
399 else:
400 # Parse time value (simplified - just support days for now)
401 if log_expiry.endswith((b".days", b".day")):
402 days = int(log_expiry.split(b".")[0])
403 expiry_seconds = days * 86400
404 else:
405 # Default to 1 day
406 expiry_seconds = 86400
408 stat_info = os.stat(gc_log_path)
409 if time.time() - stat_info.st_mtime < expiry_seconds:
410 # gc.log exists and is not expired - skip GC
411 with open(gc_log_path, "rb") as f:
412 print(f.read().decode("utf-8", errors="replace"))
413 return False
415 # TODO: Support gc.autoDetach to run in background
416 # For now, run in foreground
418 try:
419 # Run GC with auto=True flag
420 garbage_collect(repo, auto=True)
422 # Remove gc.log on successful completion
423 if os.path.exists(gc_log_path):
424 try:
425 os.unlink(gc_log_path)
426 except FileNotFoundError:
427 pass
429 return True
430 except OSError as e:
431 # Write error to gc.log
432 with open(gc_log_path, "wb") as f:
433 f.write(f"Auto GC failed: {e}\n".encode())
434 # Don't propagate the error - auto GC failures shouldn't break operations
435 return False