Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/gc.py: 23%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Git garbage collection implementation."""
3import logging
4import os
5import time
6from collections import deque
7from collections.abc import Callable
8from dataclasses import dataclass, field
9from typing import TYPE_CHECKING
11from dulwich.object_store import (
12 BaseObjectStore,
13 DiskObjectStore,
14)
15from dulwich.objects import Commit, ObjectID, Tag, Tree
16from dulwich.refs import RefsContainer
18if TYPE_CHECKING:
19 from .config import Config
20 from .repo import BaseRepo, Repo
23DEFAULT_GC_AUTO = 6700
24DEFAULT_GC_AUTO_PACK_LIMIT = 50
27@dataclass
28class GCStats:
29 """Statistics from garbage collection."""
31 pruned_objects: set[bytes] = field(default_factory=set)
32 bytes_freed: int = 0
33 packs_before: int = 0
34 packs_after: int = 0
35 loose_objects_before: int = 0
36 loose_objects_after: int = 0
39def find_reachable_objects(
40 object_store: BaseObjectStore,
41 refs_container: RefsContainer,
42 include_reflogs: bool = True,
43 progress: Callable[[str], None] | None = None,
44) -> set[bytes]:
45 """Find all reachable objects in the repository.
47 Args:
48 object_store: Object store to search
49 refs_container: Reference container
50 include_reflogs: Whether to include reflog entries
51 progress: Optional progress callback
53 Returns:
54 Set of reachable object SHAs
55 """
56 reachable = set()
57 pending: deque[ObjectID] = deque()
59 # Start with all refs
60 for ref in refs_container.allkeys():
61 try:
62 sha = refs_container[ref] # This follows symbolic refs
63 if sha and sha not in reachable:
64 pending.append(sha)
65 reachable.add(sha)
66 except KeyError:
67 # Broken ref
68 if progress:
69 progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}")
70 continue
72 # TODO: Add reflog support when reflog functionality is available
74 # Walk all reachable objects
75 while pending:
76 sha = pending.popleft()
78 if progress:
79 progress(f"Checking object {sha.decode('ascii', 'replace')}")
81 try:
82 obj = object_store[sha]
83 except KeyError:
84 continue
86 # Add referenced objects
87 if isinstance(obj, Commit):
88 # Tree
89 if obj.tree not in reachable:
90 pending.append(obj.tree)
91 reachable.add(obj.tree)
92 # Parents
93 for parent in obj.parents:
94 if parent not in reachable:
95 pending.append(parent)
96 reachable.add(parent)
97 elif isinstance(obj, Tree):
98 # Tree entries
99 for entry in obj.items():
100 assert entry.sha is not None
101 if entry.sha not in reachable:
102 pending.append(entry.sha)
103 reachable.add(entry.sha)
104 elif isinstance(obj, Tag):
105 # Tagged object
106 if obj.object[1] not in reachable:
107 pending.append(obj.object[1])
108 reachable.add(obj.object[1])
110 return reachable
113def find_unreachable_objects(
114 object_store: BaseObjectStore,
115 refs_container: RefsContainer,
116 include_reflogs: bool = True,
117 progress: Callable[[str], None] | None = None,
118) -> set[bytes]:
119 """Find all unreachable objects in the repository.
121 Args:
122 object_store: Object store to search
123 refs_container: Reference container
124 include_reflogs: Whether to include reflog entries
125 progress: Optional progress callback
127 Returns:
128 Set of unreachable object SHAs
129 """
130 reachable = find_reachable_objects(
131 object_store, refs_container, include_reflogs, progress
132 )
134 unreachable = set()
135 for sha in object_store:
136 if sha not in reachable:
137 unreachable.add(sha)
139 return unreachable
142def prune_unreachable_objects(
143 object_store: DiskObjectStore,
144 refs_container: RefsContainer,
145 grace_period: int | None = None,
146 dry_run: bool = False,
147 progress: Callable[[str], None] | None = None,
148) -> tuple[set[bytes], int]:
149 """Remove unreachable objects from the repository.
151 Args:
152 object_store: Object store to prune
153 refs_container: Reference container
154 grace_period: Grace period in seconds (objects newer than this are kept)
155 dry_run: If True, only report what would be deleted
156 progress: Optional progress callback
158 Returns:
159 Tuple of (set of pruned object SHAs, total bytes freed)
160 """
161 unreachable = find_unreachable_objects(
162 object_store, refs_container, progress=progress
163 )
165 pruned = set()
166 bytes_freed = 0
168 for sha in unreachable:
169 try:
170 obj = object_store[sha]
172 # Check grace period
173 if grace_period is not None:
174 try:
175 mtime = object_store.get_object_mtime(sha)
176 age = time.time() - mtime
177 if age < grace_period:
178 if progress:
179 progress(
180 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
181 )
182 continue
183 except KeyError:
184 # Object not found, skip it
185 continue
187 if progress:
188 progress(f"Pruning {sha.decode('ascii', 'replace')}")
190 # Calculate size before attempting deletion
191 obj_size = len(obj.as_raw_string())
193 if not dry_run:
194 object_store.delete_loose_object(sha)
196 # Only count as pruned if we get here (deletion succeeded or dry run)
197 pruned.add(sha)
198 bytes_freed += obj_size
200 except KeyError:
201 # Object already gone
202 pass
203 except OSError as e:
204 # File system errors during deletion
205 if progress:
206 progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}")
207 return pruned, bytes_freed
210def garbage_collect(
211 repo: "Repo",
212 auto: bool = False,
213 aggressive: bool = False,
214 prune: bool = True,
215 grace_period: int | None = 1209600, # 2 weeks default
216 dry_run: bool = False,
217 progress: Callable[[str], None] | None = None,
218) -> GCStats:
219 """Run garbage collection on a repository.
221 Args:
222 repo: Repository to garbage collect
223 auto: Whether this is an automatic gc
224 aggressive: Whether to use aggressive settings
225 prune: Whether to prune unreachable objects
226 grace_period: Grace period for pruning in seconds
227 dry_run: If True, only report what would be done
228 progress: Optional progress callback
230 Returns:
231 GCStats object with garbage collection statistics
232 """
233 stats = GCStats()
235 object_store = repo.object_store
236 refs_container = repo.refs
238 # Count initial state
239 stats.packs_before = len(list(object_store.packs))
240 stats.loose_objects_before = object_store.count_loose_objects()
242 # Find unreachable objects to exclude from repacking
243 unreachable_to_prune = set()
244 if prune:
245 if progress:
246 progress("Finding unreachable objects")
247 unreachable = find_unreachable_objects(
248 object_store, refs_container, progress=progress
249 )
251 # Apply grace period check
252 for sha in unreachable:
253 try:
254 if grace_period is not None:
255 try:
256 mtime = object_store.get_object_mtime(sha)
257 age = time.time() - mtime
258 if age < grace_period:
259 if progress:
260 progress(
261 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
262 )
263 continue
264 except KeyError:
265 # Object not found, skip it
266 continue
268 unreachable_to_prune.add(sha)
269 obj = object_store[sha]
270 stats.bytes_freed += len(obj.as_raw_string())
271 except KeyError:
272 pass
274 stats.pruned_objects = unreachable_to_prune
276 # Pack refs
277 if progress:
278 progress("Packing references")
279 if not dry_run:
280 repo.refs.pack_refs()
282 # Delete loose unreachable objects
283 if prune and not dry_run:
284 for sha in unreachable_to_prune:
285 if object_store.contains_loose(sha):
286 try:
287 object_store.delete_loose_object(sha)
288 except OSError:
289 pass
291 # Repack everything, excluding unreachable objects
292 # This handles both loose object packing and pack consolidation
293 if progress:
294 progress("Repacking repository")
295 if not dry_run:
296 if prune and unreachable_to_prune:
297 # Repack excluding unreachable objects
298 object_store.repack(exclude=unreachable_to_prune, progress=progress)
299 else:
300 # Normal repack
301 object_store.repack(progress=progress)
303 # Prune orphaned temporary files
304 if progress:
305 progress("Pruning temporary files")
306 if not dry_run:
307 object_store.prune(grace_period=grace_period)
309 # Count final state
310 stats.packs_after = len(list(object_store.packs))
311 stats.loose_objects_after = object_store.count_loose_objects()
313 return stats
316def should_run_gc(repo: "BaseRepo", config: "Config | None" = None) -> bool:
317 """Check if automatic garbage collection should run.
319 Args:
320 repo: Repository to check
321 config: Configuration to use (defaults to repo config)
323 Returns:
324 True if GC should run, False otherwise
325 """
326 # Check environment variable first
327 if os.environ.get("GIT_AUTO_GC") == "0":
328 return False
330 # Check programmatic disable flag
331 if getattr(repo, "_autogc_disabled", False):
332 return False
334 if config is None:
335 config = repo.get_config()
337 # Check if auto GC is disabled
338 try:
339 gc_auto = config.get(b"gc", b"auto")
340 gc_auto_value = int(gc_auto)
341 except KeyError:
342 gc_auto_value = DEFAULT_GC_AUTO
344 if gc_auto_value == 0:
345 # Auto GC is disabled
346 return False
348 # Check loose object count
349 object_store = repo.object_store
350 if not isinstance(object_store, DiskObjectStore):
351 # Can't count loose objects on non-disk stores
352 return False
354 loose_count = object_store.count_loose_objects()
355 if loose_count >= gc_auto_value:
356 return True
358 # Check pack file count
359 try:
360 gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit")
361 pack_limit = int(gc_auto_pack_limit)
362 except KeyError:
363 pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT
365 if pack_limit > 0:
366 pack_count = object_store.count_pack_files()
367 if pack_count >= pack_limit:
368 return True
370 return False
373def maybe_auto_gc(
374 repo: "Repo",
375 config: "Config | None" = None,
376 progress: Callable[[str], None] | None = None,
377) -> bool:
378 """Run automatic garbage collection if needed.
380 Args:
381 repo: Repository to potentially GC
382 config: Configuration to use (defaults to repo config)
383 progress: Optional progress reporting callback
385 Returns:
386 True if GC was run, False otherwise
387 """
388 if not should_run_gc(repo, config):
389 return False
391 # Check for gc.log file - only for disk-based repos
392 if not hasattr(repo, "controldir"):
393 # For non-disk repos, just run GC without gc.log handling
394 garbage_collect(repo, auto=True, progress=progress)
395 return True
397 gc_log_path = os.path.join(repo.controldir(), "gc.log")
398 if os.path.exists(gc_log_path):
399 # Check gc.logExpiry
400 if config is None:
401 config = repo.get_config()
402 try:
403 log_expiry = config.get(b"gc", b"logExpiry")
404 except KeyError:
405 # Default to 1 day
406 expiry_seconds = 86400
407 else:
408 # Parse time value (simplified - just support days for now)
409 if log_expiry.endswith((b".days", b".day")):
410 days = int(log_expiry.split(b".")[0])
411 expiry_seconds = days * 86400
412 else:
413 # Default to 1 day
414 expiry_seconds = 86400
416 stat_info = os.stat(gc_log_path)
417 if time.time() - stat_info.st_mtime < expiry_seconds:
418 # gc.log exists and is not expired - skip GC
419 with open(gc_log_path, "rb") as f:
420 logging.info(
421 "gc.log content: %s", f.read().decode("utf-8", errors="replace")
422 )
423 return False
425 # TODO: Support gc.autoDetach to run in background
426 # For now, run in foreground
428 try:
429 # Run GC with auto=True flag
430 garbage_collect(repo, auto=True, progress=progress)
432 # Remove gc.log on successful completion
433 if os.path.exists(gc_log_path):
434 try:
435 os.unlink(gc_log_path)
436 except FileNotFoundError:
437 pass
439 return True
440 except OSError as e:
441 # Write error to gc.log
442 with open(gc_log_path, "wb") as f:
443 f.write(f"Auto GC failed: {e}\n".encode())
444 # Don't propagate the error - auto GC failures shouldn't break operations
445 return False