Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/gc.py: 24%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Git garbage collection implementation."""
3__all__ = [
4 "DEFAULT_GC_AUTO",
5 "DEFAULT_GC_AUTO_PACK_LIMIT",
6 "GCStats",
7 "find_reachable_objects",
8 "find_unreachable_objects",
9 "garbage_collect",
10 "maybe_auto_gc",
11 "prune_unreachable_objects",
12 "should_run_gc",
13]
15import logging
16import os
17import time
18from collections import deque
19from collections.abc import Callable
20from dataclasses import dataclass, field
21from typing import TYPE_CHECKING
23from dulwich.object_store import (
24 BaseObjectStore,
25 DiskObjectStore,
26)
27from dulwich.objects import Commit, ObjectID, Tag, Tree
28from dulwich.refs import RefsContainer
30if TYPE_CHECKING:
31 from .config import Config
32 from .repo import BaseRepo, Repo
35DEFAULT_GC_AUTO = 6700
36DEFAULT_GC_AUTO_PACK_LIMIT = 50
39@dataclass
40class GCStats:
41 """Statistics from garbage collection."""
43 pruned_objects: set[ObjectID] = field(default_factory=set)
44 bytes_freed: int = 0
45 packs_before: int = 0
46 packs_after: int = 0
47 loose_objects_before: int = 0
48 loose_objects_after: int = 0
51def find_reachable_objects(
52 object_store: BaseObjectStore,
53 refs_container: RefsContainer,
54 include_reflogs: bool = True,
55 progress: Callable[[str], None] | None = None,
56) -> set[ObjectID]:
57 """Find all reachable objects in the repository.
59 Args:
60 object_store: Object store to search
61 refs_container: Reference container
62 include_reflogs: Whether to include reflog entries
63 progress: Optional progress callback
65 Returns:
66 Set of reachable object SHAs
67 """
68 reachable: set[ObjectID] = set()
69 pending: deque[ObjectID] = deque()
71 # Start with all refs
72 for ref in refs_container.allkeys():
73 try:
74 sha = refs_container[ref] # This follows symbolic refs
75 if sha and sha not in reachable:
76 pending.append(sha)
77 reachable.add(sha)
78 except KeyError:
79 # Broken ref
80 if progress:
81 progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}")
82 continue
84 # TODO: Add reflog support when reflog functionality is available
86 # Walk all reachable objects
87 while pending:
88 sha = pending.popleft()
90 if progress:
91 progress(f"Checking object {sha.decode('ascii', 'replace')}")
93 try:
94 obj = object_store[sha]
95 except KeyError:
96 continue
98 # Add referenced objects
99 if isinstance(obj, Commit):
100 # Tree
101 if obj.tree not in reachable:
102 pending.append(obj.tree)
103 reachable.add(obj.tree)
104 # Parents
105 for parent in obj.parents:
106 if parent not in reachable:
107 pending.append(parent)
108 reachable.add(parent)
109 elif isinstance(obj, Tree):
110 # Tree entries
111 for entry in obj.items():
112 assert entry.sha is not None
113 if entry.sha not in reachable:
114 pending.append(entry.sha)
115 reachable.add(entry.sha)
116 elif isinstance(obj, Tag):
117 # Tagged object
118 if obj.object[1] not in reachable:
119 pending.append(obj.object[1])
120 reachable.add(obj.object[1])
122 return reachable
125def find_unreachable_objects(
126 object_store: BaseObjectStore,
127 refs_container: RefsContainer,
128 include_reflogs: bool = True,
129 progress: Callable[[str], None] | None = None,
130) -> set[ObjectID]:
131 """Find all unreachable objects in the repository.
133 Args:
134 object_store: Object store to search
135 refs_container: Reference container
136 include_reflogs: Whether to include reflog entries
137 progress: Optional progress callback
139 Returns:
140 Set of unreachable object SHAs
141 """
142 reachable = find_reachable_objects(
143 object_store, refs_container, include_reflogs, progress
144 )
146 unreachable: set[ObjectID] = set()
147 for sha in object_store:
148 if sha not in reachable:
149 unreachable.add(sha)
151 return unreachable
154def prune_unreachable_objects(
155 object_store: DiskObjectStore,
156 refs_container: RefsContainer,
157 grace_period: int | None = None,
158 dry_run: bool = False,
159 progress: Callable[[str], None] | None = None,
160) -> tuple[set[ObjectID], int]:
161 """Remove unreachable objects from the repository.
163 Args:
164 object_store: Object store to prune
165 refs_container: Reference container
166 grace_period: Grace period in seconds (objects newer than this are kept)
167 dry_run: If True, only report what would be deleted
168 progress: Optional progress callback
170 Returns:
171 Tuple of (set of pruned object SHAs, total bytes freed)
172 """
173 unreachable = find_unreachable_objects(
174 object_store, refs_container, progress=progress
175 )
177 pruned: set[ObjectID] = set()
178 bytes_freed = 0
180 for sha in unreachable:
181 try:
182 obj = object_store[sha]
184 # Check grace period
185 if grace_period is not None:
186 try:
187 mtime = object_store.get_object_mtime(sha)
188 age = time.time() - mtime
189 if age < grace_period:
190 if progress:
191 progress(
192 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
193 )
194 continue
195 except KeyError:
196 # Object not found, skip it
197 continue
199 if progress:
200 progress(f"Pruning {sha.decode('ascii', 'replace')}")
202 # Calculate size before attempting deletion
203 obj_size = len(obj.as_raw_string())
205 if not dry_run:
206 object_store.delete_loose_object(sha)
208 # Only count as pruned if we get here (deletion succeeded or dry run)
209 pruned.add(sha)
210 bytes_freed += obj_size
212 except KeyError:
213 # Object already gone
214 pass
215 except OSError as e:
216 # File system errors during deletion
217 if progress:
218 progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}")
219 return pruned, bytes_freed
222def garbage_collect(
223 repo: "Repo",
224 auto: bool = False,
225 aggressive: bool = False,
226 prune: bool = True,
227 grace_period: int | None = 1209600, # 2 weeks default
228 dry_run: bool = False,
229 progress: Callable[[str], None] | None = None,
230) -> GCStats:
231 """Run garbage collection on a repository.
233 Args:
234 repo: Repository to garbage collect
235 auto: Whether this is an automatic gc
236 aggressive: Whether to use aggressive settings
237 prune: Whether to prune unreachable objects
238 grace_period: Grace period for pruning in seconds
239 dry_run: If True, only report what would be done
240 progress: Optional progress callback
242 Returns:
243 GCStats object with garbage collection statistics
244 """
245 stats = GCStats()
247 object_store = repo.object_store
248 refs_container = repo.refs
250 # Count initial state
251 stats.packs_before = len(list(object_store.packs))
252 stats.loose_objects_before = object_store.count_loose_objects()
254 # Find unreachable objects to exclude from repacking
255 unreachable_to_prune = set()
256 if prune:
257 if progress:
258 progress("Finding unreachable objects")
259 unreachable = find_unreachable_objects(
260 object_store, refs_container, progress=progress
261 )
263 # Apply grace period check
264 for sha in unreachable:
265 try:
266 if grace_period is not None:
267 try:
268 mtime = object_store.get_object_mtime(sha)
269 age = time.time() - mtime
270 if age < grace_period:
271 if progress:
272 progress(
273 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
274 )
275 continue
276 except KeyError:
277 # Object not found, skip it
278 continue
280 unreachable_to_prune.add(sha)
281 obj = object_store[sha]
282 stats.bytes_freed += len(obj.as_raw_string())
283 except KeyError:
284 pass
286 stats.pruned_objects = unreachable_to_prune
288 # Pack refs
289 if progress:
290 progress("Packing references")
291 if not dry_run:
292 repo.refs.pack_refs()
294 # Delete loose unreachable objects
295 if prune and not dry_run:
296 for sha in unreachable_to_prune:
297 if object_store.contains_loose(sha):
298 try:
299 object_store.delete_loose_object(sha)
300 except OSError:
301 pass
303 # Repack everything, excluding unreachable objects
304 # This handles both loose object packing and pack consolidation
305 if progress:
306 progress("Repacking repository")
307 if not dry_run:
308 if prune and unreachable_to_prune:
309 # Repack excluding unreachable objects
310 object_store.repack(exclude=unreachable_to_prune, progress=progress)
311 else:
312 # Normal repack
313 object_store.repack(progress=progress)
315 # Prune orphaned temporary files
316 if progress:
317 progress("Pruning temporary files")
318 if not dry_run:
319 object_store.prune(grace_period=grace_period)
321 # Count final state
322 stats.packs_after = len(list(object_store.packs))
323 stats.loose_objects_after = object_store.count_loose_objects()
325 return stats
328def should_run_gc(repo: "BaseRepo", config: "Config | None" = None) -> bool:
329 """Check if automatic garbage collection should run.
331 Args:
332 repo: Repository to check
333 config: Configuration to use (defaults to repo config)
335 Returns:
336 True if GC should run, False otherwise
337 """
338 # Check environment variable first
339 if os.environ.get("GIT_AUTO_GC") == "0":
340 return False
342 # Check programmatic disable flag
343 if getattr(repo, "_autogc_disabled", False):
344 return False
346 if config is None:
347 config = repo.get_config()
349 # Check if auto GC is disabled
350 try:
351 gc_auto = config.get(b"gc", b"auto")
352 gc_auto_value = int(gc_auto)
353 except KeyError:
354 gc_auto_value = DEFAULT_GC_AUTO
356 if gc_auto_value == 0:
357 # Auto GC is disabled
358 return False
360 # Check loose object count
361 object_store = repo.object_store
362 if not isinstance(object_store, DiskObjectStore):
363 # Can't count loose objects on non-disk stores
364 return False
366 loose_count = object_store.count_loose_objects()
367 if loose_count >= gc_auto_value:
368 return True
370 # Check pack file count
371 try:
372 gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit")
373 pack_limit = int(gc_auto_pack_limit)
374 except KeyError:
375 pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT
377 if pack_limit > 0:
378 pack_count = object_store.count_pack_files()
379 if pack_count >= pack_limit:
380 return True
382 return False
385def maybe_auto_gc(
386 repo: "Repo",
387 config: "Config | None" = None,
388 progress: Callable[[str], None] | None = None,
389) -> bool:
390 """Run automatic garbage collection if needed.
392 Args:
393 repo: Repository to potentially GC
394 config: Configuration to use (defaults to repo config)
395 progress: Optional progress reporting callback
397 Returns:
398 True if GC was run, False otherwise
399 """
400 if not should_run_gc(repo, config):
401 return False
403 # Check for gc.log file - only for disk-based repos
404 if not hasattr(repo, "controldir"):
405 # For non-disk repos, just run GC without gc.log handling
406 garbage_collect(repo, auto=True, progress=progress)
407 return True
409 gc_log_path = os.path.join(repo.controldir(), "gc.log")
410 if os.path.exists(gc_log_path):
411 # Check gc.logExpiry
412 if config is None:
413 config = repo.get_config()
414 try:
415 log_expiry = config.get(b"gc", b"logExpiry")
416 except KeyError:
417 # Default to 1 day
418 expiry_seconds = 86400
419 else:
420 # Parse time value (simplified - just support days for now)
421 if log_expiry.endswith((b".days", b".day")):
422 days = int(log_expiry.split(b".")[0])
423 expiry_seconds = days * 86400
424 else:
425 # Default to 1 day
426 expiry_seconds = 86400
428 stat_info = os.stat(gc_log_path)
429 if time.time() - stat_info.st_mtime < expiry_seconds:
430 # gc.log exists and is not expired - skip GC
431 with open(gc_log_path, "rb") as f:
432 logging.info(
433 "gc.log content: %s", f.read().decode("utf-8", errors="replace")
434 )
435 return False
437 # TODO: Support gc.autoDetach to run in background
438 # For now, run in foreground
440 try:
441 # Run GC with auto=True flag
442 garbage_collect(repo, auto=True, progress=progress)
444 # Remove gc.log on successful completion
445 if os.path.exists(gc_log_path):
446 try:
447 os.unlink(gc_log_path)
448 except FileNotFoundError:
449 pass
451 return True
452 except OSError as e:
453 # Write error to gc.log
454 with open(gc_log_path, "wb") as f:
455 f.write(f"Auto GC failed: {e}\n".encode())
456 # Don't propagate the error - auto GC failures shouldn't break operations
457 return False