Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/gc.py: 23%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Git garbage collection implementation."""
3import collections
4import os
5import time
6from dataclasses import dataclass, field
7from typing import TYPE_CHECKING, Optional
9from dulwich.object_store import (
10 BaseObjectStore,
11 DiskObjectStore,
12 PackBasedObjectStore,
13)
14from dulwich.objects import Commit, ObjectID, Tag, Tree
15from dulwich.refs import RefsContainer
17if TYPE_CHECKING:
18 from .config import Config
19 from .repo import BaseRepo
22DEFAULT_GC_AUTO = 6700
23DEFAULT_GC_AUTO_PACK_LIMIT = 50
26@dataclass
27class GCStats:
28 """Statistics from garbage collection."""
30 pruned_objects: set[bytes] = field(default_factory=set)
31 bytes_freed: int = 0
32 packs_before: int = 0
33 packs_after: int = 0
34 loose_objects_before: int = 0
35 loose_objects_after: int = 0
38def find_reachable_objects(
39 object_store: BaseObjectStore,
40 refs_container: RefsContainer,
41 include_reflogs: bool = True,
42 progress=None,
43) -> set[bytes]:
44 """Find all reachable objects in the repository.
46 Args:
47 object_store: Object store to search
48 refs_container: Reference container
49 include_reflogs: Whether to include reflog entries
50 progress: Optional progress callback
52 Returns:
53 Set of reachable object SHAs
54 """
55 reachable = set()
56 pending: collections.deque[ObjectID] = collections.deque()
58 # Start with all refs
59 for ref in refs_container.allkeys():
60 try:
61 sha = refs_container[ref] # This follows symbolic refs
62 if sha and sha not in reachable:
63 pending.append(sha)
64 reachable.add(sha)
65 except KeyError:
66 # Broken ref
67 if progress:
68 progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}")
69 continue
71 # TODO: Add reflog support when reflog functionality is available
73 # Walk all reachable objects
74 while pending:
75 sha = pending.popleft()
77 if progress:
78 progress(f"Checking object {sha.decode('ascii', 'replace')}")
80 try:
81 obj = object_store[sha]
82 except KeyError:
83 continue
85 # Add referenced objects
86 if isinstance(obj, Commit):
87 # Tree
88 if obj.tree not in reachable:
89 pending.append(obj.tree)
90 reachable.add(obj.tree)
91 # Parents
92 for parent in obj.parents:
93 if parent not in reachable:
94 pending.append(parent)
95 reachable.add(parent)
96 elif isinstance(obj, Tree):
97 # Tree entries
98 for entry in obj.items():
99 if entry.sha not in reachable:
100 pending.append(entry.sha)
101 reachable.add(entry.sha)
102 elif isinstance(obj, Tag):
103 # Tagged object
104 if obj.object[1] not in reachable:
105 pending.append(obj.object[1])
106 reachable.add(obj.object[1])
108 return reachable
111def find_unreachable_objects(
112 object_store: BaseObjectStore,
113 refs_container: RefsContainer,
114 include_reflogs: bool = True,
115 progress=None,
116) -> set[bytes]:
117 """Find all unreachable objects in the repository.
119 Args:
120 object_store: Object store to search
121 refs_container: Reference container
122 include_reflogs: Whether to include reflog entries
123 progress: Optional progress callback
125 Returns:
126 Set of unreachable object SHAs
127 """
128 reachable = find_reachable_objects(
129 object_store, refs_container, include_reflogs, progress
130 )
132 unreachable = set()
133 for sha in object_store:
134 if sha not in reachable:
135 unreachable.add(sha)
137 return unreachable
140def prune_unreachable_objects(
141 object_store: PackBasedObjectStore,
142 refs_container: RefsContainer,
143 grace_period: Optional[int] = None,
144 dry_run: bool = False,
145 progress=None,
146) -> tuple[set[bytes], int]:
147 """Remove unreachable objects from the repository.
149 Args:
150 object_store: Object store to prune
151 refs_container: Reference container
152 grace_period: Grace period in seconds (objects newer than this are kept)
153 dry_run: If True, only report what would be deleted
154 progress: Optional progress callback
156 Returns:
157 Tuple of (set of pruned object SHAs, total bytes freed)
158 """
159 unreachable = find_unreachable_objects(
160 object_store, refs_container, progress=progress
161 )
163 pruned = set()
164 bytes_freed = 0
166 for sha in unreachable:
167 try:
168 obj = object_store[sha]
170 # Check grace period
171 if grace_period is not None:
172 try:
173 mtime = object_store.get_object_mtime(sha)
174 age = time.time() - mtime
175 if age < grace_period:
176 if progress:
177 progress(
178 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
179 )
180 continue
181 except KeyError:
182 # Object not found, skip it
183 continue
185 if progress:
186 progress(f"Pruning {sha.decode('ascii', 'replace')}")
188 # Calculate size before attempting deletion
189 obj_size = len(obj.as_raw_string())
191 if not dry_run:
192 object_store.delete_loose_object(sha)
194 # Only count as pruned if we get here (deletion succeeded or dry run)
195 pruned.add(sha)
196 bytes_freed += obj_size
198 except KeyError:
199 # Object already gone
200 pass
201 except OSError as e:
202 # File system errors during deletion
203 if progress:
204 progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}")
205 return pruned, bytes_freed
208def garbage_collect(
209 repo,
210 auto: bool = False,
211 aggressive: bool = False,
212 prune: bool = True,
213 grace_period: Optional[int] = 1209600, # 2 weeks default
214 dry_run: bool = False,
215 progress=None,
216) -> GCStats:
217 """Run garbage collection on a repository.
219 Args:
220 repo: Repository to garbage collect
221 auto: Whether this is an automatic gc
222 aggressive: Whether to use aggressive settings
223 prune: Whether to prune unreachable objects
224 grace_period: Grace period for pruning in seconds
225 dry_run: If True, only report what would be done
226 progress: Optional progress callback
228 Returns:
229 GCStats object with garbage collection statistics
230 """
231 stats = GCStats()
233 object_store = repo.object_store
234 refs_container = repo.refs
236 # Count initial state
237 stats.packs_before = len(list(object_store.packs))
238 stats.loose_objects_before = object_store.count_loose_objects()
240 # Find unreachable objects to exclude from repacking
241 unreachable_to_prune = set()
242 if prune:
243 if progress:
244 progress("Finding unreachable objects")
245 unreachable = find_unreachable_objects(
246 object_store, refs_container, progress=progress
247 )
249 # Apply grace period check
250 for sha in unreachable:
251 try:
252 if grace_period is not None:
253 try:
254 mtime = object_store.get_object_mtime(sha)
255 age = time.time() - mtime
256 if age < grace_period:
257 if progress:
258 progress(
259 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
260 )
261 continue
262 except KeyError:
263 # Object not found, skip it
264 continue
266 unreachable_to_prune.add(sha)
267 obj = object_store[sha]
268 stats.bytes_freed += len(obj.as_raw_string())
269 except KeyError:
270 pass
272 stats.pruned_objects = unreachable_to_prune
274 # Pack refs
275 if progress:
276 progress("Packing references")
277 if not dry_run:
278 repo.refs.pack_refs()
280 # Delete loose unreachable objects
281 if prune and not dry_run:
282 for sha in unreachable_to_prune:
283 if object_store.contains_loose(sha):
284 try:
285 object_store.delete_loose_object(sha)
286 except OSError:
287 pass
289 # Repack everything, excluding unreachable objects
290 # This handles both loose object packing and pack consolidation
291 if progress:
292 progress("Repacking repository")
293 if not dry_run:
294 if prune and unreachable_to_prune:
295 # Repack excluding unreachable objects
296 object_store.repack(exclude=unreachable_to_prune)
297 else:
298 # Normal repack
299 object_store.repack()
301 # Prune orphaned temporary files
302 if progress:
303 progress("Pruning temporary files")
304 if not dry_run:
305 object_store.prune(grace_period=grace_period)
307 # Count final state
308 stats.packs_after = len(list(object_store.packs))
309 stats.loose_objects_after = object_store.count_loose_objects()
311 return stats
314def should_run_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool:
315 """Check if automatic garbage collection should run.
317 Args:
318 repo: Repository to check
319 config: Configuration to use (defaults to repo config)
321 Returns:
322 True if GC should run, False otherwise
323 """
324 # Check environment variable first
325 if os.environ.get("GIT_AUTO_GC") == "0":
326 return False
328 # Check programmatic disable flag
329 if getattr(repo, "_autogc_disabled", False):
330 return False
332 if config is None:
333 config = repo.get_config()
335 # Check if auto GC is disabled
336 try:
337 gc_auto = config.get(b"gc", b"auto")
338 gc_auto_value = int(gc_auto)
339 except KeyError:
340 gc_auto_value = DEFAULT_GC_AUTO
342 if gc_auto_value == 0:
343 # Auto GC is disabled
344 return False
346 # Check loose object count
347 object_store = repo.object_store
348 if not isinstance(object_store, DiskObjectStore):
349 # Can't count loose objects on non-disk stores
350 return False
352 loose_count = object_store.count_loose_objects()
353 if loose_count >= gc_auto_value:
354 return True
356 # Check pack file count
357 try:
358 gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit")
359 pack_limit = int(gc_auto_pack_limit)
360 except KeyError:
361 pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT
363 if pack_limit > 0:
364 pack_count = object_store.count_pack_files()
365 if pack_count >= pack_limit:
366 return True
368 return False
371def maybe_auto_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool:
372 """Run automatic garbage collection if needed.
374 Args:
375 repo: Repository to potentially GC
376 config: Configuration to use (defaults to repo config)
378 Returns:
379 True if GC was run, False otherwise
380 """
381 if not should_run_gc(repo, config):
382 return False
384 # Check for gc.log file - only for disk-based repos
385 if not hasattr(repo, "controldir"):
386 # For non-disk repos, just run GC without gc.log handling
387 garbage_collect(repo, auto=True)
388 return True
390 gc_log_path = os.path.join(repo.controldir(), "gc.log")
391 if os.path.exists(gc_log_path):
392 # Check gc.logExpiry
393 if config is None:
394 config = repo.get_config()
395 try:
396 log_expiry = config.get(b"gc", b"logExpiry")
397 except KeyError:
398 # Default to 1 day
399 expiry_seconds = 86400
400 else:
401 # Parse time value (simplified - just support days for now)
402 if log_expiry.endswith((b".days", b".day")):
403 days = int(log_expiry.split(b".")[0])
404 expiry_seconds = days * 86400
405 else:
406 # Default to 1 day
407 expiry_seconds = 86400
409 stat_info = os.stat(gc_log_path)
410 if time.time() - stat_info.st_mtime < expiry_seconds:
411 # gc.log exists and is not expired - skip GC
412 with open(gc_log_path, "rb") as f:
413 print(f.read().decode("utf-8", errors="replace"))
414 return False
416 # TODO: Support gc.autoDetach to run in background
417 # For now, run in foreground
419 try:
420 # Run GC with auto=True flag
421 garbage_collect(repo, auto=True)
423 # Remove gc.log on successful completion
424 if os.path.exists(gc_log_path):
425 try:
426 os.unlink(gc_log_path)
427 except FileNotFoundError:
428 pass
430 return True
431 except OSError as e:
432 # Write error to gc.log
433 with open(gc_log_path, "wb") as f:
434 f.write(f"Auto GC failed: {e}\n".encode())
435 # Don't propagate the error - auto GC failures shouldn't break operations
436 return False