Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/gc.py: 24%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# gc.py -- Git garbage collection implementation
2# Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
3#
4# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
5# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
6# General Public License as published by the Free Software Foundation; version 2.0
7# or (at your option) any later version. You can redistribute it and/or
8# modify it under the terms of either of these two licenses.
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#
16# You should have received a copy of the licenses; if not, see
17# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
18# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
19# License, Version 2.0.
20#
22"""Git garbage collection implementation."""
24__all__ = [
25 "DEFAULT_GC_AUTO",
26 "DEFAULT_GC_AUTO_PACK_LIMIT",
27 "GCStats",
28 "find_reachable_objects",
29 "find_unreachable_objects",
30 "garbage_collect",
31 "maybe_auto_gc",
32 "prune_unreachable_objects",
33 "should_run_gc",
34]
36import logging
37import os
38import time
39from collections import deque
40from collections.abc import Callable
41from dataclasses import dataclass, field
42from typing import TYPE_CHECKING
44from dulwich.object_store import (
45 BaseObjectStore,
46 DiskObjectStore,
47)
48from dulwich.objects import Commit, ObjectID, Tag, Tree
49from dulwich.refs import RefsContainer
51if TYPE_CHECKING:
52 from .config import Config
53 from .repo import BaseRepo, Repo
56DEFAULT_GC_AUTO = 6700
57DEFAULT_GC_AUTO_PACK_LIMIT = 50
60@dataclass
61class GCStats:
62 """Statistics from garbage collection."""
64 pruned_objects: set[ObjectID] = field(default_factory=set)
65 bytes_freed: int = 0
66 packs_before: int = 0
67 packs_after: int = 0
68 loose_objects_before: int = 0
69 loose_objects_after: int = 0
72def find_reachable_objects(
73 object_store: BaseObjectStore,
74 refs_container: RefsContainer,
75 include_reflogs: bool = True,
76 progress: Callable[[str], None] | None = None,
77) -> set[ObjectID]:
78 """Find all reachable objects in the repository.
80 Args:
81 object_store: Object store to search
82 refs_container: Reference container
83 include_reflogs: Whether to include reflog entries
84 progress: Optional progress callback
86 Returns:
87 Set of reachable object SHAs
88 """
89 reachable: set[ObjectID] = set()
90 pending: deque[ObjectID] = deque()
92 # Start with all refs
93 for ref in refs_container.allkeys():
94 try:
95 sha = refs_container[ref] # This follows symbolic refs
96 if sha and sha not in reachable:
97 pending.append(sha)
98 reachable.add(sha)
99 except KeyError:
100 # Broken ref
101 if progress:
102 progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}")
103 continue
105 # TODO: Add reflog support when reflog functionality is available
107 # Walk all reachable objects
108 while pending:
109 sha = pending.popleft()
111 if progress:
112 progress(f"Checking object {sha.decode('ascii', 'replace')}")
114 try:
115 obj = object_store[sha]
116 except KeyError:
117 continue
119 # Add referenced objects
120 if isinstance(obj, Commit):
121 # Tree
122 if obj.tree not in reachable:
123 pending.append(obj.tree)
124 reachable.add(obj.tree)
125 # Parents
126 for parent in obj.parents:
127 if parent not in reachable:
128 pending.append(parent)
129 reachable.add(parent)
130 elif isinstance(obj, Tree):
131 # Tree entries
132 for entry in obj.items():
133 assert entry.sha is not None
134 if entry.sha not in reachable:
135 pending.append(entry.sha)
136 reachable.add(entry.sha)
137 elif isinstance(obj, Tag):
138 # Tagged object
139 if obj.object[1] not in reachable:
140 pending.append(obj.object[1])
141 reachable.add(obj.object[1])
143 return reachable
146def find_unreachable_objects(
147 object_store: BaseObjectStore,
148 refs_container: RefsContainer,
149 include_reflogs: bool = True,
150 progress: Callable[[str], None] | None = None,
151) -> set[ObjectID]:
152 """Find all unreachable objects in the repository.
154 Args:
155 object_store: Object store to search
156 refs_container: Reference container
157 include_reflogs: Whether to include reflog entries
158 progress: Optional progress callback
160 Returns:
161 Set of unreachable object SHAs
162 """
163 reachable = find_reachable_objects(
164 object_store, refs_container, include_reflogs, progress
165 )
167 unreachable: set[ObjectID] = set()
168 for sha in object_store:
169 if sha not in reachable:
170 unreachable.add(sha)
172 return unreachable
175def prune_unreachable_objects(
176 object_store: DiskObjectStore,
177 refs_container: RefsContainer,
178 grace_period: int | None = None,
179 dry_run: bool = False,
180 progress: Callable[[str], None] | None = None,
181) -> tuple[set[ObjectID], int]:
182 """Remove unreachable objects from the repository.
184 Args:
185 object_store: Object store to prune
186 refs_container: Reference container
187 grace_period: Grace period in seconds (objects newer than this are kept)
188 dry_run: If True, only report what would be deleted
189 progress: Optional progress callback
191 Returns:
192 Tuple of (set of pruned object SHAs, total bytes freed)
193 """
194 unreachable = find_unreachable_objects(
195 object_store, refs_container, progress=progress
196 )
198 pruned: set[ObjectID] = set()
199 bytes_freed = 0
201 for sha in unreachable:
202 try:
203 obj = object_store[sha]
205 # Check grace period
206 if grace_period is not None:
207 try:
208 mtime = object_store.get_object_mtime(sha)
209 age = time.time() - mtime
210 if age < grace_period:
211 if progress:
212 progress(
213 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
214 )
215 continue
216 except KeyError:
217 # Object not found, skip it
218 continue
220 if progress:
221 progress(f"Pruning {sha.decode('ascii', 'replace')}")
223 # Calculate size before attempting deletion
224 obj_size = len(obj.as_raw_string())
226 if not dry_run:
227 object_store.delete_loose_object(sha)
229 # Only count as pruned if we get here (deletion succeeded or dry run)
230 pruned.add(sha)
231 bytes_freed += obj_size
233 except KeyError:
234 # Object already gone
235 pass
236 except OSError as e:
237 # File system errors during deletion
238 if progress:
239 progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}")
240 return pruned, bytes_freed
243def garbage_collect(
244 repo: "Repo",
245 auto: bool = False,
246 aggressive: bool = False,
247 prune: bool = True,
248 grace_period: int | None = 1209600, # 2 weeks default
249 dry_run: bool = False,
250 progress: Callable[[str], None] | None = None,
251) -> GCStats:
252 """Run garbage collection on a repository.
254 Args:
255 repo: Repository to garbage collect
256 auto: Whether this is an automatic gc
257 aggressive: Whether to use aggressive settings
258 prune: Whether to prune unreachable objects
259 grace_period: Grace period for pruning in seconds
260 dry_run: If True, only report what would be done
261 progress: Optional progress callback
263 Returns:
264 GCStats object with garbage collection statistics
265 """
266 stats = GCStats()
268 object_store = repo.object_store
269 refs_container = repo.refs
271 # Count initial state
272 stats.packs_before = len(list(object_store.packs))
273 stats.loose_objects_before = object_store.count_loose_objects()
275 # Find unreachable objects to exclude from repacking
276 unreachable_to_prune = set()
277 if prune:
278 if progress:
279 progress("Finding unreachable objects")
280 unreachable = find_unreachable_objects(
281 object_store, refs_container, progress=progress
282 )
284 # Apply grace period check
285 for sha in unreachable:
286 try:
287 if grace_period is not None:
288 try:
289 mtime = object_store.get_object_mtime(sha)
290 age = time.time() - mtime
291 if age < grace_period:
292 if progress:
293 progress(
294 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
295 )
296 continue
297 except KeyError:
298 # Object not found, skip it
299 continue
301 unreachable_to_prune.add(sha)
302 obj = object_store[sha]
303 stats.bytes_freed += len(obj.as_raw_string())
304 except KeyError:
305 pass
307 stats.pruned_objects = unreachable_to_prune
309 # Pack refs
310 if progress:
311 progress("Packing references")
312 if not dry_run:
313 repo.refs.pack_refs()
315 # Delete loose unreachable objects
316 if prune and not dry_run:
317 for sha in unreachable_to_prune:
318 if object_store.contains_loose(sha):
319 try:
320 object_store.delete_loose_object(sha)
321 except OSError:
322 pass
324 # Repack everything, excluding unreachable objects
325 # This handles both loose object packing and pack consolidation
326 if progress:
327 progress("Repacking repository")
328 if not dry_run:
329 if prune and unreachable_to_prune:
330 # Repack excluding unreachable objects
331 object_store.repack(exclude=unreachable_to_prune, progress=progress)
332 else:
333 # Normal repack
334 object_store.repack(progress=progress)
336 # Prune orphaned temporary files
337 if progress:
338 progress("Pruning temporary files")
339 if not dry_run:
340 object_store.prune(grace_period=grace_period)
342 # Count final state
343 stats.packs_after = len(list(object_store.packs))
344 stats.loose_objects_after = object_store.count_loose_objects()
346 return stats
349def should_run_gc(repo: "BaseRepo", config: "Config | None" = None) -> bool:
350 """Check if automatic garbage collection should run.
352 Args:
353 repo: Repository to check
354 config: Configuration to use (defaults to repo config)
356 Returns:
357 True if GC should run, False otherwise
358 """
359 # Check environment variable first
360 if os.environ.get("GIT_AUTO_GC") == "0":
361 return False
363 # Check programmatic disable flag
364 if getattr(repo, "_autogc_disabled", False):
365 return False
367 if config is None:
368 config = repo.get_config()
370 # Check if auto GC is disabled
371 try:
372 gc_auto = config.get(b"gc", b"auto")
373 gc_auto_value = int(gc_auto)
374 except KeyError:
375 gc_auto_value = DEFAULT_GC_AUTO
377 if gc_auto_value == 0:
378 # Auto GC is disabled
379 return False
381 # Check loose object count
382 object_store = repo.object_store
383 if not isinstance(object_store, DiskObjectStore):
384 # Can't count loose objects on non-disk stores
385 return False
387 loose_count = object_store.count_loose_objects()
388 if loose_count >= gc_auto_value:
389 return True
391 # Check pack file count
392 try:
393 gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit")
394 pack_limit = int(gc_auto_pack_limit)
395 except KeyError:
396 pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT
398 if pack_limit > 0:
399 pack_count = object_store.count_pack_files()
400 if pack_count >= pack_limit:
401 return True
403 return False
406def maybe_auto_gc(
407 repo: "Repo",
408 config: "Config | None" = None,
409 progress: Callable[[str], None] | None = None,
410) -> bool:
411 """Run automatic garbage collection if needed.
413 Args:
414 repo: Repository to potentially GC
415 config: Configuration to use (defaults to repo config)
416 progress: Optional progress reporting callback
418 Returns:
419 True if GC was run, False otherwise
420 """
421 if not should_run_gc(repo, config):
422 return False
424 # Check for gc.log file - only for disk-based repos
425 if not hasattr(repo, "controldir"):
426 # For non-disk repos, just run GC without gc.log handling
427 garbage_collect(repo, auto=True, progress=progress)
428 return True
430 gc_log_path = os.path.join(repo.controldir(), "gc.log")
431 if os.path.exists(gc_log_path):
432 # Check gc.logExpiry
433 if config is None:
434 config = repo.get_config()
435 try:
436 log_expiry = config.get(b"gc", b"logExpiry")
437 except KeyError:
438 # Default to 1 day
439 expiry_seconds = 86400
440 else:
441 # Parse time value (simplified - just support days for now)
442 if log_expiry.endswith((b".days", b".day")):
443 days = int(log_expiry.split(b".")[0])
444 expiry_seconds = days * 86400
445 else:
446 # Default to 1 day
447 expiry_seconds = 86400
449 stat_info = os.stat(gc_log_path)
450 if time.time() - stat_info.st_mtime < expiry_seconds:
451 # gc.log exists and is not expired - skip GC
452 with open(gc_log_path, "rb") as f:
453 logging.info(
454 "gc.log content: %s", f.read().decode("utf-8", errors="replace")
455 )
456 return False
458 # TODO: Support gc.autoDetach to run in background
459 # For now, run in foreground
461 try:
462 # Run GC with auto=True flag
463 garbage_collect(repo, auto=True, progress=progress)
465 # Remove gc.log on successful completion
466 if os.path.exists(gc_log_path):
467 try:
468 os.unlink(gc_log_path)
469 except FileNotFoundError:
470 pass
472 return True
473 except OSError as e:
474 # Write error to gc.log
475 with open(gc_log_path, "wb") as f:
476 f.write(f"Auto GC failed: {e}\n".encode())
477 # Don't propagate the error - auto GC failures shouldn't break operations
478 return False