Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/gc.py: 23%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# gc.py -- Git garbage collection implementation
2# Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk>
3#
4# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
5# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
6# General Public License as published by the Free Software Foundation; version 2.0
7# or (at your option) any later version. You can redistribute it and/or
8# modify it under the terms of either of these two licenses.
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#
16# You should have received a copy of the licenses; if not, see
17# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
18# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
19# License, Version 2.0.
20#
22"""Git garbage collection implementation."""
24__all__ = [
25 "DEFAULT_GC_AUTO",
26 "DEFAULT_GC_AUTO_PACK_LIMIT",
27 "DEFAULT_GC_PRUNE_EXPIRE",
28 "GCStats",
29 "find_reachable_objects",
30 "find_unreachable_objects",
31 "garbage_collect",
32 "get_prune_grace_period",
33 "maybe_auto_gc",
34 "prune_unreachable_objects",
35 "should_run_gc",
36]
38import logging
39import os
40import time
41from collections import deque
42from collections.abc import Callable
43from dataclasses import dataclass, field
44from typing import TYPE_CHECKING
46from dulwich.object_store import (
47 BaseObjectStore,
48 DiskObjectStore,
49)
50from dulwich.objects import Commit, ObjectID, Tag, Tree
51from dulwich.refs import RefsContainer
53if TYPE_CHECKING:
54 from .config import Config
55 from .repo import BaseRepo, Repo
58DEFAULT_GC_AUTO = 6700
59DEFAULT_GC_AUTO_PACK_LIMIT = 50
60DEFAULT_GC_PRUNE_EXPIRE = 1209600 # 2 weeks in seconds
63def get_prune_grace_period(config: "Config") -> int:
64 """Read gc.pruneExpire from config and return grace period in seconds.
66 If gc.pruneExpire is not set, returns the default of 2 weeks.
68 Args:
69 config: Repository configuration
71 Returns:
72 Grace period in seconds
74 Raises:
75 ValueError: If the configured value cannot be parsed
76 """
77 from .approxidate import parse_approxidate
79 try:
80 raw_value = config.get(b"gc", b"pruneExpire")
81 if isinstance(raw_value, bytes):
82 value = raw_value.decode("utf-8")
83 else:
84 value = raw_value
85 except KeyError:
86 return DEFAULT_GC_PRUNE_EXPIRE
88 value = value.strip()
89 if value == "now":
90 return 0
92 timestamp = parse_approxidate(value)
93 return max(0, int(time.time() - timestamp))
96@dataclass
97class GCStats:
98 """Statistics from garbage collection."""
100 pruned_objects: set[ObjectID] = field(default_factory=set)
101 bytes_freed: int = 0
102 packs_before: int = 0
103 packs_after: int = 0
104 loose_objects_before: int = 0
105 loose_objects_after: int = 0
108def find_reachable_objects(
109 object_store: BaseObjectStore,
110 refs_container: RefsContainer,
111 include_reflogs: bool = True,
112 progress: Callable[[str], None] | None = None,
113) -> set[ObjectID]:
114 """Find all reachable objects in the repository.
116 Args:
117 object_store: Object store to search
118 refs_container: Reference container
119 include_reflogs: Whether to include reflog entries
120 progress: Optional progress callback
122 Returns:
123 Set of reachable object SHAs
124 """
125 reachable: set[ObjectID] = set()
126 pending: deque[ObjectID] = deque()
128 # Start with all refs
129 for ref in refs_container.allkeys():
130 try:
131 sha = refs_container[ref] # This follows symbolic refs
132 if sha and sha not in reachable:
133 pending.append(sha)
134 reachable.add(sha)
135 except KeyError:
136 # Broken ref
137 if progress:
138 progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}")
139 continue
141 # TODO: Add reflog support when reflog functionality is available
143 # Walk all reachable objects
144 while pending:
145 sha = pending.popleft()
147 if progress:
148 progress(f"Checking object {sha.decode('ascii', 'replace')}")
150 try:
151 obj = object_store[sha]
152 except KeyError:
153 continue
155 # Add referenced objects
156 if isinstance(obj, Commit):
157 # Tree
158 if obj.tree not in reachable:
159 pending.append(obj.tree)
160 reachable.add(obj.tree)
161 # Parents
162 for parent in obj.parents:
163 if parent not in reachable:
164 pending.append(parent)
165 reachable.add(parent)
166 elif isinstance(obj, Tree):
167 # Tree entries
168 for entry in obj.items():
169 assert entry.sha is not None
170 if entry.sha not in reachable:
171 pending.append(entry.sha)
172 reachable.add(entry.sha)
173 elif isinstance(obj, Tag):
174 # Tagged object
175 if obj.object[1] not in reachable:
176 pending.append(obj.object[1])
177 reachable.add(obj.object[1])
179 return reachable
182def find_unreachable_objects(
183 object_store: BaseObjectStore,
184 refs_container: RefsContainer,
185 include_reflogs: bool = True,
186 progress: Callable[[str], None] | None = None,
187) -> set[ObjectID]:
188 """Find all unreachable objects in the repository.
190 Args:
191 object_store: Object store to search
192 refs_container: Reference container
193 include_reflogs: Whether to include reflog entries
194 progress: Optional progress callback
196 Returns:
197 Set of unreachable object SHAs
198 """
199 reachable = find_reachable_objects(
200 object_store, refs_container, include_reflogs, progress
201 )
203 unreachable: set[ObjectID] = set()
204 for sha in object_store:
205 if sha not in reachable:
206 unreachable.add(sha)
208 return unreachable
211def prune_unreachable_objects(
212 object_store: DiskObjectStore,
213 refs_container: RefsContainer,
214 grace_period: int | None = None,
215 dry_run: bool = False,
216 progress: Callable[[str], None] | None = None,
217) -> tuple[set[ObjectID], int]:
218 """Remove unreachable objects from the repository.
220 Args:
221 object_store: Object store to prune
222 refs_container: Reference container
223 grace_period: Grace period in seconds (objects newer than this are kept)
224 dry_run: If True, only report what would be deleted
225 progress: Optional progress callback
227 Returns:
228 Tuple of (set of pruned object SHAs, total bytes freed)
229 """
230 unreachable = find_unreachable_objects(
231 object_store, refs_container, progress=progress
232 )
234 pruned: set[ObjectID] = set()
235 bytes_freed = 0
237 for sha in unreachable:
238 try:
239 obj = object_store[sha]
241 # Check grace period
242 if grace_period is not None:
243 try:
244 mtime = object_store.get_object_mtime(sha)
245 age = time.time() - mtime
246 if age < grace_period:
247 if progress:
248 progress(
249 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
250 )
251 continue
252 except KeyError:
253 # Object not found, skip it
254 continue
256 if progress:
257 progress(f"Pruning {sha.decode('ascii', 'replace')}")
259 # Calculate size before attempting deletion
260 obj_size = len(obj.as_raw_string())
262 if not dry_run:
263 object_store.delete_loose_object(sha)
265 # Only count as pruned if we get here (deletion succeeded or dry run)
266 pruned.add(sha)
267 bytes_freed += obj_size
269 except KeyError:
270 # Object already gone
271 pass
272 except OSError as e:
273 # File system errors during deletion
274 if progress:
275 progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}")
276 return pruned, bytes_freed
279def garbage_collect(
280 repo: "Repo",
281 auto: bool = False,
282 aggressive: bool = False,
283 prune: bool = True,
284 grace_period: int | None = 1209600, # 2 weeks default
285 dry_run: bool = False,
286 progress: Callable[[str], None] | None = None,
287) -> GCStats:
288 """Run garbage collection on a repository.
290 Args:
291 repo: Repository to garbage collect
292 auto: Whether this is an automatic gc
293 aggressive: Whether to use aggressive settings
294 prune: Whether to prune unreachable objects
295 grace_period: Grace period for pruning in seconds
296 dry_run: If True, only report what would be done
297 progress: Optional progress callback
299 Returns:
300 GCStats object with garbage collection statistics
301 """
302 stats = GCStats()
304 object_store = repo.object_store
305 refs_container = repo.refs
307 # Count initial state
308 stats.packs_before = len(list(object_store.packs))
309 stats.loose_objects_before = object_store.count_loose_objects()
311 # Find unreachable objects to exclude from repacking
312 unreachable_to_prune = set()
313 if prune:
314 if progress:
315 progress("Finding unreachable objects")
316 unreachable = find_unreachable_objects(
317 object_store, refs_container, progress=progress
318 )
320 # Apply grace period check
321 for sha in unreachable:
322 try:
323 if grace_period is not None:
324 try:
325 mtime = object_store.get_object_mtime(sha)
326 age = time.time() - mtime
327 if age < grace_period:
328 if progress:
329 progress(
330 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)"
331 )
332 continue
333 except KeyError:
334 # Object not found, skip it
335 continue
337 unreachable_to_prune.add(sha)
338 obj = object_store[sha]
339 stats.bytes_freed += len(obj.as_raw_string())
340 except KeyError:
341 pass
343 stats.pruned_objects = unreachable_to_prune
345 # Pack refs
346 if progress:
347 progress("Packing references")
348 if not dry_run:
349 repo.refs.pack_refs()
351 # Delete loose unreachable objects
352 if prune and not dry_run:
353 for sha in unreachable_to_prune:
354 if object_store.contains_loose(sha):
355 try:
356 object_store.delete_loose_object(sha)
357 except OSError:
358 pass
360 # Repack everything, excluding unreachable objects
361 # This handles both loose object packing and pack consolidation
362 if progress:
363 progress("Repacking repository")
364 if not dry_run:
365 if prune and unreachable_to_prune:
366 # Repack excluding unreachable objects
367 object_store.repack(exclude=unreachable_to_prune, progress=progress)
368 else:
369 # Normal repack
370 object_store.repack(progress=progress)
372 # Prune orphaned temporary files
373 if progress:
374 progress("Pruning temporary files")
375 if not dry_run:
376 object_store.prune(grace_period=grace_period)
378 # Count final state
379 stats.packs_after = len(list(object_store.packs))
380 stats.loose_objects_after = object_store.count_loose_objects()
382 return stats
385def should_run_gc(repo: "BaseRepo", config: "Config | None" = None) -> bool:
386 """Check if automatic garbage collection should run.
388 Args:
389 repo: Repository to check
390 config: Configuration to use (defaults to repo config)
392 Returns:
393 True if GC should run, False otherwise
394 """
395 # Check environment variable first
396 if os.environ.get("GIT_AUTO_GC") == "0":
397 return False
399 # Check programmatic disable flag
400 if getattr(repo, "_autogc_disabled", False):
401 return False
403 if config is None:
404 config = repo.get_config()
406 # Check if auto GC is disabled
407 try:
408 gc_auto = config.get(b"gc", b"auto")
409 gc_auto_value = int(gc_auto)
410 except KeyError:
411 gc_auto_value = DEFAULT_GC_AUTO
413 if gc_auto_value == 0:
414 # Auto GC is disabled
415 return False
417 # Check loose object count
418 object_store = repo.object_store
419 if not isinstance(object_store, DiskObjectStore):
420 # Can't count loose objects on non-disk stores
421 return False
423 loose_count = object_store.count_loose_objects()
424 if loose_count >= gc_auto_value:
425 return True
427 # Check pack file count
428 try:
429 gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit")
430 pack_limit = int(gc_auto_pack_limit)
431 except KeyError:
432 pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT
434 if pack_limit > 0:
435 pack_count = object_store.count_pack_files()
436 if pack_count >= pack_limit:
437 return True
439 return False
442def maybe_auto_gc(
443 repo: "Repo",
444 config: "Config | None" = None,
445 progress: Callable[[str], None] | None = None,
446) -> bool:
447 """Run automatic garbage collection if needed.
449 Args:
450 repo: Repository to potentially GC
451 config: Configuration to use (defaults to repo config)
452 progress: Optional progress reporting callback
454 Returns:
455 True if GC was run, False otherwise
456 """
457 if not should_run_gc(repo, config):
458 return False
460 # Check for gc.log file - only for disk-based repos
461 if not hasattr(repo, "controldir"):
462 # For non-disk repos, just run GC without gc.log handling
463 garbage_collect(repo, auto=True, progress=progress)
464 return True
466 gc_log_path = os.path.join(repo.controldir(), "gc.log")
467 if os.path.exists(gc_log_path):
468 # Check gc.logExpiry
469 if config is None:
470 config = repo.get_config()
471 try:
472 log_expiry = config.get(b"gc", b"logExpiry")
473 except KeyError:
474 # Default to 1 day
475 expiry_seconds = 86400
476 else:
477 # Parse time value (simplified - just support days for now)
478 if log_expiry.endswith((b".days", b".day")):
479 days = int(log_expiry.split(b".")[0])
480 expiry_seconds = days * 86400
481 else:
482 # Default to 1 day
483 expiry_seconds = 86400
485 stat_info = os.stat(gc_log_path)
486 if time.time() - stat_info.st_mtime < expiry_seconds:
487 # gc.log exists and is not expired - skip GC
488 with open(gc_log_path, "rb") as f:
489 logging.info(
490 "gc.log content: %s", f.read().decode("utf-8", errors="replace")
491 )
492 return False
494 # TODO: Support gc.autoDetach to run in background
495 # For now, run in foreground
497 try:
498 # Run GC with auto=True flag
499 garbage_collect(repo, auto=True, progress=progress)
501 # Remove gc.log on successful completion
502 if os.path.exists(gc_log_path):
503 try:
504 os.unlink(gc_log_path)
505 except FileNotFoundError:
506 pass
508 return True
509 except OSError as e:
510 # Write error to gc.log
511 with open(gc_log_path, "wb") as f:
512 f.write(f"Auto GC failed: {e}\n".encode())
513 # Don't propagate the error - auto GC failures shouldn't break operations
514 return False