Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/gc.py: 23%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

236 statements  

1# gc.py -- Git garbage collection implementation 

2# Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk> 

3# 

4# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 

5# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU 

6# General Public License as published by the Free Software Foundation; version 2.0 

7# or (at your option) any later version. You can redistribute it and/or 

8# modify it under the terms of either of these two licenses. 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15# 

16# You should have received a copy of the licenses; if not, see 

17# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License 

18# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache 

19# License, Version 2.0. 

20# 

21 

22"""Git garbage collection implementation.""" 

23 

24__all__ = [ 

25 "DEFAULT_GC_AUTO", 

26 "DEFAULT_GC_AUTO_PACK_LIMIT", 

27 "DEFAULT_GC_PRUNE_EXPIRE", 

28 "GCStats", 

29 "find_reachable_objects", 

30 "find_unreachable_objects", 

31 "garbage_collect", 

32 "get_prune_grace_period", 

33 "maybe_auto_gc", 

34 "prune_unreachable_objects", 

35 "should_run_gc", 

36] 

37 

38import logging 

39import os 

40import time 

41from collections import deque 

42from collections.abc import Callable 

43from dataclasses import dataclass, field 

44from typing import TYPE_CHECKING 

45 

46from dulwich.object_store import ( 

47 BaseObjectStore, 

48 DiskObjectStore, 

49) 

50from dulwich.objects import Commit, ObjectID, Tag, Tree 

51from dulwich.refs import RefsContainer 

52 

53if TYPE_CHECKING: 

54 from .config import Config 

55 from .repo import BaseRepo, Repo 

56 

57 

58DEFAULT_GC_AUTO = 6700 

59DEFAULT_GC_AUTO_PACK_LIMIT = 50 

60DEFAULT_GC_PRUNE_EXPIRE = 1209600 # 2 weeks in seconds 

61 

62 

63def get_prune_grace_period(config: "Config") -> int: 

64 """Read gc.pruneExpire from config and return grace period in seconds. 

65 

66 If gc.pruneExpire is not set, returns the default of 2 weeks. 

67 

68 Args: 

69 config: Repository configuration 

70 

71 Returns: 

72 Grace period in seconds 

73 

74 Raises: 

75 ValueError: If the configured value cannot be parsed 

76 """ 

77 from .approxidate import parse_approxidate 

78 

79 try: 

80 raw_value = config.get(b"gc", b"pruneExpire") 

81 if isinstance(raw_value, bytes): 

82 value = raw_value.decode("utf-8") 

83 else: 

84 value = raw_value 

85 except KeyError: 

86 return DEFAULT_GC_PRUNE_EXPIRE 

87 

88 value = value.strip() 

89 if value == "now": 

90 return 0 

91 

92 timestamp = parse_approxidate(value) 

93 return max(0, int(time.time() - timestamp)) 

94 

95 

96@dataclass 

97class GCStats: 

98 """Statistics from garbage collection.""" 

99 

100 pruned_objects: set[ObjectID] = field(default_factory=set) 

101 bytes_freed: int = 0 

102 packs_before: int = 0 

103 packs_after: int = 0 

104 loose_objects_before: int = 0 

105 loose_objects_after: int = 0 

106 

107 

108def find_reachable_objects( 

109 object_store: BaseObjectStore, 

110 refs_container: RefsContainer, 

111 include_reflogs: bool = True, 

112 progress: Callable[[str], None] | None = None, 

113) -> set[ObjectID]: 

114 """Find all reachable objects in the repository. 

115 

116 Args: 

117 object_store: Object store to search 

118 refs_container: Reference container 

119 include_reflogs: Whether to include reflog entries 

120 progress: Optional progress callback 

121 

122 Returns: 

123 Set of reachable object SHAs 

124 """ 

125 reachable: set[ObjectID] = set() 

126 pending: deque[ObjectID] = deque() 

127 

128 # Start with all refs 

129 for ref in refs_container.allkeys(): 

130 try: 

131 sha = refs_container[ref] # This follows symbolic refs 

132 if sha and sha not in reachable: 

133 pending.append(sha) 

134 reachable.add(sha) 

135 except KeyError: 

136 # Broken ref 

137 if progress: 

138 progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}") 

139 continue 

140 

141 # TODO: Add reflog support when reflog functionality is available 

142 

143 # Walk all reachable objects 

144 while pending: 

145 sha = pending.popleft() 

146 

147 if progress: 

148 progress(f"Checking object {sha.decode('ascii', 'replace')}") 

149 

150 try: 

151 obj = object_store[sha] 

152 except KeyError: 

153 continue 

154 

155 # Add referenced objects 

156 if isinstance(obj, Commit): 

157 # Tree 

158 if obj.tree not in reachable: 

159 pending.append(obj.tree) 

160 reachable.add(obj.tree) 

161 # Parents 

162 for parent in obj.parents: 

163 if parent not in reachable: 

164 pending.append(parent) 

165 reachable.add(parent) 

166 elif isinstance(obj, Tree): 

167 # Tree entries 

168 for entry in obj.items(): 

169 assert entry.sha is not None 

170 if entry.sha not in reachable: 

171 pending.append(entry.sha) 

172 reachable.add(entry.sha) 

173 elif isinstance(obj, Tag): 

174 # Tagged object 

175 if obj.object[1] not in reachable: 

176 pending.append(obj.object[1]) 

177 reachable.add(obj.object[1]) 

178 

179 return reachable 

180 

181 

182def find_unreachable_objects( 

183 object_store: BaseObjectStore, 

184 refs_container: RefsContainer, 

185 include_reflogs: bool = True, 

186 progress: Callable[[str], None] | None = None, 

187) -> set[ObjectID]: 

188 """Find all unreachable objects in the repository. 

189 

190 Args: 

191 object_store: Object store to search 

192 refs_container: Reference container 

193 include_reflogs: Whether to include reflog entries 

194 progress: Optional progress callback 

195 

196 Returns: 

197 Set of unreachable object SHAs 

198 """ 

199 reachable = find_reachable_objects( 

200 object_store, refs_container, include_reflogs, progress 

201 ) 

202 

203 unreachable: set[ObjectID] = set() 

204 for sha in object_store: 

205 if sha not in reachable: 

206 unreachable.add(sha) 

207 

208 return unreachable 

209 

210 

211def prune_unreachable_objects( 

212 object_store: DiskObjectStore, 

213 refs_container: RefsContainer, 

214 grace_period: int | None = None, 

215 dry_run: bool = False, 

216 progress: Callable[[str], None] | None = None, 

217) -> tuple[set[ObjectID], int]: 

218 """Remove unreachable objects from the repository. 

219 

220 Args: 

221 object_store: Object store to prune 

222 refs_container: Reference container 

223 grace_period: Grace period in seconds (objects newer than this are kept) 

224 dry_run: If True, only report what would be deleted 

225 progress: Optional progress callback 

226 

227 Returns: 

228 Tuple of (set of pruned object SHAs, total bytes freed) 

229 """ 

230 unreachable = find_unreachable_objects( 

231 object_store, refs_container, progress=progress 

232 ) 

233 

234 pruned: set[ObjectID] = set() 

235 bytes_freed = 0 

236 

237 for sha in unreachable: 

238 try: 

239 obj = object_store[sha] 

240 

241 # Check grace period 

242 if grace_period is not None: 

243 try: 

244 mtime = object_store.get_object_mtime(sha) 

245 age = time.time() - mtime 

246 if age < grace_period: 

247 if progress: 

248 progress( 

249 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" 

250 ) 

251 continue 

252 except KeyError: 

253 # Object not found, skip it 

254 continue 

255 

256 if progress: 

257 progress(f"Pruning {sha.decode('ascii', 'replace')}") 

258 

259 # Calculate size before attempting deletion 

260 obj_size = len(obj.as_raw_string()) 

261 

262 if not dry_run: 

263 object_store.delete_loose_object(sha) 

264 

265 # Only count as pruned if we get here (deletion succeeded or dry run) 

266 pruned.add(sha) 

267 bytes_freed += obj_size 

268 

269 except KeyError: 

270 # Object already gone 

271 pass 

272 except OSError as e: 

273 # File system errors during deletion 

274 if progress: 

275 progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}") 

276 return pruned, bytes_freed 

277 

278 

279def garbage_collect( 

280 repo: "Repo", 

281 auto: bool = False, 

282 aggressive: bool = False, 

283 prune: bool = True, 

284 grace_period: int | None = 1209600, # 2 weeks default 

285 dry_run: bool = False, 

286 progress: Callable[[str], None] | None = None, 

287) -> GCStats: 

288 """Run garbage collection on a repository. 

289 

290 Args: 

291 repo: Repository to garbage collect 

292 auto: Whether this is an automatic gc 

293 aggressive: Whether to use aggressive settings 

294 prune: Whether to prune unreachable objects 

295 grace_period: Grace period for pruning in seconds 

296 dry_run: If True, only report what would be done 

297 progress: Optional progress callback 

298 

299 Returns: 

300 GCStats object with garbage collection statistics 

301 """ 

302 stats = GCStats() 

303 

304 object_store = repo.object_store 

305 refs_container = repo.refs 

306 

307 # Count initial state 

308 stats.packs_before = len(list(object_store.packs)) 

309 stats.loose_objects_before = object_store.count_loose_objects() 

310 

311 # Find unreachable objects to exclude from repacking 

312 unreachable_to_prune = set() 

313 if prune: 

314 if progress: 

315 progress("Finding unreachable objects") 

316 unreachable = find_unreachable_objects( 

317 object_store, refs_container, progress=progress 

318 ) 

319 

320 # Apply grace period check 

321 for sha in unreachable: 

322 try: 

323 if grace_period is not None: 

324 try: 

325 mtime = object_store.get_object_mtime(sha) 

326 age = time.time() - mtime 

327 if age < grace_period: 

328 if progress: 

329 progress( 

330 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" 

331 ) 

332 continue 

333 except KeyError: 

334 # Object not found, skip it 

335 continue 

336 

337 unreachable_to_prune.add(sha) 

338 obj = object_store[sha] 

339 stats.bytes_freed += len(obj.as_raw_string()) 

340 except KeyError: 

341 pass 

342 

343 stats.pruned_objects = unreachable_to_prune 

344 

345 # Pack refs 

346 if progress: 

347 progress("Packing references") 

348 if not dry_run: 

349 repo.refs.pack_refs() 

350 

351 # Delete loose unreachable objects 

352 if prune and not dry_run: 

353 for sha in unreachable_to_prune: 

354 if object_store.contains_loose(sha): 

355 try: 

356 object_store.delete_loose_object(sha) 

357 except OSError: 

358 pass 

359 

360 # Repack everything, excluding unreachable objects 

361 # This handles both loose object packing and pack consolidation 

362 if progress: 

363 progress("Repacking repository") 

364 if not dry_run: 

365 if prune and unreachable_to_prune: 

366 # Repack excluding unreachable objects 

367 object_store.repack(exclude=unreachable_to_prune, progress=progress) 

368 else: 

369 # Normal repack 

370 object_store.repack(progress=progress) 

371 

372 # Prune orphaned temporary files 

373 if progress: 

374 progress("Pruning temporary files") 

375 if not dry_run: 

376 object_store.prune(grace_period=grace_period) 

377 

378 # Count final state 

379 stats.packs_after = len(list(object_store.packs)) 

380 stats.loose_objects_after = object_store.count_loose_objects() 

381 

382 return stats 

383 

384 

385def should_run_gc(repo: "BaseRepo", config: "Config | None" = None) -> bool: 

386 """Check if automatic garbage collection should run. 

387 

388 Args: 

389 repo: Repository to check 

390 config: Configuration to use (defaults to repo config) 

391 

392 Returns: 

393 True if GC should run, False otherwise 

394 """ 

395 # Check environment variable first 

396 if os.environ.get("GIT_AUTO_GC") == "0": 

397 return False 

398 

399 # Check programmatic disable flag 

400 if getattr(repo, "_autogc_disabled", False): 

401 return False 

402 

403 if config is None: 

404 config = repo.get_config() 

405 

406 # Check if auto GC is disabled 

407 try: 

408 gc_auto = config.get(b"gc", b"auto") 

409 gc_auto_value = int(gc_auto) 

410 except KeyError: 

411 gc_auto_value = DEFAULT_GC_AUTO 

412 

413 if gc_auto_value == 0: 

414 # Auto GC is disabled 

415 return False 

416 

417 # Check loose object count 

418 object_store = repo.object_store 

419 if not isinstance(object_store, DiskObjectStore): 

420 # Can't count loose objects on non-disk stores 

421 return False 

422 

423 loose_count = object_store.count_loose_objects() 

424 if loose_count >= gc_auto_value: 

425 return True 

426 

427 # Check pack file count 

428 try: 

429 gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit") 

430 pack_limit = int(gc_auto_pack_limit) 

431 except KeyError: 

432 pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT 

433 

434 if pack_limit > 0: 

435 pack_count = object_store.count_pack_files() 

436 if pack_count >= pack_limit: 

437 return True 

438 

439 return False 

440 

441 

442def maybe_auto_gc( 

443 repo: "Repo", 

444 config: "Config | None" = None, 

445 progress: Callable[[str], None] | None = None, 

446) -> bool: 

447 """Run automatic garbage collection if needed. 

448 

449 Args: 

450 repo: Repository to potentially GC 

451 config: Configuration to use (defaults to repo config) 

452 progress: Optional progress reporting callback 

453 

454 Returns: 

455 True if GC was run, False otherwise 

456 """ 

457 if not should_run_gc(repo, config): 

458 return False 

459 

460 # Check for gc.log file - only for disk-based repos 

461 if not hasattr(repo, "controldir"): 

462 # For non-disk repos, just run GC without gc.log handling 

463 garbage_collect(repo, auto=True, progress=progress) 

464 return True 

465 

466 gc_log_path = os.path.join(repo.controldir(), "gc.log") 

467 if os.path.exists(gc_log_path): 

468 # Check gc.logExpiry 

469 if config is None: 

470 config = repo.get_config() 

471 try: 

472 log_expiry = config.get(b"gc", b"logExpiry") 

473 except KeyError: 

474 # Default to 1 day 

475 expiry_seconds = 86400 

476 else: 

477 # Parse time value (simplified - just support days for now) 

478 if log_expiry.endswith((b".days", b".day")): 

479 days = int(log_expiry.split(b".")[0]) 

480 expiry_seconds = days * 86400 

481 else: 

482 # Default to 1 day 

483 expiry_seconds = 86400 

484 

485 stat_info = os.stat(gc_log_path) 

486 if time.time() - stat_info.st_mtime < expiry_seconds: 

487 # gc.log exists and is not expired - skip GC 

488 with open(gc_log_path, "rb") as f: 

489 logging.info( 

490 "gc.log content: %s", f.read().decode("utf-8", errors="replace") 

491 ) 

492 return False 

493 

494 # TODO: Support gc.autoDetach to run in background 

495 # For now, run in foreground 

496 

497 try: 

498 # Run GC with auto=True flag 

499 garbage_collect(repo, auto=True, progress=progress) 

500 

501 # Remove gc.log on successful completion 

502 if os.path.exists(gc_log_path): 

503 try: 

504 os.unlink(gc_log_path) 

505 except FileNotFoundError: 

506 pass 

507 

508 return True 

509 except OSError as e: 

510 # Write error to gc.log 

511 with open(gc_log_path, "wb") as f: 

512 f.write(f"Auto GC failed: {e}\n".encode()) 

513 # Don't propagate the error - auto GC failures shouldn't break operations 

514 return False