Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/gc.py: 24%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

221 statements  

1# gc.py -- Git garbage collection implementation 

2# Copyright (C) 2025 Jelmer Vernooij <jelmer@jelmer.uk> 

3# 

4# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 

5# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU 

6# General Public License as published by the Free Software Foundation; version 2.0 

7# or (at your option) any later version. You can redistribute it and/or 

8# modify it under the terms of either of these two licenses. 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, 

12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

13# See the License for the specific language governing permissions and 

14# limitations under the License. 

15# 

16# You should have received a copy of the licenses; if not, see 

17# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License 

18# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache 

19# License, Version 2.0. 

20# 

21 

22"""Git garbage collection implementation.""" 

23 

24__all__ = [ 

25 "DEFAULT_GC_AUTO", 

26 "DEFAULT_GC_AUTO_PACK_LIMIT", 

27 "GCStats", 

28 "find_reachable_objects", 

29 "find_unreachable_objects", 

30 "garbage_collect", 

31 "maybe_auto_gc", 

32 "prune_unreachable_objects", 

33 "should_run_gc", 

34] 

35 

36import logging 

37import os 

38import time 

39from collections import deque 

40from collections.abc import Callable 

41from dataclasses import dataclass, field 

42from typing import TYPE_CHECKING 

43 

44from dulwich.object_store import ( 

45 BaseObjectStore, 

46 DiskObjectStore, 

47) 

48from dulwich.objects import Commit, ObjectID, Tag, Tree 

49from dulwich.refs import RefsContainer 

50 

51if TYPE_CHECKING: 

52 from .config import Config 

53 from .repo import BaseRepo, Repo 

54 

55 

56DEFAULT_GC_AUTO = 6700 

57DEFAULT_GC_AUTO_PACK_LIMIT = 50 

58 

59 

60@dataclass 

61class GCStats: 

62 """Statistics from garbage collection.""" 

63 

64 pruned_objects: set[ObjectID] = field(default_factory=set) 

65 bytes_freed: int = 0 

66 packs_before: int = 0 

67 packs_after: int = 0 

68 loose_objects_before: int = 0 

69 loose_objects_after: int = 0 

70 

71 

72def find_reachable_objects( 

73 object_store: BaseObjectStore, 

74 refs_container: RefsContainer, 

75 include_reflogs: bool = True, 

76 progress: Callable[[str], None] | None = None, 

77) -> set[ObjectID]: 

78 """Find all reachable objects in the repository. 

79 

80 Args: 

81 object_store: Object store to search 

82 refs_container: Reference container 

83 include_reflogs: Whether to include reflog entries 

84 progress: Optional progress callback 

85 

86 Returns: 

87 Set of reachable object SHAs 

88 """ 

89 reachable: set[ObjectID] = set() 

90 pending: deque[ObjectID] = deque() 

91 

92 # Start with all refs 

93 for ref in refs_container.allkeys(): 

94 try: 

95 sha = refs_container[ref] # This follows symbolic refs 

96 if sha and sha not in reachable: 

97 pending.append(sha) 

98 reachable.add(sha) 

99 except KeyError: 

100 # Broken ref 

101 if progress: 

102 progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}") 

103 continue 

104 

105 # TODO: Add reflog support when reflog functionality is available 

106 

107 # Walk all reachable objects 

108 while pending: 

109 sha = pending.popleft() 

110 

111 if progress: 

112 progress(f"Checking object {sha.decode('ascii', 'replace')}") 

113 

114 try: 

115 obj = object_store[sha] 

116 except KeyError: 

117 continue 

118 

119 # Add referenced objects 

120 if isinstance(obj, Commit): 

121 # Tree 

122 if obj.tree not in reachable: 

123 pending.append(obj.tree) 

124 reachable.add(obj.tree) 

125 # Parents 

126 for parent in obj.parents: 

127 if parent not in reachable: 

128 pending.append(parent) 

129 reachable.add(parent) 

130 elif isinstance(obj, Tree): 

131 # Tree entries 

132 for entry in obj.items(): 

133 assert entry.sha is not None 

134 if entry.sha not in reachable: 

135 pending.append(entry.sha) 

136 reachable.add(entry.sha) 

137 elif isinstance(obj, Tag): 

138 # Tagged object 

139 if obj.object[1] not in reachable: 

140 pending.append(obj.object[1]) 

141 reachable.add(obj.object[1]) 

142 

143 return reachable 

144 

145 

146def find_unreachable_objects( 

147 object_store: BaseObjectStore, 

148 refs_container: RefsContainer, 

149 include_reflogs: bool = True, 

150 progress: Callable[[str], None] | None = None, 

151) -> set[ObjectID]: 

152 """Find all unreachable objects in the repository. 

153 

154 Args: 

155 object_store: Object store to search 

156 refs_container: Reference container 

157 include_reflogs: Whether to include reflog entries 

158 progress: Optional progress callback 

159 

160 Returns: 

161 Set of unreachable object SHAs 

162 """ 

163 reachable = find_reachable_objects( 

164 object_store, refs_container, include_reflogs, progress 

165 ) 

166 

167 unreachable: set[ObjectID] = set() 

168 for sha in object_store: 

169 if sha not in reachable: 

170 unreachable.add(sha) 

171 

172 return unreachable 

173 

174 

175def prune_unreachable_objects( 

176 object_store: DiskObjectStore, 

177 refs_container: RefsContainer, 

178 grace_period: int | None = None, 

179 dry_run: bool = False, 

180 progress: Callable[[str], None] | None = None, 

181) -> tuple[set[ObjectID], int]: 

182 """Remove unreachable objects from the repository. 

183 

184 Args: 

185 object_store: Object store to prune 

186 refs_container: Reference container 

187 grace_period: Grace period in seconds (objects newer than this are kept) 

188 dry_run: If True, only report what would be deleted 

189 progress: Optional progress callback 

190 

191 Returns: 

192 Tuple of (set of pruned object SHAs, total bytes freed) 

193 """ 

194 unreachable = find_unreachable_objects( 

195 object_store, refs_container, progress=progress 

196 ) 

197 

198 pruned: set[ObjectID] = set() 

199 bytes_freed = 0 

200 

201 for sha in unreachable: 

202 try: 

203 obj = object_store[sha] 

204 

205 # Check grace period 

206 if grace_period is not None: 

207 try: 

208 mtime = object_store.get_object_mtime(sha) 

209 age = time.time() - mtime 

210 if age < grace_period: 

211 if progress: 

212 progress( 

213 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" 

214 ) 

215 continue 

216 except KeyError: 

217 # Object not found, skip it 

218 continue 

219 

220 if progress: 

221 progress(f"Pruning {sha.decode('ascii', 'replace')}") 

222 

223 # Calculate size before attempting deletion 

224 obj_size = len(obj.as_raw_string()) 

225 

226 if not dry_run: 

227 object_store.delete_loose_object(sha) 

228 

229 # Only count as pruned if we get here (deletion succeeded or dry run) 

230 pruned.add(sha) 

231 bytes_freed += obj_size 

232 

233 except KeyError: 

234 # Object already gone 

235 pass 

236 except OSError as e: 

237 # File system errors during deletion 

238 if progress: 

239 progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}") 

240 return pruned, bytes_freed 

241 

242 

243def garbage_collect( 

244 repo: "Repo", 

245 auto: bool = False, 

246 aggressive: bool = False, 

247 prune: bool = True, 

248 grace_period: int | None = 1209600, # 2 weeks default 

249 dry_run: bool = False, 

250 progress: Callable[[str], None] | None = None, 

251) -> GCStats: 

252 """Run garbage collection on a repository. 

253 

254 Args: 

255 repo: Repository to garbage collect 

256 auto: Whether this is an automatic gc 

257 aggressive: Whether to use aggressive settings 

258 prune: Whether to prune unreachable objects 

259 grace_period: Grace period for pruning in seconds 

260 dry_run: If True, only report what would be done 

261 progress: Optional progress callback 

262 

263 Returns: 

264 GCStats object with garbage collection statistics 

265 """ 

266 stats = GCStats() 

267 

268 object_store = repo.object_store 

269 refs_container = repo.refs 

270 

271 # Count initial state 

272 stats.packs_before = len(list(object_store.packs)) 

273 stats.loose_objects_before = object_store.count_loose_objects() 

274 

275 # Find unreachable objects to exclude from repacking 

276 unreachable_to_prune = set() 

277 if prune: 

278 if progress: 

279 progress("Finding unreachable objects") 

280 unreachable = find_unreachable_objects( 

281 object_store, refs_container, progress=progress 

282 ) 

283 

284 # Apply grace period check 

285 for sha in unreachable: 

286 try: 

287 if grace_period is not None: 

288 try: 

289 mtime = object_store.get_object_mtime(sha) 

290 age = time.time() - mtime 

291 if age < grace_period: 

292 if progress: 

293 progress( 

294 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" 

295 ) 

296 continue 

297 except KeyError: 

298 # Object not found, skip it 

299 continue 

300 

301 unreachable_to_prune.add(sha) 

302 obj = object_store[sha] 

303 stats.bytes_freed += len(obj.as_raw_string()) 

304 except KeyError: 

305 pass 

306 

307 stats.pruned_objects = unreachable_to_prune 

308 

309 # Pack refs 

310 if progress: 

311 progress("Packing references") 

312 if not dry_run: 

313 repo.refs.pack_refs() 

314 

315 # Delete loose unreachable objects 

316 if prune and not dry_run: 

317 for sha in unreachable_to_prune: 

318 if object_store.contains_loose(sha): 

319 try: 

320 object_store.delete_loose_object(sha) 

321 except OSError: 

322 pass 

323 

324 # Repack everything, excluding unreachable objects 

325 # This handles both loose object packing and pack consolidation 

326 if progress: 

327 progress("Repacking repository") 

328 if not dry_run: 

329 if prune and unreachable_to_prune: 

330 # Repack excluding unreachable objects 

331 object_store.repack(exclude=unreachable_to_prune, progress=progress) 

332 else: 

333 # Normal repack 

334 object_store.repack(progress=progress) 

335 

336 # Prune orphaned temporary files 

337 if progress: 

338 progress("Pruning temporary files") 

339 if not dry_run: 

340 object_store.prune(grace_period=grace_period) 

341 

342 # Count final state 

343 stats.packs_after = len(list(object_store.packs)) 

344 stats.loose_objects_after = object_store.count_loose_objects() 

345 

346 return stats 

347 

348 

349def should_run_gc(repo: "BaseRepo", config: "Config | None" = None) -> bool: 

350 """Check if automatic garbage collection should run. 

351 

352 Args: 

353 repo: Repository to check 

354 config: Configuration to use (defaults to repo config) 

355 

356 Returns: 

357 True if GC should run, False otherwise 

358 """ 

359 # Check environment variable first 

360 if os.environ.get("GIT_AUTO_GC") == "0": 

361 return False 

362 

363 # Check programmatic disable flag 

364 if getattr(repo, "_autogc_disabled", False): 

365 return False 

366 

367 if config is None: 

368 config = repo.get_config() 

369 

370 # Check if auto GC is disabled 

371 try: 

372 gc_auto = config.get(b"gc", b"auto") 

373 gc_auto_value = int(gc_auto) 

374 except KeyError: 

375 gc_auto_value = DEFAULT_GC_AUTO 

376 

377 if gc_auto_value == 0: 

378 # Auto GC is disabled 

379 return False 

380 

381 # Check loose object count 

382 object_store = repo.object_store 

383 if not isinstance(object_store, DiskObjectStore): 

384 # Can't count loose objects on non-disk stores 

385 return False 

386 

387 loose_count = object_store.count_loose_objects() 

388 if loose_count >= gc_auto_value: 

389 return True 

390 

391 # Check pack file count 

392 try: 

393 gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit") 

394 pack_limit = int(gc_auto_pack_limit) 

395 except KeyError: 

396 pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT 

397 

398 if pack_limit > 0: 

399 pack_count = object_store.count_pack_files() 

400 if pack_count >= pack_limit: 

401 return True 

402 

403 return False 

404 

405 

406def maybe_auto_gc( 

407 repo: "Repo", 

408 config: "Config | None" = None, 

409 progress: Callable[[str], None] | None = None, 

410) -> bool: 

411 """Run automatic garbage collection if needed. 

412 

413 Args: 

414 repo: Repository to potentially GC 

415 config: Configuration to use (defaults to repo config) 

416 progress: Optional progress reporting callback 

417 

418 Returns: 

419 True if GC was run, False otherwise 

420 """ 

421 if not should_run_gc(repo, config): 

422 return False 

423 

424 # Check for gc.log file - only for disk-based repos 

425 if not hasattr(repo, "controldir"): 

426 # For non-disk repos, just run GC without gc.log handling 

427 garbage_collect(repo, auto=True, progress=progress) 

428 return True 

429 

430 gc_log_path = os.path.join(repo.controldir(), "gc.log") 

431 if os.path.exists(gc_log_path): 

432 # Check gc.logExpiry 

433 if config is None: 

434 config = repo.get_config() 

435 try: 

436 log_expiry = config.get(b"gc", b"logExpiry") 

437 except KeyError: 

438 # Default to 1 day 

439 expiry_seconds = 86400 

440 else: 

441 # Parse time value (simplified - just support days for now) 

442 if log_expiry.endswith((b".days", b".day")): 

443 days = int(log_expiry.split(b".")[0]) 

444 expiry_seconds = days * 86400 

445 else: 

446 # Default to 1 day 

447 expiry_seconds = 86400 

448 

449 stat_info = os.stat(gc_log_path) 

450 if time.time() - stat_info.st_mtime < expiry_seconds: 

451 # gc.log exists and is not expired - skip GC 

452 with open(gc_log_path, "rb") as f: 

453 logging.info( 

454 "gc.log content: %s", f.read().decode("utf-8", errors="replace") 

455 ) 

456 return False 

457 

458 # TODO: Support gc.autoDetach to run in background 

459 # For now, run in foreground 

460 

461 try: 

462 # Run GC with auto=True flag 

463 garbage_collect(repo, auto=True, progress=progress) 

464 

465 # Remove gc.log on successful completion 

466 if os.path.exists(gc_log_path): 

467 try: 

468 os.unlink(gc_log_path) 

469 except FileNotFoundError: 

470 pass 

471 

472 return True 

473 except OSError as e: 

474 # Write error to gc.log 

475 with open(gc_log_path, "wb") as f: 

476 f.write(f"Auto GC failed: {e}\n".encode()) 

477 # Don't propagate the error - auto GC failures shouldn't break operations 

478 return False