Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/gc.py: 24%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

220 statements  

1"""Git garbage collection implementation.""" 

2 

3__all__ = [ 

4 "DEFAULT_GC_AUTO", 

5 "DEFAULT_GC_AUTO_PACK_LIMIT", 

6 "GCStats", 

7 "find_reachable_objects", 

8 "find_unreachable_objects", 

9 "garbage_collect", 

10 "maybe_auto_gc", 

11 "prune_unreachable_objects", 

12 "should_run_gc", 

13] 

14 

15import logging 

16import os 

17import time 

18from collections import deque 

19from collections.abc import Callable 

20from dataclasses import dataclass, field 

21from typing import TYPE_CHECKING 

22 

23from dulwich.object_store import ( 

24 BaseObjectStore, 

25 DiskObjectStore, 

26) 

27from dulwich.objects import Commit, ObjectID, Tag, Tree 

28from dulwich.refs import RefsContainer 

29 

30if TYPE_CHECKING: 

31 from .config import Config 

32 from .repo import BaseRepo, Repo 

33 

34 

35DEFAULT_GC_AUTO = 6700 

36DEFAULT_GC_AUTO_PACK_LIMIT = 50 

37 

38 

39@dataclass 

40class GCStats: 

41 """Statistics from garbage collection.""" 

42 

43 pruned_objects: set[ObjectID] = field(default_factory=set) 

44 bytes_freed: int = 0 

45 packs_before: int = 0 

46 packs_after: int = 0 

47 loose_objects_before: int = 0 

48 loose_objects_after: int = 0 

49 

50 

51def find_reachable_objects( 

52 object_store: BaseObjectStore, 

53 refs_container: RefsContainer, 

54 include_reflogs: bool = True, 

55 progress: Callable[[str], None] | None = None, 

56) -> set[ObjectID]: 

57 """Find all reachable objects in the repository. 

58 

59 Args: 

60 object_store: Object store to search 

61 refs_container: Reference container 

62 include_reflogs: Whether to include reflog entries 

63 progress: Optional progress callback 

64 

65 Returns: 

66 Set of reachable object SHAs 

67 """ 

68 reachable: set[ObjectID] = set() 

69 pending: deque[ObjectID] = deque() 

70 

71 # Start with all refs 

72 for ref in refs_container.allkeys(): 

73 try: 

74 sha = refs_container[ref] # This follows symbolic refs 

75 if sha and sha not in reachable: 

76 pending.append(sha) 

77 reachable.add(sha) 

78 except KeyError: 

79 # Broken ref 

80 if progress: 

81 progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}") 

82 continue 

83 

84 # TODO: Add reflog support when reflog functionality is available 

85 

86 # Walk all reachable objects 

87 while pending: 

88 sha = pending.popleft() 

89 

90 if progress: 

91 progress(f"Checking object {sha.decode('ascii', 'replace')}") 

92 

93 try: 

94 obj = object_store[sha] 

95 except KeyError: 

96 continue 

97 

98 # Add referenced objects 

99 if isinstance(obj, Commit): 

100 # Tree 

101 if obj.tree not in reachable: 

102 pending.append(obj.tree) 

103 reachable.add(obj.tree) 

104 # Parents 

105 for parent in obj.parents: 

106 if parent not in reachable: 

107 pending.append(parent) 

108 reachable.add(parent) 

109 elif isinstance(obj, Tree): 

110 # Tree entries 

111 for entry in obj.items(): 

112 assert entry.sha is not None 

113 if entry.sha not in reachable: 

114 pending.append(entry.sha) 

115 reachable.add(entry.sha) 

116 elif isinstance(obj, Tag): 

117 # Tagged object 

118 if obj.object[1] not in reachable: 

119 pending.append(obj.object[1]) 

120 reachable.add(obj.object[1]) 

121 

122 return reachable 

123 

124 

125def find_unreachable_objects( 

126 object_store: BaseObjectStore, 

127 refs_container: RefsContainer, 

128 include_reflogs: bool = True, 

129 progress: Callable[[str], None] | None = None, 

130) -> set[ObjectID]: 

131 """Find all unreachable objects in the repository. 

132 

133 Args: 

134 object_store: Object store to search 

135 refs_container: Reference container 

136 include_reflogs: Whether to include reflog entries 

137 progress: Optional progress callback 

138 

139 Returns: 

140 Set of unreachable object SHAs 

141 """ 

142 reachable = find_reachable_objects( 

143 object_store, refs_container, include_reflogs, progress 

144 ) 

145 

146 unreachable: set[ObjectID] = set() 

147 for sha in object_store: 

148 if sha not in reachable: 

149 unreachable.add(sha) 

150 

151 return unreachable 

152 

153 

154def prune_unreachable_objects( 

155 object_store: DiskObjectStore, 

156 refs_container: RefsContainer, 

157 grace_period: int | None = None, 

158 dry_run: bool = False, 

159 progress: Callable[[str], None] | None = None, 

160) -> tuple[set[ObjectID], int]: 

161 """Remove unreachable objects from the repository. 

162 

163 Args: 

164 object_store: Object store to prune 

165 refs_container: Reference container 

166 grace_period: Grace period in seconds (objects newer than this are kept) 

167 dry_run: If True, only report what would be deleted 

168 progress: Optional progress callback 

169 

170 Returns: 

171 Tuple of (set of pruned object SHAs, total bytes freed) 

172 """ 

173 unreachable = find_unreachable_objects( 

174 object_store, refs_container, progress=progress 

175 ) 

176 

177 pruned: set[ObjectID] = set() 

178 bytes_freed = 0 

179 

180 for sha in unreachable: 

181 try: 

182 obj = object_store[sha] 

183 

184 # Check grace period 

185 if grace_period is not None: 

186 try: 

187 mtime = object_store.get_object_mtime(sha) 

188 age = time.time() - mtime 

189 if age < grace_period: 

190 if progress: 

191 progress( 

192 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" 

193 ) 

194 continue 

195 except KeyError: 

196 # Object not found, skip it 

197 continue 

198 

199 if progress: 

200 progress(f"Pruning {sha.decode('ascii', 'replace')}") 

201 

202 # Calculate size before attempting deletion 

203 obj_size = len(obj.as_raw_string()) 

204 

205 if not dry_run: 

206 object_store.delete_loose_object(sha) 

207 

208 # Only count as pruned if we get here (deletion succeeded or dry run) 

209 pruned.add(sha) 

210 bytes_freed += obj_size 

211 

212 except KeyError: 

213 # Object already gone 

214 pass 

215 except OSError as e: 

216 # File system errors during deletion 

217 if progress: 

218 progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}") 

219 return pruned, bytes_freed 

220 

221 

222def garbage_collect( 

223 repo: "Repo", 

224 auto: bool = False, 

225 aggressive: bool = False, 

226 prune: bool = True, 

227 grace_period: int | None = 1209600, # 2 weeks default 

228 dry_run: bool = False, 

229 progress: Callable[[str], None] | None = None, 

230) -> GCStats: 

231 """Run garbage collection on a repository. 

232 

233 Args: 

234 repo: Repository to garbage collect 

235 auto: Whether this is an automatic gc 

236 aggressive: Whether to use aggressive settings 

237 prune: Whether to prune unreachable objects 

238 grace_period: Grace period for pruning in seconds 

239 dry_run: If True, only report what would be done 

240 progress: Optional progress callback 

241 

242 Returns: 

243 GCStats object with garbage collection statistics 

244 """ 

245 stats = GCStats() 

246 

247 object_store = repo.object_store 

248 refs_container = repo.refs 

249 

250 # Count initial state 

251 stats.packs_before = len(list(object_store.packs)) 

252 stats.loose_objects_before = object_store.count_loose_objects() 

253 

254 # Find unreachable objects to exclude from repacking 

255 unreachable_to_prune = set() 

256 if prune: 

257 if progress: 

258 progress("Finding unreachable objects") 

259 unreachable = find_unreachable_objects( 

260 object_store, refs_container, progress=progress 

261 ) 

262 

263 # Apply grace period check 

264 for sha in unreachable: 

265 try: 

266 if grace_period is not None: 

267 try: 

268 mtime = object_store.get_object_mtime(sha) 

269 age = time.time() - mtime 

270 if age < grace_period: 

271 if progress: 

272 progress( 

273 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" 

274 ) 

275 continue 

276 except KeyError: 

277 # Object not found, skip it 

278 continue 

279 

280 unreachable_to_prune.add(sha) 

281 obj = object_store[sha] 

282 stats.bytes_freed += len(obj.as_raw_string()) 

283 except KeyError: 

284 pass 

285 

286 stats.pruned_objects = unreachable_to_prune 

287 

288 # Pack refs 

289 if progress: 

290 progress("Packing references") 

291 if not dry_run: 

292 repo.refs.pack_refs() 

293 

294 # Delete loose unreachable objects 

295 if prune and not dry_run: 

296 for sha in unreachable_to_prune: 

297 if object_store.contains_loose(sha): 

298 try: 

299 object_store.delete_loose_object(sha) 

300 except OSError: 

301 pass 

302 

303 # Repack everything, excluding unreachable objects 

304 # This handles both loose object packing and pack consolidation 

305 if progress: 

306 progress("Repacking repository") 

307 if not dry_run: 

308 if prune and unreachable_to_prune: 

309 # Repack excluding unreachable objects 

310 object_store.repack(exclude=unreachable_to_prune, progress=progress) 

311 else: 

312 # Normal repack 

313 object_store.repack(progress=progress) 

314 

315 # Prune orphaned temporary files 

316 if progress: 

317 progress("Pruning temporary files") 

318 if not dry_run: 

319 object_store.prune(grace_period=grace_period) 

320 

321 # Count final state 

322 stats.packs_after = len(list(object_store.packs)) 

323 stats.loose_objects_after = object_store.count_loose_objects() 

324 

325 return stats 

326 

327 

328def should_run_gc(repo: "BaseRepo", config: "Config | None" = None) -> bool: 

329 """Check if automatic garbage collection should run. 

330 

331 Args: 

332 repo: Repository to check 

333 config: Configuration to use (defaults to repo config) 

334 

335 Returns: 

336 True if GC should run, False otherwise 

337 """ 

338 # Check environment variable first 

339 if os.environ.get("GIT_AUTO_GC") == "0": 

340 return False 

341 

342 # Check programmatic disable flag 

343 if getattr(repo, "_autogc_disabled", False): 

344 return False 

345 

346 if config is None: 

347 config = repo.get_config() 

348 

349 # Check if auto GC is disabled 

350 try: 

351 gc_auto = config.get(b"gc", b"auto") 

352 gc_auto_value = int(gc_auto) 

353 except KeyError: 

354 gc_auto_value = DEFAULT_GC_AUTO 

355 

356 if gc_auto_value == 0: 

357 # Auto GC is disabled 

358 return False 

359 

360 # Check loose object count 

361 object_store = repo.object_store 

362 if not isinstance(object_store, DiskObjectStore): 

363 # Can't count loose objects on non-disk stores 

364 return False 

365 

366 loose_count = object_store.count_loose_objects() 

367 if loose_count >= gc_auto_value: 

368 return True 

369 

370 # Check pack file count 

371 try: 

372 gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit") 

373 pack_limit = int(gc_auto_pack_limit) 

374 except KeyError: 

375 pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT 

376 

377 if pack_limit > 0: 

378 pack_count = object_store.count_pack_files() 

379 if pack_count >= pack_limit: 

380 return True 

381 

382 return False 

383 

384 

385def maybe_auto_gc( 

386 repo: "Repo", 

387 config: "Config | None" = None, 

388 progress: Callable[[str], None] | None = None, 

389) -> bool: 

390 """Run automatic garbage collection if needed. 

391 

392 Args: 

393 repo: Repository to potentially GC 

394 config: Configuration to use (defaults to repo config) 

395 progress: Optional progress reporting callback 

396 

397 Returns: 

398 True if GC was run, False otherwise 

399 """ 

400 if not should_run_gc(repo, config): 

401 return False 

402 

403 # Check for gc.log file - only for disk-based repos 

404 if not hasattr(repo, "controldir"): 

405 # For non-disk repos, just run GC without gc.log handling 

406 garbage_collect(repo, auto=True, progress=progress) 

407 return True 

408 

409 gc_log_path = os.path.join(repo.controldir(), "gc.log") 

410 if os.path.exists(gc_log_path): 

411 # Check gc.logExpiry 

412 if config is None: 

413 config = repo.get_config() 

414 try: 

415 log_expiry = config.get(b"gc", b"logExpiry") 

416 except KeyError: 

417 # Default to 1 day 

418 expiry_seconds = 86400 

419 else: 

420 # Parse time value (simplified - just support days for now) 

421 if log_expiry.endswith((b".days", b".day")): 

422 days = int(log_expiry.split(b".")[0]) 

423 expiry_seconds = days * 86400 

424 else: 

425 # Default to 1 day 

426 expiry_seconds = 86400 

427 

428 stat_info = os.stat(gc_log_path) 

429 if time.time() - stat_info.st_mtime < expiry_seconds: 

430 # gc.log exists and is not expired - skip GC 

431 with open(gc_log_path, "rb") as f: 

432 logging.info( 

433 "gc.log content: %s", f.read().decode("utf-8", errors="replace") 

434 ) 

435 return False 

436 

437 # TODO: Support gc.autoDetach to run in background 

438 # For now, run in foreground 

439 

440 try: 

441 # Run GC with auto=True flag 

442 garbage_collect(repo, auto=True, progress=progress) 

443 

444 # Remove gc.log on successful completion 

445 if os.path.exists(gc_log_path): 

446 try: 

447 os.unlink(gc_log_path) 

448 except FileNotFoundError: 

449 pass 

450 

451 return True 

452 except OSError as e: 

453 # Write error to gc.log 

454 with open(gc_log_path, "wb") as f: 

455 f.write(f"Auto GC failed: {e}\n".encode()) 

456 # Don't propagate the error - auto GC failures shouldn't break operations 

457 return False