Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/gc.py: 23%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

218 statements  

1"""Git garbage collection implementation.""" 

2 

3import collections 

4import logging 

5import os 

6import time 

7from dataclasses import dataclass, field 

8from typing import TYPE_CHECKING, Callable, Optional 

9 

10from dulwich.object_store import ( 

11 BaseObjectStore, 

12 DiskObjectStore, 

13) 

14from dulwich.objects import Commit, ObjectID, Tag, Tree 

15from dulwich.refs import RefsContainer 

16 

17if TYPE_CHECKING: 

18 from .config import Config 

19 from .repo import BaseRepo, Repo 

20 

21 

22DEFAULT_GC_AUTO = 6700 

23DEFAULT_GC_AUTO_PACK_LIMIT = 50 

24 

25 

26@dataclass 

27class GCStats: 

28 """Statistics from garbage collection.""" 

29 

30 pruned_objects: set[bytes] = field(default_factory=set) 

31 bytes_freed: int = 0 

32 packs_before: int = 0 

33 packs_after: int = 0 

34 loose_objects_before: int = 0 

35 loose_objects_after: int = 0 

36 

37 

38def find_reachable_objects( 

39 object_store: BaseObjectStore, 

40 refs_container: RefsContainer, 

41 include_reflogs: bool = True, 

42 progress: Optional[Callable[[str], None]] = None, 

43) -> set[bytes]: 

44 """Find all reachable objects in the repository. 

45 

46 Args: 

47 object_store: Object store to search 

48 refs_container: Reference container 

49 include_reflogs: Whether to include reflog entries 

50 progress: Optional progress callback 

51 

52 Returns: 

53 Set of reachable object SHAs 

54 """ 

55 reachable = set() 

56 pending: collections.deque[ObjectID] = collections.deque() 

57 

58 # Start with all refs 

59 for ref in refs_container.allkeys(): 

60 try: 

61 sha = refs_container[ref] # This follows symbolic refs 

62 if sha and sha not in reachable: 

63 pending.append(sha) 

64 reachable.add(sha) 

65 except KeyError: 

66 # Broken ref 

67 if progress: 

68 progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}") 

69 continue 

70 

71 # TODO: Add reflog support when reflog functionality is available 

72 

73 # Walk all reachable objects 

74 while pending: 

75 sha = pending.popleft() 

76 

77 if progress: 

78 progress(f"Checking object {sha.decode('ascii', 'replace')}") 

79 

80 try: 

81 obj = object_store[sha] 

82 except KeyError: 

83 continue 

84 

85 # Add referenced objects 

86 if isinstance(obj, Commit): 

87 # Tree 

88 if obj.tree not in reachable: 

89 pending.append(obj.tree) 

90 reachable.add(obj.tree) 

91 # Parents 

92 for parent in obj.parents: 

93 if parent not in reachable: 

94 pending.append(parent) 

95 reachable.add(parent) 

96 elif isinstance(obj, Tree): 

97 # Tree entries 

98 for entry in obj.items(): 

99 assert entry.sha is not None 

100 if entry.sha not in reachable: 

101 pending.append(entry.sha) 

102 reachable.add(entry.sha) 

103 elif isinstance(obj, Tag): 

104 # Tagged object 

105 if obj.object[1] not in reachable: 

106 pending.append(obj.object[1]) 

107 reachable.add(obj.object[1]) 

108 

109 return reachable 

110 

111 

112def find_unreachable_objects( 

113 object_store: BaseObjectStore, 

114 refs_container: RefsContainer, 

115 include_reflogs: bool = True, 

116 progress: Optional[Callable[[str], None]] = None, 

117) -> set[bytes]: 

118 """Find all unreachable objects in the repository. 

119 

120 Args: 

121 object_store: Object store to search 

122 refs_container: Reference container 

123 include_reflogs: Whether to include reflog entries 

124 progress: Optional progress callback 

125 

126 Returns: 

127 Set of unreachable object SHAs 

128 """ 

129 reachable = find_reachable_objects( 

130 object_store, refs_container, include_reflogs, progress 

131 ) 

132 

133 unreachable = set() 

134 for sha in object_store: 

135 if sha not in reachable: 

136 unreachable.add(sha) 

137 

138 return unreachable 

139 

140 

141def prune_unreachable_objects( 

142 object_store: DiskObjectStore, 

143 refs_container: RefsContainer, 

144 grace_period: Optional[int] = None, 

145 dry_run: bool = False, 

146 progress: Optional[Callable[[str], None]] = None, 

147) -> tuple[set[bytes], int]: 

148 """Remove unreachable objects from the repository. 

149 

150 Args: 

151 object_store: Object store to prune 

152 refs_container: Reference container 

153 grace_period: Grace period in seconds (objects newer than this are kept) 

154 dry_run: If True, only report what would be deleted 

155 progress: Optional progress callback 

156 

157 Returns: 

158 Tuple of (set of pruned object SHAs, total bytes freed) 

159 """ 

160 unreachable = find_unreachable_objects( 

161 object_store, refs_container, progress=progress 

162 ) 

163 

164 pruned = set() 

165 bytes_freed = 0 

166 

167 for sha in unreachable: 

168 try: 

169 obj = object_store[sha] 

170 

171 # Check grace period 

172 if grace_period is not None: 

173 try: 

174 mtime = object_store.get_object_mtime(sha) 

175 age = time.time() - mtime 

176 if age < grace_period: 

177 if progress: 

178 progress( 

179 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" 

180 ) 

181 continue 

182 except KeyError: 

183 # Object not found, skip it 

184 continue 

185 

186 if progress: 

187 progress(f"Pruning {sha.decode('ascii', 'replace')}") 

188 

189 # Calculate size before attempting deletion 

190 obj_size = len(obj.as_raw_string()) 

191 

192 if not dry_run: 

193 object_store.delete_loose_object(sha) 

194 

195 # Only count as pruned if we get here (deletion succeeded or dry run) 

196 pruned.add(sha) 

197 bytes_freed += obj_size 

198 

199 except KeyError: 

200 # Object already gone 

201 pass 

202 except OSError as e: 

203 # File system errors during deletion 

204 if progress: 

205 progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}") 

206 return pruned, bytes_freed 

207 

208 

209def garbage_collect( 

210 repo: "Repo", 

211 auto: bool = False, 

212 aggressive: bool = False, 

213 prune: bool = True, 

214 grace_period: Optional[int] = 1209600, # 2 weeks default 

215 dry_run: bool = False, 

216 progress: Optional[Callable[[str], None]] = None, 

217) -> GCStats: 

218 """Run garbage collection on a repository. 

219 

220 Args: 

221 repo: Repository to garbage collect 

222 auto: Whether this is an automatic gc 

223 aggressive: Whether to use aggressive settings 

224 prune: Whether to prune unreachable objects 

225 grace_period: Grace period for pruning in seconds 

226 dry_run: If True, only report what would be done 

227 progress: Optional progress callback 

228 

229 Returns: 

230 GCStats object with garbage collection statistics 

231 """ 

232 stats = GCStats() 

233 

234 object_store = repo.object_store 

235 refs_container = repo.refs 

236 

237 # Count initial state 

238 stats.packs_before = len(list(object_store.packs)) 

239 stats.loose_objects_before = object_store.count_loose_objects() 

240 

241 # Find unreachable objects to exclude from repacking 

242 unreachable_to_prune = set() 

243 if prune: 

244 if progress: 

245 progress("Finding unreachable objects") 

246 unreachable = find_unreachable_objects( 

247 object_store, refs_container, progress=progress 

248 ) 

249 

250 # Apply grace period check 

251 for sha in unreachable: 

252 try: 

253 if grace_period is not None: 

254 try: 

255 mtime = object_store.get_object_mtime(sha) 

256 age = time.time() - mtime 

257 if age < grace_period: 

258 if progress: 

259 progress( 

260 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" 

261 ) 

262 continue 

263 except KeyError: 

264 # Object not found, skip it 

265 continue 

266 

267 unreachable_to_prune.add(sha) 

268 obj = object_store[sha] 

269 stats.bytes_freed += len(obj.as_raw_string()) 

270 except KeyError: 

271 pass 

272 

273 stats.pruned_objects = unreachable_to_prune 

274 

275 # Pack refs 

276 if progress: 

277 progress("Packing references") 

278 if not dry_run: 

279 repo.refs.pack_refs() 

280 

281 # Delete loose unreachable objects 

282 if prune and not dry_run: 

283 for sha in unreachable_to_prune: 

284 if object_store.contains_loose(sha): 

285 try: 

286 object_store.delete_loose_object(sha) 

287 except OSError: 

288 pass 

289 

290 # Repack everything, excluding unreachable objects 

291 # This handles both loose object packing and pack consolidation 

292 if progress: 

293 progress("Repacking repository") 

294 if not dry_run: 

295 if prune and unreachable_to_prune: 

296 # Repack excluding unreachable objects 

297 object_store.repack(exclude=unreachable_to_prune, progress=progress) 

298 else: 

299 # Normal repack 

300 object_store.repack(progress=progress) 

301 

302 # Prune orphaned temporary files 

303 if progress: 

304 progress("Pruning temporary files") 

305 if not dry_run: 

306 object_store.prune(grace_period=grace_period) 

307 

308 # Count final state 

309 stats.packs_after = len(list(object_store.packs)) 

310 stats.loose_objects_after = object_store.count_loose_objects() 

311 

312 return stats 

313 

314 

315def should_run_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool: 

316 """Check if automatic garbage collection should run. 

317 

318 Args: 

319 repo: Repository to check 

320 config: Configuration to use (defaults to repo config) 

321 

322 Returns: 

323 True if GC should run, False otherwise 

324 """ 

325 # Check environment variable first 

326 if os.environ.get("GIT_AUTO_GC") == "0": 

327 return False 

328 

329 # Check programmatic disable flag 

330 if getattr(repo, "_autogc_disabled", False): 

331 return False 

332 

333 if config is None: 

334 config = repo.get_config() 

335 

336 # Check if auto GC is disabled 

337 try: 

338 gc_auto = config.get(b"gc", b"auto") 

339 gc_auto_value = int(gc_auto) 

340 except KeyError: 

341 gc_auto_value = DEFAULT_GC_AUTO 

342 

343 if gc_auto_value == 0: 

344 # Auto GC is disabled 

345 return False 

346 

347 # Check loose object count 

348 object_store = repo.object_store 

349 if not isinstance(object_store, DiskObjectStore): 

350 # Can't count loose objects on non-disk stores 

351 return False 

352 

353 loose_count = object_store.count_loose_objects() 

354 if loose_count >= gc_auto_value: 

355 return True 

356 

357 # Check pack file count 

358 try: 

359 gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit") 

360 pack_limit = int(gc_auto_pack_limit) 

361 except KeyError: 

362 pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT 

363 

364 if pack_limit > 0: 

365 pack_count = object_store.count_pack_files() 

366 if pack_count >= pack_limit: 

367 return True 

368 

369 return False 

370 

371 

372def maybe_auto_gc( 

373 repo: "Repo", 

374 config: Optional["Config"] = None, 

375 progress: Optional[Callable[[str], None]] = None, 

376) -> bool: 

377 """Run automatic garbage collection if needed. 

378 

379 Args: 

380 repo: Repository to potentially GC 

381 config: Configuration to use (defaults to repo config) 

382 progress: Optional progress reporting callback 

383 

384 Returns: 

385 True if GC was run, False otherwise 

386 """ 

387 if not should_run_gc(repo, config): 

388 return False 

389 

390 # Check for gc.log file - only for disk-based repos 

391 if not hasattr(repo, "controldir"): 

392 # For non-disk repos, just run GC without gc.log handling 

393 garbage_collect(repo, auto=True, progress=progress) 

394 return True 

395 

396 gc_log_path = os.path.join(repo.controldir(), "gc.log") 

397 if os.path.exists(gc_log_path): 

398 # Check gc.logExpiry 

399 if config is None: 

400 config = repo.get_config() 

401 try: 

402 log_expiry = config.get(b"gc", b"logExpiry") 

403 except KeyError: 

404 # Default to 1 day 

405 expiry_seconds = 86400 

406 else: 

407 # Parse time value (simplified - just support days for now) 

408 if log_expiry.endswith((b".days", b".day")): 

409 days = int(log_expiry.split(b".")[0]) 

410 expiry_seconds = days * 86400 

411 else: 

412 # Default to 1 day 

413 expiry_seconds = 86400 

414 

415 stat_info = os.stat(gc_log_path) 

416 if time.time() - stat_info.st_mtime < expiry_seconds: 

417 # gc.log exists and is not expired - skip GC 

418 with open(gc_log_path, "rb") as f: 

419 logging.info( 

420 "gc.log content: %s", f.read().decode("utf-8", errors="replace") 

421 ) 

422 return False 

423 

424 # TODO: Support gc.autoDetach to run in background 

425 # For now, run in foreground 

426 

427 try: 

428 # Run GC with auto=True flag 

429 garbage_collect(repo, auto=True, progress=progress) 

430 

431 # Remove gc.log on successful completion 

432 if os.path.exists(gc_log_path): 

433 try: 

434 os.unlink(gc_log_path) 

435 except FileNotFoundError: 

436 pass 

437 

438 return True 

439 except OSError as e: 

440 # Write error to gc.log 

441 with open(gc_log_path, "wb") as f: 

442 f.write(f"Auto GC failed: {e}\n".encode()) 

443 # Don't propagate the error - auto GC failures shouldn't break operations 

444 return False