Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/gc.py: 23%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

219 statements  

1"""Git garbage collection implementation.""" 

2 

3import logging 

4import os 

5import time 

6from collections import deque 

7from collections.abc import Callable 

8from dataclasses import dataclass, field 

9from typing import TYPE_CHECKING 

10 

11from dulwich.object_store import ( 

12 BaseObjectStore, 

13 DiskObjectStore, 

14) 

15from dulwich.objects import Commit, ObjectID, Tag, Tree 

16from dulwich.refs import RefsContainer 

17 

18if TYPE_CHECKING: 

19 from .config import Config 

20 from .repo import BaseRepo, Repo 

21 

22 

23DEFAULT_GC_AUTO = 6700 

24DEFAULT_GC_AUTO_PACK_LIMIT = 50 

25 

26 

27@dataclass 

28class GCStats: 

29 """Statistics from garbage collection.""" 

30 

31 pruned_objects: set[bytes] = field(default_factory=set) 

32 bytes_freed: int = 0 

33 packs_before: int = 0 

34 packs_after: int = 0 

35 loose_objects_before: int = 0 

36 loose_objects_after: int = 0 

37 

38 

39def find_reachable_objects( 

40 object_store: BaseObjectStore, 

41 refs_container: RefsContainer, 

42 include_reflogs: bool = True, 

43 progress: Callable[[str], None] | None = None, 

44) -> set[bytes]: 

45 """Find all reachable objects in the repository. 

46 

47 Args: 

48 object_store: Object store to search 

49 refs_container: Reference container 

50 include_reflogs: Whether to include reflog entries 

51 progress: Optional progress callback 

52 

53 Returns: 

54 Set of reachable object SHAs 

55 """ 

56 reachable = set() 

57 pending: deque[ObjectID] = deque() 

58 

59 # Start with all refs 

60 for ref in refs_container.allkeys(): 

61 try: 

62 sha = refs_container[ref] # This follows symbolic refs 

63 if sha and sha not in reachable: 

64 pending.append(sha) 

65 reachable.add(sha) 

66 except KeyError: 

67 # Broken ref 

68 if progress: 

69 progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}") 

70 continue 

71 

72 # TODO: Add reflog support when reflog functionality is available 

73 

74 # Walk all reachable objects 

75 while pending: 

76 sha = pending.popleft() 

77 

78 if progress: 

79 progress(f"Checking object {sha.decode('ascii', 'replace')}") 

80 

81 try: 

82 obj = object_store[sha] 

83 except KeyError: 

84 continue 

85 

86 # Add referenced objects 

87 if isinstance(obj, Commit): 

88 # Tree 

89 if obj.tree not in reachable: 

90 pending.append(obj.tree) 

91 reachable.add(obj.tree) 

92 # Parents 

93 for parent in obj.parents: 

94 if parent not in reachable: 

95 pending.append(parent) 

96 reachable.add(parent) 

97 elif isinstance(obj, Tree): 

98 # Tree entries 

99 for entry in obj.items(): 

100 assert entry.sha is not None 

101 if entry.sha not in reachable: 

102 pending.append(entry.sha) 

103 reachable.add(entry.sha) 

104 elif isinstance(obj, Tag): 

105 # Tagged object 

106 if obj.object[1] not in reachable: 

107 pending.append(obj.object[1]) 

108 reachable.add(obj.object[1]) 

109 

110 return reachable 

111 

112 

113def find_unreachable_objects( 

114 object_store: BaseObjectStore, 

115 refs_container: RefsContainer, 

116 include_reflogs: bool = True, 

117 progress: Callable[[str], None] | None = None, 

118) -> set[bytes]: 

119 """Find all unreachable objects in the repository. 

120 

121 Args: 

122 object_store: Object store to search 

123 refs_container: Reference container 

124 include_reflogs: Whether to include reflog entries 

125 progress: Optional progress callback 

126 

127 Returns: 

128 Set of unreachable object SHAs 

129 """ 

130 reachable = find_reachable_objects( 

131 object_store, refs_container, include_reflogs, progress 

132 ) 

133 

134 unreachable = set() 

135 for sha in object_store: 

136 if sha not in reachable: 

137 unreachable.add(sha) 

138 

139 return unreachable 

140 

141 

142def prune_unreachable_objects( 

143 object_store: DiskObjectStore, 

144 refs_container: RefsContainer, 

145 grace_period: int | None = None, 

146 dry_run: bool = False, 

147 progress: Callable[[str], None] | None = None, 

148) -> tuple[set[bytes], int]: 

149 """Remove unreachable objects from the repository. 

150 

151 Args: 

152 object_store: Object store to prune 

153 refs_container: Reference container 

154 grace_period: Grace period in seconds (objects newer than this are kept) 

155 dry_run: If True, only report what would be deleted 

156 progress: Optional progress callback 

157 

158 Returns: 

159 Tuple of (set of pruned object SHAs, total bytes freed) 

160 """ 

161 unreachable = find_unreachable_objects( 

162 object_store, refs_container, progress=progress 

163 ) 

164 

165 pruned = set() 

166 bytes_freed = 0 

167 

168 for sha in unreachable: 

169 try: 

170 obj = object_store[sha] 

171 

172 # Check grace period 

173 if grace_period is not None: 

174 try: 

175 mtime = object_store.get_object_mtime(sha) 

176 age = time.time() - mtime 

177 if age < grace_period: 

178 if progress: 

179 progress( 

180 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" 

181 ) 

182 continue 

183 except KeyError: 

184 # Object not found, skip it 

185 continue 

186 

187 if progress: 

188 progress(f"Pruning {sha.decode('ascii', 'replace')}") 

189 

190 # Calculate size before attempting deletion 

191 obj_size = len(obj.as_raw_string()) 

192 

193 if not dry_run: 

194 object_store.delete_loose_object(sha) 

195 

196 # Only count as pruned if we get here (deletion succeeded or dry run) 

197 pruned.add(sha) 

198 bytes_freed += obj_size 

199 

200 except KeyError: 

201 # Object already gone 

202 pass 

203 except OSError as e: 

204 # File system errors during deletion 

205 if progress: 

206 progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}") 

207 return pruned, bytes_freed 

208 

209 

210def garbage_collect( 

211 repo: "Repo", 

212 auto: bool = False, 

213 aggressive: bool = False, 

214 prune: bool = True, 

215 grace_period: int | None = 1209600, # 2 weeks default 

216 dry_run: bool = False, 

217 progress: Callable[[str], None] | None = None, 

218) -> GCStats: 

219 """Run garbage collection on a repository. 

220 

221 Args: 

222 repo: Repository to garbage collect 

223 auto: Whether this is an automatic gc 

224 aggressive: Whether to use aggressive settings 

225 prune: Whether to prune unreachable objects 

226 grace_period: Grace period for pruning in seconds 

227 dry_run: If True, only report what would be done 

228 progress: Optional progress callback 

229 

230 Returns: 

231 GCStats object with garbage collection statistics 

232 """ 

233 stats = GCStats() 

234 

235 object_store = repo.object_store 

236 refs_container = repo.refs 

237 

238 # Count initial state 

239 stats.packs_before = len(list(object_store.packs)) 

240 stats.loose_objects_before = object_store.count_loose_objects() 

241 

242 # Find unreachable objects to exclude from repacking 

243 unreachable_to_prune = set() 

244 if prune: 

245 if progress: 

246 progress("Finding unreachable objects") 

247 unreachable = find_unreachable_objects( 

248 object_store, refs_container, progress=progress 

249 ) 

250 

251 # Apply grace period check 

252 for sha in unreachable: 

253 try: 

254 if grace_period is not None: 

255 try: 

256 mtime = object_store.get_object_mtime(sha) 

257 age = time.time() - mtime 

258 if age < grace_period: 

259 if progress: 

260 progress( 

261 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" 

262 ) 

263 continue 

264 except KeyError: 

265 # Object not found, skip it 

266 continue 

267 

268 unreachable_to_prune.add(sha) 

269 obj = object_store[sha] 

270 stats.bytes_freed += len(obj.as_raw_string()) 

271 except KeyError: 

272 pass 

273 

274 stats.pruned_objects = unreachable_to_prune 

275 

276 # Pack refs 

277 if progress: 

278 progress("Packing references") 

279 if not dry_run: 

280 repo.refs.pack_refs() 

281 

282 # Delete loose unreachable objects 

283 if prune and not dry_run: 

284 for sha in unreachable_to_prune: 

285 if object_store.contains_loose(sha): 

286 try: 

287 object_store.delete_loose_object(sha) 

288 except OSError: 

289 pass 

290 

291 # Repack everything, excluding unreachable objects 

292 # This handles both loose object packing and pack consolidation 

293 if progress: 

294 progress("Repacking repository") 

295 if not dry_run: 

296 if prune and unreachable_to_prune: 

297 # Repack excluding unreachable objects 

298 object_store.repack(exclude=unreachable_to_prune, progress=progress) 

299 else: 

300 # Normal repack 

301 object_store.repack(progress=progress) 

302 

303 # Prune orphaned temporary files 

304 if progress: 

305 progress("Pruning temporary files") 

306 if not dry_run: 

307 object_store.prune(grace_period=grace_period) 

308 

309 # Count final state 

310 stats.packs_after = len(list(object_store.packs)) 

311 stats.loose_objects_after = object_store.count_loose_objects() 

312 

313 return stats 

314 

315 

316def should_run_gc(repo: "BaseRepo", config: "Config | None" = None) -> bool: 

317 """Check if automatic garbage collection should run. 

318 

319 Args: 

320 repo: Repository to check 

321 config: Configuration to use (defaults to repo config) 

322 

323 Returns: 

324 True if GC should run, False otherwise 

325 """ 

326 # Check environment variable first 

327 if os.environ.get("GIT_AUTO_GC") == "0": 

328 return False 

329 

330 # Check programmatic disable flag 

331 if getattr(repo, "_autogc_disabled", False): 

332 return False 

333 

334 if config is None: 

335 config = repo.get_config() 

336 

337 # Check if auto GC is disabled 

338 try: 

339 gc_auto = config.get(b"gc", b"auto") 

340 gc_auto_value = int(gc_auto) 

341 except KeyError: 

342 gc_auto_value = DEFAULT_GC_AUTO 

343 

344 if gc_auto_value == 0: 

345 # Auto GC is disabled 

346 return False 

347 

348 # Check loose object count 

349 object_store = repo.object_store 

350 if not isinstance(object_store, DiskObjectStore): 

351 # Can't count loose objects on non-disk stores 

352 return False 

353 

354 loose_count = object_store.count_loose_objects() 

355 if loose_count >= gc_auto_value: 

356 return True 

357 

358 # Check pack file count 

359 try: 

360 gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit") 

361 pack_limit = int(gc_auto_pack_limit) 

362 except KeyError: 

363 pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT 

364 

365 if pack_limit > 0: 

366 pack_count = object_store.count_pack_files() 

367 if pack_count >= pack_limit: 

368 return True 

369 

370 return False 

371 

372 

373def maybe_auto_gc( 

374 repo: "Repo", 

375 config: "Config | None" = None, 

376 progress: Callable[[str], None] | None = None, 

377) -> bool: 

378 """Run automatic garbage collection if needed. 

379 

380 Args: 

381 repo: Repository to potentially GC 

382 config: Configuration to use (defaults to repo config) 

383 progress: Optional progress reporting callback 

384 

385 Returns: 

386 True if GC was run, False otherwise 

387 """ 

388 if not should_run_gc(repo, config): 

389 return False 

390 

391 # Check for gc.log file - only for disk-based repos 

392 if not hasattr(repo, "controldir"): 

393 # For non-disk repos, just run GC without gc.log handling 

394 garbage_collect(repo, auto=True, progress=progress) 

395 return True 

396 

397 gc_log_path = os.path.join(repo.controldir(), "gc.log") 

398 if os.path.exists(gc_log_path): 

399 # Check gc.logExpiry 

400 if config is None: 

401 config = repo.get_config() 

402 try: 

403 log_expiry = config.get(b"gc", b"logExpiry") 

404 except KeyError: 

405 # Default to 1 day 

406 expiry_seconds = 86400 

407 else: 

408 # Parse time value (simplified - just support days for now) 

409 if log_expiry.endswith((b".days", b".day")): 

410 days = int(log_expiry.split(b".")[0]) 

411 expiry_seconds = days * 86400 

412 else: 

413 # Default to 1 day 

414 expiry_seconds = 86400 

415 

416 stat_info = os.stat(gc_log_path) 

417 if time.time() - stat_info.st_mtime < expiry_seconds: 

418 # gc.log exists and is not expired - skip GC 

419 with open(gc_log_path, "rb") as f: 

420 logging.info( 

421 "gc.log content: %s", f.read().decode("utf-8", errors="replace") 

422 ) 

423 return False 

424 

425 # TODO: Support gc.autoDetach to run in background 

426 # For now, run in foreground 

427 

428 try: 

429 # Run GC with auto=True flag 

430 garbage_collect(repo, auto=True, progress=progress) 

431 

432 # Remove gc.log on successful completion 

433 if os.path.exists(gc_log_path): 

434 try: 

435 os.unlink(gc_log_path) 

436 except FileNotFoundError: 

437 pass 

438 

439 return True 

440 except OSError as e: 

441 # Write error to gc.log 

442 with open(gc_log_path, "wb") as f: 

443 f.write(f"Auto GC failed: {e}\n".encode()) 

444 # Don't propagate the error - auto GC failures shouldn't break operations 

445 return False