Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/gc.py: 23%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

216 statements  

1"""Git garbage collection implementation.""" 

2 

3import collections 

4import os 

5import time 

6from dataclasses import dataclass, field 

7from typing import TYPE_CHECKING, Callable, Optional 

8 

9from dulwich.object_store import ( 

10 BaseObjectStore, 

11 DiskObjectStore, 

12) 

13from dulwich.objects import Commit, ObjectID, Tag, Tree 

14from dulwich.refs import RefsContainer 

15 

16if TYPE_CHECKING: 

17 from .config import Config 

18 from .repo import BaseRepo, Repo 

19 

20 

21DEFAULT_GC_AUTO = 6700 

22DEFAULT_GC_AUTO_PACK_LIMIT = 50 

23 

24 

25@dataclass 

26class GCStats: 

27 """Statistics from garbage collection.""" 

28 

29 pruned_objects: set[bytes] = field(default_factory=set) 

30 bytes_freed: int = 0 

31 packs_before: int = 0 

32 packs_after: int = 0 

33 loose_objects_before: int = 0 

34 loose_objects_after: int = 0 

35 

36 

37def find_reachable_objects( 

38 object_store: BaseObjectStore, 

39 refs_container: RefsContainer, 

40 include_reflogs: bool = True, 

41 progress: Optional[Callable[[str], None]] = None, 

42) -> set[bytes]: 

43 """Find all reachable objects in the repository. 

44 

45 Args: 

46 object_store: Object store to search 

47 refs_container: Reference container 

48 include_reflogs: Whether to include reflog entries 

49 progress: Optional progress callback 

50 

51 Returns: 

52 Set of reachable object SHAs 

53 """ 

54 reachable = set() 

55 pending: collections.deque[ObjectID] = collections.deque() 

56 

57 # Start with all refs 

58 for ref in refs_container.allkeys(): 

59 try: 

60 sha = refs_container[ref] # This follows symbolic refs 

61 if sha and sha not in reachable: 

62 pending.append(sha) 

63 reachable.add(sha) 

64 except KeyError: 

65 # Broken ref 

66 if progress: 

67 progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}") 

68 continue 

69 

70 # TODO: Add reflog support when reflog functionality is available 

71 

72 # Walk all reachable objects 

73 while pending: 

74 sha = pending.popleft() 

75 

76 if progress: 

77 progress(f"Checking object {sha.decode('ascii', 'replace')}") 

78 

79 try: 

80 obj = object_store[sha] 

81 except KeyError: 

82 continue 

83 

84 # Add referenced objects 

85 if isinstance(obj, Commit): 

86 # Tree 

87 if obj.tree not in reachable: 

88 pending.append(obj.tree) 

89 reachable.add(obj.tree) 

90 # Parents 

91 for parent in obj.parents: 

92 if parent not in reachable: 

93 pending.append(parent) 

94 reachable.add(parent) 

95 elif isinstance(obj, Tree): 

96 # Tree entries 

97 for entry in obj.items(): 

98 if entry.sha not in reachable: 

99 pending.append(entry.sha) 

100 reachable.add(entry.sha) 

101 elif isinstance(obj, Tag): 

102 # Tagged object 

103 if obj.object[1] not in reachable: 

104 pending.append(obj.object[1]) 

105 reachable.add(obj.object[1]) 

106 

107 return reachable 

108 

109 

110def find_unreachable_objects( 

111 object_store: BaseObjectStore, 

112 refs_container: RefsContainer, 

113 include_reflogs: bool = True, 

114 progress: Optional[Callable[[str], None]] = None, 

115) -> set[bytes]: 

116 """Find all unreachable objects in the repository. 

117 

118 Args: 

119 object_store: Object store to search 

120 refs_container: Reference container 

121 include_reflogs: Whether to include reflog entries 

122 progress: Optional progress callback 

123 

124 Returns: 

125 Set of unreachable object SHAs 

126 """ 

127 reachable = find_reachable_objects( 

128 object_store, refs_container, include_reflogs, progress 

129 ) 

130 

131 unreachable = set() 

132 for sha in object_store: 

133 if sha not in reachable: 

134 unreachable.add(sha) 

135 

136 return unreachable 

137 

138 

139def prune_unreachable_objects( 

140 object_store: DiskObjectStore, 

141 refs_container: RefsContainer, 

142 grace_period: Optional[int] = None, 

143 dry_run: bool = False, 

144 progress: Optional[Callable[[str], None]] = None, 

145) -> tuple[set[bytes], int]: 

146 """Remove unreachable objects from the repository. 

147 

148 Args: 

149 object_store: Object store to prune 

150 refs_container: Reference container 

151 grace_period: Grace period in seconds (objects newer than this are kept) 

152 dry_run: If True, only report what would be deleted 

153 progress: Optional progress callback 

154 

155 Returns: 

156 Tuple of (set of pruned object SHAs, total bytes freed) 

157 """ 

158 unreachable = find_unreachable_objects( 

159 object_store, refs_container, progress=progress 

160 ) 

161 

162 pruned = set() 

163 bytes_freed = 0 

164 

165 for sha in unreachable: 

166 try: 

167 obj = object_store[sha] 

168 

169 # Check grace period 

170 if grace_period is not None: 

171 try: 

172 mtime = object_store.get_object_mtime(sha) 

173 age = time.time() - mtime 

174 if age < grace_period: 

175 if progress: 

176 progress( 

177 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" 

178 ) 

179 continue 

180 except KeyError: 

181 # Object not found, skip it 

182 continue 

183 

184 if progress: 

185 progress(f"Pruning {sha.decode('ascii', 'replace')}") 

186 

187 # Calculate size before attempting deletion 

188 obj_size = len(obj.as_raw_string()) 

189 

190 if not dry_run: 

191 object_store.delete_loose_object(sha) 

192 

193 # Only count as pruned if we get here (deletion succeeded or dry run) 

194 pruned.add(sha) 

195 bytes_freed += obj_size 

196 

197 except KeyError: 

198 # Object already gone 

199 pass 

200 except OSError as e: 

201 # File system errors during deletion 

202 if progress: 

203 progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}") 

204 return pruned, bytes_freed 

205 

206 

207def garbage_collect( 

208 repo: "Repo", 

209 auto: bool = False, 

210 aggressive: bool = False, 

211 prune: bool = True, 

212 grace_period: Optional[int] = 1209600, # 2 weeks default 

213 dry_run: bool = False, 

214 progress: Optional[Callable[[str], None]] = None, 

215) -> GCStats: 

216 """Run garbage collection on a repository. 

217 

218 Args: 

219 repo: Repository to garbage collect 

220 auto: Whether this is an automatic gc 

221 aggressive: Whether to use aggressive settings 

222 prune: Whether to prune unreachable objects 

223 grace_period: Grace period for pruning in seconds 

224 dry_run: If True, only report what would be done 

225 progress: Optional progress callback 

226 

227 Returns: 

228 GCStats object with garbage collection statistics 

229 """ 

230 stats = GCStats() 

231 

232 object_store = repo.object_store 

233 refs_container = repo.refs 

234 

235 # Count initial state 

236 stats.packs_before = len(list(object_store.packs)) 

237 stats.loose_objects_before = object_store.count_loose_objects() 

238 

239 # Find unreachable objects to exclude from repacking 

240 unreachable_to_prune = set() 

241 if prune: 

242 if progress: 

243 progress("Finding unreachable objects") 

244 unreachable = find_unreachable_objects( 

245 object_store, refs_container, progress=progress 

246 ) 

247 

248 # Apply grace period check 

249 for sha in unreachable: 

250 try: 

251 if grace_period is not None: 

252 try: 

253 mtime = object_store.get_object_mtime(sha) 

254 age = time.time() - mtime 

255 if age < grace_period: 

256 if progress: 

257 progress( 

258 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" 

259 ) 

260 continue 

261 except KeyError: 

262 # Object not found, skip it 

263 continue 

264 

265 unreachable_to_prune.add(sha) 

266 obj = object_store[sha] 

267 stats.bytes_freed += len(obj.as_raw_string()) 

268 except KeyError: 

269 pass 

270 

271 stats.pruned_objects = unreachable_to_prune 

272 

273 # Pack refs 

274 if progress: 

275 progress("Packing references") 

276 if not dry_run: 

277 repo.refs.pack_refs() 

278 

279 # Delete loose unreachable objects 

280 if prune and not dry_run: 

281 for sha in unreachable_to_prune: 

282 if object_store.contains_loose(sha): 

283 try: 

284 object_store.delete_loose_object(sha) 

285 except OSError: 

286 pass 

287 

288 # Repack everything, excluding unreachable objects 

289 # This handles both loose object packing and pack consolidation 

290 if progress: 

291 progress("Repacking repository") 

292 if not dry_run: 

293 if prune and unreachable_to_prune: 

294 # Repack excluding unreachable objects 

295 object_store.repack(exclude=unreachable_to_prune) 

296 else: 

297 # Normal repack 

298 object_store.repack() 

299 

300 # Prune orphaned temporary files 

301 if progress: 

302 progress("Pruning temporary files") 

303 if not dry_run: 

304 object_store.prune(grace_period=grace_period) 

305 

306 # Count final state 

307 stats.packs_after = len(list(object_store.packs)) 

308 stats.loose_objects_after = object_store.count_loose_objects() 

309 

310 return stats 

311 

312 

313def should_run_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool: 

314 """Check if automatic garbage collection should run. 

315 

316 Args: 

317 repo: Repository to check 

318 config: Configuration to use (defaults to repo config) 

319 

320 Returns: 

321 True if GC should run, False otherwise 

322 """ 

323 # Check environment variable first 

324 if os.environ.get("GIT_AUTO_GC") == "0": 

325 return False 

326 

327 # Check programmatic disable flag 

328 if getattr(repo, "_autogc_disabled", False): 

329 return False 

330 

331 if config is None: 

332 config = repo.get_config() 

333 

334 # Check if auto GC is disabled 

335 try: 

336 gc_auto = config.get(b"gc", b"auto") 

337 gc_auto_value = int(gc_auto) 

338 except KeyError: 

339 gc_auto_value = DEFAULT_GC_AUTO 

340 

341 if gc_auto_value == 0: 

342 # Auto GC is disabled 

343 return False 

344 

345 # Check loose object count 

346 object_store = repo.object_store 

347 if not isinstance(object_store, DiskObjectStore): 

348 # Can't count loose objects on non-disk stores 

349 return False 

350 

351 loose_count = object_store.count_loose_objects() 

352 if loose_count >= gc_auto_value: 

353 return True 

354 

355 # Check pack file count 

356 try: 

357 gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit") 

358 pack_limit = int(gc_auto_pack_limit) 

359 except KeyError: 

360 pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT 

361 

362 if pack_limit > 0: 

363 pack_count = object_store.count_pack_files() 

364 if pack_count >= pack_limit: 

365 return True 

366 

367 return False 

368 

369 

370def maybe_auto_gc(repo: "Repo", config: Optional["Config"] = None) -> bool: 

371 """Run automatic garbage collection if needed. 

372 

373 Args: 

374 repo: Repository to potentially GC 

375 config: Configuration to use (defaults to repo config) 

376 

377 Returns: 

378 True if GC was run, False otherwise 

379 """ 

380 if not should_run_gc(repo, config): 

381 return False 

382 

383 # Check for gc.log file - only for disk-based repos 

384 if not hasattr(repo, "controldir"): 

385 # For non-disk repos, just run GC without gc.log handling 

386 garbage_collect(repo, auto=True) 

387 return True 

388 

389 gc_log_path = os.path.join(repo.controldir(), "gc.log") 

390 if os.path.exists(gc_log_path): 

391 # Check gc.logExpiry 

392 if config is None: 

393 config = repo.get_config() 

394 try: 

395 log_expiry = config.get(b"gc", b"logExpiry") 

396 except KeyError: 

397 # Default to 1 day 

398 expiry_seconds = 86400 

399 else: 

400 # Parse time value (simplified - just support days for now) 

401 if log_expiry.endswith((b".days", b".day")): 

402 days = int(log_expiry.split(b".")[0]) 

403 expiry_seconds = days * 86400 

404 else: 

405 # Default to 1 day 

406 expiry_seconds = 86400 

407 

408 stat_info = os.stat(gc_log_path) 

409 if time.time() - stat_info.st_mtime < expiry_seconds: 

410 # gc.log exists and is not expired - skip GC 

411 with open(gc_log_path, "rb") as f: 

412 print(f.read().decode("utf-8", errors="replace")) 

413 return False 

414 

415 # TODO: Support gc.autoDetach to run in background 

416 # For now, run in foreground 

417 

418 try: 

419 # Run GC with auto=True flag 

420 garbage_collect(repo, auto=True) 

421 

422 # Remove gc.log on successful completion 

423 if os.path.exists(gc_log_path): 

424 try: 

425 os.unlink(gc_log_path) 

426 except FileNotFoundError: 

427 pass 

428 

429 return True 

430 except OSError as e: 

431 # Write error to gc.log 

432 with open(gc_log_path, "wb") as f: 

433 f.write(f"Auto GC failed: {e}\n".encode()) 

434 # Don't propagate the error - auto GC failures shouldn't break operations 

435 return False