Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dulwich/gc.py: 23%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

216 statements  

1"""Git garbage collection implementation.""" 

2 

3import collections 

4import os 

5import time 

6from dataclasses import dataclass, field 

7from typing import TYPE_CHECKING, Optional 

8 

9from dulwich.object_store import ( 

10 BaseObjectStore, 

11 DiskObjectStore, 

12 PackBasedObjectStore, 

13) 

14from dulwich.objects import Commit, ObjectID, Tag, Tree 

15from dulwich.refs import RefsContainer 

16 

17if TYPE_CHECKING: 

18 from .config import Config 

19 from .repo import BaseRepo 

20 

21 

22DEFAULT_GC_AUTO = 6700 

23DEFAULT_GC_AUTO_PACK_LIMIT = 50 

24 

25 

26@dataclass 

27class GCStats: 

28 """Statistics from garbage collection.""" 

29 

30 pruned_objects: set[bytes] = field(default_factory=set) 

31 bytes_freed: int = 0 

32 packs_before: int = 0 

33 packs_after: int = 0 

34 loose_objects_before: int = 0 

35 loose_objects_after: int = 0 

36 

37 

38def find_reachable_objects( 

39 object_store: BaseObjectStore, 

40 refs_container: RefsContainer, 

41 include_reflogs: bool = True, 

42 progress=None, 

43) -> set[bytes]: 

44 """Find all reachable objects in the repository. 

45 

46 Args: 

47 object_store: Object store to search 

48 refs_container: Reference container 

49 include_reflogs: Whether to include reflog entries 

50 progress: Optional progress callback 

51 

52 Returns: 

53 Set of reachable object SHAs 

54 """ 

55 reachable = set() 

56 pending: collections.deque[ObjectID] = collections.deque() 

57 

58 # Start with all refs 

59 for ref in refs_container.allkeys(): 

60 try: 

61 sha = refs_container[ref] # This follows symbolic refs 

62 if sha and sha not in reachable: 

63 pending.append(sha) 

64 reachable.add(sha) 

65 except KeyError: 

66 # Broken ref 

67 if progress: 

68 progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}") 

69 continue 

70 

71 # TODO: Add reflog support when reflog functionality is available 

72 

73 # Walk all reachable objects 

74 while pending: 

75 sha = pending.popleft() 

76 

77 if progress: 

78 progress(f"Checking object {sha.decode('ascii', 'replace')}") 

79 

80 try: 

81 obj = object_store[sha] 

82 except KeyError: 

83 continue 

84 

85 # Add referenced objects 

86 if isinstance(obj, Commit): 

87 # Tree 

88 if obj.tree not in reachable: 

89 pending.append(obj.tree) 

90 reachable.add(obj.tree) 

91 # Parents 

92 for parent in obj.parents: 

93 if parent not in reachable: 

94 pending.append(parent) 

95 reachable.add(parent) 

96 elif isinstance(obj, Tree): 

97 # Tree entries 

98 for entry in obj.items(): 

99 if entry.sha not in reachable: 

100 pending.append(entry.sha) 

101 reachable.add(entry.sha) 

102 elif isinstance(obj, Tag): 

103 # Tagged object 

104 if obj.object[1] not in reachable: 

105 pending.append(obj.object[1]) 

106 reachable.add(obj.object[1]) 

107 

108 return reachable 

109 

110 

111def find_unreachable_objects( 

112 object_store: BaseObjectStore, 

113 refs_container: RefsContainer, 

114 include_reflogs: bool = True, 

115 progress=None, 

116) -> set[bytes]: 

117 """Find all unreachable objects in the repository. 

118 

119 Args: 

120 object_store: Object store to search 

121 refs_container: Reference container 

122 include_reflogs: Whether to include reflog entries 

123 progress: Optional progress callback 

124 

125 Returns: 

126 Set of unreachable object SHAs 

127 """ 

128 reachable = find_reachable_objects( 

129 object_store, refs_container, include_reflogs, progress 

130 ) 

131 

132 unreachable = set() 

133 for sha in object_store: 

134 if sha not in reachable: 

135 unreachable.add(sha) 

136 

137 return unreachable 

138 

139 

140def prune_unreachable_objects( 

141 object_store: PackBasedObjectStore, 

142 refs_container: RefsContainer, 

143 grace_period: Optional[int] = None, 

144 dry_run: bool = False, 

145 progress=None, 

146) -> tuple[set[bytes], int]: 

147 """Remove unreachable objects from the repository. 

148 

149 Args: 

150 object_store: Object store to prune 

151 refs_container: Reference container 

152 grace_period: Grace period in seconds (objects newer than this are kept) 

153 dry_run: If True, only report what would be deleted 

154 progress: Optional progress callback 

155 

156 Returns: 

157 Tuple of (set of pruned object SHAs, total bytes freed) 

158 """ 

159 unreachable = find_unreachable_objects( 

160 object_store, refs_container, progress=progress 

161 ) 

162 

163 pruned = set() 

164 bytes_freed = 0 

165 

166 for sha in unreachable: 

167 try: 

168 obj = object_store[sha] 

169 

170 # Check grace period 

171 if grace_period is not None: 

172 try: 

173 mtime = object_store.get_object_mtime(sha) 

174 age = time.time() - mtime 

175 if age < grace_period: 

176 if progress: 

177 progress( 

178 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" 

179 ) 

180 continue 

181 except KeyError: 

182 # Object not found, skip it 

183 continue 

184 

185 if progress: 

186 progress(f"Pruning {sha.decode('ascii', 'replace')}") 

187 

188 # Calculate size before attempting deletion 

189 obj_size = len(obj.as_raw_string()) 

190 

191 if not dry_run: 

192 object_store.delete_loose_object(sha) 

193 

194 # Only count as pruned if we get here (deletion succeeded or dry run) 

195 pruned.add(sha) 

196 bytes_freed += obj_size 

197 

198 except KeyError: 

199 # Object already gone 

200 pass 

201 except OSError as e: 

202 # File system errors during deletion 

203 if progress: 

204 progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}") 

205 return pruned, bytes_freed 

206 

207 

208def garbage_collect( 

209 repo, 

210 auto: bool = False, 

211 aggressive: bool = False, 

212 prune: bool = True, 

213 grace_period: Optional[int] = 1209600, # 2 weeks default 

214 dry_run: bool = False, 

215 progress=None, 

216) -> GCStats: 

217 """Run garbage collection on a repository. 

218 

219 Args: 

220 repo: Repository to garbage collect 

221 auto: Whether this is an automatic gc 

222 aggressive: Whether to use aggressive settings 

223 prune: Whether to prune unreachable objects 

224 grace_period: Grace period for pruning in seconds 

225 dry_run: If True, only report what would be done 

226 progress: Optional progress callback 

227 

228 Returns: 

229 GCStats object with garbage collection statistics 

230 """ 

231 stats = GCStats() 

232 

233 object_store = repo.object_store 

234 refs_container = repo.refs 

235 

236 # Count initial state 

237 stats.packs_before = len(list(object_store.packs)) 

238 stats.loose_objects_before = object_store.count_loose_objects() 

239 

240 # Find unreachable objects to exclude from repacking 

241 unreachable_to_prune = set() 

242 if prune: 

243 if progress: 

244 progress("Finding unreachable objects") 

245 unreachable = find_unreachable_objects( 

246 object_store, refs_container, progress=progress 

247 ) 

248 

249 # Apply grace period check 

250 for sha in unreachable: 

251 try: 

252 if grace_period is not None: 

253 try: 

254 mtime = object_store.get_object_mtime(sha) 

255 age = time.time() - mtime 

256 if age < grace_period: 

257 if progress: 

258 progress( 

259 f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" 

260 ) 

261 continue 

262 except KeyError: 

263 # Object not found, skip it 

264 continue 

265 

266 unreachable_to_prune.add(sha) 

267 obj = object_store[sha] 

268 stats.bytes_freed += len(obj.as_raw_string()) 

269 except KeyError: 

270 pass 

271 

272 stats.pruned_objects = unreachable_to_prune 

273 

274 # Pack refs 

275 if progress: 

276 progress("Packing references") 

277 if not dry_run: 

278 repo.refs.pack_refs() 

279 

280 # Delete loose unreachable objects 

281 if prune and not dry_run: 

282 for sha in unreachable_to_prune: 

283 if object_store.contains_loose(sha): 

284 try: 

285 object_store.delete_loose_object(sha) 

286 except OSError: 

287 pass 

288 

289 # Repack everything, excluding unreachable objects 

290 # This handles both loose object packing and pack consolidation 

291 if progress: 

292 progress("Repacking repository") 

293 if not dry_run: 

294 if prune and unreachable_to_prune: 

295 # Repack excluding unreachable objects 

296 object_store.repack(exclude=unreachable_to_prune) 

297 else: 

298 # Normal repack 

299 object_store.repack() 

300 

301 # Prune orphaned temporary files 

302 if progress: 

303 progress("Pruning temporary files") 

304 if not dry_run: 

305 object_store.prune(grace_period=grace_period) 

306 

307 # Count final state 

308 stats.packs_after = len(list(object_store.packs)) 

309 stats.loose_objects_after = object_store.count_loose_objects() 

310 

311 return stats 

312 

313 

314def should_run_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool: 

315 """Check if automatic garbage collection should run. 

316 

317 Args: 

318 repo: Repository to check 

319 config: Configuration to use (defaults to repo config) 

320 

321 Returns: 

322 True if GC should run, False otherwise 

323 """ 

324 # Check environment variable first 

325 if os.environ.get("GIT_AUTO_GC") == "0": 

326 return False 

327 

328 # Check programmatic disable flag 

329 if getattr(repo, "_autogc_disabled", False): 

330 return False 

331 

332 if config is None: 

333 config = repo.get_config() 

334 

335 # Check if auto GC is disabled 

336 try: 

337 gc_auto = config.get(b"gc", b"auto") 

338 gc_auto_value = int(gc_auto) 

339 except KeyError: 

340 gc_auto_value = DEFAULT_GC_AUTO 

341 

342 if gc_auto_value == 0: 

343 # Auto GC is disabled 

344 return False 

345 

346 # Check loose object count 

347 object_store = repo.object_store 

348 if not isinstance(object_store, DiskObjectStore): 

349 # Can't count loose objects on non-disk stores 

350 return False 

351 

352 loose_count = object_store.count_loose_objects() 

353 if loose_count >= gc_auto_value: 

354 return True 

355 

356 # Check pack file count 

357 try: 

358 gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit") 

359 pack_limit = int(gc_auto_pack_limit) 

360 except KeyError: 

361 pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT 

362 

363 if pack_limit > 0: 

364 pack_count = object_store.count_pack_files() 

365 if pack_count >= pack_limit: 

366 return True 

367 

368 return False 

369 

370 

371def maybe_auto_gc(repo: "BaseRepo", config: Optional["Config"] = None) -> bool: 

372 """Run automatic garbage collection if needed. 

373 

374 Args: 

375 repo: Repository to potentially GC 

376 config: Configuration to use (defaults to repo config) 

377 

378 Returns: 

379 True if GC was run, False otherwise 

380 """ 

381 if not should_run_gc(repo, config): 

382 return False 

383 

384 # Check for gc.log file - only for disk-based repos 

385 if not hasattr(repo, "controldir"): 

386 # For non-disk repos, just run GC without gc.log handling 

387 garbage_collect(repo, auto=True) 

388 return True 

389 

390 gc_log_path = os.path.join(repo.controldir(), "gc.log") 

391 if os.path.exists(gc_log_path): 

392 # Check gc.logExpiry 

393 if config is None: 

394 config = repo.get_config() 

395 try: 

396 log_expiry = config.get(b"gc", b"logExpiry") 

397 except KeyError: 

398 # Default to 1 day 

399 expiry_seconds = 86400 

400 else: 

401 # Parse time value (simplified - just support days for now) 

402 if log_expiry.endswith((b".days", b".day")): 

403 days = int(log_expiry.split(b".")[0]) 

404 expiry_seconds = days * 86400 

405 else: 

406 # Default to 1 day 

407 expiry_seconds = 86400 

408 

409 stat_info = os.stat(gc_log_path) 

410 if time.time() - stat_info.st_mtime < expiry_seconds: 

411 # gc.log exists and is not expired - skip GC 

412 with open(gc_log_path, "rb") as f: 

413 print(f.read().decode("utf-8", errors="replace")) 

414 return False 

415 

416 # TODO: Support gc.autoDetach to run in background 

417 # For now, run in foreground 

418 

419 try: 

420 # Run GC with auto=True flag 

421 garbage_collect(repo, auto=True) 

422 

423 # Remove gc.log on successful completion 

424 if os.path.exists(gc_log_path): 

425 try: 

426 os.unlink(gc_log_path) 

427 except FileNotFoundError: 

428 pass 

429 

430 return True 

431 except OSError as e: 

432 # Write error to gc.log 

433 with open(gc_log_path, "wb") as f: 

434 f.write(f"Auto GC failed: {e}\n".encode()) 

435 # Don't propagate the error - auto GC failures shouldn't break operations 

436 return False