Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/dulwich/diff_tree.py: 21%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

340 statements  

1# diff_tree.py -- Utilities for diffing files and trees. 

2# Copyright (C) 2010 Google, Inc. 

3# 

4# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU 

5# General Public License as public by the Free Software Foundation; version 2.0 

6# or (at your option) any later version. You can redistribute it and/or 

7# modify it under the terms of either of these two licenses. 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# 

15# You should have received a copy of the licenses; if not, see 

16# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License 

17# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache 

18# License, Version 2.0. 

19# 

20 

21"""Utilities for diffing files and trees.""" 

22 

23import stat 

24from collections import defaultdict, namedtuple 

25from io import BytesIO 

26from itertools import chain 

27from typing import Dict, List, Optional 

28 

29from .object_store import BaseObjectStore 

30from .objects import S_ISGITLINK, ObjectID, ShaFile, Tree, TreeEntry 

31 

32# TreeChange type constants. 

33CHANGE_ADD = "add" 

34CHANGE_MODIFY = "modify" 

35CHANGE_DELETE = "delete" 

36CHANGE_RENAME = "rename" 

37CHANGE_COPY = "copy" 

38CHANGE_UNCHANGED = "unchanged" 

39 

40RENAME_CHANGE_TYPES = (CHANGE_RENAME, CHANGE_COPY) 

41 

42_NULL_ENTRY = TreeEntry(None, None, None) 

43 

44_MAX_SCORE = 100 

45RENAME_THRESHOLD = 60 

46MAX_FILES = 200 

47REWRITE_THRESHOLD = None 

48 

49 

50class TreeChange(namedtuple("TreeChange", ["type", "old", "new"])): 

51 """Named tuple a single change between two trees.""" 

52 

53 @classmethod 

54 def add(cls, new): 

55 return cls(CHANGE_ADD, _NULL_ENTRY, new) 

56 

57 @classmethod 

58 def delete(cls, old): 

59 return cls(CHANGE_DELETE, old, _NULL_ENTRY) 

60 

61 

62def _tree_entries(path: str, tree: Tree) -> List[TreeEntry]: 

63 result: List[TreeEntry] = [] 

64 if not tree: 

65 return result 

66 for entry in tree.iteritems(name_order=True): 

67 result.append(entry.in_path(path)) 

68 return result 

69 

70 

71def _merge_entries(path, tree1, tree2): 

72 """Merge the entries of two trees. 

73 

74 Args: 

75 path: A path to prepend to all tree entry names. 

76 tree1: The first Tree object to iterate, or None. 

77 tree2: The second Tree object to iterate, or None. 

78 

79 Returns: 

80 A list of pairs of TreeEntry objects for each pair of entries in 

81 the trees. If an entry exists in one tree but not the other, the other 

82 entry will have all attributes set to None. If neither entry's path is 

83 None, they are guaranteed to match. 

84 """ 

85 entries1 = _tree_entries(path, tree1) 

86 entries2 = _tree_entries(path, tree2) 

87 i1 = i2 = 0 

88 len1 = len(entries1) 

89 len2 = len(entries2) 

90 

91 result = [] 

92 while i1 < len1 and i2 < len2: 

93 entry1 = entries1[i1] 

94 entry2 = entries2[i2] 

95 if entry1.path < entry2.path: 

96 result.append((entry1, _NULL_ENTRY)) 

97 i1 += 1 

98 elif entry1.path > entry2.path: 

99 result.append((_NULL_ENTRY, entry2)) 

100 i2 += 1 

101 else: 

102 result.append((entry1, entry2)) 

103 i1 += 1 

104 i2 += 1 

105 for i in range(i1, len1): 

106 result.append((entries1[i], _NULL_ENTRY)) 

107 for i in range(i2, len2): 

108 result.append((_NULL_ENTRY, entries2[i])) 

109 return result 

110 

111 

112def _is_tree(entry): 

113 mode = entry.mode 

114 if mode is None: 

115 return False 

116 return stat.S_ISDIR(mode) 

117 

118 

119def walk_trees(store, tree1_id, tree2_id, prune_identical=False): 

120 """Recursively walk all the entries of two trees. 

121 

122 Iteration is depth-first pre-order, as in e.g. os.walk. 

123 

124 Args: 

125 store: An ObjectStore for looking up objects. 

126 tree1_id: The SHA of the first Tree object to iterate, or None. 

127 tree2_id: The SHA of the second Tree object to iterate, or None. 

128 prune_identical: If True, identical subtrees will not be walked. 

129 

130 Returns: 

131 Iterator over Pairs of TreeEntry objects for each pair of entries 

132 in the trees and their subtrees recursively. If an entry exists in one 

133 tree but not the other, the other entry will have all attributes set 

134 to None. If neither entry's path is None, they are guaranteed to 

135 match. 

136 """ 

137 # This could be fairly easily generalized to >2 trees if we find a use 

138 # case. 

139 mode1 = tree1_id and stat.S_IFDIR or None 

140 mode2 = tree2_id and stat.S_IFDIR or None 

141 todo = [(TreeEntry(b"", mode1, tree1_id), TreeEntry(b"", mode2, tree2_id))] 

142 while todo: 

143 entry1, entry2 = todo.pop() 

144 is_tree1 = _is_tree(entry1) 

145 is_tree2 = _is_tree(entry2) 

146 if prune_identical and is_tree1 and is_tree2 and entry1 == entry2: 

147 continue 

148 

149 tree1 = is_tree1 and store[entry1.sha] or None 

150 tree2 = is_tree2 and store[entry2.sha] or None 

151 path = entry1.path or entry2.path 

152 todo.extend(reversed(_merge_entries(path, tree1, tree2))) 

153 yield entry1, entry2 

154 

155 

156def _skip_tree(entry, include_trees): 

157 if entry.mode is None or (not include_trees and stat.S_ISDIR(entry.mode)): 

158 return _NULL_ENTRY 

159 return entry 

160 

161 

162def tree_changes( 

163 store, 

164 tree1_id, 

165 tree2_id, 

166 want_unchanged=False, 

167 rename_detector=None, 

168 include_trees=False, 

169 change_type_same=False, 

170): 

171 """Find the differences between the contents of two trees. 

172 

173 Args: 

174 store: An ObjectStore for looking up objects. 

175 tree1_id: The SHA of the source tree. 

176 tree2_id: The SHA of the target tree. 

177 want_unchanged: If True, include TreeChanges for unmodified entries 

178 as well. 

179 include_trees: Whether to include trees 

180 rename_detector: RenameDetector object for detecting renames. 

181 change_type_same: Whether to report change types in the same 

182 entry or as delete+add. 

183 

184 Returns: 

185 Iterator over TreeChange instances for each change between the 

186 source and target tree. 

187 """ 

188 if rename_detector is not None and tree1_id is not None and tree2_id is not None: 

189 yield from rename_detector.changes_with_renames( 

190 tree1_id, 

191 tree2_id, 

192 want_unchanged=want_unchanged, 

193 include_trees=include_trees, 

194 ) 

195 return 

196 

197 entries = walk_trees( 

198 store, tree1_id, tree2_id, prune_identical=(not want_unchanged) 

199 ) 

200 for entry1, entry2 in entries: 

201 if entry1 == entry2 and not want_unchanged: 

202 continue 

203 

204 # Treat entries for trees as missing. 

205 entry1 = _skip_tree(entry1, include_trees) 

206 entry2 = _skip_tree(entry2, include_trees) 

207 

208 if entry1 != _NULL_ENTRY and entry2 != _NULL_ENTRY: 

209 if ( 

210 stat.S_IFMT(entry1.mode) != stat.S_IFMT(entry2.mode) 

211 and not change_type_same 

212 ): 

213 # File type changed: report as delete/add. 

214 yield TreeChange.delete(entry1) 

215 entry1 = _NULL_ENTRY 

216 change_type = CHANGE_ADD 

217 elif entry1 == entry2: 

218 change_type = CHANGE_UNCHANGED 

219 else: 

220 change_type = CHANGE_MODIFY 

221 elif entry1 != _NULL_ENTRY: 

222 change_type = CHANGE_DELETE 

223 elif entry2 != _NULL_ENTRY: 

224 change_type = CHANGE_ADD 

225 else: 

226 # Both were None because at least one was a tree. 

227 continue 

228 yield TreeChange(change_type, entry1, entry2) 

229 

230 

231def _all_eq(seq, key, value): 

232 for e in seq: 

233 if key(e) != value: 

234 return False 

235 return True 

236 

237 

238def _all_same(seq, key): 

239 return _all_eq(seq[1:], key, key(seq[0])) 

240 

241 

242def tree_changes_for_merge( 

243 store: BaseObjectStore, 

244 parent_tree_ids: List[ObjectID], 

245 tree_id: ObjectID, 

246 rename_detector=None, 

247): 

248 """Get the tree changes for a merge tree relative to all its parents. 

249 

250 Args: 

251 store: An ObjectStore for looking up objects. 

252 parent_tree_ids: An iterable of the SHAs of the parent trees. 

253 tree_id: The SHA of the merge tree. 

254 rename_detector: RenameDetector object for detecting renames. 

255 

256 Returns: 

257 Iterator over lists of TreeChange objects, one per conflicted path 

258 in the merge. 

259 

260 Each list contains one element per parent, with the TreeChange for that 

261 path relative to that parent. An element may be None if it never 

262 existed in one parent and was deleted in two others. 

263 

264 A path is only included in the output if it is a conflict, i.e. its SHA 

265 in the merge tree is not found in any of the parents, or in the case of 

266 deletes, if not all of the old SHAs match. 

267 """ 

268 all_parent_changes = [ 

269 tree_changes(store, t, tree_id, rename_detector=rename_detector) 

270 for t in parent_tree_ids 

271 ] 

272 num_parents = len(parent_tree_ids) 

273 changes_by_path: Dict[str, List[Optional[TreeChange]]] = defaultdict( 

274 lambda: [None] * num_parents 

275 ) 

276 

277 # Organize by path. 

278 for i, parent_changes in enumerate(all_parent_changes): 

279 for change in parent_changes: 

280 if change.type == CHANGE_DELETE: 

281 path = change.old.path 

282 else: 

283 path = change.new.path 

284 changes_by_path[path][i] = change 

285 

286 def old_sha(c): 

287 return c.old.sha 

288 

289 def change_type(c): 

290 return c.type 

291 

292 # Yield only conflicting changes. 

293 for _, changes in sorted(changes_by_path.items()): 

294 assert len(changes) == num_parents 

295 have = [c for c in changes if c is not None] 

296 if _all_eq(have, change_type, CHANGE_DELETE): 

297 if not _all_same(have, old_sha): 

298 yield changes 

299 elif not _all_same(have, change_type): 

300 yield changes 

301 elif None not in changes: 

302 # If no change was found relative to one parent, that means the SHA 

303 # must have matched the SHA in that parent, so it is not a 

304 # conflict. 

305 yield changes 

306 

307 

308_BLOCK_SIZE = 64 

309 

310 

311def _count_blocks(obj: ShaFile) -> Dict[int, int]: 

312 """Count the blocks in an object. 

313 

314 Splits the data into blocks either on lines or <=64-byte chunks of lines. 

315 

316 Args: 

317 obj: The object to count blocks for. 

318 

319 Returns: 

320 A dict of block hashcode -> total bytes occurring. 

321 """ 

322 block_counts: Dict[int, int] = defaultdict(int) 

323 block = BytesIO() 

324 n = 0 

325 

326 # Cache attrs as locals to avoid expensive lookups in the inner loop. 

327 block_write = block.write 

328 block_seek = block.seek 

329 block_truncate = block.truncate 

330 block_getvalue = block.getvalue 

331 

332 for c in chain.from_iterable(obj.as_raw_chunks()): 

333 cb = c.to_bytes(1, "big") 

334 block_write(cb) 

335 n += 1 

336 if cb == b"\n" or n == _BLOCK_SIZE: 

337 value = block_getvalue() 

338 block_counts[hash(value)] += len(value) 

339 block_seek(0) 

340 block_truncate() 

341 n = 0 

342 if n > 0: 

343 last_block = block_getvalue() 

344 block_counts[hash(last_block)] += len(last_block) 

345 return block_counts 

346 

347 

348def _common_bytes(blocks1, blocks2): 

349 """Count the number of common bytes in two block count dicts. 

350 

351 Args: 

352 blocks1: The first dict of block hashcode -> total bytes. 

353 blocks2: The second dict of block hashcode -> total bytes. 

354 

355 Returns: 

356 The number of bytes in common between blocks1 and blocks2. This is 

357 only approximate due to possible hash collisions. 

358 """ 

359 # Iterate over the smaller of the two dicts, since this is symmetrical. 

360 if len(blocks1) > len(blocks2): 

361 blocks1, blocks2 = blocks2, blocks1 

362 score = 0 

363 for block, count1 in blocks1.items(): 

364 count2 = blocks2.get(block) 

365 if count2: 

366 score += min(count1, count2) 

367 return score 

368 

369 

370def _similarity_score(obj1, obj2, block_cache=None): 

371 """Compute a similarity score for two objects. 

372 

373 Args: 

374 obj1: The first object to score. 

375 obj2: The second object to score. 

376 block_cache: An optional dict of SHA to block counts to cache 

377 results between calls. 

378 

379 Returns: 

380 The similarity score between the two objects, defined as the 

381 number of bytes in common between the two objects divided by the 

382 maximum size, scaled to the range 0-100. 

383 """ 

384 if block_cache is None: 

385 block_cache = {} 

386 if obj1.id not in block_cache: 

387 block_cache[obj1.id] = _count_blocks(obj1) 

388 if obj2.id not in block_cache: 

389 block_cache[obj2.id] = _count_blocks(obj2) 

390 

391 common_bytes = _common_bytes(block_cache[obj1.id], block_cache[obj2.id]) 

392 max_size = max(obj1.raw_length(), obj2.raw_length()) 

393 if not max_size: 

394 return _MAX_SCORE 

395 return int(float(common_bytes) * _MAX_SCORE / max_size) 

396 

397 

398def _tree_change_key(entry): 

399 # Sort by old path then new path. If only one exists, use it for both keys. 

400 path1 = entry.old.path 

401 path2 = entry.new.path 

402 if path1 is None: 

403 path1 = path2 

404 if path2 is None: 

405 path2 = path1 

406 return (path1, path2) 

407 

408 

409class RenameDetector: 

410 """Object for handling rename detection between two trees.""" 

411 

412 def __init__( 

413 self, 

414 store, 

415 rename_threshold=RENAME_THRESHOLD, 

416 max_files=MAX_FILES, 

417 rewrite_threshold=REWRITE_THRESHOLD, 

418 find_copies_harder=False, 

419 ) -> None: 

420 """Initialize the rename detector. 

421 

422 Args: 

423 store: An ObjectStore for looking up objects. 

424 rename_threshold: The threshold similarity score for considering 

425 an add/delete pair to be a rename/copy; see _similarity_score. 

426 max_files: The maximum number of adds and deletes to consider, 

427 or None for no limit. The detector is guaranteed to compare no more 

428 than max_files ** 2 add/delete pairs. This limit is provided 

429 because rename detection can be quadratic in the project size. If 

430 the limit is exceeded, no content rename detection is attempted. 

431 rewrite_threshold: The threshold similarity score below which a 

432 modify should be considered a delete/add, or None to not break 

433 modifies; see _similarity_score. 

434 find_copies_harder: If True, consider unmodified files when 

435 detecting copies. 

436 """ 

437 self._store = store 

438 self._rename_threshold = rename_threshold 

439 self._rewrite_threshold = rewrite_threshold 

440 self._max_files = max_files 

441 self._find_copies_harder = find_copies_harder 

442 self._want_unchanged = False 

443 

444 def _reset(self): 

445 self._adds = [] 

446 self._deletes = [] 

447 self._changes = [] 

448 

449 def _should_split(self, change): 

450 if ( 

451 self._rewrite_threshold is None 

452 or change.type != CHANGE_MODIFY 

453 or change.old.sha == change.new.sha 

454 ): 

455 return False 

456 old_obj = self._store[change.old.sha] 

457 new_obj = self._store[change.new.sha] 

458 return _similarity_score(old_obj, new_obj) < self._rewrite_threshold 

459 

460 def _add_change(self, change): 

461 if change.type == CHANGE_ADD: 

462 self._adds.append(change) 

463 elif change.type == CHANGE_DELETE: 

464 self._deletes.append(change) 

465 elif self._should_split(change): 

466 self._deletes.append(TreeChange.delete(change.old)) 

467 self._adds.append(TreeChange.add(change.new)) 

468 elif ( 

469 self._find_copies_harder and change.type == CHANGE_UNCHANGED 

470 ) or change.type == CHANGE_MODIFY: 

471 # Treat all modifies as potential deletes for rename detection, 

472 # but don't split them (to avoid spurious renames). Setting 

473 # find_copies_harder means we treat unchanged the same as 

474 # modified. 

475 self._deletes.append(change) 

476 else: 

477 self._changes.append(change) 

478 

479 def _collect_changes(self, tree1_id, tree2_id): 

480 want_unchanged = self._find_copies_harder or self._want_unchanged 

481 for change in tree_changes( 

482 self._store, 

483 tree1_id, 

484 tree2_id, 

485 want_unchanged=want_unchanged, 

486 include_trees=self._include_trees, 

487 ): 

488 self._add_change(change) 

489 

490 def _prune(self, add_paths, delete_paths): 

491 self._adds = [a for a in self._adds if a.new.path not in add_paths] 

492 self._deletes = [d for d in self._deletes if d.old.path not in delete_paths] 

493 

494 def _find_exact_renames(self): 

495 add_map = defaultdict(list) 

496 for add in self._adds: 

497 add_map[add.new.sha].append(add.new) 

498 delete_map = defaultdict(list) 

499 for delete in self._deletes: 

500 # Keep track of whether the delete was actually marked as a delete. 

501 # If not, it needs to be marked as a copy. 

502 is_delete = delete.type == CHANGE_DELETE 

503 delete_map[delete.old.sha].append((delete.old, is_delete)) 

504 

505 add_paths = set() 

506 delete_paths = set() 

507 for sha, sha_deletes in delete_map.items(): 

508 sha_adds = add_map[sha] 

509 for (old, is_delete), new in zip(sha_deletes, sha_adds): 

510 if stat.S_IFMT(old.mode) != stat.S_IFMT(new.mode): 

511 continue 

512 if is_delete: 

513 delete_paths.add(old.path) 

514 add_paths.add(new.path) 

515 new_type = is_delete and CHANGE_RENAME or CHANGE_COPY 

516 self._changes.append(TreeChange(new_type, old, new)) 

517 

518 num_extra_adds = len(sha_adds) - len(sha_deletes) 

519 # TODO(dborowitz): Less arbitrary way of dealing with extra copies. 

520 old = sha_deletes[0][0] 

521 if num_extra_adds > 0: 

522 for new in sha_adds[-num_extra_adds:]: 

523 add_paths.add(new.path) 

524 self._changes.append(TreeChange(CHANGE_COPY, old, new)) 

525 self._prune(add_paths, delete_paths) 

526 

527 def _should_find_content_renames(self): 

528 return len(self._adds) * len(self._deletes) <= self._max_files**2 

529 

530 def _rename_type(self, check_paths, delete, add): 

531 if check_paths and delete.old.path == add.new.path: 

532 # If the paths match, this must be a split modify, so make sure it 

533 # comes out as a modify. 

534 return CHANGE_MODIFY 

535 elif delete.type != CHANGE_DELETE: 

536 # If it's in deletes but not marked as a delete, it must have been 

537 # added due to find_copies_harder, and needs to be marked as a 

538 # copy. 

539 return CHANGE_COPY 

540 return CHANGE_RENAME 

541 

542 def _find_content_rename_candidates(self): 

543 candidates = self._candidates = [] 

544 # TODO: Optimizations: 

545 # - Compare object sizes before counting blocks. 

546 # - Skip if delete's S_IFMT differs from all adds. 

547 # - Skip if adds or deletes is empty. 

548 # Match C git's behavior of not attempting to find content renames if 

549 # the matrix size exceeds the threshold. 

550 if not self._should_find_content_renames(): 

551 return 

552 

553 block_cache = {} 

554 check_paths = self._rename_threshold is not None 

555 for delete in self._deletes: 

556 if S_ISGITLINK(delete.old.mode): 

557 continue # Git links don't exist in this repo. 

558 old_sha = delete.old.sha 

559 old_obj = self._store[old_sha] 

560 block_cache[old_sha] = _count_blocks(old_obj) 

561 for add in self._adds: 

562 if stat.S_IFMT(delete.old.mode) != stat.S_IFMT(add.new.mode): 

563 continue 

564 new_obj = self._store[add.new.sha] 

565 score = _similarity_score(old_obj, new_obj, block_cache=block_cache) 

566 if score > self._rename_threshold: 

567 new_type = self._rename_type(check_paths, delete, add) 

568 rename = TreeChange(new_type, delete.old, add.new) 

569 candidates.append((-score, rename)) 

570 

571 def _choose_content_renames(self): 

572 # Sort scores from highest to lowest, but keep names in ascending 

573 # order. 

574 self._candidates.sort() 

575 

576 delete_paths = set() 

577 add_paths = set() 

578 for _, change in self._candidates: 

579 new_path = change.new.path 

580 if new_path in add_paths: 

581 continue 

582 old_path = change.old.path 

583 orig_type = change.type 

584 if old_path in delete_paths: 

585 change = TreeChange(CHANGE_COPY, change.old, change.new) 

586 

587 # If the candidate was originally a copy, that means it came from a 

588 # modified or unchanged path, so we don't want to prune it. 

589 if orig_type != CHANGE_COPY: 

590 delete_paths.add(old_path) 

591 add_paths.add(new_path) 

592 self._changes.append(change) 

593 self._prune(add_paths, delete_paths) 

594 

595 def _join_modifies(self): 

596 if self._rewrite_threshold is None: 

597 return 

598 

599 modifies = {} 

600 delete_map = {d.old.path: d for d in self._deletes} 

601 for add in self._adds: 

602 path = add.new.path 

603 delete = delete_map.get(path) 

604 if delete is not None and stat.S_IFMT(delete.old.mode) == stat.S_IFMT( 

605 add.new.mode 

606 ): 

607 modifies[path] = TreeChange(CHANGE_MODIFY, delete.old, add.new) 

608 

609 self._adds = [a for a in self._adds if a.new.path not in modifies] 

610 self._deletes = [a for a in self._deletes if a.new.path not in modifies] 

611 self._changes += modifies.values() 

612 

613 def _sorted_changes(self): 

614 result = [] 

615 result.extend(self._adds) 

616 result.extend(self._deletes) 

617 result.extend(self._changes) 

618 result.sort(key=_tree_change_key) 

619 return result 

620 

621 def _prune_unchanged(self): 

622 if self._want_unchanged: 

623 return 

624 self._deletes = [d for d in self._deletes if d.type != CHANGE_UNCHANGED] 

625 

626 def changes_with_renames( 

627 self, tree1_id, tree2_id, want_unchanged=False, include_trees=False 

628 ): 

629 """Iterate TreeChanges between two tree SHAs, with rename detection.""" 

630 self._reset() 

631 self._want_unchanged = want_unchanged 

632 self._include_trees = include_trees 

633 self._collect_changes(tree1_id, tree2_id) 

634 self._find_exact_renames() 

635 self._find_content_rename_candidates() 

636 self._choose_content_renames() 

637 self._join_modifies() 

638 self._prune_unchanged() 

639 return self._sorted_changes() 

640 

641 

642# Hold on to the pure-python implementations for testing. 

643_is_tree_py = _is_tree 

644_merge_entries_py = _merge_entries 

645_count_blocks_py = _count_blocks 

646try: 

647 # Try to import Rust versions 

648 from dulwich._diff_tree import ( # type: ignore 

649 _count_blocks, 

650 _is_tree, 

651 _merge_entries, 

652 ) 

653except ImportError: 

654 pass