1# diff_tree.py -- Utilities for diffing files and trees.
2# Copyright (C) 2010 Google, Inc.
3#
4# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
5# General Public License as public by the Free Software Foundation; version 2.0
6# or (at your option) any later version. You can redistribute it and/or
7# modify it under the terms of either of these two licenses.
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14#
15# You should have received a copy of the licenses; if not, see
16# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
17# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
18# License, Version 2.0.
19#
20
21"""Utilities for diffing files and trees."""
22
23import stat
24from collections import defaultdict, namedtuple
25from io import BytesIO
26from itertools import chain
27from typing import Dict, List, Optional
28
29from .object_store import BaseObjectStore
30from .objects import S_ISGITLINK, ObjectID, ShaFile, Tree, TreeEntry
31
32# TreeChange type constants.
33CHANGE_ADD = "add"
34CHANGE_MODIFY = "modify"
35CHANGE_DELETE = "delete"
36CHANGE_RENAME = "rename"
37CHANGE_COPY = "copy"
38CHANGE_UNCHANGED = "unchanged"
39
40RENAME_CHANGE_TYPES = (CHANGE_RENAME, CHANGE_COPY)
41
42_NULL_ENTRY = TreeEntry(None, None, None)
43
44_MAX_SCORE = 100
45RENAME_THRESHOLD = 60
46MAX_FILES = 200
47REWRITE_THRESHOLD = None
48
49
50class TreeChange(namedtuple("TreeChange", ["type", "old", "new"])):
51 """Named tuple a single change between two trees."""
52
53 @classmethod
54 def add(cls, new):
55 return cls(CHANGE_ADD, _NULL_ENTRY, new)
56
57 @classmethod
58 def delete(cls, old):
59 return cls(CHANGE_DELETE, old, _NULL_ENTRY)
60
61
62def _tree_entries(path: str, tree: Tree) -> List[TreeEntry]:
63 result: List[TreeEntry] = []
64 if not tree:
65 return result
66 for entry in tree.iteritems(name_order=True):
67 result.append(entry.in_path(path))
68 return result
69
70
71def _merge_entries(path, tree1, tree2):
72 """Merge the entries of two trees.
73
74 Args:
75 path: A path to prepend to all tree entry names.
76 tree1: The first Tree object to iterate, or None.
77 tree2: The second Tree object to iterate, or None.
78
79 Returns:
80 A list of pairs of TreeEntry objects for each pair of entries in
81 the trees. If an entry exists in one tree but not the other, the other
82 entry will have all attributes set to None. If neither entry's path is
83 None, they are guaranteed to match.
84 """
85 entries1 = _tree_entries(path, tree1)
86 entries2 = _tree_entries(path, tree2)
87 i1 = i2 = 0
88 len1 = len(entries1)
89 len2 = len(entries2)
90
91 result = []
92 while i1 < len1 and i2 < len2:
93 entry1 = entries1[i1]
94 entry2 = entries2[i2]
95 if entry1.path < entry2.path:
96 result.append((entry1, _NULL_ENTRY))
97 i1 += 1
98 elif entry1.path > entry2.path:
99 result.append((_NULL_ENTRY, entry2))
100 i2 += 1
101 else:
102 result.append((entry1, entry2))
103 i1 += 1
104 i2 += 1
105 for i in range(i1, len1):
106 result.append((entries1[i], _NULL_ENTRY))
107 for i in range(i2, len2):
108 result.append((_NULL_ENTRY, entries2[i]))
109 return result
110
111
112def _is_tree(entry):
113 mode = entry.mode
114 if mode is None:
115 return False
116 return stat.S_ISDIR(mode)
117
118
119def walk_trees(store, tree1_id, tree2_id, prune_identical=False):
120 """Recursively walk all the entries of two trees.
121
122 Iteration is depth-first pre-order, as in e.g. os.walk.
123
124 Args:
125 store: An ObjectStore for looking up objects.
126 tree1_id: The SHA of the first Tree object to iterate, or None.
127 tree2_id: The SHA of the second Tree object to iterate, or None.
128 prune_identical: If True, identical subtrees will not be walked.
129
130 Returns:
131 Iterator over Pairs of TreeEntry objects for each pair of entries
132 in the trees and their subtrees recursively. If an entry exists in one
133 tree but not the other, the other entry will have all attributes set
134 to None. If neither entry's path is None, they are guaranteed to
135 match.
136 """
137 # This could be fairly easily generalized to >2 trees if we find a use
138 # case.
139 mode1 = tree1_id and stat.S_IFDIR or None
140 mode2 = tree2_id and stat.S_IFDIR or None
141 todo = [(TreeEntry(b"", mode1, tree1_id), TreeEntry(b"", mode2, tree2_id))]
142 while todo:
143 entry1, entry2 = todo.pop()
144 is_tree1 = _is_tree(entry1)
145 is_tree2 = _is_tree(entry2)
146 if prune_identical and is_tree1 and is_tree2 and entry1 == entry2:
147 continue
148
149 tree1 = is_tree1 and store[entry1.sha] or None
150 tree2 = is_tree2 and store[entry2.sha] or None
151 path = entry1.path or entry2.path
152 todo.extend(reversed(_merge_entries(path, tree1, tree2)))
153 yield entry1, entry2
154
155
156def _skip_tree(entry, include_trees):
157 if entry.mode is None or (not include_trees and stat.S_ISDIR(entry.mode)):
158 return _NULL_ENTRY
159 return entry
160
161
162def tree_changes(
163 store,
164 tree1_id,
165 tree2_id,
166 want_unchanged=False,
167 rename_detector=None,
168 include_trees=False,
169 change_type_same=False,
170):
171 """Find the differences between the contents of two trees.
172
173 Args:
174 store: An ObjectStore for looking up objects.
175 tree1_id: The SHA of the source tree.
176 tree2_id: The SHA of the target tree.
177 want_unchanged: If True, include TreeChanges for unmodified entries
178 as well.
179 include_trees: Whether to include trees
180 rename_detector: RenameDetector object for detecting renames.
181 change_type_same: Whether to report change types in the same
182 entry or as delete+add.
183
184 Returns:
185 Iterator over TreeChange instances for each change between the
186 source and target tree.
187 """
188 if rename_detector is not None and tree1_id is not None and tree2_id is not None:
189 yield from rename_detector.changes_with_renames(
190 tree1_id,
191 tree2_id,
192 want_unchanged=want_unchanged,
193 include_trees=include_trees,
194 )
195 return
196
197 entries = walk_trees(
198 store, tree1_id, tree2_id, prune_identical=(not want_unchanged)
199 )
200 for entry1, entry2 in entries:
201 if entry1 == entry2 and not want_unchanged:
202 continue
203
204 # Treat entries for trees as missing.
205 entry1 = _skip_tree(entry1, include_trees)
206 entry2 = _skip_tree(entry2, include_trees)
207
208 if entry1 != _NULL_ENTRY and entry2 != _NULL_ENTRY:
209 if (
210 stat.S_IFMT(entry1.mode) != stat.S_IFMT(entry2.mode)
211 and not change_type_same
212 ):
213 # File type changed: report as delete/add.
214 yield TreeChange.delete(entry1)
215 entry1 = _NULL_ENTRY
216 change_type = CHANGE_ADD
217 elif entry1 == entry2:
218 change_type = CHANGE_UNCHANGED
219 else:
220 change_type = CHANGE_MODIFY
221 elif entry1 != _NULL_ENTRY:
222 change_type = CHANGE_DELETE
223 elif entry2 != _NULL_ENTRY:
224 change_type = CHANGE_ADD
225 else:
226 # Both were None because at least one was a tree.
227 continue
228 yield TreeChange(change_type, entry1, entry2)
229
230
231def _all_eq(seq, key, value):
232 for e in seq:
233 if key(e) != value:
234 return False
235 return True
236
237
238def _all_same(seq, key):
239 return _all_eq(seq[1:], key, key(seq[0]))
240
241
242def tree_changes_for_merge(
243 store: BaseObjectStore,
244 parent_tree_ids: List[ObjectID],
245 tree_id: ObjectID,
246 rename_detector=None,
247):
248 """Get the tree changes for a merge tree relative to all its parents.
249
250 Args:
251 store: An ObjectStore for looking up objects.
252 parent_tree_ids: An iterable of the SHAs of the parent trees.
253 tree_id: The SHA of the merge tree.
254 rename_detector: RenameDetector object for detecting renames.
255
256 Returns:
257 Iterator over lists of TreeChange objects, one per conflicted path
258 in the merge.
259
260 Each list contains one element per parent, with the TreeChange for that
261 path relative to that parent. An element may be None if it never
262 existed in one parent and was deleted in two others.
263
264 A path is only included in the output if it is a conflict, i.e. its SHA
265 in the merge tree is not found in any of the parents, or in the case of
266 deletes, if not all of the old SHAs match.
267 """
268 all_parent_changes = [
269 tree_changes(store, t, tree_id, rename_detector=rename_detector)
270 for t in parent_tree_ids
271 ]
272 num_parents = len(parent_tree_ids)
273 changes_by_path: Dict[str, List[Optional[TreeChange]]] = defaultdict(
274 lambda: [None] * num_parents
275 )
276
277 # Organize by path.
278 for i, parent_changes in enumerate(all_parent_changes):
279 for change in parent_changes:
280 if change.type == CHANGE_DELETE:
281 path = change.old.path
282 else:
283 path = change.new.path
284 changes_by_path[path][i] = change
285
286 def old_sha(c):
287 return c.old.sha
288
289 def change_type(c):
290 return c.type
291
292 # Yield only conflicting changes.
293 for _, changes in sorted(changes_by_path.items()):
294 assert len(changes) == num_parents
295 have = [c for c in changes if c is not None]
296 if _all_eq(have, change_type, CHANGE_DELETE):
297 if not _all_same(have, old_sha):
298 yield changes
299 elif not _all_same(have, change_type):
300 yield changes
301 elif None not in changes:
302 # If no change was found relative to one parent, that means the SHA
303 # must have matched the SHA in that parent, so it is not a
304 # conflict.
305 yield changes
306
307
308_BLOCK_SIZE = 64
309
310
311def _count_blocks(obj: ShaFile) -> Dict[int, int]:
312 """Count the blocks in an object.
313
314 Splits the data into blocks either on lines or <=64-byte chunks of lines.
315
316 Args:
317 obj: The object to count blocks for.
318
319 Returns:
320 A dict of block hashcode -> total bytes occurring.
321 """
322 block_counts: Dict[int, int] = defaultdict(int)
323 block = BytesIO()
324 n = 0
325
326 # Cache attrs as locals to avoid expensive lookups in the inner loop.
327 block_write = block.write
328 block_seek = block.seek
329 block_truncate = block.truncate
330 block_getvalue = block.getvalue
331
332 for c in chain.from_iterable(obj.as_raw_chunks()):
333 cb = c.to_bytes(1, "big")
334 block_write(cb)
335 n += 1
336 if cb == b"\n" or n == _BLOCK_SIZE:
337 value = block_getvalue()
338 block_counts[hash(value)] += len(value)
339 block_seek(0)
340 block_truncate()
341 n = 0
342 if n > 0:
343 last_block = block_getvalue()
344 block_counts[hash(last_block)] += len(last_block)
345 return block_counts
346
347
348def _common_bytes(blocks1, blocks2):
349 """Count the number of common bytes in two block count dicts.
350
351 Args:
352 blocks1: The first dict of block hashcode -> total bytes.
353 blocks2: The second dict of block hashcode -> total bytes.
354
355 Returns:
356 The number of bytes in common between blocks1 and blocks2. This is
357 only approximate due to possible hash collisions.
358 """
359 # Iterate over the smaller of the two dicts, since this is symmetrical.
360 if len(blocks1) > len(blocks2):
361 blocks1, blocks2 = blocks2, blocks1
362 score = 0
363 for block, count1 in blocks1.items():
364 count2 = blocks2.get(block)
365 if count2:
366 score += min(count1, count2)
367 return score
368
369
370def _similarity_score(obj1, obj2, block_cache=None):
371 """Compute a similarity score for two objects.
372
373 Args:
374 obj1: The first object to score.
375 obj2: The second object to score.
376 block_cache: An optional dict of SHA to block counts to cache
377 results between calls.
378
379 Returns:
380 The similarity score between the two objects, defined as the
381 number of bytes in common between the two objects divided by the
382 maximum size, scaled to the range 0-100.
383 """
384 if block_cache is None:
385 block_cache = {}
386 if obj1.id not in block_cache:
387 block_cache[obj1.id] = _count_blocks(obj1)
388 if obj2.id not in block_cache:
389 block_cache[obj2.id] = _count_blocks(obj2)
390
391 common_bytes = _common_bytes(block_cache[obj1.id], block_cache[obj2.id])
392 max_size = max(obj1.raw_length(), obj2.raw_length())
393 if not max_size:
394 return _MAX_SCORE
395 return int(float(common_bytes) * _MAX_SCORE / max_size)
396
397
398def _tree_change_key(entry):
399 # Sort by old path then new path. If only one exists, use it for both keys.
400 path1 = entry.old.path
401 path2 = entry.new.path
402 if path1 is None:
403 path1 = path2
404 if path2 is None:
405 path2 = path1
406 return (path1, path2)
407
408
409class RenameDetector:
410 """Object for handling rename detection between two trees."""
411
412 def __init__(
413 self,
414 store,
415 rename_threshold=RENAME_THRESHOLD,
416 max_files=MAX_FILES,
417 rewrite_threshold=REWRITE_THRESHOLD,
418 find_copies_harder=False,
419 ) -> None:
420 """Initialize the rename detector.
421
422 Args:
423 store: An ObjectStore for looking up objects.
424 rename_threshold: The threshold similarity score for considering
425 an add/delete pair to be a rename/copy; see _similarity_score.
426 max_files: The maximum number of adds and deletes to consider,
427 or None for no limit. The detector is guaranteed to compare no more
428 than max_files ** 2 add/delete pairs. This limit is provided
429 because rename detection can be quadratic in the project size. If
430 the limit is exceeded, no content rename detection is attempted.
431 rewrite_threshold: The threshold similarity score below which a
432 modify should be considered a delete/add, or None to not break
433 modifies; see _similarity_score.
434 find_copies_harder: If True, consider unmodified files when
435 detecting copies.
436 """
437 self._store = store
438 self._rename_threshold = rename_threshold
439 self._rewrite_threshold = rewrite_threshold
440 self._max_files = max_files
441 self._find_copies_harder = find_copies_harder
442 self._want_unchanged = False
443
444 def _reset(self):
445 self._adds = []
446 self._deletes = []
447 self._changes = []
448
449 def _should_split(self, change):
450 if (
451 self._rewrite_threshold is None
452 or change.type != CHANGE_MODIFY
453 or change.old.sha == change.new.sha
454 ):
455 return False
456 old_obj = self._store[change.old.sha]
457 new_obj = self._store[change.new.sha]
458 return _similarity_score(old_obj, new_obj) < self._rewrite_threshold
459
460 def _add_change(self, change):
461 if change.type == CHANGE_ADD:
462 self._adds.append(change)
463 elif change.type == CHANGE_DELETE:
464 self._deletes.append(change)
465 elif self._should_split(change):
466 self._deletes.append(TreeChange.delete(change.old))
467 self._adds.append(TreeChange.add(change.new))
468 elif (
469 self._find_copies_harder and change.type == CHANGE_UNCHANGED
470 ) or change.type == CHANGE_MODIFY:
471 # Treat all modifies as potential deletes for rename detection,
472 # but don't split them (to avoid spurious renames). Setting
473 # find_copies_harder means we treat unchanged the same as
474 # modified.
475 self._deletes.append(change)
476 else:
477 self._changes.append(change)
478
479 def _collect_changes(self, tree1_id, tree2_id):
480 want_unchanged = self._find_copies_harder or self._want_unchanged
481 for change in tree_changes(
482 self._store,
483 tree1_id,
484 tree2_id,
485 want_unchanged=want_unchanged,
486 include_trees=self._include_trees,
487 ):
488 self._add_change(change)
489
490 def _prune(self, add_paths, delete_paths):
491 self._adds = [a for a in self._adds if a.new.path not in add_paths]
492 self._deletes = [d for d in self._deletes if d.old.path not in delete_paths]
493
494 def _find_exact_renames(self):
495 add_map = defaultdict(list)
496 for add in self._adds:
497 add_map[add.new.sha].append(add.new)
498 delete_map = defaultdict(list)
499 for delete in self._deletes:
500 # Keep track of whether the delete was actually marked as a delete.
501 # If not, it needs to be marked as a copy.
502 is_delete = delete.type == CHANGE_DELETE
503 delete_map[delete.old.sha].append((delete.old, is_delete))
504
505 add_paths = set()
506 delete_paths = set()
507 for sha, sha_deletes in delete_map.items():
508 sha_adds = add_map[sha]
509 for (old, is_delete), new in zip(sha_deletes, sha_adds):
510 if stat.S_IFMT(old.mode) != stat.S_IFMT(new.mode):
511 continue
512 if is_delete:
513 delete_paths.add(old.path)
514 add_paths.add(new.path)
515 new_type = is_delete and CHANGE_RENAME or CHANGE_COPY
516 self._changes.append(TreeChange(new_type, old, new))
517
518 num_extra_adds = len(sha_adds) - len(sha_deletes)
519 # TODO(dborowitz): Less arbitrary way of dealing with extra copies.
520 old = sha_deletes[0][0]
521 if num_extra_adds > 0:
522 for new in sha_adds[-num_extra_adds:]:
523 add_paths.add(new.path)
524 self._changes.append(TreeChange(CHANGE_COPY, old, new))
525 self._prune(add_paths, delete_paths)
526
527 def _should_find_content_renames(self):
528 return len(self._adds) * len(self._deletes) <= self._max_files**2
529
530 def _rename_type(self, check_paths, delete, add):
531 if check_paths and delete.old.path == add.new.path:
532 # If the paths match, this must be a split modify, so make sure it
533 # comes out as a modify.
534 return CHANGE_MODIFY
535 elif delete.type != CHANGE_DELETE:
536 # If it's in deletes but not marked as a delete, it must have been
537 # added due to find_copies_harder, and needs to be marked as a
538 # copy.
539 return CHANGE_COPY
540 return CHANGE_RENAME
541
542 def _find_content_rename_candidates(self):
543 candidates = self._candidates = []
544 # TODO: Optimizations:
545 # - Compare object sizes before counting blocks.
546 # - Skip if delete's S_IFMT differs from all adds.
547 # - Skip if adds or deletes is empty.
548 # Match C git's behavior of not attempting to find content renames if
549 # the matrix size exceeds the threshold.
550 if not self._should_find_content_renames():
551 return
552
553 block_cache = {}
554 check_paths = self._rename_threshold is not None
555 for delete in self._deletes:
556 if S_ISGITLINK(delete.old.mode):
557 continue # Git links don't exist in this repo.
558 old_sha = delete.old.sha
559 old_obj = self._store[old_sha]
560 block_cache[old_sha] = _count_blocks(old_obj)
561 for add in self._adds:
562 if stat.S_IFMT(delete.old.mode) != stat.S_IFMT(add.new.mode):
563 continue
564 new_obj = self._store[add.new.sha]
565 score = _similarity_score(old_obj, new_obj, block_cache=block_cache)
566 if score > self._rename_threshold:
567 new_type = self._rename_type(check_paths, delete, add)
568 rename = TreeChange(new_type, delete.old, add.new)
569 candidates.append((-score, rename))
570
571 def _choose_content_renames(self):
572 # Sort scores from highest to lowest, but keep names in ascending
573 # order.
574 self._candidates.sort()
575
576 delete_paths = set()
577 add_paths = set()
578 for _, change in self._candidates:
579 new_path = change.new.path
580 if new_path in add_paths:
581 continue
582 old_path = change.old.path
583 orig_type = change.type
584 if old_path in delete_paths:
585 change = TreeChange(CHANGE_COPY, change.old, change.new)
586
587 # If the candidate was originally a copy, that means it came from a
588 # modified or unchanged path, so we don't want to prune it.
589 if orig_type != CHANGE_COPY:
590 delete_paths.add(old_path)
591 add_paths.add(new_path)
592 self._changes.append(change)
593 self._prune(add_paths, delete_paths)
594
595 def _join_modifies(self):
596 if self._rewrite_threshold is None:
597 return
598
599 modifies = {}
600 delete_map = {d.old.path: d for d in self._deletes}
601 for add in self._adds:
602 path = add.new.path
603 delete = delete_map.get(path)
604 if delete is not None and stat.S_IFMT(delete.old.mode) == stat.S_IFMT(
605 add.new.mode
606 ):
607 modifies[path] = TreeChange(CHANGE_MODIFY, delete.old, add.new)
608
609 self._adds = [a for a in self._adds if a.new.path not in modifies]
610 self._deletes = [a for a in self._deletes if a.new.path not in modifies]
611 self._changes += modifies.values()
612
613 def _sorted_changes(self):
614 result = []
615 result.extend(self._adds)
616 result.extend(self._deletes)
617 result.extend(self._changes)
618 result.sort(key=_tree_change_key)
619 return result
620
621 def _prune_unchanged(self):
622 if self._want_unchanged:
623 return
624 self._deletes = [d for d in self._deletes if d.type != CHANGE_UNCHANGED]
625
626 def changes_with_renames(
627 self, tree1_id, tree2_id, want_unchanged=False, include_trees=False
628 ):
629 """Iterate TreeChanges between two tree SHAs, with rename detection."""
630 self._reset()
631 self._want_unchanged = want_unchanged
632 self._include_trees = include_trees
633 self._collect_changes(tree1_id, tree2_id)
634 self._find_exact_renames()
635 self._find_content_rename_candidates()
636 self._choose_content_renames()
637 self._join_modifies()
638 self._prune_unchanged()
639 return self._sorted_changes()
640
641
642# Hold on to the pure-python implementations for testing.
643_is_tree_py = _is_tree
644_merge_entries_py = _merge_entries
645_count_blocks_py = _count_blocks
646try:
647 # Try to import Rust versions
648 from dulwich._diff_tree import ( # type: ignore
649 _count_blocks,
650 _is_tree,
651 _merge_entries,
652 )
653except ImportError:
654 pass