Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/dulwich/diff

1# diff_tree.py -- Utilities for diffing files and trees.

4# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU

5# General Public License as public by the Free Software Foundation; version 2.0

6# or (at your option) any later version. You can redistribute it and/or

7# modify it under the terms of either of these two licenses.

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14#

15# You should have received a copy of the licenses; if not, see

16# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License

17# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache

18# License, Version 2.0.

19#

21"""Utilities for diffing files and trees."""

23import stat

24from collections import defaultdict, namedtuple

25from io import BytesIO

26from itertools import chain

27from typing import Dict, List, Optional

29from .object_store import BaseObjectStore

30from .objects import S_ISGITLINK, ObjectID, ShaFile, Tree, TreeEntry

32# TreeChange type constants.

33CHANGE_ADD = "add"

34CHANGE_MODIFY = "modify"

35CHANGE_DELETE = "delete"

36CHANGE_RENAME = "rename"

37CHANGE_COPY = "copy"

38CHANGE_UNCHANGED = "unchanged"

40RENAME_CHANGE_TYPES = (CHANGE_RENAME, CHANGE_COPY)

42_NULL_ENTRY = TreeEntry(None, None, None)

44_MAX_SCORE = 100

45RENAME_THRESHOLD = 60

46MAX_FILES = 200

47REWRITE_THRESHOLD = None

50class TreeChange(namedtuple("TreeChange", ["type", "old", "new"])):

51 """Named tuple a single change between two trees."""

53 @classmethod

54 def add(cls, new):

55 return cls(CHANGE_ADD, _NULL_ENTRY, new)

57 @classmethod

58 def delete(cls, old):

59 return cls(CHANGE_DELETE, old, _NULL_ENTRY)

62def _tree_entries(path: str, tree: Tree) -> List[TreeEntry]:

63 result: List[TreeEntry] = []

64 if not tree:

65 return result

66 for entry in tree.iteritems(name_order=True):

67 result.append(entry.in_path(path))

68 return result

71def _merge_entries(path, tree1, tree2):

72 """Merge the entries of two trees.

74 Args:

75 path: A path to prepend to all tree entry names.

76 tree1: The first Tree object to iterate, or None.

77 tree2: The second Tree object to iterate, or None.

79 Returns:

80 A list of pairs of TreeEntry objects for each pair of entries in

81 the trees. If an entry exists in one tree but not the other, the other

82 entry will have all attributes set to None. If neither entry's path is

83 None, they are guaranteed to match.

84 """

85 entries1 = _tree_entries(path, tree1)

86 entries2 = _tree_entries(path, tree2)

87 i1 = i2 = 0

88 len1 = len(entries1)

89 len2 = len(entries2)

91 result = []

92 while i1 < len1 and i2 < len2:

93 entry1 = entries1[i1]

94 entry2 = entries2[i2]

95 if entry1.path < entry2.path:

96 result.append((entry1, _NULL_ENTRY))

97 i1 += 1

98 elif entry1.path > entry2.path:

99 result.append((_NULL_ENTRY, entry2))

100 i2 += 1

101 else:

102 result.append((entry1, entry2))

103 i1 += 1

104 i2 += 1

105 for i in range(i1, len1):

106 result.append((entries1[i], _NULL_ENTRY))

107 for i in range(i2, len2):

108 result.append((_NULL_ENTRY, entries2[i]))

109 return result

110

111

112def _is_tree(entry):

113 mode = entry.mode

114 if mode is None:

115 return False

116 return stat.S_ISDIR(mode)

117

118

119def walk_trees(store, tree1_id, tree2_id, prune_identical=False):

120 """Recursively walk all the entries of two trees.

121

122 Iteration is depth-first pre-order, as in e.g. os.walk.

123

124 Args:

125 store: An ObjectStore for looking up objects.

126 tree1_id: The SHA of the first Tree object to iterate, or None.

127 tree2_id: The SHA of the second Tree object to iterate, or None.

128 prune_identical: If True, identical subtrees will not be walked.

129

130 Returns:

131 Iterator over Pairs of TreeEntry objects for each pair of entries

132 in the trees and their subtrees recursively. If an entry exists in one

133 tree but not the other, the other entry will have all attributes set

134 to None. If neither entry's path is None, they are guaranteed to

135 match.

136 """

137 # This could be fairly easily generalized to >2 trees if we find a use

138 # case.

139 mode1 = tree1_id and stat.S_IFDIR or None

140 mode2 = tree2_id and stat.S_IFDIR or None

141 todo = [(TreeEntry(b"", mode1, tree1_id), TreeEntry(b"", mode2, tree2_id))]

142 while todo:

143 entry1, entry2 = todo.pop()

144 is_tree1 = _is_tree(entry1)

145 is_tree2 = _is_tree(entry2)

146 if prune_identical and is_tree1 and is_tree2 and entry1 == entry2:

147 continue

148

149 tree1 = is_tree1 and store[entry1.sha] or None

150 tree2 = is_tree2 and store[entry2.sha] or None

151 path = entry1.path or entry2.path

152 todo.extend(reversed(_merge_entries(path, tree1, tree2)))

153 yield entry1, entry2

154

155

156def _skip_tree(entry, include_trees):

157 if entry.mode is None or (not include_trees and stat.S_ISDIR(entry.mode)):

158 return _NULL_ENTRY

159 return entry

160

161

162def tree_changes(

163 store,

164 tree1_id,

165 tree2_id,

166 want_unchanged=False,

167 rename_detector=None,

168 include_trees=False,

169 change_type_same=False,

170):

171 """Find the differences between the contents of two trees.

172

173 Args:

174 store: An ObjectStore for looking up objects.

175 tree1_id: The SHA of the source tree.

176 tree2_id: The SHA of the target tree.

177 want_unchanged: If True, include TreeChanges for unmodified entries

178 as well.

179 include_trees: Whether to include trees

180 rename_detector: RenameDetector object for detecting renames.

181 change_type_same: Whether to report change types in the same

182 entry or as delete+add.

183

184 Returns:

185 Iterator over TreeChange instances for each change between the

186 source and target tree.

187 """

188 if rename_detector is not None and tree1_id is not None and tree2_id is not None:

189 yield from rename_detector.changes_with_renames(

190 tree1_id,

191 tree2_id,

192 want_unchanged=want_unchanged,

193 include_trees=include_trees,

194 )

195 return

196

197 entries = walk_trees(

198 store, tree1_id, tree2_id, prune_identical=(not want_unchanged)

199 )

200 for entry1, entry2 in entries:

201 if entry1 == entry2 and not want_unchanged:

202 continue

203

204 # Treat entries for trees as missing.

205 entry1 = _skip_tree(entry1, include_trees)

206 entry2 = _skip_tree(entry2, include_trees)

207

208 if entry1 != _NULL_ENTRY and entry2 != _NULL_ENTRY:

209 if (

210 stat.S_IFMT(entry1.mode) != stat.S_IFMT(entry2.mode)

211 and not change_type_same

212 ):

213 # File type changed: report as delete/add.

214 yield TreeChange.delete(entry1)

215 entry1 = _NULL_ENTRY

216 change_type = CHANGE_ADD

217 elif entry1 == entry2:

218 change_type = CHANGE_UNCHANGED

219 else:

220 change_type = CHANGE_MODIFY

221 elif entry1 != _NULL_ENTRY:

222 change_type = CHANGE_DELETE

223 elif entry2 != _NULL_ENTRY:

224 change_type = CHANGE_ADD

225 else:

226 # Both were None because at least one was a tree.

227 continue

228 yield TreeChange(change_type, entry1, entry2)

229

230

231def _all_eq(seq, key, value):

232 for e in seq:

233 if key(e) != value:

234 return False

235 return True

236

237

238def _all_same(seq, key):

239 return _all_eq(seq[1:], key, key(seq[0]))

240

241

242def tree_changes_for_merge(

243 store: BaseObjectStore,

244 parent_tree_ids: List[ObjectID],

245 tree_id: ObjectID,

246 rename_detector=None,

247):

248 """Get the tree changes for a merge tree relative to all its parents.

249

250 Args:

251 store: An ObjectStore for looking up objects.

252 parent_tree_ids: An iterable of the SHAs of the parent trees.

253 tree_id: The SHA of the merge tree.

254 rename_detector: RenameDetector object for detecting renames.

255

256 Returns:

257 Iterator over lists of TreeChange objects, one per conflicted path

258 in the merge.

259

260 Each list contains one element per parent, with the TreeChange for that

261 path relative to that parent. An element may be None if it never

262 existed in one parent and was deleted in two others.

263

264 A path is only included in the output if it is a conflict, i.e. its SHA

265 in the merge tree is not found in any of the parents, or in the case of

266 deletes, if not all of the old SHAs match.

267 """

268 all_parent_changes = [

269 tree_changes(store, t, tree_id, rename_detector=rename_detector)

270 for t in parent_tree_ids

271 ]

272 num_parents = len(parent_tree_ids)

273 changes_by_path: Dict[str, List[Optional[TreeChange]]] = defaultdict(

274 lambda: [None] * num_parents

275 )

276

277 # Organize by path.

278 for i, parent_changes in enumerate(all_parent_changes):

279 for change in parent_changes:

280 if change.type == CHANGE_DELETE:

281 path = change.old.path

282 else:

283 path = change.new.path

284 changes_by_path[path][i] = change

285

286 def old_sha(c):

287 return c.old.sha

288

289 def change_type(c):

290 return c.type

291

292 # Yield only conflicting changes.

293 for _, changes in sorted(changes_by_path.items()):

294 assert len(changes) == num_parents

295 have = [c for c in changes if c is not None]

296 if _all_eq(have, change_type, CHANGE_DELETE):

297 if not _all_same(have, old_sha):

298 yield changes

299 elif not _all_same(have, change_type):

300 yield changes

301 elif None not in changes:

302 # If no change was found relative to one parent, that means the SHA

303 # must have matched the SHA in that parent, so it is not a

304 # conflict.

305 yield changes

308_BLOCK_SIZE = 64

311def _count_blocks(obj: ShaFile) -> Dict[int, int]:

312 """Count the blocks in an object.

313

314 Splits the data into blocks either on lines or <=64-byte chunks of lines.

315

316 Args:

317 obj: The object to count blocks for.

318

319 Returns:

320 A dict of block hashcode -> total bytes occurring.

321 """

322 block_counts: Dict[int, int] = defaultdict(int)

323 block = BytesIO()

324 n = 0

325

326 # Cache attrs as locals to avoid expensive lookups in the inner loop.

327 block_write = block.write

328 block_seek = block.seek

329 block_truncate = block.truncate

330 block_getvalue = block.getvalue

331

332 for c in chain.from_iterable(obj.as_raw_chunks()):

333 cb = c.to_bytes(1, "big")

334 block_write(cb)

335 n += 1

336 if cb == b"\n" or n == _BLOCK_SIZE:

337 value = block_getvalue()

338 block_counts[hash(value)] += len(value)

339 block_seek(0)

340 block_truncate()

341 n = 0

342 if n > 0:

343 last_block = block_getvalue()

344 block_counts[hash(last_block)] += len(last_block)

345 return block_counts

346

347

348def _common_bytes(blocks1, blocks2):

349 """Count the number of common bytes in two block count dicts.

350

351 Args:

352 blocks1: The first dict of block hashcode -> total bytes.

353 blocks2: The second dict of block hashcode -> total bytes.

354

355 Returns:

356 The number of bytes in common between blocks1 and blocks2. This is

357 only approximate due to possible hash collisions.

358 """

359 # Iterate over the smaller of the two dicts, since this is symmetrical.

360 if len(blocks1) > len(blocks2):

361 blocks1, blocks2 = blocks2, blocks1

362 score = 0

363 for block, count1 in blocks1.items():

364 count2 = blocks2.get(block)

365 if count2:

366 score += min(count1, count2)

367 return score

368

369

370def _similarity_score(obj1, obj2, block_cache=None):

371 """Compute a similarity score for two objects.

372

373 Args:

374 obj1: The first object to score.

375 obj2: The second object to score.

376 block_cache: An optional dict of SHA to block counts to cache

377 results between calls.

378

379 Returns:

380 The similarity score between the two objects, defined as the

381 number of bytes in common between the two objects divided by the

382 maximum size, scaled to the range 0-100.

383 """

384 if block_cache is None:

385 block_cache = {}

386 if obj1.id not in block_cache:

387 block_cache[obj1.id] = _count_blocks(obj1)

388 if obj2.id not in block_cache:

389 block_cache[obj2.id] = _count_blocks(obj2)

390

391 common_bytes = _common_bytes(block_cache[obj1.id], block_cache[obj2.id])

392 max_size = max(obj1.raw_length(), obj2.raw_length())

393 if not max_size:

394 return _MAX_SCORE

395 return int(float(common_bytes) * _MAX_SCORE / max_size)

396

397

398def _tree_change_key(entry):

399 # Sort by old path then new path. If only one exists, use it for both keys.

400 path1 = entry.old.path

401 path2 = entry.new.path

402 if path1 is None:

403 path1 = path2

404 if path2 is None:

405 path2 = path1

406 return (path1, path2)

407

408

409class RenameDetector:

410 """Object for handling rename detection between two trees."""

411

412 def __init__(

413 self,

414 store,

415 rename_threshold=RENAME_THRESHOLD,

416 max_files=MAX_FILES,

417 rewrite_threshold=REWRITE_THRESHOLD,

418 find_copies_harder=False,

419 ) -> None:

420 """Initialize the rename detector.

421

422 Args:

423 store: An ObjectStore for looking up objects.

424 rename_threshold: The threshold similarity score for considering

425 an add/delete pair to be a rename/copy; see _similarity_score.

426 max_files: The maximum number of adds and deletes to consider,

427 or None for no limit. The detector is guaranteed to compare no more

428 than max_files ** 2 add/delete pairs. This limit is provided

429 because rename detection can be quadratic in the project size. If

430 the limit is exceeded, no content rename detection is attempted.

431 rewrite_threshold: The threshold similarity score below which a

432 modify should be considered a delete/add, or None to not break

433 modifies; see _similarity_score.

434 find_copies_harder: If True, consider unmodified files when

435 detecting copies.

436 """

437 self._store = store

438 self._rename_threshold = rename_threshold

439 self._rewrite_threshold = rewrite_threshold

440 self._max_files = max_files

441 self._find_copies_harder = find_copies_harder

442 self._want_unchanged = False

443

444 def _reset(self):

445 self._adds = []

446 self._deletes = []

447 self._changes = []

448

449 def _should_split(self, change):

450 if (

451 self._rewrite_threshold is None

452 or change.type != CHANGE_MODIFY

453 or change.old.sha == change.new.sha

454 ):

455 return False

456 old_obj = self._store[change.old.sha]

457 new_obj = self._store[change.new.sha]

458 return _similarity_score(old_obj, new_obj) < self._rewrite_threshold

459

460 def _add_change(self, change):

461 if change.type == CHANGE_ADD:

462 self._adds.append(change)

463 elif change.type == CHANGE_DELETE:

464 self._deletes.append(change)

465 elif self._should_split(change):

466 self._deletes.append(TreeChange.delete(change.old))

467 self._adds.append(TreeChange.add(change.new))

468 elif (

469 self._find_copies_harder and change.type == CHANGE_UNCHANGED

470 ) or change.type == CHANGE_MODIFY:

471 # Treat all modifies as potential deletes for rename detection,

472 # but don't split them (to avoid spurious renames). Setting

473 # find_copies_harder means we treat unchanged the same as

474 # modified.

475 self._deletes.append(change)

476 else:

477 self._changes.append(change)

478

479 def _collect_changes(self, tree1_id, tree2_id):

480 want_unchanged = self._find_copies_harder or self._want_unchanged

481 for change in tree_changes(

482 self._store,

483 tree1_id,

484 tree2_id,

485 want_unchanged=want_unchanged,

486 include_trees=self._include_trees,

487 ):

488 self._add_change(change)

489

490 def _prune(self, add_paths, delete_paths):

491 self._adds = [a for a in self._adds if a.new.path not in add_paths]

492 self._deletes = [d for d in self._deletes if d.old.path not in delete_paths]

493

494 def _find_exact_renames(self):

495 add_map = defaultdict(list)

496 for add in self._adds:

497 add_map[add.new.sha].append(add.new)

498 delete_map = defaultdict(list)

499 for delete in self._deletes:

500 # Keep track of whether the delete was actually marked as a delete.

501 # If not, it needs to be marked as a copy.

502 is_delete = delete.type == CHANGE_DELETE

503 delete_map[delete.old.sha].append((delete.old, is_delete))

504

505 add_paths = set()

506 delete_paths = set()

507 for sha, sha_deletes in delete_map.items():

508 sha_adds = add_map[sha]

509 for (old, is_delete), new in zip(sha_deletes, sha_adds):

510 if stat.S_IFMT(old.mode) != stat.S_IFMT(new.mode):

511 continue

512 if is_delete:

513 delete_paths.add(old.path)

514 add_paths.add(new.path)

515 new_type = is_delete and CHANGE_RENAME or CHANGE_COPY

516 self._changes.append(TreeChange(new_type, old, new))

517

518 num_extra_adds = len(sha_adds) - len(sha_deletes)

519 # TODO(dborowitz): Less arbitrary way of dealing with extra copies.

520 old = sha_deletes[0][0]

521 if num_extra_adds > 0:

522 for new in sha_adds[-num_extra_adds:]:

523 add_paths.add(new.path)

524 self._changes.append(TreeChange(CHANGE_COPY, old, new))

525 self._prune(add_paths, delete_paths)

526

527 def _should_find_content_renames(self):

528 return len(self._adds) * len(self._deletes) <= self._max_files**2

529

530 def _rename_type(self, check_paths, delete, add):

531 if check_paths and delete.old.path == add.new.path:

532 # If the paths match, this must be a split modify, so make sure it

533 # comes out as a modify.

534 return CHANGE_MODIFY

535 elif delete.type != CHANGE_DELETE:

536 # If it's in deletes but not marked as a delete, it must have been

537 # added due to find_copies_harder, and needs to be marked as a

538 # copy.

539 return CHANGE_COPY

540 return CHANGE_RENAME

541

542 def _find_content_rename_candidates(self):

543 candidates = self._candidates = []

544 # TODO: Optimizations:

545 # - Compare object sizes before counting blocks.

546 # - Skip if delete's S_IFMT differs from all adds.

547 # - Skip if adds or deletes is empty.

548 # Match C git's behavior of not attempting to find content renames if

549 # the matrix size exceeds the threshold.

550 if not self._should_find_content_renames():

551 return

552

553 block_cache = {}

554 check_paths = self._rename_threshold is not None

555 for delete in self._deletes:

556 if S_ISGITLINK(delete.old.mode):

557 continue # Git links don't exist in this repo.

558 old_sha = delete.old.sha

559 old_obj = self._store[old_sha]

560 block_cache[old_sha] = _count_blocks(old_obj)

561 for add in self._adds:

562 if stat.S_IFMT(delete.old.mode) != stat.S_IFMT(add.new.mode):

563 continue

564 new_obj = self._store[add.new.sha]

565 score = _similarity_score(old_obj, new_obj, block_cache=block_cache)

566 if score > self._rename_threshold:

567 new_type = self._rename_type(check_paths, delete, add)

568 rename = TreeChange(new_type, delete.old, add.new)

569 candidates.append((-score, rename))

570

571 def _choose_content_renames(self):

572 # Sort scores from highest to lowest, but keep names in ascending

573 # order.

574 self._candidates.sort()

575

576 delete_paths = set()

577 add_paths = set()

578 for _, change in self._candidates:

579 new_path = change.new.path

580 if new_path in add_paths:

581 continue

582 old_path = change.old.path

583 orig_type = change.type

584 if old_path in delete_paths:

585 change = TreeChange(CHANGE_COPY, change.old, change.new)

586

587 # If the candidate was originally a copy, that means it came from a

588 # modified or unchanged path, so we don't want to prune it.

589 if orig_type != CHANGE_COPY:

590 delete_paths.add(old_path)

591 add_paths.add(new_path)

592 self._changes.append(change)

593 self._prune(add_paths, delete_paths)

594

595 def _join_modifies(self):

596 if self._rewrite_threshold is None:

597 return

598

599 modifies = {}

600 delete_map = {d.old.path: d for d in self._deletes}

601 for add in self._adds:

602 path = add.new.path

603 delete = delete_map.get(path)

604 if delete is not None and stat.S_IFMT(delete.old.mode) == stat.S_IFMT(

605 add.new.mode

606 ):

607 modifies[path] = TreeChange(CHANGE_MODIFY, delete.old, add.new)

608

609 self._adds = [a for a in self._adds if a.new.path not in modifies]

610 self._deletes = [a for a in self._deletes if a.new.path not in modifies]

611 self._changes += modifies.values()

612

613 def _sorted_changes(self):

614 result = []

615 result.extend(self._adds)

616 result.extend(self._deletes)

617 result.extend(self._changes)

618 result.sort(key=_tree_change_key)

619 return result

620

621 def _prune_unchanged(self):

622 if self._want_unchanged:

623 return

624 self._deletes = [d for d in self._deletes if d.type != CHANGE_UNCHANGED]

625

626 def changes_with_renames(

627 self, tree1_id, tree2_id, want_unchanged=False, include_trees=False

628 ):

629 """Iterate TreeChanges between two tree SHAs, with rename detection."""

630 self._reset()

631 self._want_unchanged = want_unchanged

632 self._include_trees = include_trees

633 self._collect_changes(tree1_id, tree2_id)

634 self._find_exact_renames()

635 self._find_content_rename_candidates()

636 self._choose_content_renames()

637 self._join_modifies()

638 self._prune_unchanged()

639 return self._sorted_changes()

640

641

642# Hold on to the pure-python implementations for testing.

643_is_tree_py = _is_tree

644_merge_entries_py = _merge_entries

645_count_blocks_py = _count_blocks

646try:

647 # Try to import Rust versions

648 from dulwich._diff_tree import ( # type: ignore

649 _count_blocks,

650 _is_tree,

651 _merge_entries,

652 )

653except ImportError:

654 pass

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/dulwich/diff_tree.py: 21%

340 statements