/src/git/diffcore-rename.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * |
3 | | * Copyright (C) 2005 Junio C Hamano |
4 | | */ |
5 | | |
6 | | #define USE_THE_REPOSITORY_VARIABLE |
7 | | |
8 | | #include "git-compat-util.h" |
9 | | #include "diff.h" |
10 | | #include "diffcore.h" |
11 | | #include "object-store-ll.h" |
12 | | #include "hashmap.h" |
13 | | #include "mem-pool.h" |
14 | | #include "oid-array.h" |
15 | | #include "progress.h" |
16 | | #include "promisor-remote.h" |
17 | | #include "string-list.h" |
18 | | #include "strmap.h" |
19 | | #include "trace2.h" |
20 | | |
21 | | /* Table of rename/copy destinations */ |
22 | | |
23 | | static struct diff_rename_dst { |
24 | | struct diff_filepair *p; |
25 | | struct diff_filespec *filespec_to_free; |
26 | | int is_rename; /* false -> just a create; true -> rename or copy */ |
27 | | } *rename_dst; |
28 | | static int rename_dst_nr, rename_dst_alloc; |
29 | | /* Mapping from break source pathname to break destination index */ |
30 | | static struct strintmap *break_idx = NULL; |
31 | | |
32 | | static struct diff_rename_dst *locate_rename_dst(struct diff_filepair *p) |
33 | 0 | { |
34 | | /* Lookup by p->ONE->path */ |
35 | 0 | int idx = break_idx ? strintmap_get(break_idx, p->one->path) : -1; |
36 | 0 | return (idx == -1) ? NULL : &rename_dst[idx]; |
37 | 0 | } |
38 | | |
39 | | /* |
40 | | * Returns 0 on success, -1 if we found a duplicate. |
41 | | */ |
42 | | static int add_rename_dst(struct diff_filepair *p) |
43 | 0 | { |
44 | 0 | ALLOC_GROW(rename_dst, rename_dst_nr + 1, rename_dst_alloc); |
45 | 0 | rename_dst[rename_dst_nr].p = p; |
46 | 0 | rename_dst[rename_dst_nr].filespec_to_free = NULL; |
47 | 0 | rename_dst[rename_dst_nr].is_rename = 0; |
48 | 0 | rename_dst_nr++; |
49 | 0 | return 0; |
50 | 0 | } |
51 | | |
52 | | /* Table of rename/copy src files */ |
53 | | static struct diff_rename_src { |
54 | | struct diff_filepair *p; |
55 | | unsigned short score; /* to remember the break score */ |
56 | | } *rename_src; |
57 | | static int rename_src_nr, rename_src_alloc; |
58 | | |
59 | | static void register_rename_src(struct diff_filepair *p) |
60 | 0 | { |
61 | 0 | if (p->broken_pair) { |
62 | 0 | if (!break_idx) { |
63 | 0 | break_idx = xmalloc(sizeof(*break_idx)); |
64 | 0 | strintmap_init_with_options(break_idx, -1, NULL, 0); |
65 | 0 | } |
66 | 0 | strintmap_set(break_idx, p->one->path, rename_dst_nr); |
67 | 0 | } |
68 | |
|
69 | 0 | ALLOC_GROW(rename_src, rename_src_nr + 1, rename_src_alloc); |
70 | 0 | rename_src[rename_src_nr].p = p; |
71 | 0 | rename_src[rename_src_nr].score = p->score; |
72 | 0 | rename_src_nr++; |
73 | 0 | } |
74 | | |
75 | | static int basename_same(struct diff_filespec *src, struct diff_filespec *dst) |
76 | 0 | { |
77 | 0 | int src_len = strlen(src->path), dst_len = strlen(dst->path); |
78 | 0 | while (src_len && dst_len) { |
79 | 0 | char c1 = src->path[--src_len]; |
80 | 0 | char c2 = dst->path[--dst_len]; |
81 | 0 | if (c1 != c2) |
82 | 0 | return 0; |
83 | 0 | if (c1 == '/') |
84 | 0 | return 1; |
85 | 0 | } |
86 | 0 | return (!src_len || src->path[src_len - 1] == '/') && |
87 | 0 | (!dst_len || dst->path[dst_len - 1] == '/'); |
88 | 0 | } |
89 | | |
90 | | struct diff_score { |
91 | | int src; /* index in rename_src */ |
92 | | int dst; /* index in rename_dst */ |
93 | | unsigned short score; |
94 | | short name_score; |
95 | | }; |
96 | | |
97 | | struct inexact_prefetch_options { |
98 | | struct repository *repo; |
99 | | int skip_unmodified; |
100 | | }; |
101 | | static void inexact_prefetch(void *prefetch_options) |
102 | 0 | { |
103 | 0 | struct inexact_prefetch_options *options = prefetch_options; |
104 | 0 | int i; |
105 | 0 | struct oid_array to_fetch = OID_ARRAY_INIT; |
106 | |
|
107 | 0 | for (i = 0; i < rename_dst_nr; i++) { |
108 | 0 | if (rename_dst[i].p->renamed_pair) |
109 | | /* |
110 | | * The loop in diffcore_rename() will not need these |
111 | | * blobs, so skip prefetching. |
112 | | */ |
113 | 0 | continue; /* already found exact match */ |
114 | 0 | diff_add_if_missing(options->repo, &to_fetch, |
115 | 0 | rename_dst[i].p->two); |
116 | 0 | } |
117 | 0 | for (i = 0; i < rename_src_nr; i++) { |
118 | 0 | if (options->skip_unmodified && |
119 | 0 | diff_unmodified_pair(rename_src[i].p)) |
120 | | /* |
121 | | * The loop in diffcore_rename() will not need these |
122 | | * blobs, so skip prefetching. |
123 | | */ |
124 | 0 | continue; |
125 | 0 | diff_add_if_missing(options->repo, &to_fetch, |
126 | 0 | rename_src[i].p->one); |
127 | 0 | } |
128 | 0 | promisor_remote_get_direct(options->repo, to_fetch.oid, to_fetch.nr); |
129 | 0 | oid_array_clear(&to_fetch); |
130 | 0 | } |
131 | | |
132 | | static int estimate_similarity(struct repository *r, |
133 | | struct diff_filespec *src, |
134 | | struct diff_filespec *dst, |
135 | | int minimum_score, |
136 | | struct diff_populate_filespec_options *dpf_opt) |
137 | 0 | { |
138 | | /* src points at a file that existed in the original tree (or |
139 | | * optionally a file in the destination tree) and dst points |
140 | | * at a newly created file. They may be quite similar, in which |
141 | | * case we want to say src is renamed to dst or src is copied into |
142 | | * dst, and then some edit has been applied to dst. |
143 | | * |
144 | | * Compare them and return how similar they are, representing |
145 | | * the score as an integer between 0 and MAX_SCORE. |
146 | | * |
147 | | * When there is an exact match, it is considered a better |
148 | | * match than anything else; the destination does not even |
149 | | * call into this function in that case. |
150 | | */ |
151 | 0 | unsigned long max_size, delta_size, base_size, src_copied, literal_added; |
152 | 0 | int score; |
153 | | |
154 | | /* We deal only with regular files. Symlink renames are handled |
155 | | * only when they are exact matches --- in other words, no edits |
156 | | * after renaming. |
157 | | */ |
158 | 0 | if (!S_ISREG(src->mode) || !S_ISREG(dst->mode)) |
159 | 0 | return 0; |
160 | | |
161 | | /* |
162 | | * Need to check that source and destination sizes are |
163 | | * filled in before comparing them. |
164 | | * |
165 | | * If we already have "cnt_data" filled in, we know it's |
166 | | * all good (avoid checking the size for zero, as that |
167 | | * is a possible size - we really should have a flag to |
168 | | * say whether the size is valid or not!) |
169 | | */ |
170 | 0 | dpf_opt->check_size_only = 1; |
171 | |
|
172 | 0 | if (!src->cnt_data && |
173 | 0 | diff_populate_filespec(r, src, dpf_opt)) |
174 | 0 | return 0; |
175 | 0 | if (!dst->cnt_data && |
176 | 0 | diff_populate_filespec(r, dst, dpf_opt)) |
177 | 0 | return 0; |
178 | | |
179 | 0 | max_size = ((src->size > dst->size) ? src->size : dst->size); |
180 | 0 | base_size = ((src->size < dst->size) ? src->size : dst->size); |
181 | 0 | delta_size = max_size - base_size; |
182 | | |
183 | | /* We would not consider edits that change the file size so |
184 | | * drastically. delta_size must be smaller than |
185 | | * (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size). |
186 | | * |
187 | | * Note that base_size == 0 case is handled here already |
188 | | * and the final score computation below would not have a |
189 | | * divide-by-zero issue. |
190 | | */ |
191 | 0 | if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE) |
192 | 0 | return 0; |
193 | | |
194 | 0 | dpf_opt->check_size_only = 0; |
195 | |
|
196 | 0 | if (!src->cnt_data && diff_populate_filespec(r, src, dpf_opt)) |
197 | 0 | return 0; |
198 | 0 | if (!dst->cnt_data && diff_populate_filespec(r, dst, dpf_opt)) |
199 | 0 | return 0; |
200 | | |
201 | 0 | if (diffcore_count_changes(r, src, dst, |
202 | 0 | &src->cnt_data, &dst->cnt_data, |
203 | 0 | &src_copied, &literal_added)) |
204 | 0 | return 0; |
205 | | |
206 | | /* How similar are they? |
207 | | * what percentage of material in dst are from source? |
208 | | */ |
209 | 0 | if (!dst->size) |
210 | 0 | score = 0; /* should not happen */ |
211 | 0 | else |
212 | 0 | score = (int)(src_copied * MAX_SCORE / max_size); |
213 | 0 | return score; |
214 | 0 | } |
215 | | |
216 | | static void record_rename_pair(int dst_index, int src_index, int score) |
217 | 0 | { |
218 | 0 | struct diff_filepair *src = rename_src[src_index].p; |
219 | 0 | struct diff_filepair *dst = rename_dst[dst_index].p; |
220 | |
|
221 | 0 | if (dst->renamed_pair) |
222 | 0 | die("internal error: dst already matched."); |
223 | | |
224 | 0 | src->one->rename_used++; |
225 | 0 | src->one->count++; |
226 | |
|
227 | 0 | rename_dst[dst_index].filespec_to_free = dst->one; |
228 | 0 | rename_dst[dst_index].is_rename = 1; |
229 | |
|
230 | 0 | dst->one = src->one; |
231 | 0 | dst->renamed_pair = 1; |
232 | 0 | if (!strcmp(dst->one->path, dst->two->path)) |
233 | 0 | dst->score = rename_src[src_index].score; |
234 | 0 | else |
235 | 0 | dst->score = score; |
236 | 0 | } |
237 | | |
238 | | /* |
239 | | * We sort the rename similarity matrix with the score, in descending |
240 | | * order (the most similar first). |
241 | | */ |
242 | | static int score_compare(const void *a_, const void *b_) |
243 | 0 | { |
244 | 0 | const struct diff_score *a = a_, *b = b_; |
245 | | |
246 | | /* sink the unused ones to the bottom */ |
247 | 0 | if (a->dst < 0) |
248 | 0 | return (0 <= b->dst); |
249 | 0 | else if (b->dst < 0) |
250 | 0 | return -1; |
251 | | |
252 | 0 | if (a->score == b->score) |
253 | 0 | return b->name_score - a->name_score; |
254 | | |
255 | 0 | return b->score - a->score; |
256 | 0 | } |
257 | | |
258 | | struct file_similarity { |
259 | | struct hashmap_entry entry; |
260 | | int index; |
261 | | struct diff_filespec *filespec; |
262 | | }; |
263 | | |
264 | | static unsigned int hash_filespec(struct repository *r, |
265 | | struct diff_filespec *filespec) |
266 | 0 | { |
267 | 0 | if (!filespec->oid_valid) { |
268 | 0 | if (diff_populate_filespec(r, filespec, NULL)) |
269 | 0 | return 0; |
270 | 0 | hash_object_file(r->hash_algo, filespec->data, filespec->size, |
271 | 0 | OBJ_BLOB, &filespec->oid); |
272 | 0 | } |
273 | 0 | return oidhash(&filespec->oid); |
274 | 0 | } |
275 | | |
276 | | static int find_identical_files(struct hashmap *srcs, |
277 | | int dst_index, |
278 | | struct diff_options *options) |
279 | 0 | { |
280 | 0 | int renames = 0; |
281 | 0 | struct diff_filespec *target = rename_dst[dst_index].p->two; |
282 | 0 | struct file_similarity *p, *best = NULL; |
283 | 0 | int i = 100, best_score = -1; |
284 | 0 | unsigned int hash = hash_filespec(options->repo, target); |
285 | | |
286 | | /* |
287 | | * Find the best source match for specified destination. |
288 | | */ |
289 | 0 | p = hashmap_get_entry_from_hash(srcs, hash, NULL, |
290 | 0 | struct file_similarity, entry); |
291 | 0 | hashmap_for_each_entry_from(srcs, p, entry) { |
292 | 0 | int score; |
293 | 0 | struct diff_filespec *source = p->filespec; |
294 | | |
295 | | /* False hash collision? */ |
296 | 0 | if (!oideq(&source->oid, &target->oid)) |
297 | 0 | continue; |
298 | | /* Non-regular files? If so, the modes must match! */ |
299 | 0 | if (!S_ISREG(source->mode) || !S_ISREG(target->mode)) { |
300 | 0 | if (source->mode != target->mode) |
301 | 0 | continue; |
302 | 0 | } |
303 | | /* Give higher scores to sources that haven't been used already */ |
304 | 0 | score = !source->rename_used; |
305 | 0 | if (source->rename_used && options->detect_rename != DIFF_DETECT_COPY) |
306 | 0 | continue; |
307 | 0 | score += basename_same(source, target); |
308 | 0 | if (score > best_score) { |
309 | 0 | best = p; |
310 | 0 | best_score = score; |
311 | 0 | if (score == 2) |
312 | 0 | break; |
313 | 0 | } |
314 | | |
315 | | /* Too many identical alternatives? Pick one */ |
316 | 0 | if (!--i) |
317 | 0 | break; |
318 | 0 | } |
319 | 0 | if (best) { |
320 | 0 | record_rename_pair(dst_index, best->index, MAX_SCORE); |
321 | 0 | renames++; |
322 | 0 | } |
323 | 0 | return renames; |
324 | 0 | } |
325 | | |
326 | | static void insert_file_table(struct repository *r, |
327 | | struct mem_pool *pool, |
328 | | struct hashmap *table, int index, |
329 | | struct diff_filespec *filespec) |
330 | 0 | { |
331 | 0 | struct file_similarity *entry = mem_pool_alloc(pool, sizeof(*entry)); |
332 | |
|
333 | 0 | entry->index = index; |
334 | 0 | entry->filespec = filespec; |
335 | |
|
336 | 0 | hashmap_entry_init(&entry->entry, hash_filespec(r, filespec)); |
337 | 0 | hashmap_add(table, &entry->entry); |
338 | 0 | } |
339 | | |
340 | | /* |
341 | | * Find exact renames first. |
342 | | * |
343 | | * The first round matches up the up-to-date entries, |
344 | | * and then during the second round we try to match |
345 | | * cache-dirty entries as well. |
346 | | */ |
347 | | static int find_exact_renames(struct diff_options *options, |
348 | | struct mem_pool *pool) |
349 | 0 | { |
350 | 0 | int i, renames = 0; |
351 | 0 | struct hashmap file_table; |
352 | | |
353 | | /* Add all sources to the hash table in reverse order, because |
354 | | * later on they will be retrieved in LIFO order. |
355 | | */ |
356 | 0 | hashmap_init(&file_table, NULL, NULL, rename_src_nr); |
357 | 0 | for (i = rename_src_nr-1; i >= 0; i--) |
358 | 0 | insert_file_table(options->repo, pool, |
359 | 0 | &file_table, i, |
360 | 0 | rename_src[i].p->one); |
361 | | |
362 | | /* Walk the destinations and find best source match */ |
363 | 0 | for (i = 0; i < rename_dst_nr; i++) |
364 | 0 | renames += find_identical_files(&file_table, i, options); |
365 | | |
366 | | /* Free the hash data structure (entries will be freed with the pool) */ |
367 | 0 | hashmap_clear(&file_table); |
368 | |
|
369 | 0 | return renames; |
370 | 0 | } |
371 | | |
372 | | struct dir_rename_info { |
373 | | struct strintmap idx_map; |
374 | | struct strmap dir_rename_guess; |
375 | | struct strmap *dir_rename_count; |
376 | | struct strintmap *relevant_source_dirs; |
377 | | unsigned setup; |
378 | | }; |
379 | | |
380 | | static char *get_dirname(const char *filename) |
381 | 0 | { |
382 | 0 | char *slash = strrchr(filename, '/'); |
383 | 0 | return slash ? xstrndup(filename, slash - filename) : xstrdup(""); |
384 | 0 | } |
385 | | |
386 | | static void dirname_munge(char *filename) |
387 | 0 | { |
388 | 0 | char *slash = strrchr(filename, '/'); |
389 | 0 | if (!slash) |
390 | 0 | slash = filename; |
391 | 0 | *slash = '\0'; |
392 | 0 | } |
393 | | |
394 | | static const char *get_highest_rename_path(struct strintmap *counts) |
395 | 0 | { |
396 | 0 | int highest_count = 0; |
397 | 0 | const char *highest_destination_dir = NULL; |
398 | 0 | struct hashmap_iter iter; |
399 | 0 | struct strmap_entry *entry; |
400 | |
|
401 | 0 | strintmap_for_each_entry(counts, &iter, entry) { |
402 | 0 | const char *destination_dir = entry->key; |
403 | 0 | intptr_t count = (intptr_t)entry->value; |
404 | 0 | if (count > highest_count) { |
405 | 0 | highest_count = count; |
406 | 0 | highest_destination_dir = destination_dir; |
407 | 0 | } |
408 | 0 | } |
409 | 0 | return highest_destination_dir; |
410 | 0 | } |
411 | | |
412 | | static const char *UNKNOWN_DIR = "/"; /* placeholder -- short, illegal directory */ |
413 | | |
414 | | static int dir_rename_already_determinable(struct strintmap *counts) |
415 | 0 | { |
416 | 0 | struct hashmap_iter iter; |
417 | 0 | struct strmap_entry *entry; |
418 | 0 | int first = 0, second = 0, unknown = 0; |
419 | 0 | strintmap_for_each_entry(counts, &iter, entry) { |
420 | 0 | const char *destination_dir = entry->key; |
421 | 0 | intptr_t count = (intptr_t)entry->value; |
422 | 0 | if (!strcmp(destination_dir, UNKNOWN_DIR)) { |
423 | 0 | unknown = count; |
424 | 0 | } else if (count >= first) { |
425 | 0 | second = first; |
426 | 0 | first = count; |
427 | 0 | } else if (count >= second) { |
428 | 0 | second = count; |
429 | 0 | } |
430 | 0 | } |
431 | 0 | return first > second + unknown; |
432 | 0 | } |
433 | | |
434 | | static void increment_count(struct dir_rename_info *info, |
435 | | const char *old_dir, |
436 | | const char *new_dir) |
437 | 0 | { |
438 | 0 | struct strintmap *counts; |
439 | 0 | struct strmap_entry *e; |
440 | | |
441 | | /* Get the {new_dirs -> counts} mapping using old_dir */ |
442 | 0 | e = strmap_get_entry(info->dir_rename_count, old_dir); |
443 | 0 | if (e) { |
444 | 0 | counts = e->value; |
445 | 0 | } else { |
446 | 0 | counts = xmalloc(sizeof(*counts)); |
447 | 0 | strintmap_init_with_options(counts, 0, NULL, 1); |
448 | 0 | strmap_put(info->dir_rename_count, old_dir, counts); |
449 | 0 | } |
450 | | |
451 | | /* Increment the count for new_dir */ |
452 | 0 | strintmap_incr(counts, new_dir, 1); |
453 | 0 | } |
454 | | |
455 | | static void update_dir_rename_counts(struct dir_rename_info *info, |
456 | | struct strintmap *dirs_removed, |
457 | | const char *oldname, |
458 | | const char *newname) |
459 | 0 | { |
460 | 0 | char *old_dir; |
461 | 0 | char *new_dir; |
462 | 0 | const char new_dir_first_char = newname[0]; |
463 | 0 | int first_time_in_loop = 1; |
464 | |
|
465 | 0 | if (!info->setup) |
466 | | /* |
467 | | * info->setup is 0 here in two cases: (1) all auxiliary |
468 | | * vars (like dirs_removed) were NULL so |
469 | | * initialize_dir_rename_info() returned early, or (2) |
470 | | * either break detection or copy detection are active so |
471 | | * that we never called initialize_dir_rename_info(). In |
472 | | * the former case, we don't have enough info to know if |
473 | | * directories were renamed (because dirs_removed lets us |
474 | | * know about a necessary prerequisite, namely if they were |
475 | | * removed), and in the latter, we don't care about |
476 | | * directory renames or find_basename_matches. |
477 | | * |
478 | | * This matters because both basename and inexact matching |
479 | | * will also call update_dir_rename_counts(). In either of |
480 | | * the above two cases info->dir_rename_counts will not |
481 | | * have been properly initialized which prevents us from |
482 | | * updating it, but in these two cases we don't care about |
483 | | * dir_rename_counts anyway, so we can just exit early. |
484 | | */ |
485 | 0 | return; |
486 | | |
487 | | |
488 | 0 | old_dir = xstrdup(oldname); |
489 | 0 | new_dir = xstrdup(newname); |
490 | |
|
491 | 0 | while (1) { |
492 | 0 | int drd_flag = NOT_RELEVANT; |
493 | | |
494 | | /* Get old_dir, skip if its directory isn't relevant. */ |
495 | 0 | dirname_munge(old_dir); |
496 | 0 | if (info->relevant_source_dirs && |
497 | 0 | !strintmap_contains(info->relevant_source_dirs, old_dir)) |
498 | 0 | break; |
499 | | |
500 | | /* Get new_dir */ |
501 | 0 | dirname_munge(new_dir); |
502 | | |
503 | | /* |
504 | | * When renaming |
505 | | * "a/b/c/d/e/foo.c" -> "a/b/some/thing/else/e/foo.c" |
506 | | * then this suggests that both |
507 | | * a/b/c/d/e/ => a/b/some/thing/else/e/ |
508 | | * a/b/c/d/ => a/b/some/thing/else/ |
509 | | * so we want to increment counters for both. We do NOT, |
510 | | * however, also want to suggest that there was the following |
511 | | * rename: |
512 | | * a/b/c/ => a/b/some/thing/ |
513 | | * so we need to quit at that point. |
514 | | * |
515 | | * Note the when first_time_in_loop, we only strip off the |
516 | | * basename, and we don't care if that's different. |
517 | | */ |
518 | 0 | if (!first_time_in_loop) { |
519 | 0 | char *old_sub_dir = strchr(old_dir, '\0')+1; |
520 | 0 | char *new_sub_dir = strchr(new_dir, '\0')+1; |
521 | 0 | if (!*new_dir) { |
522 | | /* |
523 | | * Special case when renaming to root directory, |
524 | | * i.e. when new_dir == "". In this case, we had |
525 | | * something like |
526 | | * a/b/subdir => subdir |
527 | | * and so dirname_munge() sets things up so that |
528 | | * old_dir = "a/b\0subdir\0" |
529 | | * new_dir = "\0ubdir\0" |
530 | | * We didn't have a '/' to overwrite a '\0' onto |
531 | | * in new_dir, so we have to compare differently. |
532 | | */ |
533 | 0 | if (new_dir_first_char != old_sub_dir[0] || |
534 | 0 | strcmp(old_sub_dir+1, new_sub_dir)) |
535 | 0 | break; |
536 | 0 | } else { |
537 | 0 | if (strcmp(old_sub_dir, new_sub_dir)) |
538 | 0 | break; |
539 | 0 | } |
540 | 0 | } |
541 | | |
542 | | /* |
543 | | * Above we suggested that we'd keep recording renames for |
544 | | * all ancestor directories where the trailing directories |
545 | | * matched, i.e. for |
546 | | * "a/b/c/d/e/foo.c" -> "a/b/some/thing/else/e/foo.c" |
547 | | * we'd increment rename counts for each of |
548 | | * a/b/c/d/e/ => a/b/some/thing/else/e/ |
549 | | * a/b/c/d/ => a/b/some/thing/else/ |
550 | | * However, we only need the rename counts for directories |
551 | | * in dirs_removed whose value is RELEVANT_FOR_SELF. |
552 | | * However, we add one special case of also recording it for |
553 | | * first_time_in_loop because find_basename_matches() can |
554 | | * use that as a hint to find a good pairing. |
555 | | */ |
556 | 0 | if (dirs_removed) |
557 | 0 | drd_flag = strintmap_get(dirs_removed, old_dir); |
558 | 0 | if (drd_flag == RELEVANT_FOR_SELF || first_time_in_loop) |
559 | 0 | increment_count(info, old_dir, new_dir); |
560 | |
|
561 | 0 | first_time_in_loop = 0; |
562 | 0 | if (drd_flag == NOT_RELEVANT) |
563 | 0 | break; |
564 | | /* If we hit toplevel directory ("") for old or new dir, quit */ |
565 | 0 | if (!*old_dir || !*new_dir) |
566 | 0 | break; |
567 | 0 | } |
568 | | |
569 | | /* Free resources we don't need anymore */ |
570 | 0 | free(old_dir); |
571 | 0 | free(new_dir); |
572 | 0 | } |
573 | | |
574 | | static void initialize_dir_rename_info(struct dir_rename_info *info, |
575 | | struct strintmap *relevant_sources, |
576 | | struct strintmap *dirs_removed, |
577 | | struct strmap *dir_rename_count, |
578 | | struct strmap *cached_pairs) |
579 | 0 | { |
580 | 0 | struct hashmap_iter iter; |
581 | 0 | struct strmap_entry *entry; |
582 | 0 | int i; |
583 | |
|
584 | 0 | if (!dirs_removed && !relevant_sources) { |
585 | 0 | info->setup = 0; |
586 | 0 | return; |
587 | 0 | } |
588 | 0 | info->setup = 1; |
589 | |
|
590 | 0 | info->dir_rename_count = dir_rename_count; |
591 | 0 | if (!info->dir_rename_count) { |
592 | 0 | info->dir_rename_count = xmalloc(sizeof(*dir_rename_count)); |
593 | 0 | strmap_init(info->dir_rename_count); |
594 | 0 | } |
595 | 0 | strintmap_init_with_options(&info->idx_map, -1, NULL, 0); |
596 | 0 | strmap_init_with_options(&info->dir_rename_guess, NULL, 0); |
597 | | |
598 | | /* Setup info->relevant_source_dirs */ |
599 | 0 | info->relevant_source_dirs = NULL; |
600 | 0 | if (dirs_removed || !relevant_sources) { |
601 | 0 | info->relevant_source_dirs = dirs_removed; /* might be NULL */ |
602 | 0 | } else { |
603 | 0 | info->relevant_source_dirs = xmalloc(sizeof(struct strintmap)); |
604 | 0 | strintmap_init(info->relevant_source_dirs, 0 /* unused */); |
605 | 0 | strintmap_for_each_entry(relevant_sources, &iter, entry) { |
606 | 0 | char *dirname = get_dirname(entry->key); |
607 | 0 | if (!dirs_removed || |
608 | 0 | strintmap_contains(dirs_removed, dirname)) |
609 | 0 | strintmap_set(info->relevant_source_dirs, |
610 | 0 | dirname, 0 /* value irrelevant */); |
611 | 0 | free(dirname); |
612 | 0 | } |
613 | 0 | } |
614 | | |
615 | | /* |
616 | | * Loop setting up both info->idx_map, and doing setup of |
617 | | * info->dir_rename_count. |
618 | | */ |
619 | 0 | for (i = 0; i < rename_dst_nr; ++i) { |
620 | | /* |
621 | | * For non-renamed files, make idx_map contain mapping of |
622 | | * filename -> index (index within rename_dst, that is) |
623 | | */ |
624 | 0 | if (!rename_dst[i].is_rename) { |
625 | 0 | char *filename = rename_dst[i].p->two->path; |
626 | 0 | strintmap_set(&info->idx_map, filename, i); |
627 | 0 | continue; |
628 | 0 | } |
629 | | |
630 | | /* |
631 | | * For everything else (i.e. renamed files), make |
632 | | * dir_rename_count contain a map of a map: |
633 | | * old_directory -> {new_directory -> count} |
634 | | * In other words, for every pair look at the directories for |
635 | | * the old filename and the new filename and count how many |
636 | | * times that pairing occurs. |
637 | | */ |
638 | 0 | update_dir_rename_counts(info, dirs_removed, |
639 | 0 | rename_dst[i].p->one->path, |
640 | 0 | rename_dst[i].p->two->path); |
641 | 0 | } |
642 | | |
643 | | /* Add cached_pairs to counts */ |
644 | 0 | strmap_for_each_entry(cached_pairs, &iter, entry) { |
645 | 0 | const char *old_name = entry->key; |
646 | 0 | const char *new_name = entry->value; |
647 | 0 | if (!new_name) |
648 | | /* known delete; ignore it */ |
649 | 0 | continue; |
650 | | |
651 | 0 | update_dir_rename_counts(info, dirs_removed, old_name, new_name); |
652 | 0 | } |
653 | | |
654 | | /* |
655 | | * Now we collapse |
656 | | * dir_rename_count: old_directory -> {new_directory -> count} |
657 | | * down to |
658 | | * dir_rename_guess: old_directory -> best_new_directory |
659 | | * where best_new_directory is the one with the highest count. |
660 | | */ |
661 | 0 | strmap_for_each_entry(info->dir_rename_count, &iter, entry) { |
662 | | /* entry->key is source_dir */ |
663 | 0 | struct strintmap *counts = entry->value; |
664 | 0 | char *best_newdir; |
665 | |
|
666 | 0 | best_newdir = xstrdup(get_highest_rename_path(counts)); |
667 | 0 | strmap_put(&info->dir_rename_guess, entry->key, |
668 | 0 | best_newdir); |
669 | 0 | } |
670 | 0 | } |
671 | | |
672 | | void partial_clear_dir_rename_count(struct strmap *dir_rename_count) |
673 | 0 | { |
674 | 0 | struct hashmap_iter iter; |
675 | 0 | struct strmap_entry *entry; |
676 | |
|
677 | 0 | strmap_for_each_entry(dir_rename_count, &iter, entry) { |
678 | 0 | struct strintmap *counts = entry->value; |
679 | 0 | strintmap_clear(counts); |
680 | 0 | } |
681 | 0 | strmap_partial_clear(dir_rename_count, 1); |
682 | 0 | } |
683 | | |
684 | | static void cleanup_dir_rename_info(struct dir_rename_info *info, |
685 | | struct strintmap *dirs_removed, |
686 | | int keep_dir_rename_count) |
687 | 0 | { |
688 | 0 | struct hashmap_iter iter; |
689 | 0 | struct strmap_entry *entry; |
690 | 0 | struct string_list to_remove = STRING_LIST_INIT_NODUP; |
691 | 0 | int i; |
692 | |
|
693 | 0 | if (!info->setup) |
694 | 0 | return; |
695 | | |
696 | | /* idx_map */ |
697 | 0 | strintmap_clear(&info->idx_map); |
698 | | |
699 | | /* dir_rename_guess */ |
700 | 0 | strmap_clear(&info->dir_rename_guess, 1); |
701 | | |
702 | | /* relevant_source_dirs */ |
703 | 0 | if (info->relevant_source_dirs && |
704 | 0 | info->relevant_source_dirs != dirs_removed) { |
705 | 0 | strintmap_clear(info->relevant_source_dirs); |
706 | 0 | FREE_AND_NULL(info->relevant_source_dirs); |
707 | 0 | } |
708 | | |
709 | | /* dir_rename_count */ |
710 | 0 | if (!keep_dir_rename_count) { |
711 | 0 | partial_clear_dir_rename_count(info->dir_rename_count); |
712 | 0 | strmap_clear(info->dir_rename_count, 1); |
713 | 0 | FREE_AND_NULL(info->dir_rename_count); |
714 | 0 | return; |
715 | 0 | } |
716 | | |
717 | | /* |
718 | | * Although dir_rename_count was passed in |
719 | | * diffcore_rename_extended() and we want to keep it around and |
720 | | * return it to that caller, we first want to remove any counts in |
721 | | * the maps associated with UNKNOWN_DIR entries and any data |
722 | | * associated with directories that weren't renamed. |
723 | | */ |
724 | 0 | strmap_for_each_entry(info->dir_rename_count, &iter, entry) { |
725 | 0 | const char *source_dir = entry->key; |
726 | 0 | struct strintmap *counts = entry->value; |
727 | |
|
728 | 0 | if (!strintmap_get(dirs_removed, source_dir)) { |
729 | 0 | string_list_append(&to_remove, source_dir); |
730 | 0 | strintmap_clear(counts); |
731 | 0 | continue; |
732 | 0 | } |
733 | | |
734 | 0 | if (strintmap_contains(counts, UNKNOWN_DIR)) |
735 | 0 | strintmap_remove(counts, UNKNOWN_DIR); |
736 | 0 | } |
737 | 0 | for (i = 0; i < to_remove.nr; ++i) |
738 | 0 | strmap_remove(info->dir_rename_count, |
739 | 0 | to_remove.items[i].string, 1); |
740 | 0 | string_list_clear(&to_remove, 0); |
741 | 0 | } |
742 | | |
743 | | static const char *get_basename(const char *filename) |
744 | 0 | { |
745 | | /* |
746 | | * gitbasename() has to worry about special drives, multiple |
747 | | * directory separator characters, trailing slashes, NULL or |
748 | | * empty strings, etc. We only work on filenames as stored in |
749 | | * git, and thus get to ignore all those complications. |
750 | | */ |
751 | 0 | const char *base = strrchr(filename, '/'); |
752 | 0 | return base ? base + 1 : filename; |
753 | 0 | } |
754 | | |
755 | | static int idx_possible_rename(char *filename, struct dir_rename_info *info) |
756 | 0 | { |
757 | | /* |
758 | | * Our comparison of files with the same basename (see |
759 | | * find_basename_matches() below), is only helpful when after exact |
760 | | * rename detection we have exactly one file with a given basename |
761 | | * among the rename sources and also only exactly one file with |
762 | | * that basename among the rename destinations. When we have |
763 | | * multiple files with the same basename in either set, we do not |
764 | | * know which to compare against. However, there are some |
765 | | * filenames that occur in large numbers (particularly |
766 | | * build-related filenames such as 'Makefile', '.gitignore', or |
767 | | * 'build.gradle' that potentially exist within every single |
768 | | * subdirectory), and for performance we want to be able to quickly |
769 | | * find renames for these files too. |
770 | | * |
771 | | * The reason basename comparisons are a useful heuristic was that it |
772 | | * is common for people to move files across directories while keeping |
773 | | * their filename the same. If we had a way of determining or even |
774 | | * making a good educated guess about which directory these non-unique |
775 | | * basename files had moved the file to, we could check it. |
776 | | * Luckily... |
777 | | * |
778 | | * When an entire directory is in fact renamed, we have two factors |
779 | | * helping us out: |
780 | | * (a) the original directory disappeared giving us a hint |
781 | | * about when we can apply an extra heuristic. |
782 | | * (a) we often have several files within that directory and |
783 | | * subdirectories that are renamed without changes |
784 | | * So, rules for a heuristic: |
785 | | * (0) If there basename matches are non-unique (the condition under |
786 | | * which this function is called) AND |
787 | | * (1) the directory in which the file was found has disappeared |
788 | | * (i.e. dirs_removed is non-NULL and has a relevant entry) THEN |
789 | | * (2) use exact renames of files within the directory to determine |
790 | | * where the directory is likely to have been renamed to. IF |
791 | | * there is at least one exact rename from within that |
792 | | * directory, we can proceed. |
793 | | * (3) If there are multiple places the directory could have been |
794 | | * renamed to based on exact renames, ignore all but one of them. |
795 | | * Just use the destination with the most renames going to it. |
796 | | * (4) Check if applying that directory rename to the original file |
797 | | * would result in a destination filename that is in the |
798 | | * potential rename set. If so, return the index of the |
799 | | * destination file (the index within rename_dst). |
800 | | * (5) Compare the original file and returned destination for |
801 | | * similarity, and if they are sufficiently similar, record the |
802 | | * rename. |
803 | | * |
804 | | * This function, idx_possible_rename(), is only responsible for (4). |
805 | | * The conditions/steps in (1)-(3) are handled via setting up |
806 | | * dir_rename_count and dir_rename_guess in |
807 | | * initialize_dir_rename_info(). Steps (0) and (5) are handled by |
808 | | * the caller of this function. |
809 | | */ |
810 | 0 | char *old_dir, *new_dir; |
811 | 0 | struct strbuf new_path = STRBUF_INIT; |
812 | 0 | int idx; |
813 | |
|
814 | 0 | if (!info->setup) |
815 | 0 | return -1; |
816 | | |
817 | 0 | old_dir = get_dirname(filename); |
818 | 0 | new_dir = strmap_get(&info->dir_rename_guess, old_dir); |
819 | 0 | free(old_dir); |
820 | 0 | if (!new_dir) |
821 | 0 | return -1; |
822 | | |
823 | 0 | strbuf_addstr(&new_path, new_dir); |
824 | 0 | strbuf_addch(&new_path, '/'); |
825 | 0 | strbuf_addstr(&new_path, get_basename(filename)); |
826 | |
|
827 | 0 | idx = strintmap_get(&info->idx_map, new_path.buf); |
828 | 0 | strbuf_release(&new_path); |
829 | 0 | return idx; |
830 | 0 | } |
831 | | |
832 | | struct basename_prefetch_options { |
833 | | struct repository *repo; |
834 | | struct strintmap *relevant_sources; |
835 | | struct strintmap *sources; |
836 | | struct strintmap *dests; |
837 | | struct dir_rename_info *info; |
838 | | }; |
839 | | static void basename_prefetch(void *prefetch_options) |
840 | 0 | { |
841 | 0 | struct basename_prefetch_options *options = prefetch_options; |
842 | 0 | struct strintmap *relevant_sources = options->relevant_sources; |
843 | 0 | struct strintmap *sources = options->sources; |
844 | 0 | struct strintmap *dests = options->dests; |
845 | 0 | struct dir_rename_info *info = options->info; |
846 | 0 | int i; |
847 | 0 | struct oid_array to_fetch = OID_ARRAY_INIT; |
848 | | |
849 | | /* |
850 | | * TODO: The following loops mirror the code/logic from |
851 | | * find_basename_matches(), though not quite exactly. Maybe |
852 | | * abstract the iteration logic out somehow? |
853 | | */ |
854 | 0 | for (i = 0; i < rename_src_nr; ++i) { |
855 | 0 | char *filename = rename_src[i].p->one->path; |
856 | 0 | const char *base = NULL; |
857 | 0 | intptr_t src_index; |
858 | 0 | intptr_t dst_index; |
859 | | |
860 | | /* Skip irrelevant sources */ |
861 | 0 | if (relevant_sources && |
862 | 0 | !strintmap_contains(relevant_sources, filename)) |
863 | 0 | continue; |
864 | | |
865 | | /* |
866 | | * If the basename is unique among remaining sources, then |
867 | | * src_index will equal 'i' and we can attempt to match it |
868 | | * to a unique basename in the destinations. Otherwise, |
869 | | * use directory rename heuristics, if possible. |
870 | | */ |
871 | 0 | base = get_basename(filename); |
872 | 0 | src_index = strintmap_get(sources, base); |
873 | 0 | assert(src_index == -1 || src_index == i); |
874 | | |
875 | 0 | if (strintmap_contains(dests, base)) { |
876 | 0 | struct diff_filespec *one, *two; |
877 | | |
878 | | /* Find a matching destination, if possible */ |
879 | 0 | dst_index = strintmap_get(dests, base); |
880 | 0 | if (src_index == -1 || dst_index == -1) { |
881 | 0 | src_index = i; |
882 | 0 | dst_index = idx_possible_rename(filename, info); |
883 | 0 | } |
884 | 0 | if (dst_index == -1) |
885 | 0 | continue; |
886 | | |
887 | | /* Ignore this dest if already used in a rename */ |
888 | 0 | if (rename_dst[dst_index].is_rename) |
889 | 0 | continue; /* already used previously */ |
890 | | |
891 | 0 | one = rename_src[src_index].p->one; |
892 | 0 | two = rename_dst[dst_index].p->two; |
893 | | |
894 | | /* Add the pairs */ |
895 | 0 | diff_add_if_missing(options->repo, &to_fetch, two); |
896 | 0 | diff_add_if_missing(options->repo, &to_fetch, one); |
897 | 0 | } |
898 | 0 | } |
899 | | |
900 | 0 | promisor_remote_get_direct(options->repo, to_fetch.oid, to_fetch.nr); |
901 | 0 | oid_array_clear(&to_fetch); |
902 | 0 | } |
903 | | |
904 | | static int find_basename_matches(struct diff_options *options, |
905 | | int minimum_score, |
906 | | struct dir_rename_info *info, |
907 | | struct strintmap *relevant_sources, |
908 | | struct strintmap *dirs_removed) |
909 | 0 | { |
910 | | /* |
911 | | * When I checked in early 2020, over 76% of file renames in linux |
912 | | * just moved files to a different directory but kept the same |
913 | | * basename. gcc did that with over 64% of renames, gecko did it |
914 | | * with over 79%, and WebKit did it with over 89%. |
915 | | * |
916 | | * Therefore we can bypass the normal exhaustive NxM matrix |
917 | | * comparison of similarities between all potential rename sources |
918 | | * and destinations by instead using file basename as a hint (i.e. |
919 | | * the portion of the filename after the last '/'), checking for |
920 | | * similarity between files with the same basename, and if we find |
921 | | * a pair that are sufficiently similar, record the rename pair and |
922 | | * exclude those two from the NxM matrix. |
923 | | * |
924 | | * This *might* cause us to find a less than optimal pairing (if |
925 | | * there is another file that we are even more similar to but has a |
926 | | * different basename). Given the huge performance advantage |
927 | | * basename matching provides, and given the frequency with which |
928 | | * people use the same basename in real world projects, that's a |
929 | | * trade-off we are willing to accept when doing just rename |
930 | | * detection. |
931 | | * |
932 | | * If someone wants copy detection that implies they are willing to |
933 | | * spend more cycles to find similarities between files, so it may |
934 | | * be less likely that this heuristic is wanted. If someone is |
935 | | * doing break detection, that means they do not want filename |
936 | | * similarity to imply any form of content similiarity, and thus |
937 | | * this heuristic would definitely be incompatible. |
938 | | */ |
939 | |
|
940 | 0 | int i, renames = 0; |
941 | 0 | struct strintmap sources; |
942 | 0 | struct strintmap dests; |
943 | 0 | struct diff_populate_filespec_options dpf_options = { |
944 | 0 | .check_binary = 0, |
945 | 0 | .missing_object_cb = NULL, |
946 | 0 | .missing_object_data = NULL |
947 | 0 | }; |
948 | 0 | struct basename_prefetch_options prefetch_options = { |
949 | 0 | .repo = options->repo, |
950 | 0 | .relevant_sources = relevant_sources, |
951 | 0 | .sources = &sources, |
952 | 0 | .dests = &dests, |
953 | 0 | .info = info |
954 | 0 | }; |
955 | | |
956 | | /* |
957 | | * Create maps of basename -> fullname(s) for remaining sources and |
958 | | * dests. |
959 | | */ |
960 | 0 | strintmap_init_with_options(&sources, -1, NULL, 0); |
961 | 0 | strintmap_init_with_options(&dests, -1, NULL, 0); |
962 | 0 | for (i = 0; i < rename_src_nr; ++i) { |
963 | 0 | char *filename = rename_src[i].p->one->path; |
964 | 0 | const char *base; |
965 | | |
966 | | /* exact renames removed in remove_unneeded_paths_from_src() */ |
967 | 0 | assert(!rename_src[i].p->one->rename_used); |
968 | | |
969 | | /* Record index within rename_src (i) if basename is unique */ |
970 | 0 | base = get_basename(filename); |
971 | 0 | if (strintmap_contains(&sources, base)) |
972 | 0 | strintmap_set(&sources, base, -1); |
973 | 0 | else |
974 | 0 | strintmap_set(&sources, base, i); |
975 | 0 | } |
976 | 0 | for (i = 0; i < rename_dst_nr; ++i) { |
977 | 0 | char *filename = rename_dst[i].p->two->path; |
978 | 0 | const char *base; |
979 | |
|
980 | 0 | if (rename_dst[i].is_rename) |
981 | 0 | continue; /* involved in exact match already. */ |
982 | | |
983 | | /* Record index within rename_dst (i) if basename is unique */ |
984 | 0 | base = get_basename(filename); |
985 | 0 | if (strintmap_contains(&dests, base)) |
986 | 0 | strintmap_set(&dests, base, -1); |
987 | 0 | else |
988 | 0 | strintmap_set(&dests, base, i); |
989 | 0 | } |
990 | |
|
991 | 0 | if (options->repo == the_repository && repo_has_promisor_remote(the_repository)) { |
992 | 0 | dpf_options.missing_object_cb = basename_prefetch; |
993 | 0 | dpf_options.missing_object_data = &prefetch_options; |
994 | 0 | } |
995 | | |
996 | | /* Now look for basename matchups and do similarity estimation */ |
997 | 0 | for (i = 0; i < rename_src_nr; ++i) { |
998 | 0 | char *filename = rename_src[i].p->one->path; |
999 | 0 | const char *base = NULL; |
1000 | 0 | intptr_t src_index; |
1001 | 0 | intptr_t dst_index; |
1002 | | |
1003 | | /* Skip irrelevant sources */ |
1004 | 0 | if (relevant_sources && |
1005 | 0 | !strintmap_contains(relevant_sources, filename)) |
1006 | 0 | continue; |
1007 | | |
1008 | | /* |
1009 | | * If the basename is unique among remaining sources, then |
1010 | | * src_index will equal 'i' and we can attempt to match it |
1011 | | * to a unique basename in the destinations. Otherwise, |
1012 | | * use directory rename heuristics, if possible. |
1013 | | */ |
1014 | 0 | base = get_basename(filename); |
1015 | 0 | src_index = strintmap_get(&sources, base); |
1016 | 0 | assert(src_index == -1 || src_index == i); |
1017 | | |
1018 | 0 | if (strintmap_contains(&dests, base)) { |
1019 | 0 | struct diff_filespec *one, *two; |
1020 | 0 | int score; |
1021 | | |
1022 | | /* Find a matching destination, if possible */ |
1023 | 0 | dst_index = strintmap_get(&dests, base); |
1024 | 0 | if (src_index == -1 || dst_index == -1) { |
1025 | 0 | src_index = i; |
1026 | 0 | dst_index = idx_possible_rename(filename, info); |
1027 | 0 | } |
1028 | 0 | if (dst_index == -1) |
1029 | 0 | continue; |
1030 | | |
1031 | | /* Ignore this dest if already used in a rename */ |
1032 | 0 | if (rename_dst[dst_index].is_rename) |
1033 | 0 | continue; /* already used previously */ |
1034 | | |
1035 | | /* Estimate the similarity */ |
1036 | 0 | one = rename_src[src_index].p->one; |
1037 | 0 | two = rename_dst[dst_index].p->two; |
1038 | 0 | score = estimate_similarity(options->repo, one, two, |
1039 | 0 | minimum_score, &dpf_options); |
1040 | | |
1041 | | /* If sufficiently similar, record as rename pair */ |
1042 | 0 | if (score < minimum_score) |
1043 | 0 | continue; |
1044 | 0 | record_rename_pair(dst_index, src_index, score); |
1045 | 0 | renames++; |
1046 | 0 | update_dir_rename_counts(info, dirs_removed, |
1047 | 0 | one->path, two->path); |
1048 | | |
1049 | | /* |
1050 | | * Found a rename so don't need text anymore; if we |
1051 | | * didn't find a rename, the filespec_blob would get |
1052 | | * re-used when doing the matrix of comparisons. |
1053 | | */ |
1054 | 0 | diff_free_filespec_blob(one); |
1055 | 0 | diff_free_filespec_blob(two); |
1056 | 0 | } |
1057 | 0 | } |
1058 | | |
1059 | 0 | strintmap_clear(&sources); |
1060 | 0 | strintmap_clear(&dests); |
1061 | |
|
1062 | 0 | return renames; |
1063 | 0 | } |
1064 | | |
1065 | 0 | #define NUM_CANDIDATE_PER_DST 4 |
1066 | | static void record_if_better(struct diff_score m[], struct diff_score *o) |
1067 | 0 | { |
1068 | 0 | int i, worst; |
1069 | | |
1070 | | /* find the worst one */ |
1071 | 0 | worst = 0; |
1072 | 0 | for (i = 1; i < NUM_CANDIDATE_PER_DST; i++) |
1073 | 0 | if (score_compare(&m[i], &m[worst]) > 0) |
1074 | 0 | worst = i; |
1075 | | |
1076 | | /* is it better than the worst one? */ |
1077 | 0 | if (score_compare(&m[worst], o) > 0) |
1078 | 0 | m[worst] = *o; |
1079 | 0 | } |
1080 | | |
1081 | | /* |
1082 | | * Returns: |
1083 | | * 0 if we are under the limit; |
1084 | | * 1 if we need to disable inexact rename detection; |
1085 | | * 2 if we would be under the limit if we were given -C instead of -C -C. |
1086 | | */ |
1087 | | static int too_many_rename_candidates(int num_destinations, int num_sources, |
1088 | | struct diff_options *options) |
1089 | 0 | { |
1090 | 0 | int rename_limit = options->rename_limit; |
1091 | 0 | int i, limited_sources; |
1092 | |
|
1093 | 0 | options->needed_rename_limit = 0; |
1094 | | |
1095 | | /* |
1096 | | * This basically does a test for the rename matrix not |
1097 | | * growing larger than a "rename_limit" square matrix, ie: |
1098 | | * |
1099 | | * num_destinations * num_sources > rename_limit * rename_limit |
1100 | | * |
1101 | | * We use st_mult() to check overflow conditions; in the |
1102 | | * exceptional circumstance that size_t isn't large enough to hold |
1103 | | * the multiplication, the system won't be able to allocate enough |
1104 | | * memory for the matrix anyway. |
1105 | | */ |
1106 | 0 | if (rename_limit <= 0) |
1107 | 0 | return 0; /* treat as unlimited */ |
1108 | 0 | if (st_mult(num_destinations, num_sources) |
1109 | 0 | <= st_mult(rename_limit, rename_limit)) |
1110 | 0 | return 0; |
1111 | | |
1112 | 0 | options->needed_rename_limit = |
1113 | 0 | num_sources > num_destinations ? num_sources : num_destinations; |
1114 | | |
1115 | | /* Are we running under -C -C? */ |
1116 | 0 | if (!options->flags.find_copies_harder) |
1117 | 0 | return 1; |
1118 | | |
1119 | | /* Would we bust the limit if we were running under -C? */ |
1120 | 0 | for (limited_sources = i = 0; i < num_sources; i++) { |
1121 | 0 | if (diff_unmodified_pair(rename_src[i].p)) |
1122 | 0 | continue; |
1123 | 0 | limited_sources++; |
1124 | 0 | } |
1125 | 0 | if (st_mult(num_destinations, limited_sources) |
1126 | 0 | <= st_mult(rename_limit, rename_limit)) |
1127 | 0 | return 2; |
1128 | 0 | return 1; |
1129 | 0 | } |
1130 | | |
1131 | | static int find_renames(struct diff_score *mx, |
1132 | | int dst_cnt, |
1133 | | int minimum_score, |
1134 | | int copies, |
1135 | | struct dir_rename_info *info, |
1136 | | struct strintmap *dirs_removed) |
1137 | 0 | { |
1138 | 0 | int count = 0, i; |
1139 | |
|
1140 | 0 | for (i = 0; i < dst_cnt * NUM_CANDIDATE_PER_DST; i++) { |
1141 | 0 | struct diff_rename_dst *dst; |
1142 | |
|
1143 | 0 | if ((mx[i].dst < 0) || |
1144 | 0 | (mx[i].score < minimum_score)) |
1145 | 0 | break; /* there is no more usable pair. */ |
1146 | 0 | dst = &rename_dst[mx[i].dst]; |
1147 | 0 | if (dst->is_rename) |
1148 | 0 | continue; /* already done, either exact or fuzzy. */ |
1149 | 0 | if (!copies && rename_src[mx[i].src].p->one->rename_used) |
1150 | 0 | continue; |
1151 | 0 | record_rename_pair(mx[i].dst, mx[i].src, mx[i].score); |
1152 | 0 | count++; |
1153 | 0 | update_dir_rename_counts(info, dirs_removed, |
1154 | 0 | rename_src[mx[i].src].p->one->path, |
1155 | 0 | rename_dst[mx[i].dst].p->two->path); |
1156 | 0 | } |
1157 | 0 | return count; |
1158 | 0 | } |
1159 | | |
1160 | | static void remove_unneeded_paths_from_src(int detecting_copies, |
1161 | | struct strintmap *interesting) |
1162 | 0 | { |
1163 | 0 | int i, new_num_src; |
1164 | |
|
1165 | 0 | if (detecting_copies && !interesting) |
1166 | 0 | return; /* nothing to remove */ |
1167 | 0 | if (break_idx) |
1168 | 0 | return; /* culling incompatible with break detection */ |
1169 | | |
1170 | | /* |
1171 | | * Note on reasons why we cull unneeded sources but not destinations: |
1172 | | * 1) Pairings are stored in rename_dst (not rename_src), which we |
1173 | | * need to keep around. So, we just can't cull rename_dst even |
1174 | | * if we wanted to. But doing so wouldn't help because... |
1175 | | * |
1176 | | * 2) There is a matrix pairwise comparison that follows the |
1177 | | * "Performing inexact rename detection" progress message. |
1178 | | * Iterating over the destinations is done in the outer loop, |
1179 | | * hence we only iterate over each of those once and we can |
1180 | | * easily skip the outer loop early if the destination isn't |
1181 | | * relevant. That's only one check per destination path to |
1182 | | * skip. |
1183 | | * |
1184 | | * By contrast, the sources are iterated in the inner loop; if |
1185 | | * we check whether a source can be skipped, then we'll be |
1186 | | * checking it N separate times, once for each destination. |
1187 | | * We don't want to have to iterate over known-not-needed |
1188 | | * sources N times each, so avoid that by removing the sources |
1189 | | * from rename_src here. |
1190 | | */ |
1191 | 0 | for (i = 0, new_num_src = 0; i < rename_src_nr; i++) { |
1192 | 0 | struct diff_filespec *one = rename_src[i].p->one; |
1193 | | |
1194 | | /* |
1195 | | * renames are stored in rename_dst, so if a rename has |
1196 | | * already been detected using this source, we can just |
1197 | | * remove the source knowing rename_dst has its info. |
1198 | | */ |
1199 | 0 | if (!detecting_copies && one->rename_used) |
1200 | 0 | continue; |
1201 | | |
1202 | | /* If we don't care about the source path, skip it */ |
1203 | 0 | if (interesting && !strintmap_contains(interesting, one->path)) |
1204 | 0 | continue; |
1205 | | |
1206 | 0 | if (new_num_src < i) |
1207 | 0 | memcpy(&rename_src[new_num_src], &rename_src[i], |
1208 | 0 | sizeof(struct diff_rename_src)); |
1209 | 0 | new_num_src++; |
1210 | 0 | } |
1211 | |
|
1212 | 0 | rename_src_nr = new_num_src; |
1213 | 0 | } |
1214 | | |
1215 | | static void handle_early_known_dir_renames(struct dir_rename_info *info, |
1216 | | struct strintmap *relevant_sources, |
1217 | | struct strintmap *dirs_removed) |
1218 | 0 | { |
1219 | | /* |
1220 | | * Directory renames are determined via an aggregate of all renames |
1221 | | * under them and using a "majority wins" rule. The fact that |
1222 | | * "majority wins", though, means we don't need all the renames |
1223 | | * under the given directory, we only need enough to ensure we have |
1224 | | * a majority. |
1225 | | */ |
1226 | |
|
1227 | 0 | int i, new_num_src; |
1228 | 0 | struct hashmap_iter iter; |
1229 | 0 | struct strmap_entry *entry; |
1230 | |
|
1231 | 0 | if (!dirs_removed || !relevant_sources) |
1232 | 0 | return; /* nothing to cull */ |
1233 | 0 | if (break_idx) |
1234 | 0 | return; /* culling incompatbile with break detection */ |
1235 | | |
1236 | | /* |
1237 | | * Supplement dir_rename_count with number of potential renames, |
1238 | | * marking all potential rename sources as mapping to UNKNOWN_DIR. |
1239 | | */ |
1240 | 0 | for (i = 0; i < rename_src_nr; i++) { |
1241 | 0 | char *old_dir; |
1242 | 0 | struct diff_filespec *one = rename_src[i].p->one; |
1243 | | |
1244 | | /* |
1245 | | * sources that are part of a rename will have already been |
1246 | | * removed by a prior call to remove_unneeded_paths_from_src() |
1247 | | */ |
1248 | 0 | assert(!one->rename_used); |
1249 | | |
1250 | 0 | old_dir = get_dirname(one->path); |
1251 | 0 | while (*old_dir != '\0' && |
1252 | 0 | NOT_RELEVANT != strintmap_get(dirs_removed, old_dir)) { |
1253 | 0 | char *freeme = old_dir; |
1254 | |
|
1255 | 0 | increment_count(info, old_dir, UNKNOWN_DIR); |
1256 | 0 | old_dir = get_dirname(old_dir); |
1257 | | |
1258 | | /* Free resources we don't need anymore */ |
1259 | 0 | free(freeme); |
1260 | 0 | } |
1261 | | /* |
1262 | | * old_dir and new_dir free'd in increment_count, but |
1263 | | * get_dirname() gives us a new pointer we need to free for |
1264 | | * old_dir. Also, if the loop runs 0 times we need old_dir |
1265 | | * to be freed. |
1266 | | */ |
1267 | 0 | free(old_dir); |
1268 | 0 | } |
1269 | | |
1270 | | /* |
1271 | | * For any directory which we need a potential rename detected for |
1272 | | * (i.e. those marked as RELEVANT_FOR_SELF in dirs_removed), check |
1273 | | * whether we have enough renames to satisfy the "majority rules" |
1274 | | * requirement such that detecting any more renames of files under |
1275 | | * it won't change the result. For any such directory, mark that |
1276 | | * we no longer need to detect a rename for it. However, since we |
1277 | | * might need to still detect renames for an ancestor of that |
1278 | | * directory, use RELEVANT_FOR_ANCESTOR. |
1279 | | */ |
1280 | 0 | strmap_for_each_entry(info->dir_rename_count, &iter, entry) { |
1281 | | /* entry->key is source_dir */ |
1282 | 0 | struct strintmap *counts = entry->value; |
1283 | |
|
1284 | 0 | if (strintmap_get(dirs_removed, entry->key) == |
1285 | 0 | RELEVANT_FOR_SELF && |
1286 | 0 | dir_rename_already_determinable(counts)) { |
1287 | 0 | strintmap_set(dirs_removed, entry->key, |
1288 | 0 | RELEVANT_FOR_ANCESTOR); |
1289 | 0 | } |
1290 | 0 | } |
1291 | |
|
1292 | 0 | for (i = 0, new_num_src = 0; i < rename_src_nr; i++) { |
1293 | 0 | struct diff_filespec *one = rename_src[i].p->one; |
1294 | 0 | int val; |
1295 | |
|
1296 | 0 | val = strintmap_get(relevant_sources, one->path); |
1297 | | |
1298 | | /* |
1299 | | * sources that were not found in relevant_sources should |
1300 | | * have already been removed by a prior call to |
1301 | | * remove_unneeded_paths_from_src() |
1302 | | */ |
1303 | 0 | assert(val != -1); |
1304 | | |
1305 | 0 | if (val == RELEVANT_LOCATION) { |
1306 | 0 | int removable = 1; |
1307 | 0 | char *dir = get_dirname(one->path); |
1308 | 0 | while (1) { |
1309 | 0 | char *freeme = dir; |
1310 | 0 | int res = strintmap_get(dirs_removed, dir); |
1311 | | |
1312 | | /* Quit if not found or irrelevant */ |
1313 | 0 | if (res == NOT_RELEVANT) |
1314 | 0 | break; |
1315 | | /* If RELEVANT_FOR_SELF, can't remove */ |
1316 | 0 | if (res == RELEVANT_FOR_SELF) { |
1317 | 0 | removable = 0; |
1318 | 0 | break; |
1319 | 0 | } |
1320 | | /* Else continue searching upwards */ |
1321 | 0 | assert(res == RELEVANT_FOR_ANCESTOR); |
1322 | 0 | dir = get_dirname(dir); |
1323 | 0 | free(freeme); |
1324 | 0 | } |
1325 | 0 | free(dir); |
1326 | 0 | if (removable) { |
1327 | 0 | strintmap_set(relevant_sources, one->path, |
1328 | 0 | RELEVANT_NO_MORE); |
1329 | 0 | continue; |
1330 | 0 | } |
1331 | 0 | } |
1332 | | |
1333 | 0 | if (new_num_src < i) |
1334 | 0 | memcpy(&rename_src[new_num_src], &rename_src[i], |
1335 | 0 | sizeof(struct diff_rename_src)); |
1336 | 0 | new_num_src++; |
1337 | 0 | } |
1338 | | |
1339 | 0 | rename_src_nr = new_num_src; |
1340 | 0 | } |
1341 | | |
1342 | | static void free_filespec_data(struct diff_filespec *spec) |
1343 | 0 | { |
1344 | 0 | if (!--spec->count) |
1345 | 0 | diff_free_filespec_data(spec); |
1346 | 0 | } |
1347 | | |
1348 | | static void pool_free_filespec(struct mem_pool *pool, |
1349 | | struct diff_filespec *spec) |
1350 | 0 | { |
1351 | 0 | if (!pool) { |
1352 | 0 | free_filespec(spec); |
1353 | 0 | return; |
1354 | 0 | } |
1355 | | |
1356 | | /* |
1357 | | * Similar to free_filespec(), but only frees the data. The spec |
1358 | | * itself was allocated in the pool and should not be individually |
1359 | | * freed. |
1360 | | */ |
1361 | 0 | free_filespec_data(spec); |
1362 | 0 | } |
1363 | | |
1364 | | void pool_diff_free_filepair(struct mem_pool *pool, |
1365 | | struct diff_filepair *p) |
1366 | 0 | { |
1367 | 0 | if (!pool) { |
1368 | 0 | diff_free_filepair(p); |
1369 | 0 | return; |
1370 | 0 | } |
1371 | | |
1372 | | /* |
1373 | | * Similar to diff_free_filepair() but only frees the data from the |
1374 | | * filespecs; not the filespecs or the filepair which were |
1375 | | * allocated from the pool. |
1376 | | */ |
1377 | 0 | free_filespec_data(p->one); |
1378 | 0 | free_filespec_data(p->two); |
1379 | 0 | } |
1380 | | |
1381 | | void diffcore_rename_extended(struct diff_options *options, |
1382 | | struct mem_pool *pool, |
1383 | | struct strintmap *relevant_sources, |
1384 | | struct strintmap *dirs_removed, |
1385 | | struct strmap *dir_rename_count, |
1386 | | struct strmap *cached_pairs) |
1387 | 0 | { |
1388 | 0 | int detect_rename = options->detect_rename; |
1389 | 0 | int minimum_score = options->rename_score; |
1390 | 0 | struct diff_queue_struct *q = &diff_queued_diff; |
1391 | 0 | struct diff_queue_struct outq; |
1392 | 0 | struct diff_score *mx; |
1393 | 0 | int i, j, rename_count, skip_unmodified = 0; |
1394 | 0 | int num_destinations, dst_cnt; |
1395 | 0 | int num_sources, want_copies; |
1396 | 0 | struct progress *progress = NULL; |
1397 | 0 | struct mem_pool local_pool; |
1398 | 0 | struct dir_rename_info info; |
1399 | 0 | struct diff_populate_filespec_options dpf_options = { |
1400 | 0 | .check_binary = 0, |
1401 | 0 | .missing_object_cb = NULL, |
1402 | 0 | .missing_object_data = NULL |
1403 | 0 | }; |
1404 | 0 | struct inexact_prefetch_options prefetch_options = { |
1405 | 0 | .repo = options->repo |
1406 | 0 | }; |
1407 | |
|
1408 | 0 | trace2_region_enter("diff", "setup", options->repo); |
1409 | 0 | info.setup = 0; |
1410 | 0 | assert(!dir_rename_count || strmap_empty(dir_rename_count)); |
1411 | 0 | want_copies = (detect_rename == DIFF_DETECT_COPY); |
1412 | 0 | if (dirs_removed && (break_idx || want_copies)) |
1413 | 0 | BUG("dirs_removed incompatible with break/copy detection"); |
1414 | 0 | if (break_idx && relevant_sources) |
1415 | 0 | BUG("break detection incompatible with source specification"); |
1416 | 0 | if (!minimum_score) |
1417 | 0 | minimum_score = DEFAULT_RENAME_SCORE; |
1418 | |
|
1419 | 0 | for (i = 0; i < q->nr; i++) { |
1420 | 0 | struct diff_filepair *p = q->queue[i]; |
1421 | 0 | if (!DIFF_FILE_VALID(p->one)) { |
1422 | 0 | if (!DIFF_FILE_VALID(p->two)) |
1423 | 0 | continue; /* unmerged */ |
1424 | 0 | else if (options->single_follow && |
1425 | 0 | strcmp(options->single_follow, p->two->path)) |
1426 | 0 | continue; /* not interested */ |
1427 | 0 | else if (!options->flags.rename_empty && |
1428 | 0 | is_empty_blob_oid(&p->two->oid, the_repository->hash_algo)) |
1429 | 0 | continue; |
1430 | 0 | else if (add_rename_dst(p) < 0) { |
1431 | 0 | warning("skipping rename detection, detected" |
1432 | 0 | " duplicate destination '%s'", |
1433 | 0 | p->two->path); |
1434 | 0 | goto cleanup; |
1435 | 0 | } |
1436 | 0 | } |
1437 | 0 | else if (!options->flags.rename_empty && |
1438 | 0 | is_empty_blob_oid(&p->one->oid, the_repository->hash_algo)) |
1439 | 0 | continue; |
1440 | 0 | else if (!DIFF_PAIR_UNMERGED(p) && !DIFF_FILE_VALID(p->two)) { |
1441 | | /* |
1442 | | * If the source is a broken "delete", and |
1443 | | * they did not really want to get broken, |
1444 | | * that means the source actually stays. |
1445 | | * So we increment the "rename_used" score |
1446 | | * by one, to indicate ourselves as a user |
1447 | | */ |
1448 | 0 | if (p->broken_pair && !p->score) |
1449 | 0 | p->one->rename_used++; |
1450 | 0 | register_rename_src(p); |
1451 | 0 | } |
1452 | 0 | else if (want_copies) { |
1453 | | /* |
1454 | | * Increment the "rename_used" score by |
1455 | | * one, to indicate ourselves as a user. |
1456 | | */ |
1457 | 0 | p->one->rename_used++; |
1458 | 0 | register_rename_src(p); |
1459 | 0 | } |
1460 | 0 | } |
1461 | 0 | trace2_region_leave("diff", "setup", options->repo); |
1462 | 0 | if (rename_dst_nr == 0 || rename_src_nr == 0) |
1463 | 0 | goto cleanup; /* nothing to do */ |
1464 | | |
1465 | 0 | trace2_region_enter("diff", "exact renames", options->repo); |
1466 | 0 | mem_pool_init(&local_pool, 32*1024); |
1467 | | /* |
1468 | | * We really want to cull the candidates list early |
1469 | | * with cheap tests in order to avoid doing deltas. |
1470 | | */ |
1471 | 0 | rename_count = find_exact_renames(options, &local_pool); |
1472 | | /* |
1473 | | * Discard local_pool immediately instead of at "cleanup:" in order |
1474 | | * to reduce maximum memory usage; inexact rename detection uses up |
1475 | | * a fair amount of memory, and mem_pools can too. |
1476 | | */ |
1477 | 0 | mem_pool_discard(&local_pool, 0); |
1478 | 0 | trace2_region_leave("diff", "exact renames", options->repo); |
1479 | | |
1480 | | /* Did we only want exact renames? */ |
1481 | 0 | if (minimum_score == MAX_SCORE) |
1482 | 0 | goto cleanup; |
1483 | | |
1484 | 0 | num_sources = rename_src_nr; |
1485 | |
|
1486 | 0 | if (want_copies || break_idx) { |
1487 | | /* |
1488 | | * Cull sources: |
1489 | | * - remove ones corresponding to exact renames |
1490 | | * - remove ones not found in relevant_sources |
1491 | | */ |
1492 | 0 | trace2_region_enter("diff", "cull after exact", options->repo); |
1493 | 0 | remove_unneeded_paths_from_src(want_copies, relevant_sources); |
1494 | 0 | trace2_region_leave("diff", "cull after exact", options->repo); |
1495 | 0 | } else { |
1496 | | /* Determine minimum score to match basenames */ |
1497 | 0 | double factor = 0.5; |
1498 | 0 | char *basename_factor = getenv("GIT_BASENAME_FACTOR"); |
1499 | 0 | int min_basename_score; |
1500 | |
|
1501 | 0 | if (basename_factor) |
1502 | 0 | factor = strtol(basename_factor, NULL, 10)/100.0; |
1503 | 0 | assert(factor >= 0.0 && factor <= 1.0); |
1504 | 0 | min_basename_score = minimum_score + |
1505 | 0 | (int)(factor * (MAX_SCORE - minimum_score)); |
1506 | | |
1507 | | /* |
1508 | | * Cull sources: |
1509 | | * - remove ones involved in renames (found via exact match) |
1510 | | */ |
1511 | 0 | trace2_region_enter("diff", "cull after exact", options->repo); |
1512 | 0 | remove_unneeded_paths_from_src(want_copies, NULL); |
1513 | 0 | trace2_region_leave("diff", "cull after exact", options->repo); |
1514 | | |
1515 | | /* Preparation for basename-driven matching. */ |
1516 | 0 | trace2_region_enter("diff", "dir rename setup", options->repo); |
1517 | 0 | initialize_dir_rename_info(&info, relevant_sources, |
1518 | 0 | dirs_removed, dir_rename_count, |
1519 | 0 | cached_pairs); |
1520 | 0 | trace2_region_leave("diff", "dir rename setup", options->repo); |
1521 | | |
1522 | | /* Utilize file basenames to quickly find renames. */ |
1523 | 0 | trace2_region_enter("diff", "basename matches", options->repo); |
1524 | 0 | rename_count += find_basename_matches(options, |
1525 | 0 | min_basename_score, |
1526 | 0 | &info, |
1527 | 0 | relevant_sources, |
1528 | 0 | dirs_removed); |
1529 | 0 | trace2_region_leave("diff", "basename matches", options->repo); |
1530 | | |
1531 | | /* |
1532 | | * Cull sources, again: |
1533 | | * - remove ones involved in renames (found via basenames) |
1534 | | * - remove ones not found in relevant_sources |
1535 | | * and |
1536 | | * - remove ones in relevant_sources which are needed only |
1537 | | * for directory renames IF no ancestory directory |
1538 | | * actually needs to know any more individual path |
1539 | | * renames under them |
1540 | | */ |
1541 | 0 | trace2_region_enter("diff", "cull basename", options->repo); |
1542 | 0 | remove_unneeded_paths_from_src(want_copies, relevant_sources); |
1543 | 0 | handle_early_known_dir_renames(&info, relevant_sources, |
1544 | 0 | dirs_removed); |
1545 | 0 | trace2_region_leave("diff", "cull basename", options->repo); |
1546 | 0 | } |
1547 | | |
1548 | | /* Calculate how many rename destinations are left */ |
1549 | 0 | num_destinations = (rename_dst_nr - rename_count); |
1550 | 0 | num_sources = rename_src_nr; /* rename_src_nr reflects lower number */ |
1551 | | |
1552 | | /* All done? */ |
1553 | 0 | if (!num_destinations || !num_sources) |
1554 | 0 | goto cleanup; |
1555 | | |
1556 | 0 | switch (too_many_rename_candidates(num_destinations, num_sources, |
1557 | 0 | options)) { |
1558 | 0 | case 1: |
1559 | 0 | goto cleanup; |
1560 | 0 | case 2: |
1561 | 0 | options->degraded_cc_to_c = 1; |
1562 | 0 | skip_unmodified = 1; |
1563 | 0 | break; |
1564 | 0 | default: |
1565 | 0 | break; |
1566 | 0 | } |
1567 | | |
1568 | 0 | trace2_region_enter("diff", "inexact renames", options->repo); |
1569 | 0 | if (options->show_rename_progress) { |
1570 | 0 | progress = start_delayed_progress( |
1571 | 0 | _("Performing inexact rename detection"), |
1572 | 0 | (uint64_t)num_destinations * (uint64_t)num_sources); |
1573 | 0 | } |
1574 | | |
1575 | | /* Finish setting up dpf_options */ |
1576 | 0 | prefetch_options.skip_unmodified = skip_unmodified; |
1577 | 0 | if (options->repo == the_repository && repo_has_promisor_remote(the_repository)) { |
1578 | 0 | dpf_options.missing_object_cb = inexact_prefetch; |
1579 | 0 | dpf_options.missing_object_data = &prefetch_options; |
1580 | 0 | } |
1581 | |
|
1582 | 0 | CALLOC_ARRAY(mx, st_mult(NUM_CANDIDATE_PER_DST, num_destinations)); |
1583 | 0 | for (dst_cnt = i = 0; i < rename_dst_nr; i++) { |
1584 | 0 | struct diff_filespec *two = rename_dst[i].p->two; |
1585 | 0 | struct diff_score *m; |
1586 | |
|
1587 | 0 | if (rename_dst[i].is_rename) |
1588 | 0 | continue; /* exact or basename match already handled */ |
1589 | | |
1590 | 0 | m = &mx[dst_cnt * NUM_CANDIDATE_PER_DST]; |
1591 | 0 | for (j = 0; j < NUM_CANDIDATE_PER_DST; j++) |
1592 | 0 | m[j].dst = -1; |
1593 | |
|
1594 | 0 | for (j = 0; j < rename_src_nr; j++) { |
1595 | 0 | struct diff_filespec *one = rename_src[j].p->one; |
1596 | 0 | struct diff_score this_src; |
1597 | |
|
1598 | 0 | assert(!one->rename_used || want_copies || break_idx); |
1599 | | |
1600 | 0 | if (skip_unmodified && |
1601 | 0 | diff_unmodified_pair(rename_src[j].p)) |
1602 | 0 | continue; |
1603 | | |
1604 | 0 | this_src.score = estimate_similarity(options->repo, |
1605 | 0 | one, two, |
1606 | 0 | minimum_score, |
1607 | 0 | &dpf_options); |
1608 | 0 | this_src.name_score = basename_same(one, two); |
1609 | 0 | this_src.dst = i; |
1610 | 0 | this_src.src = j; |
1611 | 0 | record_if_better(m, &this_src); |
1612 | | /* |
1613 | | * Once we run estimate_similarity, |
1614 | | * We do not need the text anymore. |
1615 | | */ |
1616 | 0 | diff_free_filespec_blob(one); |
1617 | 0 | diff_free_filespec_blob(two); |
1618 | 0 | } |
1619 | 0 | dst_cnt++; |
1620 | 0 | display_progress(progress, |
1621 | 0 | (uint64_t)dst_cnt * (uint64_t)num_sources); |
1622 | 0 | } |
1623 | 0 | stop_progress(&progress); |
1624 | | |
1625 | | /* cost matrix sorted by most to least similar pair */ |
1626 | 0 | STABLE_QSORT(mx, dst_cnt * NUM_CANDIDATE_PER_DST, score_compare); |
1627 | |
|
1628 | 0 | rename_count += find_renames(mx, dst_cnt, minimum_score, 0, |
1629 | 0 | &info, dirs_removed); |
1630 | 0 | if (want_copies) |
1631 | 0 | rename_count += find_renames(mx, dst_cnt, minimum_score, 1, |
1632 | 0 | &info, dirs_removed); |
1633 | 0 | free(mx); |
1634 | 0 | trace2_region_leave("diff", "inexact renames", options->repo); |
1635 | |
|
1636 | 0 | cleanup: |
1637 | | /* At this point, we have found some renames and copies and they |
1638 | | * are recorded in rename_dst. The original list is still in *q. |
1639 | | */ |
1640 | 0 | trace2_region_enter("diff", "write back to queue", options->repo); |
1641 | 0 | DIFF_QUEUE_CLEAR(&outq); |
1642 | 0 | for (i = 0; i < q->nr; i++) { |
1643 | 0 | struct diff_filepair *p = q->queue[i]; |
1644 | 0 | struct diff_filepair *pair_to_free = NULL; |
1645 | |
|
1646 | 0 | if (DIFF_PAIR_UNMERGED(p)) { |
1647 | 0 | diff_q(&outq, p); |
1648 | 0 | } |
1649 | 0 | else if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) { |
1650 | | /* Creation */ |
1651 | 0 | diff_q(&outq, p); |
1652 | 0 | } |
1653 | 0 | else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) { |
1654 | | /* |
1655 | | * Deletion |
1656 | | * |
1657 | | * We would output this delete record if: |
1658 | | * |
1659 | | * (1) this is a broken delete and the counterpart |
1660 | | * broken create remains in the output; or |
1661 | | * (2) this is not a broken delete, and rename_dst |
1662 | | * does not have a rename/copy to move p->one->path |
1663 | | * out of existence. |
1664 | | * |
1665 | | * Otherwise, the counterpart broken create |
1666 | | * has been turned into a rename-edit; or |
1667 | | * delete did not have a matching create to |
1668 | | * begin with. |
1669 | | */ |
1670 | 0 | if (DIFF_PAIR_BROKEN(p)) { |
1671 | | /* broken delete */ |
1672 | 0 | struct diff_rename_dst *dst = locate_rename_dst(p); |
1673 | 0 | if (!dst) |
1674 | 0 | BUG("tracking failed somehow; failed to find associated dst for broken pair"); |
1675 | 0 | if (dst->is_rename) |
1676 | | /* counterpart is now rename/copy */ |
1677 | 0 | pair_to_free = p; |
1678 | 0 | } |
1679 | 0 | else { |
1680 | 0 | if (p->one->rename_used) |
1681 | | /* this path remains */ |
1682 | 0 | pair_to_free = p; |
1683 | 0 | } |
1684 | | |
1685 | 0 | if (!pair_to_free) |
1686 | 0 | diff_q(&outq, p); |
1687 | 0 | } |
1688 | 0 | else if (!diff_unmodified_pair(p)) |
1689 | | /* all the usual ones need to be kept */ |
1690 | 0 | diff_q(&outq, p); |
1691 | 0 | else |
1692 | | /* no need to keep unmodified pairs */ |
1693 | 0 | pair_to_free = p; |
1694 | | |
1695 | 0 | if (pair_to_free) |
1696 | 0 | pool_diff_free_filepair(pool, pair_to_free); |
1697 | 0 | } |
1698 | 0 | diff_debug_queue("done copying original", &outq); |
1699 | |
|
1700 | 0 | free(q->queue); |
1701 | 0 | *q = outq; |
1702 | 0 | diff_debug_queue("done collapsing", q); |
1703 | |
|
1704 | 0 | for (i = 0; i < rename_dst_nr; i++) |
1705 | 0 | if (rename_dst[i].filespec_to_free) |
1706 | 0 | pool_free_filespec(pool, rename_dst[i].filespec_to_free); |
1707 | |
|
1708 | 0 | cleanup_dir_rename_info(&info, dirs_removed, dir_rename_count != NULL); |
1709 | 0 | FREE_AND_NULL(rename_dst); |
1710 | 0 | rename_dst_nr = rename_dst_alloc = 0; |
1711 | 0 | FREE_AND_NULL(rename_src); |
1712 | 0 | rename_src_nr = rename_src_alloc = 0; |
1713 | 0 | if (break_idx) { |
1714 | 0 | strintmap_clear(break_idx); |
1715 | 0 | FREE_AND_NULL(break_idx); |
1716 | 0 | } |
1717 | 0 | trace2_region_leave("diff", "write back to queue", options->repo); |
1718 | 0 | return; |
1719 | 0 | } |
1720 | | |
1721 | | void diffcore_rename(struct diff_options *options) |
1722 | 0 | { |
1723 | 0 | diffcore_rename_extended(options, NULL, NULL, NULL, NULL, NULL); |
1724 | 0 | } |