diff --git a/src/diff_print.c b/src/diff_print.c index 30f221a62..0de548813 100644 --- a/src/diff_print.c +++ b/src/diff_print.c @@ -46,7 +46,7 @@ static char diff_pick_suffix(int mode) { if (S_ISDIR(mode)) return '/'; - else if (mode & 0100) //-V536 + else if (mode & 0100) /* -V536 */ /* in git, modes are very regular, so we must have 0100755 mode */ return '*'; else @@ -162,7 +162,7 @@ static int diff_print_one_raw( if (delta->similarity > 0) git_buf_printf(out, "%03u", delta->similarity); - if (delta->status == GIT_DELTA_RENAMED || delta->status == GIT_DELTA_COPIED) + if (delta->old_file.path != delta->new_file.path) git_buf_printf( out, "\t%s %s\n", delta->old_file.path, delta->new_file.path); else diff --git a/src/diff_tform.c b/src/diff_tform.c index 64746e7dd..8c4e96ecf 100644 --- a/src/diff_tform.c +++ b/src/diff_tform.c @@ -673,6 +673,15 @@ GIT_INLINE(bool) delta_is_new_only(git_diff_delta *delta) delta->status == GIT_DELTA_IGNORED); } +GIT_INLINE(void) delta_make_rename( + git_diff_delta *to, const git_diff_delta *from, uint32_t similarity) +{ + to->status = GIT_DELTA_RENAMED; + to->similarity = similarity; + memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); + to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; +} + typedef struct { uint32_t idx; uint32_t similarity; @@ -682,13 +691,15 @@ int git_diff_find_similar( git_diff_list *diff, git_diff_find_options *given_opts) { - size_t i, j, cache_size; + size_t i, j, sigcache_size; int error = 0, similarity; git_diff_delta *from, *to; git_diff_find_options opts; - size_t num_rewrites = 0, num_updates = 0; - void **cache; /* cache of similarity metric file signatures */ - diff_find_match *match_sources, *match_targets; /* cache of best matches */ + size_t num_srcs = 0, num_tgts = 0, tried_srcs = 0, tried_tgts = 0; + size_t num_rewrites = 0, num_updates = 0, num_bumped = 0; + void **sigcache; /* cache of similarity metric file signatures */ + diff_find_match *match_srcs = NULL, *match_tgts = NULL, *best_match; + git_diff_file swap; if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0) return error; @@ -697,70 +708,109 @@ int git_diff_find_similar( if (!git__is_uint32(diff->deltas.length)) return 0; - cache_size = diff->deltas.length * 2; /* must store b/c length may change */ - cache = git__calloc(cache_size, sizeof(void *)); - GITERR_CHECK_ALLOC(cache); - - match_sources = git__calloc(diff->deltas.length, sizeof(diff_find_match)); - match_targets = git__calloc(diff->deltas.length, sizeof(diff_find_match)); - GITERR_CHECK_ALLOC(match_sources); - GITERR_CHECK_ALLOC(match_targets); - - /* next find the most similar delta for each rename / copy candidate */ + sigcache_size = diff->deltas.length * 2; /* keep size b/c diff may change */ + sigcache = git__calloc(sigcache_size, sizeof(void *)); + GITERR_CHECK_ALLOC(sigcache); + /* Label rename sources and targets + * + * This will also set self-similarity scores for MODIFIED files and + * mark them for splitting if break-rewrites is enabled + */ git_vector_foreach(&diff->deltas, i, to) { - size_t tried_sources = 0; + if (is_rename_source(diff, &opts, i, sigcache)) + ++num_srcs; - match_targets[i].idx = (uint32_t)i; - match_targets[i].similarity = 0; - - /* skip things that are not rename targets */ - if (!is_rename_target(diff, &opts, i, cache)) - continue; - - git_vector_foreach(&diff->deltas, j, from) { - if (i == j) - continue; - - /* skip things that are not rename sources */ - if (!is_rename_source(diff, &opts, j, cache)) - continue; - - /* cap on maximum targets we'll examine (per "to" file) */ - if (++tried_sources > opts.rename_limit) - break; - - /* calculate similarity for this pair and find best match */ - if ((error = similarity_measure( - &similarity, diff, &opts, cache, 2 * j, 2 * i + 1)) < 0) - goto cleanup; - - if (similarity < 0) { /* not actually comparable */ - --tried_sources; - continue; - } - - if (match_targets[i].similarity < (uint32_t)similarity && - match_sources[j].similarity < (uint32_t)similarity) { - match_targets[i].similarity = (uint32_t)similarity; - match_sources[j].similarity = (uint32_t)similarity; - match_targets[i].idx = (uint32_t)j; - match_sources[j].idx = (uint32_t)i; - } - } + if (is_rename_target(diff, &opts, i, sigcache)) + ++num_tgts; } - /* next rewrite the diffs with renames / copies */ + /* if there are no candidate srcs or tgts, we're done */ + if (!num_srcs || !num_tgts) + goto cleanup; + + match_tgts = git__calloc(diff->deltas.length, sizeof(diff_find_match)); + GITERR_CHECK_ALLOC(match_tgts); + match_srcs = git__calloc(diff->deltas.length, sizeof(diff_find_match)); + GITERR_CHECK_ALLOC(match_srcs); + + /* + * Find best-fit matches for rename / copy candidates + */ + +find_best_matches: + tried_tgts = num_bumped = 0; git_vector_foreach(&diff->deltas, i, to) { - /* check if this delta was the target of a similarity */ - if ((similarity = (int)match_targets[i].similarity) <= 0) + /* skip things that are not rename targets */ + if ((to->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0) continue; - assert(to && (to->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) != 0); + tried_srcs = 0; - from = GIT_VECTOR_GET(&diff->deltas, match_targets[i].idx); - assert(from && (from->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) != 0); + git_vector_foreach(&diff->deltas, j, from) { + /* skip things that are not rename sources */ + if ((from->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0) + continue; + + /* calculate similarity for this pair and find best match */ + if (i == j) + similarity = -1; /* don't measure self-similarity here */ + else if ((error = similarity_measure( + &similarity, diff, &opts, sigcache, 2 * j, 2 * i + 1)) < 0) + goto cleanup; + + /* if this pairing is better for the src and the tgt, keep it */ + if (similarity > 0 && + match_tgts[i].similarity < (uint32_t)similarity && + match_srcs[j].similarity < (uint32_t)similarity) + { + if (match_tgts[i].similarity > 0) { + match_tgts[match_srcs[j].idx].similarity = 0; + match_srcs[match_tgts[i].idx].similarity = 0; + ++num_bumped; + } + + match_tgts[i].similarity = (uint32_t)similarity; + match_tgts[i].idx = (uint32_t)j; + + match_srcs[j].similarity = (uint32_t)similarity; + match_srcs[j].idx = (uint32_t)i; + } + + if (++tried_srcs >= num_srcs) + break; + + /* cap on maximum targets we'll examine (per "to" file) */ + if (tried_srcs > opts.rename_limit) + break; + } + + if (++tried_tgts >= num_tgts) + break; + } + + if (num_bumped > 0) /* try again if we bumped some items */ + goto find_best_matches; + + /* + * Rewrite the diffs with renames / copies + */ + + tried_tgts = 0; + + git_vector_foreach(&diff->deltas, i, to) { + /* skip things that are not rename targets */ + if ((to->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0) + continue; + + /* check if this delta was the target of a similarity */ + best_match = &match_tgts[i]; + if (!best_match->similarity) + continue; + + j = best_match->idx; + from = GIT_VECTOR_GET(&diff->deltas, j); /* possible scenarios: * 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME @@ -774,99 +824,84 @@ int git_diff_find_similar( if (delta_is_new_only(to)) { - if (similarity < (int)opts.rename_threshold) + if (best_match->similarity < opts.rename_threshold) continue; - from->status = GIT_DELTA_RENAMED; - from->similarity = (uint32_t)similarity; - memcpy(&from->new_file, &to->new_file, sizeof(from->new_file)); - - to->flags |= GIT_DIFF_FLAG__TO_DELETE; + delta_make_rename(to, from, best_match->similarity); + from->flags |= GIT_DIFF_FLAG__TO_DELETE; num_rewrites++; } else { assert(delta_is_split(to)); - if (similarity < (int)opts.rename_from_rewrite_threshold) + if (best_match->similarity < opts.rename_from_rewrite_threshold) continue; - from->status = GIT_DELTA_RENAMED; - from->similarity = (uint32_t)similarity; - memcpy(&from->new_file, &to->new_file, sizeof(from->new_file)); + memcpy(&swap, &to->old_file, sizeof(swap)); - to->status = GIT_DELTA_DELETED; - memset(&to->new_file, 0, sizeof(to->new_file)); - to->new_file.path = to->old_file.path; - to->new_file.flags |= GIT_DIFF_FLAG_VALID_OID; - if ((to->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0) { - to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; - num_rewrites--; - } + delta_make_rename(to, from, best_match->similarity); + num_rewrites--; + + from->status = GIT_DELTA_DELETED; + memcpy(&from->old_file, &swap, sizeof(from->old_file)); + memset(&from->new_file, 0, sizeof(from->new_file)); + from->new_file.path = from->old_file.path; + from->new_file.flags |= GIT_DIFF_FLAG_VALID_OID; num_updates++; } } else if (delta_is_split(from)) { - git_diff_file swap; if (delta_is_new_only(to)) { - if (similarity < (int)opts.rename_threshold) + if (best_match->similarity < opts.rename_threshold) continue; - memcpy(&swap, &from->new_file, sizeof(swap)); + delta_make_rename(to, from, best_match->similarity); - from->status = GIT_DELTA_RENAMED; - from->similarity = (uint32_t)similarity; - memcpy(&from->new_file, &to->new_file, sizeof(from->new_file)); - if ((from->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0) { - from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; - num_rewrites--; - } - - to->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ? + from->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ? GIT_DELTA_UNTRACKED : GIT_DELTA_ADDED; - memcpy(&to->new_file, &swap, sizeof(to->new_file)); - to->old_file.path = to->new_file.path; + memset(&from->old_file, 0, sizeof(from->old_file)); + from->old_file.path = from->new_file.path; + from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID; + + from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; + num_rewrites--; num_updates++; } else { assert(delta_is_split(from)); - if (similarity < (int)opts.rename_from_rewrite_threshold) + if (best_match->similarity < opts.rename_from_rewrite_threshold) continue; - memcpy(&swap, &to->new_file, sizeof(swap)); + memcpy(&swap, &to->old_file, sizeof(swap)); - to->status = GIT_DELTA_RENAMED; - to->similarity = (uint32_t)similarity; - memcpy(&to->new_file, &from->new_file, sizeof(to->new_file)); - if ((to->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0) { - to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; - num_rewrites--; - } + delta_make_rename(to, from, best_match->similarity); + num_rewrites--; + num_updates++; - memcpy(&from->new_file, &swap, sizeof(from->new_file)); - if ((from->flags & GIT_DIFF_FLAG__TO_SPLIT) == 0) { - from->flags |= GIT_DIFF_FLAG__TO_SPLIT; - num_rewrites++; - } + memcpy(&from->old_file, &swap, sizeof(from->old_file)); - /* in the off chance that we've just swapped the new - * element into the correct place, clear the SPLIT flag + /* if we've just swapped the new element into the correct + * place, clear the SPLIT flag */ - if (match_targets[match_targets[i].idx].idx == i && - match_targets[match_targets[i].idx].similarity > + if (match_tgts[j].idx == i && + match_tgts[j].similarity > opts.rename_from_rewrite_threshold) { - from->status = GIT_DELTA_RENAMED; - from->similarity = - (uint32_t)match_targets[match_targets[i].idx].similarity; - match_targets[match_targets[i].idx].similarity = 0; + from->status = GIT_DELTA_RENAMED; + from->similarity = match_tgts[j].similarity; + match_tgts[j].similarity = 0; from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; num_rewrites--; } + /* otherwise, if we just overwrote a source, update mapping */ + else if (j > i && match_srcs[i].similarity > 0) { + match_tgts[match_srcs[i].idx].idx = j; + } num_updates++; } @@ -874,31 +909,35 @@ int git_diff_find_similar( else if (delta_is_new_only(to)) { if (!FLAG_SET(&opts, GIT_DIFF_FIND_COPIES) || - similarity < (int)opts.copy_threshold) + best_match->similarity < opts.copy_threshold) continue; - to->status = GIT_DELTA_COPIED; - to->similarity = (uint32_t)similarity; + to->status = GIT_DELTA_COPIED; + to->similarity = best_match->similarity; memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); num_updates++; } } + /* + * Actually split and delete entries as needed + */ + if (num_rewrites > 0 || num_updates > 0) error = apply_splits_and_deletes( diff, diff->deltas.length - num_rewrites, FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES)); cleanup: - git__free(match_sources); - git__free(match_targets); + git__free(match_srcs); + git__free(match_tgts); - for (i = 0; i < cache_size; ++i) { - if (cache[i] != NULL) - opts.metric->free_signature(cache[i], opts.metric->payload); + for (i = 0; i < sigcache_size; ++i) { + if (sigcache[i] != NULL) + opts.metric->free_signature(sigcache[i], opts.metric->payload); } - git__free(cache); + git__free(sigcache); if (!given_opts || !given_opts->metric) git__free(opts.metric); diff --git a/tests-clar/diff/rename.c b/tests-clar/diff/rename.c index e6bf72c17..79f407057 100644 --- a/tests-clar/diff/rename.c +++ b/tests-clar/diff/rename.c @@ -755,7 +755,7 @@ void test_diff_rename__file_partial_exchange(void) git_buf_free(&c2); } -void test_diff_rename__file_split(void) +void test_diff_rename__rename_and_copy_from_same_source(void) { git_buf c1 = GIT_BUF_INIT, c2 = GIT_BUF_INIT; git_index *index; @@ -947,6 +947,7 @@ void test_diff_rename__rejected_match_can_match_others(void) cl_git_pass( git_diff_tree_to_index(&diff, g_repo, tree, index, &diffopts)); + cl_git_pass(git_diff_find_similar(diff, &findopts)); cl_git_pass( @@ -967,10 +968,10 @@ static void write_similarity_file_two(const char *filename, size_t b_lines) size_t i; for (i = 0; i < b_lines; i++) - git_buf_printf(&contents, "%0.2d - bbbbb\r\n", (i+1)); + git_buf_printf(&contents, "%0.2d - bbbbb\r\n", (int)(i+1)); for (i = b_lines; i < 50; i++) - git_buf_printf(&contents, "%0.2d - aaaaa%s", (i+1), (i == 49 ? "" : "\r\n")); + git_buf_printf(&contents, "%0.2d - aaaaa%s", (int)(i+1), (i == 49 ? "" : "\r\n")); cl_git_pass( git_futils_writebuffer(&contents, filename, O_RDWR|O_CREAT, 0777)); @@ -1018,6 +1019,7 @@ void test_diff_rename__rejected_match_can_match_others_two(void) cl_git_pass( git_diff_tree_to_index(&diff, g_repo, tree, index, &diffopts)); + cl_git_pass(git_diff_find_similar(diff, &findopts)); cl_git_pass(