From 2123a17f83cc95adee4e7259d0686b07f074de4c Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Mon, 2 Dec 2013 13:27:06 -0800 Subject: [PATCH 1/3] Fix bug making split deltas a COPIED targets When FIND_COPIES is used in combination with BREAK_REWRITES for rename detection, there was a bug where the split MODIFIED delta was only used as a target for RENAME records and not for COPIED records. This fixes that, converting the split into a pair of DELETED and COPIED deltas when that circumstance arises. --- src/diff_tform.c | 46 +++++++++++++++++++++++++++--------------- tests/diff/rename.c | 49 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 16 deletions(-) diff --git a/src/diff_tform.c b/src/diff_tform.c index 28a9cc70d..3b5575d28 100644 --- a/src/diff_tform.c +++ b/src/diff_tform.c @@ -366,12 +366,28 @@ static int normalize_find_opts( return 0; } +static int insert_delete_side_of_split( + git_diff *diff, git_vector *onto, const git_diff_delta *delta) +{ + /* make new record for DELETED side of split */ + git_diff_delta *deleted = diff_delta__dup(delta, &diff->pool); + GITERR_CHECK_ALLOC(deleted); + + deleted->status = GIT_DELTA_DELETED; + deleted->nfiles = 1; + memset(&deleted->new_file, 0, sizeof(deleted->new_file)); + deleted->new_file.path = deleted->old_file.path; + deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_OID; + + return git_vector_insert(onto, deleted); +} + static int apply_splits_and_deletes( git_diff *diff, size_t expected_size, bool actually_split) { git_vector onto = GIT_VECTOR_INIT; size_t i; - git_diff_delta *delta, *deleted; + git_diff_delta *delta; if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0) return -1; @@ -384,17 +400,7 @@ static int apply_splits_and_deletes( if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0 && actually_split) { delta->similarity = 0; - /* make new record for DELETED side of split */ - if (!(deleted = diff_delta__dup(delta, &diff->pool))) - goto on_error; - - deleted->status = GIT_DELTA_DELETED; - deleted->nfiles = 1; - memset(&deleted->new_file, 0, sizeof(deleted->new_file)); - deleted->new_file.path = deleted->old_file.path; - deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_OID; - - if (git_vector_insert(&onto, deleted) < 0) + if (insert_delete_side_of_split(diff, &onto, delta) < 0) goto on_error; if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) @@ -1058,10 +1064,7 @@ find_best_matches: } } - else if (delta_is_new_only(tgt)) { - if (!FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) - continue; - + else if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) { if (tgt2src_copy[t].similarity < opts.copy_threshold) continue; @@ -1069,10 +1072,21 @@ find_best_matches: best_match = &tgt2src_copy[t]; src = GIT_VECTOR_GET(&diff->deltas, best_match->idx); + if (delta_is_split(tgt)) { + error = insert_delete_side_of_split(diff, &diff->deltas, tgt); + if (error < 0) + goto cleanup; + num_rewrites--; + } + + if (!delta_is_split(tgt) && !delta_is_new_only(tgt)) + continue; + tgt->status = GIT_DELTA_COPIED; tgt->similarity = best_match->similarity; tgt->nfiles = 2; memcpy(&tgt->old_file, &src->old_file, sizeof(tgt->old_file)); + tgt->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; num_updates++; } diff --git a/tests/diff/rename.c b/tests/diff/rename.c index 42bb65aa8..7f033936e 100644 --- a/tests/diff/rename.c +++ b/tests/diff/rename.c @@ -1284,3 +1284,52 @@ void test_diff_rename__rewrite_on_single_file(void) git_diff_free(diff); git_index_free(index); } + +void test_diff_rename__can_find_copy_to_split(void) +{ + git_buf c1 = GIT_BUF_INIT; + git_index *index; + git_tree *tree; + git_diff *diff; + git_diff_options diffopts = GIT_DIFF_OPTIONS_INIT; + git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT; + diff_expects exp; + + cl_git_pass(git_futils_readbuffer(&c1, "renames/songof7cities.txt")); + cl_git_pass(git_futils_writebuffer(&c1, "renames/untimely.txt", 0, 0)); + + cl_git_pass( + git_revparse_single((git_object **)&tree, g_repo, "HEAD^{tree}")); + + cl_git_pass(git_repository_index(&index, g_repo)); + cl_git_pass(git_index_read_tree(index, tree)); + cl_git_pass(git_index_add_bypath(index, "untimely.txt")); + + diffopts.flags = GIT_DIFF_INCLUDE_UNMODIFIED; + + cl_git_pass(git_diff_tree_to_index(&diff, g_repo, tree, index, &diffopts)); + + memset(&exp, 0, sizeof(exp)); + cl_git_pass(git_diff_foreach( + diff, diff_file_cb, diff_hunk_cb, diff_line_cb, &exp)); + cl_assert_equal_i(4, exp.files); + cl_assert_equal_i(1, exp.file_status[GIT_DELTA_MODIFIED]); + cl_assert_equal_i(3, exp.file_status[GIT_DELTA_UNMODIFIED]); + + opts.flags = GIT_DIFF_FIND_ALL; + cl_git_pass(git_diff_find_similar(diff, &opts)); + + memset(&exp, 0, sizeof(exp)); + cl_git_pass(git_diff_foreach( + diff, diff_file_cb, diff_hunk_cb, diff_line_cb, &exp)); + cl_assert_equal_i(5, exp.files); + cl_assert_equal_i(1, exp.file_status[GIT_DELTA_DELETED]); + cl_assert_equal_i(1, exp.file_status[GIT_DELTA_COPIED]); + cl_assert_equal_i(3, exp.file_status[GIT_DELTA_UNMODIFIED]); + + git_diff_free(diff); + git_tree_free(tree); + git_index_free(index); + + git_buf_free(&c1); +} From 97ad85b88d7a2f6d1c31a07de3c3c0b4b504d4ee Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Mon, 2 Dec 2013 13:30:05 -0800 Subject: [PATCH 2/3] Add GIT_DIFF_FIND_DELETE_UNMODIFIED flag When doing copy detection, it is often necessary to include UNMODIFIED records in the git_diff so they are available as source records for GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED. Yet in the final diff, often you will not want to have these UNMODIFIED records. This adds a flag which marks these UNMODIFIED records for deletion from the diff list so they will be removed after the rename detect phase is over. --- include/git2/diff.h | 58 ++++++++++++++++++++++++++++++++++----------- src/diff_tform.c | 2 ++ tests/diff/rename.c | 48 +++++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+), 14 deletions(-) diff --git a/include/git2/diff.h b/include/git2/diff.h index 61d288380..51ad349e3 100644 --- a/include/git2/diff.h +++ b/include/git2/diff.h @@ -468,41 +468,71 @@ typedef int (*git_diff_line_cb)( * Flags to control the behavior of diff rename/copy detection. */ typedef enum { - /** look for renames? (`--find-renames`) */ + /** Look for renames? (`--find-renames`) */ GIT_DIFF_FIND_RENAMES = (1u << 0), - /** consider old side of modified for renames? (`--break-rewrites=N`) */ + + /** Consider old side of MODIFIED for renames? (`--break-rewrites=N`) */ GIT_DIFF_FIND_RENAMES_FROM_REWRITES = (1u << 1), - /** look for copies? (a la `--find-copies`) */ + /** Look for copies? (a la `--find-copies`). */ GIT_DIFF_FIND_COPIES = (1u << 2), - /** consider unmodified as copy sources? (`--find-copies-harder`) */ + + /** Consider UNMODIFIED as copy sources? (`--find-copies-harder`). + * + * For this to work correctly, use GIT_DIFF_INCLUDE_UNMODIFIED when + * the initial `git_diff` is being generated. + */ GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED = (1u << 3), - /** mark large rewrites for split (`--break-rewrites=/M`) */ + /** Mark significant rewrites for split (`--break-rewrites=/M`) */ GIT_DIFF_FIND_REWRITES = (1u << 4), - /** actually split large rewrites into delete/add pairs */ + /** Actually split large rewrites into delete/add pairs */ GIT_DIFF_BREAK_REWRITES = (1u << 5), - /** mark rewrites for split and break into delete/add pairs */ + /** Mark rewrites for split and break into delete/add pairs */ GIT_DIFF_FIND_AND_BREAK_REWRITES = (GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES), - /** find renames/copies for untracked items in working directory */ + /** Find renames/copies for UNTRACKED items in working directory. + * + * For this to work correctly, use GIT_DIFF_INCLUDE_UNTRACKED when the + * initial `git_diff` is being generated (and obviously the diff must + * be against the working directory for this to make sense). + */ GIT_DIFF_FIND_FOR_UNTRACKED = (1u << 6), - /** turn on all finding features */ + /** Turn on all finding features. */ GIT_DIFF_FIND_ALL = (0x0ff), - /** measure similarity ignoring leading whitespace (default) */ + /** Measure similarity ignoring leading whitespace (default) */ GIT_DIFF_FIND_IGNORE_LEADING_WHITESPACE = 0, - /** measure similarity ignoring all whitespace */ + /** Measure similarity ignoring all whitespace */ GIT_DIFF_FIND_IGNORE_WHITESPACE = (1u << 12), - /** measure similarity including all data */ + /** Measure similarity including all data */ GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE = (1u << 13), - /** measure similarity only by comparing SHAs (fast and cheap) */ + /** Measure similarity only by comparing SHAs (fast and cheap) */ GIT_DIFF_FIND_EXACT_MATCH_ONLY = (1u << 14), - /** do not break rewrites unless they contribute to a rename */ + /** Do not break rewrites unless they contribute to a rename. + * + * Normally, GIT_DIFF_FIND_AND_BREAK_REWRITES will measure the self- + * similarity of modified files and split the ones that have changed a + * lot into a DELETE / ADD pair. Then the sides of that pair will be + * considered candidates for rename and copy detection. + * + * If you add this flag in and the split pair is *not* used for an + * actual rename or copy, then the modified record will be restored to + * a regular MODIFIED record instead of being split. + */ GIT_DIFF_BREAK_REWRITES_FOR_RENAMES_ONLY = (1u << 15), + + /** Delete any UNMODIFIED records after find_similar is done. + * + * Using GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED to emulate the + * --find-copies-harder behavior requires building a diff with the + * GIT_DIFF_INCLUDE_UNMODIFIED flag. If you do not want UNMODIFIED + * records in the final result, pass this flag to have them removed. + */ + GIT_DIFF_FIND_DELETE_UNMODIFIED = (1u << 16), } git_diff_find_t; /** diff --git a/src/diff_tform.c b/src/diff_tform.c index 3b5575d28..bff821dc3 100644 --- a/src/diff_tform.c +++ b/src/diff_tform.c @@ -746,6 +746,8 @@ static bool is_rename_source( case GIT_DELTA_UNMODIFIED: if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)) return false; + if (FLAG_SET(opts, GIT_DIFF_FIND_DELETE_UNMODIFIED)) + delta->flags |= GIT_DIFF_FLAG__TO_DELETE; break; default: /* MODIFIED, RENAMED, COPIED */ diff --git a/tests/diff/rename.c b/tests/diff/rename.c index 7f033936e..e081d616f 100644 --- a/tests/diff/rename.c +++ b/tests/diff/rename.c @@ -1333,3 +1333,51 @@ void test_diff_rename__can_find_copy_to_split(void) git_buf_free(&c1); } + +void test_diff_rename__can_delete_unmodified_deltas(void) +{ + git_buf c1 = GIT_BUF_INIT; + git_index *index; + git_tree *tree; + git_diff *diff; + git_diff_options diffopts = GIT_DIFF_OPTIONS_INIT; + git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT; + diff_expects exp; + + cl_git_pass(git_futils_readbuffer(&c1, "renames/songof7cities.txt")); + cl_git_pass(git_futils_writebuffer(&c1, "renames/untimely.txt", 0, 0)); + + cl_git_pass( + git_revparse_single((git_object **)&tree, g_repo, "HEAD^{tree}")); + + cl_git_pass(git_repository_index(&index, g_repo)); + cl_git_pass(git_index_read_tree(index, tree)); + cl_git_pass(git_index_add_bypath(index, "untimely.txt")); + + diffopts.flags = GIT_DIFF_INCLUDE_UNMODIFIED; + + cl_git_pass(git_diff_tree_to_index(&diff, g_repo, tree, index, &diffopts)); + + memset(&exp, 0, sizeof(exp)); + cl_git_pass(git_diff_foreach( + diff, diff_file_cb, diff_hunk_cb, diff_line_cb, &exp)); + cl_assert_equal_i(4, exp.files); + cl_assert_equal_i(1, exp.file_status[GIT_DELTA_MODIFIED]); + cl_assert_equal_i(3, exp.file_status[GIT_DELTA_UNMODIFIED]); + + opts.flags = GIT_DIFF_FIND_ALL | GIT_DIFF_FIND_DELETE_UNMODIFIED; + cl_git_pass(git_diff_find_similar(diff, &opts)); + + memset(&exp, 0, sizeof(exp)); + cl_git_pass(git_diff_foreach( + diff, diff_file_cb, diff_hunk_cb, diff_line_cb, &exp)); + cl_assert_equal_i(2, exp.files); + cl_assert_equal_i(1, exp.file_status[GIT_DELTA_DELETED]); + cl_assert_equal_i(1, exp.file_status[GIT_DELTA_COPIED]); + + git_diff_free(diff); + git_tree_free(tree); + git_index_free(index); + + git_buf_free(&c1); +} From f62c174d0d88c3e491fe3f4e3490959edb7cae1b Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Mon, 2 Dec 2013 13:49:58 -0800 Subject: [PATCH 3/3] GIT_DIFF_FIND_REMOVE_UNMODIFIED sounds better --- include/git2/diff.h | 4 ++-- src/diff_tform.c | 2 +- tests/diff/rename.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/git2/diff.h b/include/git2/diff.h index 51ad349e3..db6bce2eb 100644 --- a/include/git2/diff.h +++ b/include/git2/diff.h @@ -525,14 +525,14 @@ typedef enum { */ GIT_DIFF_BREAK_REWRITES_FOR_RENAMES_ONLY = (1u << 15), - /** Delete any UNMODIFIED records after find_similar is done. + /** Remove any UNMODIFIED deltas after find_similar is done. * * Using GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED to emulate the * --find-copies-harder behavior requires building a diff with the * GIT_DIFF_INCLUDE_UNMODIFIED flag. If you do not want UNMODIFIED * records in the final result, pass this flag to have them removed. */ - GIT_DIFF_FIND_DELETE_UNMODIFIED = (1u << 16), + GIT_DIFF_FIND_REMOVE_UNMODIFIED = (1u << 16), } git_diff_find_t; /** diff --git a/src/diff_tform.c b/src/diff_tform.c index bff821dc3..0a28e58c7 100644 --- a/src/diff_tform.c +++ b/src/diff_tform.c @@ -746,7 +746,7 @@ static bool is_rename_source( case GIT_DELTA_UNMODIFIED: if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)) return false; - if (FLAG_SET(opts, GIT_DIFF_FIND_DELETE_UNMODIFIED)) + if (FLAG_SET(opts, GIT_DIFF_FIND_REMOVE_UNMODIFIED)) delta->flags |= GIT_DIFF_FLAG__TO_DELETE; break; diff --git a/tests/diff/rename.c b/tests/diff/rename.c index e081d616f..919f5139a 100644 --- a/tests/diff/rename.c +++ b/tests/diff/rename.c @@ -1365,7 +1365,7 @@ void test_diff_rename__can_delete_unmodified_deltas(void) cl_assert_equal_i(1, exp.file_status[GIT_DELTA_MODIFIED]); cl_assert_equal_i(3, exp.file_status[GIT_DELTA_UNMODIFIED]); - opts.flags = GIT_DIFF_FIND_ALL | GIT_DIFF_FIND_DELETE_UNMODIFIED; + opts.flags = GIT_DIFF_FIND_ALL | GIT_DIFF_FIND_REMOVE_UNMODIFIED; cl_git_pass(git_diff_find_similar(diff, &opts)); memset(&exp, 0, sizeof(exp));