diff --git a/include/git2/diff.h b/include/git2/diff.h index 54966f14a..172aa118b 100644 --- a/include/git2/diff.h +++ b/include/git2/diff.h @@ -243,6 +243,19 @@ typedef struct { * `NOT_BINARY` flag set to avoid examining file contents if you do not pass * in hunk and/or line callbacks to the diff foreach iteration function. It * will just use the git attributes for those files. + * + * The similarity score is zero unless you call `git_diff_find_similar()` + * which does a similarity analysis of files in the diff. Use that + * function to do rename and copy detection, and to split heavily modified + * files in add/delete pairs. After that call, deltas with a status of + * GIT_DELTA_RENAMED or GIT_DELTA_COPIED will have a similarity score + * between 0 and 100 indicating how similar the old and new sides are. + * + * If you ask `git_diff_find_similar` to find heavily modified files to + * break, but to not *actually* break the records, then GIT_DELTA_MODIFIED + * records may have a non-zero similarity score if the self-similarity is + * below the split threshold. To display this value like core Git, invert + * the score (a la `printf("M%03d", 100 - delta->similarity)`). */ typedef struct { git_diff_file old_file; @@ -408,18 +421,26 @@ typedef enum { /** consider unmodified as copy sources? (`--find-copies-harder`) */ GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED = (1 << 3), - /** split large rewrites into delete/add pairs (`--break-rewrites=/M`) */ - GIT_DIFF_FIND_AND_BREAK_REWRITES = (1 << 4), + /** mark large rewrites for split (`--break-rewrites=/M`) */ + GIT_DIFF_FIND_REWRITES = (1 << 4), + /** actually split large rewrites into delete/add pairs */ + GIT_DIFF_BREAK_REWRITES = (1 << 5), + /** mark rewrites for split and break into delete/add pairs */ + GIT_DIFF_FIND_AND_BREAK_REWRITES = + (GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES), + + /** consider untracked files as rename/copy targets */ + GIT_DIFF_FIND_FROM_UNTRACKED = (1 << 6), /** turn on all finding features */ - GIT_DIFF_FIND_ALL = (0x1f), + GIT_DIFF_FIND_ALL = (0x0ff), /** measure similarity ignoring leading whitespace (default) */ GIT_DIFF_FIND_IGNORE_LEADING_WHITESPACE = 0, /** measure similarity ignoring all whitespace */ - GIT_DIFF_FIND_IGNORE_WHITESPACE = (1 << 6), + GIT_DIFF_FIND_IGNORE_WHITESPACE = (1 << 12), /** measure similarity including all data */ - GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE = (1 << 7), + GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE = (1 << 13), } git_diff_find_t; /** @@ -446,7 +467,7 @@ typedef struct { * - `copy_threshold` is the same as the -C option with a value * - `rename_from_rewrite_threshold` matches the top of the -B option * - `break_rewrite_threshold` matches the bottom of the -B option - * - `target_limit` matches the -l option + * - `target_limit` matches the -l option (approximately) * * The `metric` option allows you to plug in a custom similarity metric. * Set it to NULL for the default internal metric which is based on sampling @@ -461,18 +482,18 @@ typedef struct { unsigned int flags; /** Similarity to consider a file renamed (default 50) */ - unsigned int rename_threshold; + uint16_t rename_threshold; /** Similarity of modified to be eligible rename source (default 50) */ - unsigned int rename_from_rewrite_threshold; + uint16_t rename_from_rewrite_threshold; /** Similarity to consider a file a copy (default 50) */ - unsigned int copy_threshold; + uint16_t copy_threshold; /** Similarity to split modify into delete/add pair (default 60) */ - unsigned int break_rewrite_threshold; + uint16_t break_rewrite_threshold; /** Maximum similarity sources to examine (a la diff's `-l` option or * the `diff.renameLimit` config) (default 200) */ - unsigned int target_limit; + size_t target_limit; /** Pluggable similarity metric; pass NULL to use internal metric */ git_diff_similarity_metric *metric; diff --git a/src/diff_tform.c b/src/diff_tform.c index 84650a37b..33268e403 100644 --- a/src/diff_tform.c +++ b/src/diff_tform.c @@ -19,11 +19,13 @@ static git_diff_delta *diff_delta__dup( memcpy(delta, d, sizeof(git_diff_delta)); - delta->old_file.path = git_pool_strdup(pool, d->old_file.path); - if (delta->old_file.path == NULL) - goto fail; + if (d->old_file.path != NULL) { + delta->old_file.path = git_pool_strdup(pool, d->old_file.path); + if (delta->old_file.path == NULL) + goto fail; + } - if (d->new_file.path != d->old_file.path) { + if (d->new_file.path != d->old_file.path && d->new_file.path != NULL) { delta->new_file.path = git_pool_strdup(pool, d->new_file.path); if (delta->new_file.path == NULL) goto fail; @@ -259,6 +261,9 @@ static int normalize_find_opts( if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED) opts->flags |= GIT_DIFF_FIND_COPIES; + if (opts->flags & GIT_DIFF_BREAK_REWRITES) + opts->flags |= GIT_DIFF_FIND_REWRITES; + #define USE_DEFAULT(X) ((X) == 0 || (X) > 100) if (USE_DEFAULT(opts->rename_threshold)) @@ -307,11 +312,33 @@ static int normalize_find_opts( return 0; } -static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size) +static void validate_delta(git_diff_delta *delta) +{ + assert(delta); + return; +/* + switch (delta->status) { + case GIT_DELTA_ADDED: + case GIT_DELTA_UNTRACKED: + case GIT_DELTA_IGNORED: + assert(delta->new_file.path); + break; + case GIT_DELTA_DELETED: + assert(delta->old_file.path); + break; + default: + assert(delta->old_file.path && delta->new_file.path); + break; + } +*/ +} + +static int apply_splits_and_deletes( + git_diff_list *diff, size_t expected_size, bool actually_split) { git_vector onto = GIT_VECTOR_INIT; size_t i; - git_diff_delta *delta; + git_diff_delta *delta, *deleted; if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0) return -1; @@ -322,14 +349,26 @@ static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size) continue; if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0) { - git_diff_delta *deleted = diff_delta__dup(delta, &diff->pool); - if (!deleted) + + /* just leave delta flagged with score if not actually splitting */ + if (!actually_split) { + delta->flags = (delta->flags & ~GIT_DIFF_FLAG__TO_SPLIT); + if (delta->status != GIT_DELTA_MODIFIED) + delta->similarity = 0; + continue; + } + + delta->similarity = 0; + + /* make new record for DELETED side of split */ + if (!(deleted = diff_delta__dup(delta, &diff->pool))) goto on_error; deleted->status = GIT_DELTA_DELETED; memset(&deleted->new_file, 0, sizeof(deleted->new_file)); deleted->new_file.path = deleted->old_file.path; deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_OID; + validate_delta(deleted); if (git_vector_insert(&onto, deleted) < 0) goto on_error; @@ -338,6 +377,7 @@ static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size) memset(&delta->old_file, 0, sizeof(delta->old_file)); delta->old_file.path = delta->new_file.path; delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID; + validate_delta(delta); } if (git_vector_insert(&onto, delta) < 0) @@ -350,7 +390,6 @@ static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size) git__free(delta); /* swap new delta list into place */ - git_vector_sort(&onto); git_vector_swap(&diff->deltas, &onto); git_vector_free(&onto); @@ -359,7 +398,6 @@ static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size) on_error: git_vector_foreach(&onto, i, delta) git__free(delta); - git_vector_free(&onto); return -1; @@ -379,7 +417,7 @@ static int similarity_calc( { int error = 0; git_diff_file *file = similarity_get_file(diff, file_idx); - git_iterator_type_t src = (file_idx & 1) ? diff->old_src : diff->new_src; + git_iterator_type_t src = (file_idx & 1) ? diff->new_src : diff->old_src; if (src == GIT_ITERATOR_TYPE_WORKDIR) { /* compute hashsig from file */ git_buf path = GIT_BUF_INIT; @@ -455,8 +493,8 @@ static int similarity_measure( return -1; /* clip score */ - if (score < 0) - score = 0; + if (score < 1) + score = 1; /* zero means uncomparable, so use 1 for least similar */ else if (score > 100) score = 100; @@ -465,36 +503,50 @@ static int similarity_measure( #define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0) +typedef struct { + uint32_t idx; + uint32_t similarity; +} diff_find_match; + int git_diff_find_similar( git_diff_list *diff, git_diff_find_options *given_opts) { - size_t i, j, cache_size, *matches; + size_t i, j, cache_size; int error = 0, similarity; git_diff_delta *from, *to; git_diff_find_options opts; - size_t tried_targets, num_rewrites = 0; - void **cache; + size_t num_rewrites = 0, num_updates = 0; + void **cache; /* cache of similarity metric file signatures */ + diff_find_match *matches; /* cache of best matches */ if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0) return error; /* TODO: maybe abort if deltas.length > target_limit ??? */ + if (!git__is_uint32(diff->deltas.length)) + return 0; cache_size = diff->deltas.length * 2; /* must store b/c length may change */ cache = git__calloc(cache_size, sizeof(void *)); GITERR_CHECK_ALLOC(cache); - matches = git__calloc(diff->deltas.length, sizeof(size_t)); + matches = git__calloc(diff->deltas.length, sizeof(diff_find_match)); GITERR_CHECK_ALLOC(matches); - /* first break MODIFIED records that are too different (if requested) */ + /* first mark MODIFIED deltas to split if too different (if requested) */ - if (FLAG_SET(opts, GIT_DIFF_FIND_AND_BREAK_REWRITES)) { + if (FLAG_SET(opts, GIT_DIFF_FIND_REWRITES)) { git_vector_foreach(&diff->deltas, i, from) { if (from->status != GIT_DELTA_MODIFIED) continue; + /* skip things that aren't plain blobs */ + if (GIT_MODE_TYPE(from->old_file.mode) != + GIT_MODE_TYPE(GIT_FILEMODE_BLOB)) + continue; + + /* measure similarity from old_file to new_file */ similarity = similarity_measure( diff, &opts, cache, 2 * i, 2 * i + 1); @@ -503,7 +555,9 @@ int git_diff_find_similar( goto cleanup; } - if ((unsigned int)similarity < opts.break_rewrite_threshold) { + if (similarity > 0 && + similarity < (int)opts.break_rewrite_threshold) { + from->similarity = (uint32_t)similarity; from->flags |= GIT_DIFF_FLAG__TO_SPLIT; num_rewrites++; } @@ -513,9 +567,12 @@ int git_diff_find_similar( /* next find the most similar delta for each rename / copy candidate */ git_vector_foreach(&diff->deltas, i, from) { - tried_targets = 0; + size_t tried_targets = 0; - /* skip things that aren't blobs */ + matches[i].idx = i; + matches[i].similarity = 0; + + /* skip things that aren't plain blobs */ if (GIT_MODE_TYPE(from->old_file.mode) != GIT_MODE_TYPE(GIT_FILEMODE_BLOB)) continue; @@ -525,7 +582,13 @@ int git_diff_find_similar( !FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)) continue; - /* skip all but DELETED files unless copy detection is on */ + /* don't check UNTRACKED files as source unless given option */ + if ((from->status == GIT_DELTA_UNTRACKED || + from->status == GIT_DELTA_IGNORED) && + !FLAG_SET(opts, GIT_DIFF_FIND_FROM_UNTRACKED)) + continue; + + /* only use DELETED (or split MODIFIED) unless copy detection on */ if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES) && from->status != GIT_DELTA_DELETED && (from->flags & GIT_DIFF_FLAG__TO_SPLIT) == 0) @@ -540,9 +603,11 @@ int git_diff_find_similar( GIT_MODE_TYPE(GIT_FILEMODE_BLOB)) continue; + /* only consider ADDED, RENAMED, COPIED, and split MODIFIED as + * targets; maybe include UNTRACKED and IGNORED if requested. + */ switch (to->status) { case GIT_DELTA_ADDED: - case GIT_DELTA_UNTRACKED: case GIT_DELTA_RENAMED: case GIT_DELTA_COPIED: break; @@ -550,18 +615,21 @@ int git_diff_find_similar( if ((to->flags & GIT_DIFF_FLAG__TO_SPLIT) == 0) continue; break; + case GIT_DELTA_UNTRACKED: + case GIT_DELTA_IGNORED: + if (!FLAG_SET(opts, GIT_DIFF_FIND_FROM_UNTRACKED)) + continue; + break; default: - /* only the above status values should be checked */ + /* all other status values will be skipped */ continue; } - /* cap on maximum files we'll examine (per "from" file) */ + /* cap on maximum targets we'll examine (per "from" file) */ if (++tried_targets > opts.target_limit) break; - /* calculate similarity and see if this pair beats the - * similarity score of the current best pair. - */ + /* calculate similarity for this pair and find best match */ similarity = similarity_measure( diff, &opts, cache, 2 * i, 2 * j + 1); @@ -570,112 +638,133 @@ int git_diff_find_similar( goto cleanup; } - if (to->similarity < (unsigned int)similarity) { - to->similarity = (unsigned int)similarity; - matches[j] = i + 1; + if (matches[i].similarity < (uint32_t)similarity) { + matches[i].similarity = (uint32_t)similarity; + matches[i].idx = j; } } } /* next rewrite the diffs with renames / copies */ - git_vector_foreach(&diff->deltas, j, to) { - if (!matches[j]) { - assert(to->similarity == 0); + git_vector_foreach(&diff->deltas, i, from) { + if (!matches[i].similarity) continue; - } - i = matches[j] - 1; - from = GIT_VECTOR_GET(&diff->deltas, i); - assert(from); + to = GIT_VECTOR_GET(&diff->deltas, matches[i].idx); + assert(to); - /* four possible outcomes here: - * 1. old DELETED and if over rename threshold, - * new becomes RENAMED and old goes away - * 2. old SPLIT and if over rename threshold, - * new becomes RENAMED and old becomes ADDED (clear SPLIT) - * 3. old was MODIFIED but FIND_RENAMES_FROM_REWRITES is on and - * old is more similar to new than it is to itself, in which - * case, new becomes RENAMED and old becomed ADDED - * 4. otherwise if over copy threshold, new becomes COPIED + similarity = (int)matches[i].similarity; + + /* + * Four possible outcomes here: */ + /* 1. DELETED "from" with match over rename threshold becomes + * RENAMED "from" record (and "to" record goes away) + */ if (from->status == GIT_DELTA_DELETED) { - if (to->similarity < opts.rename_threshold) { - to->similarity = 0; + if (similarity < (int)opts.rename_threshold) continue; - } - to->status = GIT_DELTA_RENAMED; - memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); + to->flags |= GIT_DIFF_FLAG__TO_DELETE; + + from->status = GIT_DELTA_RENAMED; + from->similarity = (uint32_t)similarity; + memcpy(&from->new_file, &to->new_file, sizeof(to->new_file)); + validate_delta(from); - from->flags |= GIT_DIFF_FLAG__TO_DELETE; num_rewrites++; - continue; } + /* 2. SPLIT MODIFIED "from" with match over rename threshold becomes + * ADDED "from" record (with no SPLIT) and RENAMED "to" record + */ if (from->status == GIT_DELTA_MODIFIED && - (from->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0) - { - if (to->similarity < opts.rename_threshold) { - to->similarity = 0; + (from->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0) { + + if (similarity < (int)opts.rename_threshold) continue; - } to->status = GIT_DELTA_RENAMED; + to->similarity = (uint32_t)similarity; memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); + validate_delta(to); from->status = GIT_DELTA_ADDED; from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; + from->similarity = 0; /* reset self-similarity */ memset(&from->old_file, 0, sizeof(from->old_file)); - num_rewrites--; + from->old_file.path = from->new_file.path; + validate_delta(from); + num_rewrites--; + num_updates++; continue; } + /* 3. MODIFIED "from" with FIND_RENAMES_FROM_REWRITES with similar + * "to" and self-similarity below rename_from_rewrite_threshold + * becomes newly ADDED "from" and RENAMED "to". + */ if (from->status == GIT_DELTA_MODIFIED && FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) && - to->similarity > opts.rename_threshold) + similarity > (int)opts.rename_threshold) { - similarity = similarity_measure( + int self_similarity = similarity_measure( diff, &opts, cache, 2 * i, 2 * i + 1); - - if (similarity < 0) { - error = similarity; + if (self_similarity < 0) { + error = self_similarity; goto cleanup; } - if ((unsigned int)similarity < opts.rename_from_rewrite_threshold) { + if (self_similarity < (int)opts.rename_from_rewrite_threshold) { to->status = GIT_DELTA_RENAMED; + to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */ + to->similarity = (uint32_t)similarity; memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); + validate_delta(to); from->status = GIT_DELTA_ADDED; + from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */ + from->similarity = 0; memset(&from->old_file, 0, sizeof(from->old_file)); - from->old_file.path = to->old_file.path; + from->old_file.path = from->new_file.path; from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID; + validate_delta(from); + num_updates++; continue; } } - if (to->similarity < opts.copy_threshold) { - to->similarity = 0; + /* 4. if "from" -> "to" over copy threshold, "to" becomes COPIED */ + if (similarity < (int)opts.copy_threshold) continue; - } /* convert "to" to a COPIED record */ to->status = GIT_DELTA_COPIED; + to->similarity = (uint32_t)similarity; memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); + validate_delta(to); + + validate_delta(from); + + num_updates++; } if (num_rewrites > 0) { assert(num_rewrites < diff->deltas.length); error = apply_splits_and_deletes( - diff, diff->deltas.length - num_rewrites); + diff, diff->deltas.length - num_rewrites, + FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES)); } + if (num_rewrites > 0 || num_updates > 0) + git_vector_sort(&diff->deltas); + cleanup: git__free(matches); diff --git a/src/util.h b/src/util.h index 6f876d012..5ae87ac10 100644 --- a/src/util.h +++ b/src/util.h @@ -109,6 +109,13 @@ GIT_INLINE(int) git__is_sizet(git_off_t p) return p == (git_off_t)r; } +/** @return true if p fits into the range of a uint32_t */ +GIT_INLINE(int) git__is_uint32(size_t p) +{ + uint32_t r = (uint32_t)p; + return p == (size_t)r; +} + /* 32-bit cross-platform rotl */ #ifdef _MSC_VER /* use built-in method in MSVC */ # define git__rotl(v, s) (uint32_t)_rotl(v, s) diff --git a/tests-clar/diff/rename.c b/tests-clar/diff/rename.c index 1349d4013..01f65abfd 100644 --- a/tests-clar/diff/rename.c +++ b/tests-clar/diff/rename.c @@ -377,7 +377,8 @@ void test_diff_rename__handles_small_files(void) */ cl_git_pass(git_diff_tree_to_index(&diff, g_repo, tree, index, &diffopts)); - opts.flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES | GIT_DIFF_FIND_AND_BREAK_REWRITES; + opts.flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES | + GIT_DIFF_FIND_AND_BREAK_REWRITES; cl_git_pass(git_diff_find_similar(diff, &opts)); git_diff_list_free(diff);