From db106d01f093b3e61170e3738d6651a2866cb76e Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Tue, 30 Oct 2012 09:40:50 -0700 Subject: [PATCH] Move rename detection into new file This improves the naming for the rename related functionality moving it to be called `git_diff_find_similar()` and renaming all the associated constants, etc. to make more sense. I also moved the new code (plus the existing `git_diff_merge`) into a new file `diff_tform.c` where I can put new functions related to manipulating git diff lists. This also updates the implementation significantly from the last revision fixing some ordering issues (where break-rewrite needs to be handled prior to copy and rename detection) and improving config option handling. --- include/git2/diff.h | 48 ++-- src/diff.c | 342 ++-------------------------- src/diff.h | 2 + src/diff_tform.c | 466 +++++++++++++++++++++++++++++++++++++++ src/vector.c | 30 +++ src/vector.h | 3 + tests-clar/diff/rename.c | 10 +- 7 files changed, 559 insertions(+), 342 deletions(-) create mode 100644 src/diff_tform.c diff --git a/include/git2/diff.h b/include/git2/diff.h index f9dbb67e0..439215575 100644 --- a/include/git2/diff.h +++ b/include/git2/diff.h @@ -263,31 +263,41 @@ typedef struct git_diff_patch git_diff_patch; * Flags to control the behavior of diff rename/copy detection. */ typedef enum { - /** should we look for renames */ - GIT_DIFF_DETECT_RENAMES = (1 << 0), - /** should we look for copies */ - GIT_DIFF_DETECT_COPIES = (1 << 1), - /** should we consider unmodified files as possible copy sources */ - GIT_DIFF_DETECT_COPIES_FROM_UNMODIFIED = (1 << 2), - /** should we split large rewrites into delete / add pairs */ - GIT_DIFF_DETECT_BREAK_REWRITES = (1 << 3), -} git_diff_detect_t; + /** look for renames? (`--find-renames`) */ + GIT_DIFF_FIND_RENAMES = (1 << 0), + /** consider old size of modified for renames? (`--break-rewrites=N`) */ + GIT_DIFF_FIND_RENAMES_FROM_REWRITES = (1 << 1), + + /** look for copies? (a la `--find-copies`) */ + GIT_DIFF_FIND_COPIES = (1 << 2), + /** consider unmodified as copy sources? (`--find-copies-harder`) */ + GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED = (1 << 3), + + /** split large rewrites into delete/add pairs (`--break-rewrites=/M`) */ + GIT_DIFF_FIND_AND_BREAK_REWRITES = (1 << 4), +} git_diff_find_t; /** * Control behavior of rename and copy detection */ typedef struct { - /** Combination of git_diff_detect_t values */ + /** Combination of git_diff_find_t values (default FIND_RENAMES) */ unsigned int flags; - /** Threshold on similarity index to consider a file renamed. */ + + /** Similarity to consider a file renamed (default 50) */ unsigned int rename_threshold; - /** Threshold on similarity index to consider a file a copy. */ + /** Similarity of modified to be eligible rename source (default 50) */ + unsigned int rename_from_rewrite_threshold; + /** Similarity to consider a file a copy (default 50) */ unsigned int copy_threshold; - /** Threshold on change % to split modify into delete/add pair. */ + /** Similarity to split modify into delete/add pair (default 60) */ unsigned int break_rewrite_threshold; - /** Maximum rename/copy targets to check (diff.renameLimit) */ + + /** Maximum similarity sources to examine (a la diff's `-l` option or + * the `diff.renameLimit` config) (default 200) + */ unsigned int target_limit; -} git_diff_detect_options; +} git_diff_find_options; /** @name Diff List Generator Functions @@ -405,18 +415,20 @@ GIT_EXTERN(int) git_diff_merge( const git_diff_list *from); /** - * Update a diff list with file renames, copies, etc. + * Transform a diff list marking file renames, copies, etc. * * This modifies a diff list in place, replacing old entries that look * like renames or copies with new entries reflecting those changes. + * This also will, if requested, break modified files into add/remove + * pairs if the amount of change is above a threshold. * * @param diff Diff list to run detection algorithms on * @param options Control how detection should be run, NULL for defaults * @return 0 on success, -1 on failure */ -GIT_EXTERN(int) git_diff_detect( +GIT_EXTERN(int) git_diff_find_similar( git_diff_list *diff, - git_diff_detect_options *options); + git_diff_find_options *options); /**@}*/ diff --git a/src/diff.c b/src/diff.c index e2649ff3b..55f6ee7d5 100644 --- a/src/diff.c +++ b/src/diff.c @@ -110,85 +110,6 @@ static git_diff_delta *diff_delta__alloc( return delta; } -static git_diff_delta *diff_delta__dup( - const git_diff_delta *d, git_pool *pool) -{ - git_diff_delta *delta = git__malloc(sizeof(git_diff_delta)); - if (!delta) - return NULL; - - memcpy(delta, d, sizeof(git_diff_delta)); - - delta->old_file.path = git_pool_strdup(pool, d->old_file.path); - if (delta->old_file.path == NULL) - goto fail; - - if (d->new_file.path != d->old_file.path) { - delta->new_file.path = git_pool_strdup(pool, d->new_file.path); - if (delta->new_file.path == NULL) - goto fail; - } else { - delta->new_file.path = delta->old_file.path; - } - - return delta; - -fail: - git__free(delta); - return NULL; -} - -static git_diff_delta *diff_delta__merge_like_cgit( - const git_diff_delta *a, const git_diff_delta *b, git_pool *pool) -{ - git_diff_delta *dup; - - /* Emulate C git for merging two diffs (a la 'git diff '). - * - * When C git does a diff between the work dir and a tree, it actually - * diffs with the index but uses the workdir contents. This emulates - * those choices so we can emulate the type of diff. - * - * We have three file descriptions here, let's call them: - * f1 = a->old_file - * f2 = a->new_file AND b->old_file - * f3 = b->new_file - */ - - /* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */ - if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED) - return diff_delta__dup(a, pool); - - /* otherwise, base this diff on the 'b' diff */ - if ((dup = diff_delta__dup(b, pool)) == NULL) - return NULL; - - /* If 'a' status is uninteresting, then we're done */ - if (a->status == GIT_DELTA_UNMODIFIED) - return dup; - - assert(a->status != GIT_DELTA_UNMODIFIED); - assert(b->status != GIT_DELTA_UNMODIFIED); - - /* A cgit exception is that the diff of a file that is only in the - * index (i.e. not in HEAD nor workdir) is given as empty. - */ - if (dup->status == GIT_DELTA_DELETED) { - if (a->status == GIT_DELTA_ADDED) - dup->status = GIT_DELTA_UNMODIFIED; - /* else don't overwrite DELETE status */ - } else { - dup->status = a->status; - } - - git_oid_cpy(&dup->old_file.oid, &a->old_file.oid); - dup->old_file.mode = a->old_file.mode; - dup->old_file.size = a->old_file.size; - dup->old_file.flags = a->old_file.flags; - - return dup; -} - static int diff_delta__from_one( git_diff_list *diff, git_delta_t status, @@ -332,13 +253,34 @@ static char *diff_strdup_prefix(git_pool *pool, const char *prefix) return git_pool_strndup(pool, prefix, len + 1); } -static int diff_delta__cmp(const void *a, const void *b) +int git_diff_delta__cmp(const void *a, const void *b) { const git_diff_delta *da = a, *db = b; int val = strcmp(da->old_file.path, db->old_file.path); return val ? val : ((int)da->status - (int)db->status); } +bool git_diff_delta__should_skip( + const git_diff_options *opts, const git_diff_delta *delta) +{ + uint32_t flags = opts ? opts->flags : 0; + + if (delta->status == GIT_DELTA_UNMODIFIED && + (flags & GIT_DIFF_INCLUDE_UNMODIFIED) == 0) + return true; + + if (delta->status == GIT_DELTA_IGNORED && + (flags & GIT_DIFF_INCLUDE_IGNORED) == 0) + return true; + + if (delta->status == GIT_DELTA_UNTRACKED && + (flags & GIT_DIFF_INCLUDE_UNTRACKED) == 0) + return true; + + return false; +} + + static int config_bool(git_config *cfg, const char *name, int defvalue) { int val = defvalue; @@ -361,7 +303,7 @@ static git_diff_list *git_diff_list_alloc( GIT_REFCOUNT_INC(diff); diff->repo = repo; - if (git_vector_init(&diff->deltas, 0, diff_delta__cmp) < 0 || + if (git_vector_init(&diff->deltas, 0, git_diff_delta__cmp) < 0 || git_pool_init(&diff->pool, 1, 0) < 0) goto fail; @@ -991,241 +933,3 @@ on_error: git_iterator_free(a); return error; } - - -bool git_diff_delta__should_skip( - const git_diff_options *opts, const git_diff_delta *delta) -{ - uint32_t flags = opts ? opts->flags : 0; - - if (delta->status == GIT_DELTA_UNMODIFIED && - (flags & GIT_DIFF_INCLUDE_UNMODIFIED) == 0) - return true; - - if (delta->status == GIT_DELTA_IGNORED && - (flags & GIT_DIFF_INCLUDE_IGNORED) == 0) - return true; - - if (delta->status == GIT_DELTA_UNTRACKED && - (flags & GIT_DIFF_INCLUDE_UNTRACKED) == 0) - return true; - - return false; -} - - -int git_diff_merge( - git_diff_list *onto, - const git_diff_list *from) -{ - int error = 0; - git_pool onto_pool; - git_vector onto_new; - git_diff_delta *delta; - bool ignore_case = false; - unsigned int i, j; - - assert(onto && from); - - if (!from->deltas.length) - return 0; - - if (git_vector_init(&onto_new, onto->deltas.length, diff_delta__cmp) < 0 || - git_pool_init(&onto_pool, 1, 0) < 0) - return -1; - - if ((onto->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0 || - (from->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0) - { - ignore_case = true; - - /* This function currently only supports merging diff lists that - * are sorted identically. */ - assert((onto->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0 && - (from->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0); - } - - for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) { - git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i); - const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j); - int cmp = !f ? -1 : !o ? 1 : STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path); - - if (cmp < 0) { - delta = diff_delta__dup(o, &onto_pool); - i++; - } else if (cmp > 0) { - delta = diff_delta__dup(f, &onto_pool); - j++; - } else { - delta = diff_delta__merge_like_cgit(o, f, &onto_pool); - i++; - j++; - } - - /* the ignore rules for the target may not match the source - * or the result of a merged delta could be skippable... - */ - if (git_diff_delta__should_skip(&onto->opts, delta)) { - git__free(delta); - continue; - } - - if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0) - break; - } - - if (!error) { - git_vector_swap(&onto->deltas, &onto_new); - git_pool_swap(&onto->pool, &onto_pool); - onto->new_src = from->new_src; - - /* prefix strings also come from old pool, so recreate those.*/ - onto->opts.old_prefix = - git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix); - onto->opts.new_prefix = - git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix); - } - - git_vector_foreach(&onto_new, i, delta) - git__free(delta); - git_vector_free(&onto_new); - git_pool_clear(&onto_pool); - - return error; -} - -#define DEFAULT_THRESHOLD 50 -#define DEFAULT_TARGET_LIMIT 200 - -int git_diff_detect( - git_diff_list *diff, - git_diff_detect_options *opts) -{ - int error = 0; - unsigned int i, j; - git_diff_delta *from, *to; - bool check_unmodified = opts && - (opts->flags & GIT_DIFF_DETECT_COPIES_FROM_UNMODIFIED) != 0; - int max_targets = (opts && opts->target_limit > 0) ? - opts->target_limit : DEFAULT_TARGET_LIMIT; - unsigned int rename_threshold = (opts && opts->rename_threshold > 0) ? - opts->rename_threshold : DEFAULT_THRESHOLD; - unsigned int copy_threshold = (opts && opts->copy_threshold > 0) ? - opts->copy_threshold : DEFAULT_THRESHOLD; - int num_deletes = 0, num_splits = 0; - - /* TODO: update opts from config diff.renameLimit / diff.renames */ - - git_vector_foreach(&diff->deltas, i, from) { - int tried_targets = 0; - - git_vector_foreach(&diff->deltas, j, to) { - unsigned int similarity = 0; - - if (i == j) - continue; - - switch (to->status) { - case GIT_DELTA_ADDED: - case GIT_DELTA_UNTRACKED: - case GIT_DELTA_RENAMED: - case GIT_DELTA_COPIED: - break; - default: - /* only those status values should be checked */ - continue; - } - - /* don't check UNMODIFIED files as source unless given option */ - if (from->status == GIT_DELTA_UNMODIFIED && !check_unmodified) - continue; - - /* cap on maximum files we'll examine */ - if (++tried_targets > max_targets) - break; - - /* calculate similarity and see if this pair beats the - * similarity score of the current best pair. - */ - if (git_oid_cmp(&from->old_file.oid, &to->new_file.oid) == 0) - similarity = 100; - /* TODO: insert actual similarity algo here */ - - if (similarity <= to->similarity) - continue; - - if (from->status == GIT_DELTA_DELETED) { - if (similarity < rename_threshold) - continue; - - /* merge "from" & "to" to a RENAMED record */ - to->status = GIT_DELTA_RENAMED; - memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); - - from->status = GIT_DELTA__TO_DELETE; - num_deletes++; - } else { - if (similarity < copy_threshold) - continue; - - /* convert "to" to a COPIED record */ - to->status = GIT_DELTA_COPIED; - memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); - } - } - - if (from->status == GIT_DELTA_MODIFIED && - opts && (opts->flags & GIT_DIFF_DETECT_BREAK_REWRITES) != 0) - { - /* TODO: calculate similarity and maybe mark for split */ - - /* from->status = GIT_DELTA__TO_SPLIT; */ - /* num_splits++; */ - } - } - - if (num_deletes > 0 || num_splits > 0) { - git_vector onto = GIT_VECTOR_INIT; - size_t new_size = diff->deltas.length + num_splits - num_deletes; - - if (git_vector_init(&onto, new_size, diff_delta__cmp) < 0) - return -1; - - /* build new delta list without TO_DELETE and splitting TO_SPLIT */ - git_vector_foreach(&diff->deltas, i, from) { - if (from->status == GIT_DELTA__TO_DELETE) { - git__free(from); - continue; - } - - if (from->status == GIT_DELTA__TO_SPLIT) { - git_diff_delta *deleted = diff_delta__dup(from, &diff->pool); - if (!deleted) - return -1; - - deleted->status = GIT_DELTA_DELETED; - memset(&deleted->new_file, 0, sizeof(deleted->new_file)); - deleted->new_file.path = deleted->old_file.path; - deleted->new_file.flags |= GIT_DIFF_FILE_VALID_OID; - - git_vector_insert(&onto, deleted); - - from->status = GIT_DELTA_ADDED; - memset(&from->old_file, 0, sizeof(from->old_file)); - from->old_file.path = from->new_file.path; - from->old_file.flags |= GIT_DIFF_FILE_VALID_OID; - } - - git_vector_insert(&onto, from); - } - - /* swap new delta list into place */ - - git_vector_sort(&onto); - git_vector_swap(&diff->deltas, &onto); - git_vector_free(&onto); - } - - return error; -} - diff --git a/src/diff.h b/src/diff.h index 61723bc9e..ed66439bf 100644 --- a/src/diff.h +++ b/src/diff.h @@ -48,6 +48,8 @@ extern void git_diff__cleanup_modes( extern void git_diff_list_addref(git_diff_list *diff); +extern int git_diff_delta__cmp(const void *a, const void *b); + extern bool git_diff_delta__should_skip( const git_diff_options *opts, const git_diff_delta *delta); diff --git a/src/diff_tform.c b/src/diff_tform.c new file mode 100644 index 000000000..987d4b8e6 --- /dev/null +++ b/src/diff_tform.c @@ -0,0 +1,466 @@ +/* + * Copyright (C) 2012 the libgit2 contributors + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ +#include "common.h" +#include "diff.h" +#include "git2/config.h" + +static git_diff_delta *diff_delta__dup( + const git_diff_delta *d, git_pool *pool) +{ + git_diff_delta *delta = git__malloc(sizeof(git_diff_delta)); + if (!delta) + return NULL; + + memcpy(delta, d, sizeof(git_diff_delta)); + + delta->old_file.path = git_pool_strdup(pool, d->old_file.path); + if (delta->old_file.path == NULL) + goto fail; + + if (d->new_file.path != d->old_file.path) { + delta->new_file.path = git_pool_strdup(pool, d->new_file.path); + if (delta->new_file.path == NULL) + goto fail; + } else { + delta->new_file.path = delta->old_file.path; + } + + return delta; + +fail: + git__free(delta); + return NULL; +} + +static git_diff_delta *diff_delta__merge_like_cgit( + const git_diff_delta *a, const git_diff_delta *b, git_pool *pool) +{ + git_diff_delta *dup; + + /* Emulate C git for merging two diffs (a la 'git diff '). + * + * When C git does a diff between the work dir and a tree, it actually + * diffs with the index but uses the workdir contents. This emulates + * those choices so we can emulate the type of diff. + * + * We have three file descriptions here, let's call them: + * f1 = a->old_file + * f2 = a->new_file AND b->old_file + * f3 = b->new_file + */ + + /* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */ + if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED) + return diff_delta__dup(a, pool); + + /* otherwise, base this diff on the 'b' diff */ + if ((dup = diff_delta__dup(b, pool)) == NULL) + return NULL; + + /* If 'a' status is uninteresting, then we're done */ + if (a->status == GIT_DELTA_UNMODIFIED) + return dup; + + assert(a->status != GIT_DELTA_UNMODIFIED); + assert(b->status != GIT_DELTA_UNMODIFIED); + + /* A cgit exception is that the diff of a file that is only in the + * index (i.e. not in HEAD nor workdir) is given as empty. + */ + if (dup->status == GIT_DELTA_DELETED) { + if (a->status == GIT_DELTA_ADDED) + dup->status = GIT_DELTA_UNMODIFIED; + /* else don't overwrite DELETE status */ + } else { + dup->status = a->status; + } + + git_oid_cpy(&dup->old_file.oid, &a->old_file.oid); + dup->old_file.mode = a->old_file.mode; + dup->old_file.size = a->old_file.size; + dup->old_file.flags = a->old_file.flags; + + return dup; +} + +int git_diff_merge( + git_diff_list *onto, + const git_diff_list *from) +{ + int error = 0; + git_pool onto_pool; + git_vector onto_new; + git_diff_delta *delta; + bool ignore_case = false; + unsigned int i, j; + + assert(onto && from); + + if (!from->deltas.length) + return 0; + + if (git_vector_init( + &onto_new, onto->deltas.length, git_diff_delta__cmp) < 0 || + git_pool_init(&onto_pool, 1, 0) < 0) + return -1; + + if ((onto->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0 || + (from->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0) + { + ignore_case = true; + + /* This function currently only supports merging diff lists that + * are sorted identically. */ + assert((onto->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0 && + (from->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0); + } + + for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) { + git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i); + const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j); + int cmp = !f ? -1 : !o ? 1 : STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path); + + if (cmp < 0) { + delta = diff_delta__dup(o, &onto_pool); + i++; + } else if (cmp > 0) { + delta = diff_delta__dup(f, &onto_pool); + j++; + } else { + delta = diff_delta__merge_like_cgit(o, f, &onto_pool); + i++; + j++; + } + + /* the ignore rules for the target may not match the source + * or the result of a merged delta could be skippable... + */ + if (git_diff_delta__should_skip(&onto->opts, delta)) { + git__free(delta); + continue; + } + + if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0) + break; + } + + if (!error) { + git_vector_swap(&onto->deltas, &onto_new); + git_pool_swap(&onto->pool, &onto_pool); + onto->new_src = from->new_src; + + /* prefix strings also come from old pool, so recreate those.*/ + onto->opts.old_prefix = + git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix); + onto->opts.new_prefix = + git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix); + } + + git_vector_foreach(&onto_new, i, delta) + git__free(delta); + git_vector_free(&onto_new); + git_pool_clear(&onto_pool); + + return error; +} + +#define DEFAULT_THRESHOLD 50 +#define DEFAULT_BREAK_REWRITE_THRESHOLD 60 +#define DEFAULT_TARGET_LIMIT 200 + +static int normalize_find_opts( + git_diff_list *diff, + git_diff_find_options *opts, + git_diff_find_options *given) +{ + git_config *cfg = NULL; + const char *val; + + if (diff->repo != NULL && + git_repository_config__weakptr(&cfg, diff->repo) < 0) + return -1; + + if (given != NULL) + memcpy(opts, given, sizeof(*opts)); + else { + memset(opts, 0, sizeof(*opts)); + + opts->flags = GIT_DIFF_FIND_RENAMES; + + if (git_config_get_string(&val, cfg, "diff.renames") < 0) + giterr_clear(); + else if (val && + (!strcasecmp(val, "copies") || !strcasecmp(val, "copy"))) + opts->flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES; + } + + /* some flags imply others */ + + if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES) + opts->flags |= GIT_DIFF_FIND_RENAMES; + + if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED) + opts->flags |= GIT_DIFF_FIND_COPIES; + +#define USE_DEFAULT(X) ((X) == 0 || (X) > 100) + + if (USE_DEFAULT(opts->rename_threshold)) + opts->rename_threshold = DEFAULT_THRESHOLD; + + if (USE_DEFAULT(opts->rename_from_rewrite_threshold)) + opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD; + + if (USE_DEFAULT(opts->copy_threshold)) + opts->copy_threshold = DEFAULT_THRESHOLD; + + if (USE_DEFAULT(opts->break_rewrite_threshold)) + opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD; + +#undef USE_DEFAULT + + if (!opts->target_limit) { + int32_t limit = 0; + + opts->target_limit = DEFAULT_TARGET_LIMIT; + + if (git_config_get_int32(&limit, cfg, "diff.renameLimit") < 0) + giterr_clear(); + else if (limit > 0) + opts->target_limit = limit; + } + + return 0; +} + +static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size) +{ + git_vector onto = GIT_VECTOR_INIT; + size_t i; + git_diff_delta *delta; + + if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0) + return -1; + + /* build new delta list without TO_DELETE and splitting TO_SPLIT */ + git_vector_foreach(&diff->deltas, i, delta) { + if (delta->status == GIT_DELTA__TO_DELETE) { + git__free(delta); + continue; + } + + if (delta->status == GIT_DELTA__TO_SPLIT) { + git_diff_delta *deleted = diff_delta__dup(delta, &diff->pool); + if (!deleted) + return -1; + + deleted->status = GIT_DELTA_DELETED; + memset(&deleted->new_file, 0, sizeof(deleted->new_file)); + deleted->new_file.path = deleted->old_file.path; + deleted->new_file.flags |= GIT_DIFF_FILE_VALID_OID; + + git_vector_insert(&onto, deleted); + + delta->status = GIT_DELTA_ADDED; + memset(&delta->old_file, 0, sizeof(delta->old_file)); + delta->old_file.path = delta->new_file.path; + delta->old_file.flags |= GIT_DIFF_FILE_VALID_OID; + } + + git_vector_insert(&onto, delta); + } + + /* swap new delta list into place */ + git_vector_sort(&onto); + git_vector_swap(&diff->deltas, &onto); + git_vector_free(&onto); + + return 0; +} + +static unsigned int calc_similarity( + void *cache, git_diff_file *old_file, git_diff_file *new_file) +{ + GIT_UNUSED(cache); + + if (git_oid_cmp(&old_file->oid, &new_file->oid) == 0) + return 100; + + /* TODO: insert actual similarity algo here */ + + return 0; +} + +#define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0) + +int git_diff_find_similar( + git_diff_list *diff, + git_diff_find_options *given_opts) +{ + unsigned int i, j, similarity; + git_diff_delta *from, *to; + git_diff_find_options opts; + unsigned int tried_targets, num_changes = 0; + git_vector matches = GIT_VECTOR_INIT; + + if (normalize_find_opts(diff, &opts, given_opts) < 0) + return -1; + + /* first do splits if requested */ + + if (FLAG_SET(opts, GIT_DIFF_FIND_AND_BREAK_REWRITES)) { + git_vector_foreach(&diff->deltas, i, from) { + if (from->status != GIT_DELTA_MODIFIED) + continue; + + /* Right now, this doesn't work right because the similarity + * algorithm isn't actually implemented... + */ + similarity = 100; + /* calc_similarity(NULL, &from->old_file, from->new_file); */ + + if (similarity < opts.break_rewrite_threshold) { + from->status = GIT_DELTA__TO_SPLIT; + num_changes++; + } + } + + /* apply splits as needed */ + if (num_changes > 0 && + apply_splits_and_deletes( + diff, diff->deltas.length + num_changes) < 0) + return -1; + } + + /* next find the most similar delta for each rename / copy candidate */ + + if (git_vector_init(&matches, diff->deltas.length, git_diff_delta__cmp) < 0) + return -1; + + git_vector_foreach(&diff->deltas, i, from) { + tried_targets = 0; + + git_vector_foreach(&diff->deltas, j, to) { + if (i == j) + continue; + + switch (to->status) { + case GIT_DELTA_ADDED: + case GIT_DELTA_UNTRACKED: + case GIT_DELTA_RENAMED: + case GIT_DELTA_COPIED: + break; + default: + /* only the above status values should be checked */ + continue; + } + + /* skip all but DELETED files unless copy detection is on */ + if (from->status != GIT_DELTA_DELETED && + !FLAG_SET(opts, GIT_DIFF_FIND_COPIES)) + continue; + + /* don't check UNMODIFIED files as source unless given option */ + if (from->status == GIT_DELTA_UNMODIFIED && + !FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)) + continue; + + /* cap on maximum files we'll examine */ + if (++tried_targets > opts.target_limit) + break; + + /* calculate similarity and see if this pair beats the + * similarity score of the current best pair. + */ + similarity = calc_similarity(NULL, &from->old_file, &to->new_file); + + if (to->similarity < similarity) { + to->similarity = similarity; + if (git_vector_set(NULL, &matches, j, from) < 0) + return -1; + } + } + } + + /* next rewrite the diffs with renames / copies */ + + num_changes = 0; + + git_vector_foreach(&diff->deltas, j, to) { + from = GIT_VECTOR_GET(&matches, j); + if (!from) { + assert(to->similarity == 0); + continue; + } + + /* three possible outcomes here: + * 1. old DELETED and if over rename threshold, + * new becomes RENAMED and old goes away + * 2. old was MODIFIED but FIND_RENAMES_FROM_REWRITES is on and + * old is more similar to new than it is to itself, in which + * case, new becomes RENAMED and old becomed ADDED + * 3. otherwise if over copy threshold, new becomes COPIED + */ + + if (from->status == GIT_DELTA_DELETED) { + if (to->similarity < opts.rename_threshold) { + to->similarity = 0; + continue; + } + + to->status = GIT_DELTA_RENAMED; + memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); + + from->status = GIT_DELTA__TO_DELETE; + num_changes++; + + continue; + } + + if (from->status == GIT_DELTA_MODIFIED && + FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) && + to->similarity > opts.rename_threshold) + { + similarity = 100; + /* calc_similarity(NULL, &from->old_file, from->new_file); */ + + if (similarity < opts.rename_from_rewrite_threshold) { + to->status = GIT_DELTA_RENAMED; + memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); + + from->status = GIT_DELTA_ADDED; + memset(&from->old_file, 0, sizeof(from->old_file)); + from->old_file.path = to->old_file.path; + from->old_file.flags |= GIT_DIFF_FILE_VALID_OID; + + continue; + } + } + + if (to->similarity < opts.copy_threshold) { + to->similarity = 0; + continue; + } + + /* convert "to" to a COPIED record */ + to->status = GIT_DELTA_COPIED; + memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); + } + + git_vector_free(&matches); + + if (num_changes > 0) { + assert(num_changes < diff->deltas.length); + + if (apply_splits_and_deletes( + diff, diff->deltas.length - num_changes) < 0) + return -1; + } + + return 0; +} + +#undef FLAG_SET diff --git a/src/vector.c b/src/vector.c index c6a644cc3..e56b97849 100644 --- a/src/vector.c +++ b/src/vector.c @@ -241,3 +241,33 @@ void git_vector_swap(git_vector *a, git_vector *b) memcpy(a, b, sizeof(t)); memcpy(b, &t, sizeof(t)); } + +int git_vector_resize_to(git_vector *v, size_t new_length) +{ + if (new_length <= v->length) + return 0; + + while (new_length >= v->_alloc_size) + if (resize_vector(v) < 0) + return -1; + + memset(&v->contents[v->length], 0, + sizeof(void *) * (new_length - v->length)); + + v->length = new_length; + + return 0; +} + +int git_vector_set(void **old, git_vector *v, size_t position, void *value) +{ + if (git_vector_resize_to(v, position + 1) < 0) + return -1; + + if (old != NULL) + *old = v->contents[position]; + + v->contents[position] = value; + + return 0; +} diff --git a/src/vector.h b/src/vector.h index 49ba754f0..8886371e8 100644 --- a/src/vector.h +++ b/src/vector.h @@ -76,4 +76,7 @@ int git_vector_remove(git_vector *v, unsigned int idx); void git_vector_pop(git_vector *v); void git_vector_uniq(git_vector *v); +int git_vector_resize_to(git_vector *v, size_t new_length); +int git_vector_set(void **old, git_vector *v, size_t position, void *value); + #endif diff --git a/tests-clar/diff/rename.c b/tests-clar/diff/rename.c index 8a50fd5ea..0ee1db842 100644 --- a/tests-clar/diff/rename.c +++ b/tests-clar/diff/rename.c @@ -34,14 +34,14 @@ void test_diff_rename__match_oid(void) git_tree *old_tree, *new_tree; git_diff_list *diff; git_diff_options diffopts = {0}; - git_diff_detect_options opts; + git_diff_find_options opts; diff_expects exp; old_tree = resolve_commit_oid_to_tree(g_repo, old_sha); new_tree = resolve_commit_oid_to_tree(g_repo, new_sha); /* Must pass GIT_DIFF_INCLUDE_UNMODIFIED if you expect to emulate - * --find-copies-harder during rename detection... + * --find-copies-harder during rename transformion... */ memset(&diffopts, 0, sizeof(diffopts)); diffopts.flags |= GIT_DIFF_INCLUDE_UNMODIFIED; @@ -65,7 +65,7 @@ void test_diff_rename__match_oid(void) /* git diff 31e47d8c1fa36d7f8d537b96158e3f024de0a9f2 \ * 2bc7f351d20b53f1c72c16c4b036e491c478c49a */ - cl_git_pass(git_diff_detect(diff, NULL)); + cl_git_pass(git_diff_find_similar(diff, NULL)); memset(&exp, 0, sizeof(exp)); cl_git_pass(git_diff_foreach( @@ -86,8 +86,8 @@ void test_diff_rename__match_oid(void) * 2bc7f351d20b53f1c72c16c4b036e491c478c49a */ memset(&opts, 0, sizeof(opts)); - opts.flags = GIT_DIFF_DETECT_COPIES_FROM_UNMODIFIED; - cl_git_pass(git_diff_detect(diff, &opts)); + opts.flags = GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED; + cl_git_pass(git_diff_find_similar(diff, &opts)); memset(&exp, 0, sizeof(exp)); cl_git_pass(git_diff_foreach(