diff --git a/include/git2/diff.h b/include/git2/diff.h index 7f165041a..379f4694e 100644 --- a/include/git2/diff.h +++ b/include/git2/diff.h @@ -379,7 +379,7 @@ typedef struct git_diff_patch git_diff_patch; typedef enum { /** look for renames? (`--find-renames`) */ GIT_DIFF_FIND_RENAMES = (1 << 0), - /** consider old size of modified for renames? (`--break-rewrites=N`) */ + /** consider old side of modified for renames? (`--break-rewrites=N`) */ GIT_DIFF_FIND_RENAMES_FROM_REWRITES = (1 << 1), /** look for copies? (a la `--find-copies`) */ @@ -897,11 +897,12 @@ GIT_EXTERN(int) git_diff_patch_to_str( * * NULL is allowed for either `old_blob` or `new_blob` and will be treated * as an empty blob, with the `oid` set to NULL in the `git_diff_file` data. + * Passing NULL for both blobs is a noop; no callbacks will be made at all. * - * We do run a binary content check on the two blobs and if either of the - * blobs looks like binary data, the `git_diff_delta` binary attribute will - * be set to 1 and no call to the hunk_cb nor line_cb will be made (unless - * you pass `GIT_DIFF_FORCE_TEXT` of course). + * We do run a binary content check on the blob content and if either blob + * looks like binary data, the `git_diff_delta` binary attribute will be set + * to 1 and no call to the hunk_cb nor line_cb will be made (unless you pass + * `GIT_DIFF_FORCE_TEXT` of course). * * @return 0 on success, GIT_EUSER on non-zero callback, or error code */ @@ -921,6 +922,11 @@ GIT_EXTERN(int) git_diff_blobs( * so the `git_diff_file` parameters to the callbacks will be faked a la the * rules for `git_diff_blobs()`. * + * Passing NULL for `old_blob` will be treated as an empty blob (i.e. the + * `file_cb` will be invoked with GIT_DELTA_ADDED and the diff will be the + * entire content of the buffer added). Passing NULL to the buffer will do + * the reverse, with GIT_DELTA_REMOVED and blob content removed. + * * @return 0 on success, GIT_EUSER on non-zero callback, or error code */ GIT_EXTERN(int) git_diff_blob_to_buffer( diff --git a/src/diff_output.c b/src/diff_output.c index 5e9d5fe76..13434beb9 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -456,7 +456,7 @@ static void release_content(git_diff_file *file, git_map *map, git_blob *blob) } -static void diff_context_init( +static int diff_context_init( diff_context *ctxt, git_diff_list *diff, git_repository *repo, @@ -468,6 +468,12 @@ static void diff_context_init( { memset(ctxt, 0, sizeof(diff_context)); + if (!repo && diff) + repo = diff->repo; + + if (!opts && diff) + opts = &diff->opts; + ctxt->repo = repo; ctxt->diff = diff; ctxt->opts = opts; @@ -478,6 +484,8 @@ static void diff_context_init( ctxt->error = 0; setup_xdiff_options(ctxt->opts, &ctxt->xdiff_config, &ctxt->xdiff_params); + + return 0; } static int diff_delta_file_callback( @@ -922,6 +930,15 @@ static int diff_patch_line_cb( return 0; } +static int diff_required(git_diff_list *diff, const char *action) +{ + if (!diff) { + giterr_set(GITERR_INVALID, "Must provide valid diff to %s", action); + return -1; + } + + return 0; +} int git_diff_foreach( git_diff_list *diff, @@ -935,9 +952,12 @@ int git_diff_foreach( size_t idx; git_diff_patch patch; - diff_context_init( - &ctxt, diff, diff->repo, &diff->opts, - file_cb, hunk_cb, data_cb, payload); + if (diff_required(diff, "git_diff_foreach") < 0) + return -1; + + if (diff_context_init( + &ctxt, diff, NULL, NULL, file_cb, hunk_cb, data_cb, payload) < 0) + return -1; diff_patch_init(&ctxt, &patch); @@ -1306,8 +1326,10 @@ static int diff_single_init( memset(data, 0, sizeof(*data)); - diff_context_init( - &data->ctxt, NULL, repo, opts, file_cb, hunk_cb, data_cb, payload); + if (diff_context_init( + &data->ctxt, NULL, repo, opts, + file_cb, hunk_cb, data_cb, payload) < 0) + return -1; diff_patch_init(&data->ctxt, &data->patch); @@ -1374,6 +1396,9 @@ int git_diff_blobs( new_blob ? git_object_owner((const git_object *)new_blob) : old_blob ? git_object_owner((const git_object *)old_blob) : NULL; + if (!repo) /* Hmm, given two NULL blobs, silently do no callbacks? */ + return 0; + if ((error = diff_single_init( &d, repo, options, file_cb, hunk_cb, data_cb, payload)) < 0) return error; @@ -1405,6 +1430,9 @@ int git_diff_blob_to_buffer( git_repository *repo = old_blob ? git_object_owner((const git_object *)old_blob) : NULL; + if (!repo && !buf) /* Hmm, given NULLs, silently do no callbacks? */ + return 0; + if ((error = diff_single_init( &d, repo, options, file_cb, hunk_cb, data_cb, payload)) < 0) return error; @@ -1453,11 +1481,19 @@ int git_diff_get_patch( if (patch_ptr) *patch_ptr = NULL; + if (delta_ptr) + *delta_ptr = NULL; + + if (diff_required(diff, "git_diff_get_patch") < 0) + return -1; + + if (diff_context_init( + &ctxt, diff, NULL, NULL, + NULL, diff_patch_hunk_cb, diff_patch_line_cb, NULL) < 0) + return -1; delta = git_vector_get(&diff->deltas, idx); if (!delta) { - if (delta_ptr) - *delta_ptr = NULL; giterr_set(GITERR_INVALID, "Index out of range for delta in diff"); return GIT_ENOTFOUND; } @@ -1470,10 +1506,6 @@ int git_diff_get_patch( (diff->opts.flags & GIT_DIFF_SKIP_BINARY_CHECK) != 0)) return 0; - diff_context_init( - &ctxt, diff, diff->repo, &diff->opts, - NULL, diff_patch_hunk_cb, diff_patch_line_cb, NULL); - if (git_diff_delta__should_skip(ctxt.opts, delta)) return 0; diff --git a/src/diff_tform.c b/src/diff_tform.c index e051732c5..48332d3e5 100644 --- a/src/diff_tform.c +++ b/src/diff_tform.c @@ -7,6 +7,7 @@ #include "common.h" #include "diff.h" #include "git2/config.h" +#include "git2/blob.h" #include "hashsig.h" static git_diff_delta *diff_delta__dup( @@ -362,24 +363,86 @@ on_error: return -1; } -typedef struct { - /* array of delta index * 2 + (old_file/new_file) -> file hashes */ - git_hashsig *sigs; -} diff_similarity_cache; - -static unsigned int calc_similarity( - void *ref, git_diff_file *old_file, git_diff_file *new_file) +GIT_INLINE(git_diff_file *) similarity_get_file(git_diff_list *diff, size_t idx) { - diff_similarity_cache *cache = ref; + git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2); + return (idx & 1) ? &delta->new_file : &delta->old_file; +} - GIT_UNUSED(cache); +static int similarity_calc( + git_diff_list *diff, + git_diff_find_options *opts, + size_t file_idx, + void **cache) +{ + int error = 0; + git_diff_file *file = similarity_get_file(diff, file_idx); + git_iterator_type_t src = (file_idx & 1) ? diff->old_src : diff->new_src; - if (git_oid_cmp(&old_file->oid, &new_file->oid) == 0) + if (src == GIT_ITERATOR_TYPE_WORKDIR) { /* compute hashsig from file */ + git_buf path = GIT_BUF_INIT; + + /* TODO: apply wd-to-odb filters to file data if necessary */ + + if (!(error = git_buf_joinpath( + &path, git_repository_workdir(diff->repo), file->path))) + error = opts->metric->file_signature( + &cache[file_idx], file, path.ptr, opts->metric->payload); + + git_buf_free(&path); + } else { /* compute hashsig from blob buffer */ + git_blob *blob = NULL; + + /* TODO: add max size threshold a la diff? */ + + if ((error = git_blob_lookup(&blob, diff->repo, &file->oid)) < 0) + return error; + + error = opts->metric->buffer_signature( + &cache[file_idx], file, git_blob_rawcontent(blob), + git_blob_rawsize(blob), opts->metric->payload); + + git_blob_free(blob); + } + + return error; +} + +static int similarity_measure( + git_diff_list *diff, + git_diff_find_options *opts, + void **cache, + size_t a_idx, + size_t b_idx) +{ + int score = 0; + git_diff_file *a_file = similarity_get_file(diff, a_idx); + git_diff_file *b_file = similarity_get_file(diff, b_idx); + + if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode)) + return 0; + + if (git_oid_cmp(&a_file->oid, &b_file->oid) == 0) return 100; - /* TODO: insert actual similarity algo here */ + /* update signature cache if needed */ + if (!cache[a_idx] && similarity_calc(diff, opts, a_idx, cache) < 0) + return -1; + if (!cache[b_idx] && similarity_calc(diff, opts, b_idx, cache) < 0) + return -1; - return 0; + /* compare signatures */ + if (opts->metric->similarity( + &score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0) + return -1; + + /* clip score */ + if (score < 0) + score = 0; + else if (score > 100) + score = 100; + + return score; } #define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0) @@ -388,67 +451,85 @@ int git_diff_find_similar( git_diff_list *diff, git_diff_find_options *given_opts) { - unsigned int i, j, similarity; + size_t i, j, cache_size, *matches; + int error = 0, similarity; git_diff_delta *from, *to; git_diff_find_options opts; - unsigned int tried_targets, num_changes = 0; - git_vector matches = GIT_VECTOR_INIT; + size_t tried_targets, num_rewrites = 0; + void **cache; - if (normalize_find_opts(diff, &opts, given_opts) < 0) - return -1; + if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0) + return error; - /* first do splits if requested */ + /* TODO: maybe abort if deltas.length > target_limit ??? */ + + cache_size = diff->deltas.length * 2; /* must store b/c length may change */ + cache = git__calloc(cache_size, sizeof(void *)); + GITERR_CHECK_ALLOC(cache); + + matches = git__calloc(diff->deltas.length, sizeof(size_t)); + GITERR_CHECK_ALLOC(matches); + + /* first break MODIFIED records that are too different (if requested) */ if (FLAG_SET(opts, GIT_DIFF_FIND_AND_BREAK_REWRITES)) { git_vector_foreach(&diff->deltas, i, from) { if (from->status != GIT_DELTA_MODIFIED) continue; - /* Right now, this doesn't work right because the similarity - * algorithm isn't actually implemented... - */ - similarity = 100; - /* calc_similarity(NULL, &from->old_file, from->new_file); */ + similarity = similarity_measure( + diff, &opts, cache, 2 * i, 2 * i + 1); - if (similarity < opts.break_rewrite_threshold) { + if (similarity < 0) { + error = similarity; + goto cleanup; + } + + if ((unsigned int)similarity < opts.break_rewrite_threshold) { from->flags |= GIT_DIFF_FLAG__TO_SPLIT; - num_changes++; + num_rewrites++; } } - - /* apply splits as needed */ - if (num_changes > 0 && - apply_splits_and_deletes( - diff, diff->deltas.length + num_changes) < 0) - return -1; } /* next find the most similar delta for each rename / copy candidate */ - if (git_vector_init(&matches, diff->deltas.length, git_diff_delta__cmp) < 0) - return -1; - git_vector_foreach(&diff->deltas, i, from) { tried_targets = 0; + /* skip things that aren't blobs */ + if (GIT_MODE_TYPE(from->old_file.mode) != + GIT_MODE_TYPE(GIT_FILEMODE_BLOB)) + continue; + git_vector_foreach(&diff->deltas, j, to) { if (i == j) continue; + /* skip things that aren't blobs */ + if (GIT_MODE_TYPE(to->new_file.mode) != + GIT_MODE_TYPE(GIT_FILEMODE_BLOB)) + continue; + switch (to->status) { case GIT_DELTA_ADDED: case GIT_DELTA_UNTRACKED: case GIT_DELTA_RENAMED: case GIT_DELTA_COPIED: break; + case GIT_DELTA_MODIFIED: + if ((to->flags & GIT_DIFF_FLAG__TO_SPLIT) == 0) + continue; + break; default: /* only the above status values should be checked */ continue; } /* skip all but DELETED files unless copy detection is on */ - if (from->status != GIT_DELTA_DELETED && - !FLAG_SET(opts, GIT_DIFF_FIND_COPIES)) + if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES) && + from->status != GIT_DELTA_DELETED && + (from->flags & GIT_DIFF_FLAG__TO_SPLIT) == 0) continue; /* don't check UNMODIFIED files as source unless given option */ @@ -463,34 +544,44 @@ int git_diff_find_similar( /* calculate similarity and see if this pair beats the * similarity score of the current best pair. */ - similarity = calc_similarity(NULL, &from->old_file, &to->new_file); + similarity = similarity_measure( + diff, &opts, cache, 2 * i, 2 * j + 1); - if (to->similarity < similarity) { - to->similarity = similarity; - if (git_vector_set(NULL, &matches, j, from) < 0) - return -1; + if (similarity < 0) { + error = similarity; + goto cleanup; + } + + if (to->similarity < (unsigned int)similarity) { + to->similarity = (unsigned int)similarity; + matches[j] = i + 1; } } } /* next rewrite the diffs with renames / copies */ - num_changes = 0; + num_rewrites = 0; git_vector_foreach(&diff->deltas, j, to) { - from = GIT_VECTOR_GET(&matches, j); - if (!from) { + if (!matches[j]) { assert(to->similarity == 0); continue; } - /* three possible outcomes here: + i = matches[j] - 1; + from = GIT_VECTOR_GET(&diff->deltas, i); + assert(from); + + /* four possible outcomes here: * 1. old DELETED and if over rename threshold, * new becomes RENAMED and old goes away - * 2. old was MODIFIED but FIND_RENAMES_FROM_REWRITES is on and + * 2. old SPLIT and if over rename threshold, + * new becomes RENAMED and old becomes ADDED (clear SPLIT) + * 3. old was MODIFIED but FIND_RENAMES_FROM_REWRITES is on and * old is more similar to new than it is to itself, in which * case, new becomes RENAMED and old becomed ADDED - * 3. otherwise if over copy threshold, new becomes COPIED + * 4. otherwise if over copy threshold, new becomes COPIED */ if (from->status == GIT_DELTA_DELETED) { @@ -503,7 +594,26 @@ int git_diff_find_similar( memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); from->flags |= GIT_DIFF_FLAG__TO_DELETE; - num_changes++; + num_rewrites++; + + continue; + } + + if (from->status == GIT_DELTA_MODIFIED && + (from->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0) + { + if (to->similarity < opts.rename_threshold) { + to->similarity = 0; + continue; + } + + to->status = GIT_DELTA_RENAMED; + memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); + + from->status = GIT_DELTA_ADDED; + from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; + memset(&from->old_file, 0, sizeof(from->old_file)); + num_rewrites--; continue; } @@ -512,10 +622,15 @@ int git_diff_find_similar( FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) && to->similarity > opts.rename_threshold) { - similarity = 100; - /* calc_similarity(NULL, &from->old_file, from->new_file); */ + similarity = similarity_measure( + diff, &opts, cache, 2 * i, 2 * i + 1); - if (similarity < opts.rename_from_rewrite_threshold) { + if (similarity < 0) { + error = similarity; + goto cleanup; + } + + if ((unsigned int)similarity < opts.rename_from_rewrite_threshold) { to->status = GIT_DELTA_RENAMED; memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); @@ -538,17 +653,23 @@ int git_diff_find_similar( memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); } - git_vector_free(&matches); + if (num_rewrites > 0) { + assert(num_rewrites < diff->deltas.length); - if (num_changes > 0) { - assert(num_changes < diff->deltas.length); - - if (apply_splits_and_deletes( - diff, diff->deltas.length - num_changes) < 0) - return -1; + error = apply_splits_and_deletes( + diff, diff->deltas.length - num_rewrites); } - return 0; +cleanup: + git__free(matches); + + for (i = 0; i < cache_size; ++i) { + if (cache[i] != NULL) + opts.metric->free_signature(cache[i], opts.metric->payload); + } + git__free(cache); + + return error; } #undef FLAG_SET diff --git a/tests-clar/diff/blob.c b/tests-clar/diff/blob.c index a6950b871..2ac8dbc51 100644 --- a/tests-clar/diff/blob.c +++ b/tests-clar/diff/blob.c @@ -196,7 +196,7 @@ void test_diff_blob__can_compare_identical_blobs(void) NULL, NULL, &opts, diff_file_cb, diff_hunk_cb, diff_line_cb, &expected)); cl_assert_equal_i(0, expected.files_binary); - assert_identical_blobs_comparison(&expected); + cl_assert_equal_i(0, expected.files); /* NULLs mean no callbacks, period */ memset(&expected, 0, sizeof(expected)); cl_git_pass(git_diff_blobs( @@ -399,7 +399,7 @@ void test_diff_blob__can_compare_blob_to_buffer(void) assert_identical_blobs_comparison(&expected); - /* diff from NULL blob to content of b */ + /* diff from NULL blob to content of a */ memset(&expected, 0, sizeof(expected)); cl_git_pass(git_diff_blob_to_buffer( NULL, a_content, strlen(a_content),