mirror of
https://git.proxmox.com/git/libgit2
synced 2025-05-18 23:10:39 +00:00

When diffs are generated, the value for the 'nfiles' field of 'git_diff_delta' will be consistent with the value in the 'status' field. Merging diffs can modify the 'status' field of some deltas and the 'nfiles' field needs to be updated accordingly.
1130 lines
30 KiB
C
1130 lines
30 KiB
C
/*
|
|
* Copyright (C) the libgit2 contributors. All rights reserved.
|
|
*
|
|
* This file is part of libgit2, distributed under the GNU GPL v2 with
|
|
* a Linking Exception. For full terms see the included COPYING file.
|
|
*/
|
|
#include "common.h"
|
|
|
|
#include "git2/config.h"
|
|
#include "git2/blob.h"
|
|
#include "git2/sys/hashsig.h"
|
|
|
|
#include "diff.h"
|
|
#include "path.h"
|
|
#include "fileops.h"
|
|
#include "config.h"
|
|
|
|
static git_diff_delta *diff_delta__dup(
|
|
const git_diff_delta *d, git_pool *pool)
|
|
{
|
|
git_diff_delta *delta = git__malloc(sizeof(git_diff_delta));
|
|
if (!delta)
|
|
return NULL;
|
|
|
|
memcpy(delta, d, sizeof(git_diff_delta));
|
|
GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
|
|
|
|
if (d->old_file.path != NULL) {
|
|
delta->old_file.path = git_pool_strdup(pool, d->old_file.path);
|
|
if (delta->old_file.path == NULL)
|
|
goto fail;
|
|
}
|
|
|
|
if (d->new_file.path != d->old_file.path && d->new_file.path != NULL) {
|
|
delta->new_file.path = git_pool_strdup(pool, d->new_file.path);
|
|
if (delta->new_file.path == NULL)
|
|
goto fail;
|
|
} else {
|
|
delta->new_file.path = delta->old_file.path;
|
|
}
|
|
|
|
return delta;
|
|
|
|
fail:
|
|
git__free(delta);
|
|
return NULL;
|
|
}
|
|
|
|
static git_diff_delta *diff_delta__merge_like_cgit(
|
|
const git_diff_delta *a,
|
|
const git_diff_delta *b,
|
|
git_pool *pool)
|
|
{
|
|
git_diff_delta *dup;
|
|
|
|
/* Emulate C git for merging two diffs (a la 'git diff <sha>').
|
|
*
|
|
* When C git does a diff between the work dir and a tree, it actually
|
|
* diffs with the index but uses the workdir contents. This emulates
|
|
* those choices so we can emulate the type of diff.
|
|
*
|
|
* We have three file descriptions here, let's call them:
|
|
* f1 = a->old_file
|
|
* f2 = a->new_file AND b->old_file
|
|
* f3 = b->new_file
|
|
*/
|
|
|
|
/* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */
|
|
if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED)
|
|
return diff_delta__dup(a, pool);
|
|
|
|
/* otherwise, base this diff on the 'b' diff */
|
|
if ((dup = diff_delta__dup(b, pool)) == NULL)
|
|
return NULL;
|
|
|
|
/* If 'a' status is uninteresting, then we're done */
|
|
if (a->status == GIT_DELTA_UNMODIFIED)
|
|
return dup;
|
|
|
|
assert(a->status != GIT_DELTA_UNMODIFIED);
|
|
assert(b->status != GIT_DELTA_UNMODIFIED);
|
|
|
|
/* A cgit exception is that the diff of a file that is only in the
|
|
* index (i.e. not in HEAD nor workdir) is given as empty.
|
|
*/
|
|
if (dup->status == GIT_DELTA_DELETED) {
|
|
if (a->status == GIT_DELTA_ADDED) {
|
|
dup->status = GIT_DELTA_UNMODIFIED;
|
|
dup->nfiles = 2;
|
|
}
|
|
/* else don't overwrite DELETE status */
|
|
} else {
|
|
dup->status = a->status;
|
|
dup->nfiles = a->nfiles;
|
|
}
|
|
|
|
git_oid_cpy(&dup->old_file.id, &a->old_file.id);
|
|
dup->old_file.mode = a->old_file.mode;
|
|
dup->old_file.size = a->old_file.size;
|
|
dup->old_file.flags = a->old_file.flags;
|
|
|
|
return dup;
|
|
}
|
|
|
|
static git_diff_delta *diff_delta__merge_like_cgit_reversed(
|
|
const git_diff_delta *a,
|
|
const git_diff_delta *b,
|
|
git_pool *pool)
|
|
{
|
|
git_diff_delta *dup;
|
|
|
|
/* reversed version of above logic */
|
|
|
|
if (a->status == GIT_DELTA_UNMODIFIED)
|
|
return diff_delta__dup(b, pool);
|
|
|
|
if ((dup = diff_delta__dup(a, pool)) == NULL)
|
|
return NULL;
|
|
|
|
if (b->status == GIT_DELTA_UNMODIFIED || b->status == GIT_DELTA_UNTRACKED || b->status == GIT_DELTA_UNREADABLE)
|
|
return dup;
|
|
|
|
if (dup->status == GIT_DELTA_DELETED) {
|
|
if (b->status == GIT_DELTA_ADDED) {
|
|
dup->status = GIT_DELTA_UNMODIFIED;
|
|
dup->nfiles = 2;
|
|
}
|
|
} else {
|
|
dup->status = b->status;
|
|
dup->nfiles = b->nfiles;
|
|
}
|
|
|
|
git_oid_cpy(&dup->old_file.id, &b->old_file.id);
|
|
dup->old_file.mode = b->old_file.mode;
|
|
dup->old_file.size = b->old_file.size;
|
|
dup->old_file.flags = b->old_file.flags;
|
|
|
|
return dup;
|
|
}
|
|
|
|
int git_diff_merge(git_diff *onto, const git_diff *from)
|
|
{
|
|
int error = 0;
|
|
git_pool onto_pool;
|
|
git_vector onto_new;
|
|
git_diff_delta *delta;
|
|
bool ignore_case, reversed;
|
|
unsigned int i, j;
|
|
|
|
assert(onto && from);
|
|
|
|
if (!from->deltas.length)
|
|
return 0;
|
|
|
|
ignore_case = ((onto->opts.flags & GIT_DIFF_IGNORE_CASE) != 0);
|
|
reversed = ((onto->opts.flags & GIT_DIFF_REVERSE) != 0);
|
|
|
|
if (ignore_case != ((from->opts.flags & GIT_DIFF_IGNORE_CASE) != 0) ||
|
|
reversed != ((from->opts.flags & GIT_DIFF_REVERSE) != 0)) {
|
|
giterr_set(GITERR_INVALID,
|
|
"Attempt to merge diffs created with conflicting options");
|
|
return -1;
|
|
}
|
|
|
|
if (git_vector_init(
|
|
&onto_new, onto->deltas.length, git_diff_delta__cmp) < 0 ||
|
|
git_pool_init(&onto_pool, 1, 0) < 0)
|
|
return -1;
|
|
|
|
for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) {
|
|
git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i);
|
|
const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j);
|
|
int cmp = !f ? -1 : !o ? 1 :
|
|
STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path);
|
|
|
|
if (cmp < 0) {
|
|
delta = diff_delta__dup(o, &onto_pool);
|
|
i++;
|
|
} else if (cmp > 0) {
|
|
delta = diff_delta__dup(f, &onto_pool);
|
|
j++;
|
|
} else {
|
|
delta = reversed ?
|
|
diff_delta__merge_like_cgit_reversed(o, f, &onto_pool) :
|
|
diff_delta__merge_like_cgit(o, f, &onto_pool);
|
|
i++;
|
|
j++;
|
|
}
|
|
|
|
/* the ignore rules for the target may not match the source
|
|
* or the result of a merged delta could be skippable...
|
|
*/
|
|
if (git_diff_delta__should_skip(&onto->opts, delta)) {
|
|
git__free(delta);
|
|
continue;
|
|
}
|
|
|
|
if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0)
|
|
break;
|
|
}
|
|
|
|
if (!error) {
|
|
git_vector_swap(&onto->deltas, &onto_new);
|
|
git_pool_swap(&onto->pool, &onto_pool);
|
|
|
|
if ((onto->opts.flags & GIT_DIFF_REVERSE) != 0)
|
|
onto->old_src = from->old_src;
|
|
else
|
|
onto->new_src = from->new_src;
|
|
|
|
/* prefix strings also come from old pool, so recreate those.*/
|
|
onto->opts.old_prefix =
|
|
git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix);
|
|
onto->opts.new_prefix =
|
|
git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix);
|
|
}
|
|
|
|
git_vector_free_deep(&onto_new);
|
|
git_pool_clear(&onto_pool);
|
|
|
|
return error;
|
|
}
|
|
|
|
int git_diff_find_similar__hashsig_for_file(
|
|
void **out, const git_diff_file *f, const char *path, void *p)
|
|
{
|
|
git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
|
|
|
|
GIT_UNUSED(f);
|
|
return git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
|
|
}
|
|
|
|
int git_diff_find_similar__hashsig_for_buf(
|
|
void **out, const git_diff_file *f, const char *buf, size_t len, void *p)
|
|
{
|
|
git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
|
|
|
|
GIT_UNUSED(f);
|
|
return git_hashsig_create((git_hashsig **)out, buf, len, opt);
|
|
}
|
|
|
|
void git_diff_find_similar__hashsig_free(void *sig, void *payload)
|
|
{
|
|
GIT_UNUSED(payload);
|
|
git_hashsig_free(sig);
|
|
}
|
|
|
|
int git_diff_find_similar__calc_similarity(
|
|
int *score, void *siga, void *sigb, void *payload)
|
|
{
|
|
int error;
|
|
|
|
GIT_UNUSED(payload);
|
|
error = git_hashsig_compare(siga, sigb);
|
|
if (error < 0)
|
|
return error;
|
|
|
|
*score = error;
|
|
return 0;
|
|
}
|
|
|
|
#define DEFAULT_THRESHOLD 50
|
|
#define DEFAULT_BREAK_REWRITE_THRESHOLD 60
|
|
#define DEFAULT_RENAME_LIMIT 200
|
|
|
|
static int normalize_find_opts(
|
|
git_diff *diff,
|
|
git_diff_find_options *opts,
|
|
const git_diff_find_options *given)
|
|
{
|
|
git_config *cfg = NULL;
|
|
git_hashsig_option_t hashsig_opts;
|
|
|
|
GITERR_CHECK_VERSION(given, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options");
|
|
|
|
if (diff->repo != NULL &&
|
|
git_repository_config__weakptr(&cfg, diff->repo) < 0)
|
|
return -1;
|
|
|
|
if (given)
|
|
memcpy(opts, given, sizeof(*opts));
|
|
|
|
if (!given ||
|
|
(given->flags & GIT_DIFF_FIND_ALL) == GIT_DIFF_FIND_BY_CONFIG)
|
|
{
|
|
char *rule =
|
|
git_config__get_string_force(cfg, "diff.renames", "true");
|
|
int boolval;
|
|
|
|
if (!git__parse_bool(&boolval, rule) && !boolval)
|
|
/* don't set FIND_RENAMES if bool value is false */;
|
|
else if (!strcasecmp(rule, "copies") || !strcasecmp(rule, "copy"))
|
|
opts->flags |= GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES;
|
|
else
|
|
opts->flags |= GIT_DIFF_FIND_RENAMES;
|
|
|
|
git__free(rule);
|
|
}
|
|
|
|
/* some flags imply others */
|
|
|
|
if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) {
|
|
/* if we are only looking for exact matches, then don't turn
|
|
* MODIFIED items into ADD/DELETE pairs because it's too picky
|
|
*/
|
|
opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES);
|
|
|
|
/* similarly, don't look for self-rewrites to split */
|
|
opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES;
|
|
}
|
|
|
|
if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
|
|
opts->flags |= GIT_DIFF_FIND_RENAMES;
|
|
|
|
if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)
|
|
opts->flags |= GIT_DIFF_FIND_COPIES;
|
|
|
|
if (opts->flags & GIT_DIFF_BREAK_REWRITES)
|
|
opts->flags |= GIT_DIFF_FIND_REWRITES;
|
|
|
|
#define USE_DEFAULT(X) ((X) == 0 || (X) > 100)
|
|
|
|
if (USE_DEFAULT(opts->rename_threshold))
|
|
opts->rename_threshold = DEFAULT_THRESHOLD;
|
|
|
|
if (USE_DEFAULT(opts->rename_from_rewrite_threshold))
|
|
opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD;
|
|
|
|
if (USE_DEFAULT(opts->copy_threshold))
|
|
opts->copy_threshold = DEFAULT_THRESHOLD;
|
|
|
|
if (USE_DEFAULT(opts->break_rewrite_threshold))
|
|
opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD;
|
|
|
|
#undef USE_DEFAULT
|
|
|
|
if (!opts->rename_limit) {
|
|
opts->rename_limit = git_config__get_int_force(
|
|
cfg, "diff.renamelimit", DEFAULT_RENAME_LIMIT);
|
|
|
|
if (opts->rename_limit <= 0)
|
|
opts->rename_limit = DEFAULT_RENAME_LIMIT;
|
|
}
|
|
|
|
/* assign the internal metric with whitespace flag as payload */
|
|
if (!opts->metric) {
|
|
opts->metric = git__malloc(sizeof(git_diff_similarity_metric));
|
|
GITERR_CHECK_ALLOC(opts->metric);
|
|
|
|
opts->metric->file_signature = git_diff_find_similar__hashsig_for_file;
|
|
opts->metric->buffer_signature = git_diff_find_similar__hashsig_for_buf;
|
|
opts->metric->free_signature = git_diff_find_similar__hashsig_free;
|
|
opts->metric->similarity = git_diff_find_similar__calc_similarity;
|
|
|
|
if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE)
|
|
hashsig_opts = GIT_HASHSIG_IGNORE_WHITESPACE;
|
|
else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE)
|
|
hashsig_opts = GIT_HASHSIG_NORMAL;
|
|
else
|
|
hashsig_opts = GIT_HASHSIG_SMART_WHITESPACE;
|
|
hashsig_opts |= GIT_HASHSIG_ALLOW_SMALL_FILES;
|
|
opts->metric->payload = (void *)hashsig_opts;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int insert_delete_side_of_split(
|
|
git_diff *diff, git_vector *onto, const git_diff_delta *delta)
|
|
{
|
|
/* make new record for DELETED side of split */
|
|
git_diff_delta *deleted = diff_delta__dup(delta, &diff->pool);
|
|
GITERR_CHECK_ALLOC(deleted);
|
|
|
|
deleted->status = GIT_DELTA_DELETED;
|
|
deleted->nfiles = 1;
|
|
memset(&deleted->new_file, 0, sizeof(deleted->new_file));
|
|
deleted->new_file.path = deleted->old_file.path;
|
|
deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_ID;
|
|
|
|
return git_vector_insert(onto, deleted);
|
|
}
|
|
|
|
static int apply_splits_and_deletes(
|
|
git_diff *diff, size_t expected_size, bool actually_split)
|
|
{
|
|
git_vector onto = GIT_VECTOR_INIT;
|
|
size_t i;
|
|
git_diff_delta *delta;
|
|
|
|
if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0)
|
|
return -1;
|
|
|
|
/* build new delta list without TO_DELETE and splitting TO_SPLIT */
|
|
git_vector_foreach(&diff->deltas, i, delta) {
|
|
if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
|
|
continue;
|
|
|
|
if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0 && actually_split) {
|
|
delta->similarity = 0;
|
|
|
|
if (insert_delete_side_of_split(diff, &onto, delta) < 0)
|
|
goto on_error;
|
|
|
|
if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
|
|
delta->status = GIT_DELTA_UNTRACKED;
|
|
else
|
|
delta->status = GIT_DELTA_ADDED;
|
|
delta->nfiles = 1;
|
|
memset(&delta->old_file, 0, sizeof(delta->old_file));
|
|
delta->old_file.path = delta->new_file.path;
|
|
delta->old_file.flags |= GIT_DIFF_FLAG_VALID_ID;
|
|
}
|
|
|
|
/* clean up delta before inserting into new list */
|
|
GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
|
|
|
|
if (delta->status != GIT_DELTA_COPIED &&
|
|
delta->status != GIT_DELTA_RENAMED &&
|
|
(delta->status != GIT_DELTA_MODIFIED || actually_split))
|
|
delta->similarity = 0;
|
|
|
|
/* insert into new list */
|
|
if (git_vector_insert(&onto, delta) < 0)
|
|
goto on_error;
|
|
}
|
|
|
|
/* cannot return an error past this point */
|
|
|
|
/* free deltas from old list that didn't make it to the new one */
|
|
git_vector_foreach(&diff->deltas, i, delta) {
|
|
if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
|
|
git__free(delta);
|
|
}
|
|
|
|
/* swap new delta list into place */
|
|
git_vector_swap(&diff->deltas, &onto);
|
|
git_vector_free(&onto);
|
|
git_vector_sort(&diff->deltas);
|
|
|
|
return 0;
|
|
|
|
on_error:
|
|
git_vector_free_deep(&onto);
|
|
|
|
return -1;
|
|
}
|
|
|
|
GIT_INLINE(git_diff_file *) similarity_get_file(git_diff *diff, size_t idx)
|
|
{
|
|
git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2);
|
|
return (idx & 1) ? &delta->new_file : &delta->old_file;
|
|
}
|
|
|
|
typedef struct {
|
|
size_t idx;
|
|
git_iterator_type_t src;
|
|
git_repository *repo;
|
|
git_diff_file *file;
|
|
git_buf data;
|
|
git_odb_object *odb_obj;
|
|
git_blob *blob;
|
|
} similarity_info;
|
|
|
|
static int similarity_init(
|
|
similarity_info *info, git_diff *diff, size_t file_idx)
|
|
{
|
|
info->idx = file_idx;
|
|
info->src = (file_idx & 1) ? diff->new_src : diff->old_src;
|
|
info->repo = diff->repo;
|
|
info->file = similarity_get_file(diff, file_idx);
|
|
info->odb_obj = NULL;
|
|
info->blob = NULL;
|
|
git_buf_init(&info->data, 0);
|
|
|
|
if (info->file->size > 0 || info->src == GIT_ITERATOR_TYPE_WORKDIR)
|
|
return 0;
|
|
|
|
return git_diff_file__resolve_zero_size(
|
|
info->file, &info->odb_obj, info->repo);
|
|
}
|
|
|
|
static int similarity_sig(
|
|
similarity_info *info,
|
|
const git_diff_find_options *opts,
|
|
void **cache)
|
|
{
|
|
int error = 0;
|
|
git_diff_file *file = info->file;
|
|
|
|
if (info->src == GIT_ITERATOR_TYPE_WORKDIR) {
|
|
if ((error = git_buf_joinpath(
|
|
&info->data, git_repository_workdir(info->repo), file->path)) < 0)
|
|
return error;
|
|
|
|
/* if path is not a regular file, just skip this item */
|
|
if (!git_path_isfile(info->data.ptr))
|
|
return 0;
|
|
|
|
/* TODO: apply wd-to-odb filters to file data if necessary */
|
|
|
|
error = opts->metric->file_signature(
|
|
&cache[info->idx], info->file,
|
|
info->data.ptr, opts->metric->payload);
|
|
} else {
|
|
/* if we didn't initially know the size, we might have an odb_obj
|
|
* around from earlier, so convert that, otherwise load the blob now
|
|
*/
|
|
if (info->odb_obj != NULL)
|
|
error = git_object__from_odb_object(
|
|
(git_object **)&info->blob, info->repo,
|
|
info->odb_obj, GIT_OBJ_BLOB);
|
|
else
|
|
error = git_blob_lookup(&info->blob, info->repo, &file->id);
|
|
|
|
if (error < 0) {
|
|
/* if lookup fails, just skip this item in similarity calc */
|
|
giterr_clear();
|
|
} else {
|
|
size_t sz;
|
|
|
|
/* index size may not be actual blob size if filtered */
|
|
if (file->size != git_blob_rawsize(info->blob))
|
|
file->size = git_blob_rawsize(info->blob);
|
|
|
|
sz = (size_t)(git__is_sizet(file->size) ? file->size : -1);
|
|
|
|
error = opts->metric->buffer_signature(
|
|
&cache[info->idx], info->file,
|
|
git_blob_rawcontent(info->blob), sz, opts->metric->payload);
|
|
}
|
|
}
|
|
|
|
return error;
|
|
}
|
|
|
|
static void similarity_unload(similarity_info *info)
|
|
{
|
|
if (info->odb_obj)
|
|
git_odb_object_free(info->odb_obj);
|
|
|
|
if (info->blob)
|
|
git_blob_free(info->blob);
|
|
else
|
|
git_buf_free(&info->data);
|
|
}
|
|
|
|
#define FLAG_SET(opts,flag_name) (((opts)->flags & flag_name) != 0)
|
|
|
|
/* - score < 0 means files cannot be compared
|
|
* - score >= 100 means files are exact match
|
|
* - score == 0 means files are completely different
|
|
*/
|
|
static int similarity_measure(
|
|
int *score,
|
|
git_diff *diff,
|
|
const git_diff_find_options *opts,
|
|
void **cache,
|
|
size_t a_idx,
|
|
size_t b_idx)
|
|
{
|
|
git_diff_file *a_file = similarity_get_file(diff, a_idx);
|
|
git_diff_file *b_file = similarity_get_file(diff, b_idx);
|
|
bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
|
|
int error = 0;
|
|
similarity_info a_info, b_info;
|
|
|
|
*score = -1;
|
|
|
|
/* don't try to compare files of different types */
|
|
if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode))
|
|
return 0;
|
|
|
|
/* if exact match is requested, force calculation of missing OIDs now */
|
|
if (exact_match) {
|
|
if (git_oid_iszero(&a_file->id) &&
|
|
diff->old_src == GIT_ITERATOR_TYPE_WORKDIR &&
|
|
!git_diff__oid_for_file(&a_file->id,
|
|
diff, a_file->path, a_file->mode, a_file->size))
|
|
a_file->flags |= GIT_DIFF_FLAG_VALID_ID;
|
|
|
|
if (git_oid_iszero(&b_file->id) &&
|
|
diff->new_src == GIT_ITERATOR_TYPE_WORKDIR &&
|
|
!git_diff__oid_for_file(&b_file->id,
|
|
diff, b_file->path, b_file->mode, b_file->size))
|
|
b_file->flags |= GIT_DIFF_FLAG_VALID_ID;
|
|
}
|
|
|
|
/* check OID match as a quick test */
|
|
if (git_oid__cmp(&a_file->id, &b_file->id) == 0) {
|
|
*score = 100;
|
|
return 0;
|
|
}
|
|
|
|
/* don't calculate signatures if we are doing exact match */
|
|
if (exact_match) {
|
|
*score = 0;
|
|
return 0;
|
|
}
|
|
|
|
memset(&a_info, 0, sizeof(a_info));
|
|
memset(&b_info, 0, sizeof(b_info));
|
|
|
|
/* set up similarity data (will try to update missing file sizes) */
|
|
if (!cache[a_idx] && (error = similarity_init(&a_info, diff, a_idx)) < 0)
|
|
return error;
|
|
if (!cache[b_idx] && (error = similarity_init(&b_info, diff, b_idx)) < 0)
|
|
goto cleanup;
|
|
|
|
/* check if file sizes are nowhere near each other */
|
|
if (a_file->size > 127 &&
|
|
b_file->size > 127 &&
|
|
(a_file->size > (b_file->size << 3) ||
|
|
b_file->size > (a_file->size << 3)))
|
|
goto cleanup;
|
|
|
|
/* update signature cache if needed */
|
|
if (!cache[a_idx]) {
|
|
if ((error = similarity_sig(&a_info, opts, cache)) < 0)
|
|
goto cleanup;
|
|
}
|
|
if (!cache[b_idx]) {
|
|
if ((error = similarity_sig(&b_info, opts, cache)) < 0)
|
|
goto cleanup;
|
|
}
|
|
|
|
/* calculate similarity provided that the metric choose to process
|
|
* both the a and b files (some may not if file is too big, etc).
|
|
*/
|
|
if (cache[a_idx] && cache[b_idx])
|
|
error = opts->metric->similarity(
|
|
score, cache[a_idx], cache[b_idx], opts->metric->payload);
|
|
|
|
cleanup:
|
|
similarity_unload(&a_info);
|
|
similarity_unload(&b_info);
|
|
|
|
return error;
|
|
}
|
|
|
|
static int calc_self_similarity(
|
|
git_diff *diff,
|
|
const git_diff_find_options *opts,
|
|
size_t delta_idx,
|
|
void **cache)
|
|
{
|
|
int error, similarity = -1;
|
|
git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
|
|
|
|
if ((delta->flags & GIT_DIFF_FLAG__HAS_SELF_SIMILARITY) != 0)
|
|
return 0;
|
|
|
|
error = similarity_measure(
|
|
&similarity, diff, opts, cache, 2 * delta_idx, 2 * delta_idx + 1);
|
|
if (error < 0)
|
|
return error;
|
|
|
|
if (similarity >= 0) {
|
|
delta->similarity = (uint16_t)similarity;
|
|
delta->flags |= GIT_DIFF_FLAG__HAS_SELF_SIMILARITY;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static bool is_rename_target(
|
|
git_diff *diff,
|
|
const git_diff_find_options *opts,
|
|
size_t delta_idx,
|
|
void **cache)
|
|
{
|
|
git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
|
|
|
|
/* skip things that aren't plain blobs */
|
|
if (!GIT_MODE_ISBLOB(delta->new_file.mode))
|
|
return false;
|
|
|
|
/* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
|
|
* targets; maybe include UNTRACKED and IGNORED if requested.
|
|
*/
|
|
switch (delta->status) {
|
|
case GIT_DELTA_UNMODIFIED:
|
|
case GIT_DELTA_DELETED:
|
|
return false;
|
|
|
|
case GIT_DELTA_MODIFIED:
|
|
if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
|
|
!FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
|
|
return false;
|
|
|
|
if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
|
|
return false;
|
|
|
|
if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
|
|
delta->similarity < opts->break_rewrite_threshold) {
|
|
delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
|
|
break;
|
|
}
|
|
if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
|
|
delta->similarity < opts->rename_from_rewrite_threshold)
|
|
break;
|
|
|
|
return false;
|
|
|
|
case GIT_DELTA_UNTRACKED:
|
|
if (!FLAG_SET(opts, GIT_DIFF_FIND_FOR_UNTRACKED))
|
|
return false;
|
|
break;
|
|
|
|
case GIT_DELTA_IGNORED:
|
|
return false;
|
|
|
|
default: /* all other status values should be checked */
|
|
break;
|
|
}
|
|
|
|
delta->flags |= GIT_DIFF_FLAG__IS_RENAME_TARGET;
|
|
return true;
|
|
}
|
|
|
|
static bool is_rename_source(
|
|
git_diff *diff,
|
|
const git_diff_find_options *opts,
|
|
size_t delta_idx,
|
|
void **cache)
|
|
{
|
|
git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
|
|
|
|
/* skip things that aren't blobs */
|
|
if (!GIT_MODE_ISBLOB(delta->old_file.mode))
|
|
return false;
|
|
|
|
switch (delta->status) {
|
|
case GIT_DELTA_ADDED:
|
|
case GIT_DELTA_UNTRACKED:
|
|
case GIT_DELTA_UNREADABLE:
|
|
case GIT_DELTA_IGNORED:
|
|
return false;
|
|
|
|
case GIT_DELTA_DELETED:
|
|
case GIT_DELTA_TYPECHANGE:
|
|
break;
|
|
|
|
case GIT_DELTA_UNMODIFIED:
|
|
if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED))
|
|
return false;
|
|
if (FLAG_SET(opts, GIT_DIFF_FIND_REMOVE_UNMODIFIED))
|
|
delta->flags |= GIT_DIFF_FLAG__TO_DELETE;
|
|
break;
|
|
|
|
default: /* MODIFIED, RENAMED, COPIED */
|
|
/* if we're finding copies, this could be a source */
|
|
if (FLAG_SET(opts, GIT_DIFF_FIND_COPIES))
|
|
break;
|
|
|
|
/* otherwise, this is only a source if we can split it */
|
|
if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
|
|
!FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
|
|
return false;
|
|
|
|
if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
|
|
return false;
|
|
|
|
if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
|
|
delta->similarity < opts->break_rewrite_threshold) {
|
|
delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
|
|
break;
|
|
}
|
|
|
|
if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
|
|
delta->similarity < opts->rename_from_rewrite_threshold)
|
|
break;
|
|
|
|
return false;
|
|
}
|
|
|
|
delta->flags |= GIT_DIFF_FLAG__IS_RENAME_SOURCE;
|
|
return true;
|
|
}
|
|
|
|
GIT_INLINE(bool) delta_is_split(git_diff_delta *delta)
|
|
{
|
|
return (delta->status == GIT_DELTA_TYPECHANGE ||
|
|
(delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0);
|
|
}
|
|
|
|
GIT_INLINE(bool) delta_is_new_only(git_diff_delta *delta)
|
|
{
|
|
return (delta->status == GIT_DELTA_ADDED ||
|
|
delta->status == GIT_DELTA_UNTRACKED ||
|
|
delta->status == GIT_DELTA_UNREADABLE ||
|
|
delta->status == GIT_DELTA_IGNORED);
|
|
}
|
|
|
|
GIT_INLINE(void) delta_make_rename(
|
|
git_diff_delta *to, const git_diff_delta *from, uint16_t similarity)
|
|
{
|
|
to->status = GIT_DELTA_RENAMED;
|
|
to->similarity = similarity;
|
|
to->nfiles = 2;
|
|
memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
|
|
to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
|
|
}
|
|
|
|
typedef struct {
|
|
size_t idx;
|
|
uint16_t similarity;
|
|
} diff_find_match;
|
|
|
|
int git_diff_find_similar(
|
|
git_diff *diff,
|
|
const git_diff_find_options *given_opts)
|
|
{
|
|
size_t s, t;
|
|
int error = 0, result;
|
|
uint16_t similarity;
|
|
git_diff_delta *src, *tgt;
|
|
git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT;
|
|
size_t num_deltas, num_srcs = 0, num_tgts = 0;
|
|
size_t tried_srcs = 0, tried_tgts = 0;
|
|
size_t num_rewrites = 0, num_updates = 0, num_bumped = 0;
|
|
size_t sigcache_size;
|
|
void **sigcache = NULL; /* cache of similarity metric file signatures */
|
|
diff_find_match *tgt2src = NULL;
|
|
diff_find_match *src2tgt = NULL;
|
|
diff_find_match *tgt2src_copy = NULL;
|
|
diff_find_match *best_match;
|
|
git_diff_file swap;
|
|
|
|
if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0)
|
|
return error;
|
|
|
|
num_deltas = diff->deltas.length;
|
|
|
|
/* TODO: maybe abort if deltas.length > rename_limit ??? */
|
|
if (!git__is_uint32(num_deltas))
|
|
goto cleanup;
|
|
|
|
/* No flags set; nothing to do */
|
|
if ((opts.flags & GIT_DIFF_FIND_ALL) == 0)
|
|
goto cleanup;
|
|
|
|
GITERR_CHECK_ALLOC_MULTIPLY(&sigcache_size, num_deltas, 2);
|
|
sigcache = git__calloc(sigcache_size, sizeof(void *));
|
|
GITERR_CHECK_ALLOC(sigcache);
|
|
|
|
/* Label rename sources and targets
|
|
*
|
|
* This will also set self-similarity scores for MODIFIED files and
|
|
* mark them for splitting if break-rewrites is enabled
|
|
*/
|
|
git_vector_foreach(&diff->deltas, t, tgt) {
|
|
if (is_rename_source(diff, &opts, t, sigcache))
|
|
++num_srcs;
|
|
|
|
if (is_rename_target(diff, &opts, t, sigcache))
|
|
++num_tgts;
|
|
|
|
if ((tgt->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0)
|
|
num_rewrites++;
|
|
}
|
|
|
|
/* if there are no candidate srcs or tgts, we're done */
|
|
if (!num_srcs || !num_tgts)
|
|
goto cleanup;
|
|
|
|
src2tgt = git__calloc(num_deltas, sizeof(diff_find_match));
|
|
GITERR_CHECK_ALLOC(src2tgt);
|
|
tgt2src = git__calloc(num_deltas, sizeof(diff_find_match));
|
|
GITERR_CHECK_ALLOC(tgt2src);
|
|
|
|
if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
|
|
tgt2src_copy = git__calloc(num_deltas, sizeof(diff_find_match));
|
|
GITERR_CHECK_ALLOC(tgt2src_copy);
|
|
}
|
|
|
|
/*
|
|
* Find best-fit matches for rename / copy candidates
|
|
*/
|
|
|
|
find_best_matches:
|
|
tried_tgts = num_bumped = 0;
|
|
|
|
git_vector_foreach(&diff->deltas, t, tgt) {
|
|
/* skip things that are not rename targets */
|
|
if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
|
|
continue;
|
|
|
|
tried_srcs = 0;
|
|
|
|
git_vector_foreach(&diff->deltas, s, src) {
|
|
/* skip things that are not rename sources */
|
|
if ((src->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0)
|
|
continue;
|
|
|
|
/* calculate similarity for this pair and find best match */
|
|
if (s == t)
|
|
result = -1; /* don't measure self-similarity here */
|
|
else if ((error = similarity_measure(
|
|
&result, diff, &opts, sigcache, 2 * s, 2 * t + 1)) < 0)
|
|
goto cleanup;
|
|
|
|
if (result < 0)
|
|
continue;
|
|
similarity = (uint16_t)result;
|
|
|
|
/* is this a better rename? */
|
|
if (tgt2src[t].similarity < similarity &&
|
|
src2tgt[s].similarity < similarity)
|
|
{
|
|
/* eject old mapping */
|
|
if (src2tgt[s].similarity > 0) {
|
|
tgt2src[src2tgt[s].idx].similarity = 0;
|
|
num_bumped++;
|
|
}
|
|
if (tgt2src[t].similarity > 0) {
|
|
src2tgt[tgt2src[t].idx].similarity = 0;
|
|
num_bumped++;
|
|
}
|
|
|
|
/* write new mapping */
|
|
tgt2src[t].idx = s;
|
|
tgt2src[t].similarity = similarity;
|
|
src2tgt[s].idx = t;
|
|
src2tgt[s].similarity = similarity;
|
|
}
|
|
|
|
/* keep best absolute match for copies */
|
|
if (tgt2src_copy != NULL &&
|
|
tgt2src_copy[t].similarity < similarity)
|
|
{
|
|
tgt2src_copy[t].idx = s;
|
|
tgt2src_copy[t].similarity = similarity;
|
|
}
|
|
|
|
if (++tried_srcs >= num_srcs)
|
|
break;
|
|
|
|
/* cap on maximum targets we'll examine (per "tgt" file) */
|
|
if (tried_srcs > opts.rename_limit)
|
|
break;
|
|
}
|
|
|
|
if (++tried_tgts >= num_tgts)
|
|
break;
|
|
}
|
|
|
|
if (num_bumped > 0) /* try again if we bumped some items */
|
|
goto find_best_matches;
|
|
|
|
/*
|
|
* Rewrite the diffs with renames / copies
|
|
*/
|
|
|
|
git_vector_foreach(&diff->deltas, t, tgt) {
|
|
/* skip things that are not rename targets */
|
|
if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
|
|
continue;
|
|
|
|
/* check if this delta was the target of a similarity */
|
|
if (tgt2src[t].similarity)
|
|
best_match = &tgt2src[t];
|
|
else if (tgt2src_copy && tgt2src_copy[t].similarity)
|
|
best_match = &tgt2src_copy[t];
|
|
else
|
|
continue;
|
|
|
|
s = best_match->idx;
|
|
src = GIT_VECTOR_GET(&diff->deltas, s);
|
|
|
|
/* possible scenarios:
|
|
* 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME
|
|
* 2. from DELETE to SPLIT/TYPECHANGE = RENAME + DELETE
|
|
* 3. from SPLIT/TYPECHANGE to ADD/UNTRACK/IGNORE = ADD + RENAME
|
|
* 4. from SPLIT/TYPECHANGE to SPLIT/TYPECHANGE = RENAME + SPLIT
|
|
* 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY
|
|
*/
|
|
|
|
if (src->status == GIT_DELTA_DELETED) {
|
|
|
|
if (delta_is_new_only(tgt)) {
|
|
|
|
if (best_match->similarity < opts.rename_threshold)
|
|
continue;
|
|
|
|
delta_make_rename(tgt, src, best_match->similarity);
|
|
|
|
src->flags |= GIT_DIFF_FLAG__TO_DELETE;
|
|
num_rewrites++;
|
|
} else {
|
|
assert(delta_is_split(tgt));
|
|
|
|
if (best_match->similarity < opts.rename_from_rewrite_threshold)
|
|
continue;
|
|
|
|
memcpy(&swap, &tgt->old_file, sizeof(swap));
|
|
|
|
delta_make_rename(tgt, src, best_match->similarity);
|
|
num_rewrites--;
|
|
|
|
assert(src->status == GIT_DELTA_DELETED);
|
|
memcpy(&src->old_file, &swap, sizeof(src->old_file));
|
|
memset(&src->new_file, 0, sizeof(src->new_file));
|
|
src->new_file.path = src->old_file.path;
|
|
src->new_file.flags |= GIT_DIFF_FLAG_VALID_ID;
|
|
|
|
num_updates++;
|
|
|
|
if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
|
|
/* what used to be at src t is now at src s */
|
|
tgt2src[src2tgt[t].idx].idx = s;
|
|
}
|
|
}
|
|
}
|
|
|
|
else if (delta_is_split(src)) {
|
|
|
|
if (delta_is_new_only(tgt)) {
|
|
|
|
if (best_match->similarity < opts.rename_threshold)
|
|
continue;
|
|
|
|
delta_make_rename(tgt, src, best_match->similarity);
|
|
|
|
src->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ?
|
|
GIT_DELTA_UNTRACKED : GIT_DELTA_ADDED;
|
|
src->nfiles = 1;
|
|
memset(&src->old_file, 0, sizeof(src->old_file));
|
|
src->old_file.path = src->new_file.path;
|
|
src->old_file.flags |= GIT_DIFF_FLAG_VALID_ID;
|
|
|
|
src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
|
|
num_rewrites--;
|
|
|
|
num_updates++;
|
|
} else {
|
|
assert(delta_is_split(src));
|
|
|
|
if (best_match->similarity < opts.rename_from_rewrite_threshold)
|
|
continue;
|
|
|
|
memcpy(&swap, &tgt->old_file, sizeof(swap));
|
|
|
|
delta_make_rename(tgt, src, best_match->similarity);
|
|
num_rewrites--;
|
|
num_updates++;
|
|
|
|
memcpy(&src->old_file, &swap, sizeof(src->old_file));
|
|
|
|
/* if we've just swapped the new element into the correct
|
|
* place, clear the SPLIT flag
|
|
*/
|
|
if (tgt2src[s].idx == t &&
|
|
tgt2src[s].similarity >
|
|
opts.rename_from_rewrite_threshold) {
|
|
src->status = GIT_DELTA_RENAMED;
|
|
src->similarity = tgt2src[s].similarity;
|
|
tgt2src[s].similarity = 0;
|
|
src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
|
|
num_rewrites--;
|
|
}
|
|
/* otherwise, if we just overwrote a source, update mapping */
|
|
else if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
|
|
/* what used to be at src t is now at src s */
|
|
tgt2src[src2tgt[t].idx].idx = s;
|
|
}
|
|
|
|
num_updates++;
|
|
}
|
|
}
|
|
|
|
else if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
|
|
if (tgt2src_copy[t].similarity < opts.copy_threshold)
|
|
continue;
|
|
|
|
/* always use best possible source for copy */
|
|
best_match = &tgt2src_copy[t];
|
|
src = GIT_VECTOR_GET(&diff->deltas, best_match->idx);
|
|
|
|
if (delta_is_split(tgt)) {
|
|
error = insert_delete_side_of_split(diff, &diff->deltas, tgt);
|
|
if (error < 0)
|
|
goto cleanup;
|
|
num_rewrites--;
|
|
}
|
|
|
|
if (!delta_is_split(tgt) && !delta_is_new_only(tgt))
|
|
continue;
|
|
|
|
tgt->status = GIT_DELTA_COPIED;
|
|
tgt->similarity = best_match->similarity;
|
|
tgt->nfiles = 2;
|
|
memcpy(&tgt->old_file, &src->old_file, sizeof(tgt->old_file));
|
|
tgt->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
|
|
|
|
num_updates++;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Actually split and delete entries as needed
|
|
*/
|
|
|
|
if (num_rewrites > 0 || num_updates > 0)
|
|
error = apply_splits_and_deletes(
|
|
diff, diff->deltas.length - num_rewrites,
|
|
FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES) &&
|
|
!FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES_FOR_RENAMES_ONLY));
|
|
|
|
cleanup:
|
|
git__free(tgt2src);
|
|
git__free(src2tgt);
|
|
git__free(tgt2src_copy);
|
|
|
|
if (sigcache) {
|
|
for (t = 0; t < num_deltas * 2; ++t) {
|
|
if (sigcache[t] != NULL)
|
|
opts.metric->free_signature(sigcache[t], opts.metric->payload);
|
|
}
|
|
git__free(sigcache);
|
|
}
|
|
|
|
if (!given_opts || !given_opts->metric)
|
|
git__free(opts.metric);
|
|
|
|
return error;
|
|
}
|
|
|
|
#undef FLAG_SET
|