mirror of
https://git.proxmox.com/git/libgit2
synced 2025-08-07 20:59:36 +00:00
More git_diff_find_similar improvements
- Add new GIT_DIFF_FIND_EXACT_MATCH_ONLY flag to do similarity matching without using the similarity metric (i.e. only compare the SHA). - Clean up the similarity measurement code to more rigorously distinguish between files that are not similar and files that are not comparable (previously, a 0 could either mean that the files could not be compared or that they were totally different) - When splitting a MODIFIED file into a DELETE/ADD pair, actually make a DELETED/UNTRACKED pair if the right side of the diff is from the working directory. This prevents an odd mix of ADDED and UNTRACKED files on workdir diffs.
This commit is contained in:
parent
5c8f37a397
commit
9be5be47fb
@ -441,6 +441,8 @@ typedef enum {
|
|||||||
GIT_DIFF_FIND_IGNORE_WHITESPACE = (1 << 12),
|
GIT_DIFF_FIND_IGNORE_WHITESPACE = (1 << 12),
|
||||||
/** measure similarity including all data */
|
/** measure similarity including all data */
|
||||||
GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE = (1 << 13),
|
GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE = (1 << 13),
|
||||||
|
/** measure similarity only by comparing SHAs (fast and cheap) */
|
||||||
|
GIT_DIFF_FIND_EXACT_MATCH_ONLY = (1 << 14),
|
||||||
} git_diff_find_t;
|
} git_diff_find_t;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
169
src/diff_tform.c
169
src/diff_tform.c
@ -255,6 +255,16 @@ static int normalize_find_opts(
|
|||||||
|
|
||||||
/* some flags imply others */
|
/* some flags imply others */
|
||||||
|
|
||||||
|
if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) {
|
||||||
|
/* if we are only looking for exact matches, then don't turn
|
||||||
|
* MODIFIED items into ADD/DELETE pairs because it's too picky
|
||||||
|
*/
|
||||||
|
opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES);
|
||||||
|
|
||||||
|
/* similarly, don't look for self-rewrites to split */
|
||||||
|
opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES;
|
||||||
|
}
|
||||||
|
|
||||||
if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
|
if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
|
||||||
opts->flags |= GIT_DIFF_FIND_RENAMES;
|
opts->flags |= GIT_DIFF_FIND_RENAMES;
|
||||||
|
|
||||||
@ -373,7 +383,10 @@ static int apply_splits_and_deletes(
|
|||||||
if (git_vector_insert(&onto, deleted) < 0)
|
if (git_vector_insert(&onto, deleted) < 0)
|
||||||
goto on_error;
|
goto on_error;
|
||||||
|
|
||||||
delta->status = GIT_DELTA_ADDED;
|
if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
|
||||||
|
delta->status = GIT_DELTA_UNTRACKED;
|
||||||
|
else
|
||||||
|
delta->status = GIT_DELTA_ADDED;
|
||||||
memset(&delta->old_file, 0, sizeof(delta->old_file));
|
memset(&delta->old_file, 0, sizeof(delta->old_file));
|
||||||
delta->old_file.path = delta->new_file.path;
|
delta->old_file.path = delta->new_file.path;
|
||||||
delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
|
delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
|
||||||
@ -460,22 +473,56 @@ static int similarity_calc(
|
|||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define FLAG_SET(opts,flag_name) (((opts).flags & flag_name) != 0)
|
||||||
|
|
||||||
|
/* - score < 0 means files cannot be compared
|
||||||
|
* - score >= 100 means files are exact match
|
||||||
|
* - score == 0 means files are completely different
|
||||||
|
*/
|
||||||
static int similarity_measure(
|
static int similarity_measure(
|
||||||
|
int *score,
|
||||||
git_diff_list *diff,
|
git_diff_list *diff,
|
||||||
git_diff_find_options *opts,
|
git_diff_find_options *opts,
|
||||||
void **cache,
|
void **cache,
|
||||||
size_t a_idx,
|
size_t a_idx,
|
||||||
size_t b_idx)
|
size_t b_idx)
|
||||||
{
|
{
|
||||||
int score = 0;
|
|
||||||
git_diff_file *a_file = similarity_get_file(diff, a_idx);
|
git_diff_file *a_file = similarity_get_file(diff, a_idx);
|
||||||
git_diff_file *b_file = similarity_get_file(diff, b_idx);
|
git_diff_file *b_file = similarity_get_file(diff, b_idx);
|
||||||
|
bool exact_match = FLAG_SET(*opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
|
||||||
|
|
||||||
|
*score = -1;
|
||||||
|
|
||||||
|
/* don't try to compare files of different types */
|
||||||
if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode))
|
if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if (git_oid__cmp(&a_file->oid, &b_file->oid) == 0)
|
/* if exact match is requested, force calculation of missing OIDs */
|
||||||
return 100;
|
if (exact_match) {
|
||||||
|
if (git_oid_iszero(&a_file->oid) &&
|
||||||
|
diff->old_src == GIT_ITERATOR_TYPE_WORKDIR &&
|
||||||
|
!git_diff__oid_for_file(diff->repo, a_file->path,
|
||||||
|
a_file->mode, a_file->size, &a_file->oid))
|
||||||
|
a_file->flags |= GIT_DIFF_FLAG_VALID_OID;
|
||||||
|
|
||||||
|
if (git_oid_iszero(&b_file->oid) &&
|
||||||
|
diff->new_src == GIT_ITERATOR_TYPE_WORKDIR &&
|
||||||
|
!git_diff__oid_for_file(diff->repo, b_file->path,
|
||||||
|
b_file->mode, b_file->size, &b_file->oid))
|
||||||
|
b_file->flags |= GIT_DIFF_FLAG_VALID_OID;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* check OID match as a quick test */
|
||||||
|
if (git_oid__cmp(&a_file->oid, &b_file->oid) == 0) {
|
||||||
|
*score = 100;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* don't calculate signatures if we are doing exact match */
|
||||||
|
if (exact_match) {
|
||||||
|
*score = 0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/* update signature cache if needed */
|
/* update signature cache if needed */
|
||||||
if (!cache[a_idx] && similarity_calc(diff, opts, a_idx, cache) < 0)
|
if (!cache[a_idx] && similarity_calc(diff, opts, a_idx, cache) < 0)
|
||||||
@ -488,20 +535,33 @@ static int similarity_measure(
|
|||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
/* compare signatures */
|
/* compare signatures */
|
||||||
if (opts->metric->similarity(
|
return opts->metric->similarity(
|
||||||
&score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0)
|
score, cache[a_idx], cache[b_idx], opts->metric->payload);
|
||||||
return -1;
|
|
||||||
|
|
||||||
/* clip score */
|
|
||||||
if (score < 1)
|
|
||||||
score = 1; /* zero means uncomparable, so use 1 for least similar */
|
|
||||||
else if (score > 100)
|
|
||||||
score = 100;
|
|
||||||
|
|
||||||
return score;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0)
|
static void convert_to_rename_and_add(
|
||||||
|
git_diff_list *diff,
|
||||||
|
git_diff_delta *from,
|
||||||
|
git_diff_delta *to,
|
||||||
|
int similarity)
|
||||||
|
{
|
||||||
|
to->status = GIT_DELTA_RENAMED;
|
||||||
|
to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
|
||||||
|
to->similarity = (uint32_t)similarity;
|
||||||
|
memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
|
||||||
|
validate_delta(to);
|
||||||
|
|
||||||
|
if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
|
||||||
|
from->status = GIT_DELTA_UNTRACKED;
|
||||||
|
else
|
||||||
|
from->status = GIT_DELTA_ADDED;
|
||||||
|
from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
|
||||||
|
from->similarity = 0;
|
||||||
|
memset(&from->old_file, 0, sizeof(from->old_file));
|
||||||
|
from->old_file.path = from->new_file.path;
|
||||||
|
from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
|
||||||
|
validate_delta(from);
|
||||||
|
}
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint32_t idx;
|
uint32_t idx;
|
||||||
@ -542,21 +602,17 @@ int git_diff_find_similar(
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* skip things that aren't plain blobs */
|
/* skip things that aren't plain blobs */
|
||||||
if (GIT_MODE_TYPE(from->old_file.mode) !=
|
if (!GIT_MODE_ISBLOB(from->old_file.mode))
|
||||||
GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
|
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* measure similarity from old_file to new_file */
|
/* measure similarity from old_file to new_file */
|
||||||
similarity = similarity_measure(
|
if ((error = similarity_measure(
|
||||||
diff, &opts, cache, 2 * i, 2 * i + 1);
|
&similarity, diff, &opts, cache, 2 * i, 2 * i + 1)) < 0)
|
||||||
|
|
||||||
if (similarity < 0) {
|
|
||||||
error = similarity;
|
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
|
||||||
|
|
||||||
if (similarity > 0 &&
|
if (similarity < 0)
|
||||||
similarity < (int)opts.break_rewrite_threshold) {
|
continue;
|
||||||
|
if (similarity < (int)opts.break_rewrite_threshold) {
|
||||||
from->similarity = (uint32_t)similarity;
|
from->similarity = (uint32_t)similarity;
|
||||||
from->flags |= GIT_DIFF_FLAG__TO_SPLIT;
|
from->flags |= GIT_DIFF_FLAG__TO_SPLIT;
|
||||||
num_rewrites++;
|
num_rewrites++;
|
||||||
@ -573,8 +629,7 @@ int git_diff_find_similar(
|
|||||||
matches[i].similarity = 0;
|
matches[i].similarity = 0;
|
||||||
|
|
||||||
/* skip things that aren't plain blobs */
|
/* skip things that aren't plain blobs */
|
||||||
if (GIT_MODE_TYPE(from->old_file.mode) !=
|
if (!GIT_MODE_ISBLOB(from->old_file.mode))
|
||||||
GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
|
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* don't check UNMODIFIED files as source unless given option */
|
/* don't check UNMODIFIED files as source unless given option */
|
||||||
@ -599,8 +654,7 @@ int git_diff_find_similar(
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* skip things that aren't blobs */
|
/* skip things that aren't blobs */
|
||||||
if (GIT_MODE_TYPE(to->new_file.mode) !=
|
if (!GIT_MODE_ISBLOB(to->new_file.mode))
|
||||||
GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
|
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
|
/* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
|
||||||
@ -630,14 +684,13 @@ int git_diff_find_similar(
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
/* calculate similarity for this pair and find best match */
|
/* calculate similarity for this pair and find best match */
|
||||||
similarity = similarity_measure(
|
if ((error = similarity_measure(
|
||||||
diff, &opts, cache, 2 * i, 2 * j + 1);
|
&similarity, diff, &opts, cache, 2 * i, 2 * j + 1)) < 0)
|
||||||
|
|
||||||
if (similarity < 0) {
|
|
||||||
error = similarity;
|
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
|
if (similarity < 0) {
|
||||||
|
--tried_targets;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (matches[i].similarity < (uint32_t)similarity) {
|
if (matches[i].similarity < (uint32_t)similarity) {
|
||||||
matches[i].similarity = (uint32_t)similarity;
|
matches[i].similarity = (uint32_t)similarity;
|
||||||
matches[i].idx = j;
|
matches[i].idx = j;
|
||||||
@ -687,18 +740,7 @@ int git_diff_find_similar(
|
|||||||
if (similarity < (int)opts.rename_threshold)
|
if (similarity < (int)opts.rename_threshold)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
to->status = GIT_DELTA_RENAMED;
|
convert_to_rename_and_add(diff, from, to, similarity);
|
||||||
to->similarity = (uint32_t)similarity;
|
|
||||||
memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
|
|
||||||
validate_delta(to);
|
|
||||||
|
|
||||||
from->status = GIT_DELTA_ADDED;
|
|
||||||
from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
|
|
||||||
from->similarity = 0; /* reset self-similarity */
|
|
||||||
memset(&from->old_file, 0, sizeof(from->old_file));
|
|
||||||
from->old_file.path = from->new_file.path;
|
|
||||||
validate_delta(from);
|
|
||||||
|
|
||||||
num_rewrites--;
|
num_rewrites--;
|
||||||
num_updates++;
|
num_updates++;
|
||||||
continue;
|
continue;
|
||||||
@ -712,28 +754,16 @@ int git_diff_find_similar(
|
|||||||
FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
|
FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
|
||||||
similarity > (int)opts.rename_threshold)
|
similarity > (int)opts.rename_threshold)
|
||||||
{
|
{
|
||||||
int self_similarity = similarity_measure(
|
int self_similarity;
|
||||||
diff, &opts, cache, 2 * i, 2 * i + 1);
|
|
||||||
if (self_similarity < 0) {
|
if ((error = similarity_measure(&self_similarity,
|
||||||
error = self_similarity;
|
diff, &opts, cache, 2 * i, 2 * i + 1)) < 0)
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
|
||||||
|
|
||||||
if (self_similarity < (int)opts.rename_from_rewrite_threshold) {
|
if (self_similarity >= 0 &&
|
||||||
to->status = GIT_DELTA_RENAMED;
|
self_similarity < (int)opts.rename_from_rewrite_threshold) {
|
||||||
to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
|
|
||||||
to->similarity = (uint32_t)similarity;
|
|
||||||
memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
|
|
||||||
validate_delta(to);
|
|
||||||
|
|
||||||
from->status = GIT_DELTA_ADDED;
|
|
||||||
from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
|
|
||||||
from->similarity = 0;
|
|
||||||
memset(&from->old_file, 0, sizeof(from->old_file));
|
|
||||||
from->old_file.path = from->new_file.path;
|
|
||||||
from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
|
|
||||||
validate_delta(from);
|
|
||||||
|
|
||||||
|
convert_to_rename_and_add(diff, from, to, similarity);
|
||||||
num_updates++;
|
num_updates++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -754,13 +784,10 @@ int git_diff_find_similar(
|
|||||||
num_updates++;
|
num_updates++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (num_rewrites > 0) {
|
if (num_rewrites > 0)
|
||||||
assert(num_rewrites < diff->deltas.length);
|
|
||||||
|
|
||||||
error = apply_splits_and_deletes(
|
error = apply_splits_and_deletes(
|
||||||
diff, diff->deltas.length - num_rewrites,
|
diff, diff->deltas.length - num_rewrites,
|
||||||
FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES));
|
FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES));
|
||||||
}
|
|
||||||
|
|
||||||
if (num_rewrites > 0 || num_updates > 0)
|
if (num_rewrites > 0 || num_updates > 0)
|
||||||
git_vector_sort(&diff->deltas);
|
git_vector_sort(&diff->deltas);
|
||||||
|
@ -223,6 +223,7 @@ extern git_off_t git_futils_filesize(git_file fd);
|
|||||||
#define GIT_MODE_PERMS_MASK 0777
|
#define GIT_MODE_PERMS_MASK 0777
|
||||||
#define GIT_CANONICAL_PERMS(MODE) (((MODE) & 0100) ? 0755 : 0644)
|
#define GIT_CANONICAL_PERMS(MODE) (((MODE) & 0100) ? 0755 : 0644)
|
||||||
#define GIT_MODE_TYPE(MODE) ((MODE) & ~GIT_MODE_PERMS_MASK)
|
#define GIT_MODE_TYPE(MODE) ((MODE) & ~GIT_MODE_PERMS_MASK)
|
||||||
|
#define GIT_MODE_ISBLOB(MODE) (GIT_MODE_TYPE(MODE) == GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert a mode_t from the OS to a legal git mode_t value.
|
* Convert a mode_t from the OS to a legal git mode_t value.
|
||||||
|
Loading…
Reference in New Issue
Block a user