mirror of
https://git.proxmox.com/git/libgit2
synced 2025-05-03 00:43:41 +00:00
Refine pluggable similarity API
This plugs in the three basic similarity strategies for handling whitespace via internal use of the pluggable API. In so doing, I realized that the use of git_buf in the hashsig API was not needed and actually just made it harder to use, so I tweaked that API as well. Note that the similarity metric is still not hooked up in the find_similarity code - this is just setting out the function that will be used.
This commit is contained in:
parent
a235e9d355
commit
9bc8be3d7e
@ -387,20 +387,46 @@ typedef enum {
|
||||
|
||||
/** split large rewrites into delete/add pairs (`--break-rewrites=/M`) */
|
||||
GIT_DIFF_FIND_AND_BREAK_REWRITES = (1 << 4),
|
||||
|
||||
/** measure similarity ignoring leading whitespace (default) */
|
||||
GIT_DIFF_FIND_IGNORE_LEADING_WHITESPACE = 0,
|
||||
/** measure similarity ignoring all whitespace */
|
||||
GIT_DIFF_FIND_IGNORE_WHITESPACE = (1 << 6),
|
||||
/** measure similarity including all data */
|
||||
GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE = (1 << 7),
|
||||
} git_diff_find_t;
|
||||
|
||||
/**
|
||||
* Pluggable similarity metric
|
||||
*/
|
||||
typedef struct {
|
||||
int (*calc_signature)(void **out, const git_diff_file *file, void *payload);
|
||||
int (*file_signature)(
|
||||
void **out, const git_diff_file *file,
|
||||
const char *fullpath, void *payload);
|
||||
int (*buffer_signature)(
|
||||
void **out, const git_diff_file *file,
|
||||
const char *buf, size_t buflen, void *payload);
|
||||
void (*free_signature)(void *sig, void *payload);
|
||||
int (*calc_similarity)(int *score, void *siga, void *sigb, void *payload);
|
||||
int (*similarity)(int *score, void *siga, void *sigb, void *payload);
|
||||
void *payload;
|
||||
} git_diff_similarity_metric;
|
||||
|
||||
/**
|
||||
* Control behavior of rename and copy detection
|
||||
*
|
||||
* These options mostly mimic parameters that can be passed to git-diff.
|
||||
*
|
||||
* - `rename_threshold` is the same as the -M option with a value
|
||||
* - `copy_threshold` is the same as the -C option with a value
|
||||
* - `rename_from_rewrite_threshold` matches the top of the -B option
|
||||
* - `break_rewrite_threshold` matches the bottom of the -B option
|
||||
* - `target_limit` matches the -l option
|
||||
*
|
||||
* The `metric` option allows you to plug in a custom similarity metric.
|
||||
* Set it to NULL for the default internal metric which is based on sampling
|
||||
* hashes of ranges of data in the file. The default metric is a pretty
|
||||
* good similarity approximation that should work fairly well for both text
|
||||
* and binary data, and is pretty fast with fixed memory overhead.
|
||||
*/
|
||||
typedef struct {
|
||||
unsigned int version;
|
||||
|
@ -169,6 +169,60 @@ int git_diff_merge(
|
||||
return error;
|
||||
}
|
||||
|
||||
#define FIND_SIMILAR_HASHSIG(NAME,OPT) \
|
||||
static int find_similar__hashsig_for_file ## NAME( \
|
||||
void **out, const git_diff_file *f, const char *path, void *p) { \
|
||||
GIT_UNUSED(f); GIT_UNUSED(p); \
|
||||
return git_hashsig_create_fromfile((git_hashsig **)out, path, OPT); \
|
||||
} \
|
||||
static int find_similar__hashsig_for_buf ## NAME( \
|
||||
void **out, const git_diff_file *f, const char *buf, size_t len, void *p) { \
|
||||
GIT_UNUSED(f); GIT_UNUSED(p); \
|
||||
return git_hashsig_create((git_hashsig **)out, buf, len, OPT); \
|
||||
}
|
||||
|
||||
FIND_SIMILAR_HASHSIG(_default, GIT_HASHSIG_SMART_WHITESPACE);
|
||||
FIND_SIMILAR_HASHSIG(_ignore_whitespace, GIT_HASHSIG_IGNORE_WHITESPACE);
|
||||
FIND_SIMILAR_HASHSIG(_include_whitespace, GIT_HASHSIG_NORMAL);
|
||||
|
||||
static void find_similar__hashsig_free(void *sig, void *payload)
|
||||
{
|
||||
GIT_UNUSED(payload);
|
||||
git_hashsig_free(sig);
|
||||
}
|
||||
|
||||
static int find_similar__calc_similarity(
|
||||
int *score, void *siga, void *sigb, void *payload)
|
||||
{
|
||||
GIT_UNUSED(payload);
|
||||
*score = git_hashsig_compare(siga, sigb);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static git_diff_similarity_metric find_similar__internal_metrics[3] = {
|
||||
{
|
||||
find_similar__hashsig_for_file_default,
|
||||
find_similar__hashsig_for_buf_default,
|
||||
find_similar__hashsig_free,
|
||||
find_similar__calc_similarity,
|
||||
NULL
|
||||
},
|
||||
{
|
||||
find_similar__hashsig_for_file_ignore_whitespace,
|
||||
find_similar__hashsig_for_buf_ignore_whitespace,
|
||||
find_similar__hashsig_free,
|
||||
find_similar__calc_similarity,
|
||||
NULL
|
||||
},
|
||||
{
|
||||
find_similar__hashsig_for_file_include_whitespace,
|
||||
find_similar__hashsig_for_buf_include_whitespace,
|
||||
find_similar__hashsig_free,
|
||||
find_similar__calc_similarity,
|
||||
NULL
|
||||
}
|
||||
};
|
||||
|
||||
#define DEFAULT_THRESHOLD 50
|
||||
#define DEFAULT_BREAK_REWRITE_THRESHOLD 60
|
||||
#define DEFAULT_TARGET_LIMIT 200
|
||||
@ -237,6 +291,16 @@ static int normalize_find_opts(
|
||||
opts->target_limit = limit;
|
||||
}
|
||||
|
||||
/* for now, always assign the same internal metric */
|
||||
if (!opts->metric) {
|
||||
if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE)
|
||||
opts->metric = &find_similar__internal_metrics[1];
|
||||
else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE)
|
||||
opts->metric = &find_similar__internal_metrics[2];
|
||||
else
|
||||
opts->metric = &find_similar__internal_metrics[0];
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -266,7 +266,8 @@ static git_hashsig *hashsig_alloc(git_hashsig_option_t opts)
|
||||
|
||||
int git_hashsig_create(
|
||||
git_hashsig **out,
|
||||
const git_buf *buf,
|
||||
const char *buf,
|
||||
size_t buflen,
|
||||
git_hashsig_option_t opts)
|
||||
{
|
||||
int error;
|
||||
@ -274,7 +275,7 @@ int git_hashsig_create(
|
||||
git_hashsig *sig = hashsig_alloc(opts);
|
||||
GITERR_CHECK_ALLOC(sig);
|
||||
|
||||
error = hashsig_add_hashes(sig, buf->ptr, buf->size, &prog);
|
||||
error = hashsig_add_hashes(sig, buf, buflen, &prog);
|
||||
|
||||
if (!error)
|
||||
error = hashsig_finalize_hashes(sig);
|
||||
|
@ -7,7 +7,7 @@
|
||||
#ifndef INCLUDE_hashsig_h__
|
||||
#define INCLUDE_hashsig_h__
|
||||
|
||||
#include "buffer.h"
|
||||
#include "common.h"
|
||||
|
||||
/**
|
||||
* Similarity signature of line hashes for a buffer
|
||||
@ -32,11 +32,13 @@ typedef enum {
|
||||
*
|
||||
* @param out The array of hashed runs representing the file content
|
||||
* @param buf The contents of the file to hash
|
||||
* @param buflen The length of the data at `buf`
|
||||
* @param generate_pairwise_hashes Should pairwise runs be hashed
|
||||
*/
|
||||
extern int git_hashsig_create(
|
||||
git_hashsig **out,
|
||||
const git_buf *buf,
|
||||
const char *buf,
|
||||
size_t buflen,
|
||||
git_hashsig_option_t opts);
|
||||
|
||||
/**
|
||||
|
@ -748,8 +748,8 @@ void test_core_buffer__similarity_metric(void)
|
||||
/* in the first case, we compare data to itself and expect 100% match */
|
||||
|
||||
cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
|
||||
cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
|
||||
cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));
|
||||
cl_git_pass(git_hashsig_create(&a, buf.ptr, buf.size, GIT_HASHSIG_NORMAL));
|
||||
cl_git_pass(git_hashsig_create(&b, buf.ptr, buf.size, GIT_HASHSIG_NORMAL));
|
||||
|
||||
cl_assert_equal_i(100, git_hashsig_compare(a, b));
|
||||
|
||||
@ -759,13 +759,13 @@ void test_core_buffer__similarity_metric(void)
|
||||
/* if we change just a single byte, how much does that change magnify? */
|
||||
|
||||
cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
|
||||
cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
|
||||
cl_git_pass(git_hashsig_create(&a, buf.ptr, buf.size, GIT_HASHSIG_NORMAL));
|
||||
cl_git_pass(git_buf_sets(&buf,
|
||||
"Test data\nright here\ninline\ntada\nneeds more data\nlots of data\n"
|
||||
"is this enough?\nthere has to be enough data to fill the hash array!\n"
|
||||
"Apparently 191 bytes is the minimum amount of data needed.\nHere goes!\n"
|
||||
"Let's make sure we've got plenty to go with here.\n smile \n"));
|
||||
cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));
|
||||
cl_git_pass(git_hashsig_create(&b, buf.ptr, buf.size, GIT_HASHSIG_NORMAL));
|
||||
|
||||
sim = git_hashsig_compare(a, b);
|
||||
|
||||
@ -777,10 +777,10 @@ void test_core_buffer__similarity_metric(void)
|
||||
/* let's try comparing data to a superset of itself */
|
||||
|
||||
cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
|
||||
cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
|
||||
cl_git_pass(git_hashsig_create(&a, buf.ptr, buf.size, GIT_HASHSIG_NORMAL));
|
||||
cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1
|
||||
"and if I add some more, it should still be pretty similar, yes?\n"));
|
||||
cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));
|
||||
cl_git_pass(git_hashsig_create(&b, buf.ptr, buf.size, GIT_HASHSIG_NORMAL));
|
||||
|
||||
sim = git_hashsig_compare(a, b);
|
||||
|
||||
@ -792,13 +792,13 @@ void test_core_buffer__similarity_metric(void)
|
||||
/* what if we keep about half the original data and add half new */
|
||||
|
||||
cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
|
||||
cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
|
||||
cl_git_pass(git_hashsig_create(&a, buf.ptr, buf.size, GIT_HASHSIG_NORMAL));
|
||||
cl_git_pass(git_buf_sets(&buf,
|
||||
"test data\nright here\ninline\ntada\nneeds more data\nlots of data\n"
|
||||
"is this enough?\nthere has to be enough data to fill the hash array!\n"
|
||||
"okay, that's half the original\nwhat else can we add?\nmore data\n"
|
||||
"one more line will complete this\nshort\nlines\ndon't\nmatter\n"));
|
||||
cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));
|
||||
cl_git_pass(git_hashsig_create(&b, buf.ptr, buf.size, GIT_HASHSIG_NORMAL));
|
||||
|
||||
sim = git_hashsig_compare(a, b);
|
||||
|
||||
@ -810,7 +810,7 @@ void test_core_buffer__similarity_metric(void)
|
||||
/* lastly, let's check that we can hash file content as well */
|
||||
|
||||
cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
|
||||
cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
|
||||
cl_git_pass(git_hashsig_create(&a, buf.ptr, buf.size, GIT_HASHSIG_NORMAL));
|
||||
|
||||
cl_git_pass(git_futils_mkdir("scratch", NULL, 0755, GIT_MKDIR_PATH));
|
||||
cl_git_mkfile("scratch/testdata", SIMILARITY_TEST_DATA_1);
|
||||
@ -880,10 +880,10 @@ void test_core_buffer__similarity_metric_whitespace(void)
|
||||
for (i = 0; i < 3; ++i) {
|
||||
for (j = 0; j < 3; ++j) {
|
||||
cl_git_pass(git_buf_sets(&buf, text[i]));
|
||||
cl_git_pass(git_hashsig_create(&a, &buf, opt));
|
||||
cl_git_pass(git_hashsig_create(&a, buf.ptr, buf.size, opt));
|
||||
|
||||
cl_git_pass(git_buf_sets(&buf, text[j]));
|
||||
cl_git_pass(git_hashsig_create(&b, &buf, opt));
|
||||
cl_git_pass(git_hashsig_create(&b, buf.ptr, buf.size, opt));
|
||||
|
||||
sim = git_hashsig_compare(a, b);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user