mirror of
https://git.proxmox.com/git/libgit2
synced 2025-05-29 18:38:58 +00:00
Added GIT_HASHSIG_ALLOW_SMALL_FILES to allow computing signatures for small files
The implementation of the hashsig API disallows computing a signature on small files containing only a few lines. This new flag disables this behavior. git_diff_find_similar() sets this flag by default which means that rename / copy detection of small files will now work. This in turn affects the behavior of the git_status and git_blame APIs which will now detect rename of small files assuming the right options are passed.
This commit is contained in:
parent
d147900ea4
commit
36fc549781
@ -12,33 +12,52 @@
|
||||
GIT_BEGIN_DECL
|
||||
|
||||
/**
|
||||
* Similarity signature of line hashes for a buffer
|
||||
* Similarity signature of arbitrary text content based on line hashes
|
||||
*/
|
||||
typedef struct git_hashsig git_hashsig;
|
||||
|
||||
/**
|
||||
* Options for hashsig calculation
|
||||
* Options for hashsig computation
|
||||
*
|
||||
* The options GIT_HASHSIG_NORMAL, GIT_HASHSIG_IGNORE_WHITESPACE,
|
||||
* GIT_HASHSIG_SMART_WHITESPACE are exclusive and should not be combined.
|
||||
*/
|
||||
typedef enum {
|
||||
GIT_HASHSIG_NORMAL = 0, /* use all data */
|
||||
GIT_HASHSIG_IGNORE_WHITESPACE = 1, /* ignore whitespace */
|
||||
GIT_HASHSIG_SMART_WHITESPACE = 2, /* ignore \r and all space after \n */
|
||||
/**
|
||||
* Use all data
|
||||
*/
|
||||
GIT_HASHSIG_NORMAL = 0,
|
||||
|
||||
/**
|
||||
* Ignore whitespace
|
||||
*/
|
||||
GIT_HASHSIG_IGNORE_WHITESPACE = (1 << 0),
|
||||
|
||||
/**
|
||||
* Ignore \r and all space after \n
|
||||
*/
|
||||
GIT_HASHSIG_SMART_WHITESPACE = (1 << 1),
|
||||
|
||||
/**
|
||||
* Allow hashing of small files
|
||||
*/
|
||||
GIT_HASHSIG_ALLOW_SMALL_FILES = (1 << 2)
|
||||
} git_hashsig_option_t;
|
||||
|
||||
/**
|
||||
* Build a similarity signature for a buffer
|
||||
* Compute a similarity signature for a text buffer
|
||||
*
|
||||
* If you have passed a whitespace-ignoring buffer, then the whitespace
|
||||
* will be removed from the buffer while it is being processed, modifying
|
||||
* the buffer in place. Sorry about that!
|
||||
* If you have passed the option GIT_HASHSIG_IGNORE_WHITESPACE, then the
|
||||
* whitespace will be removed from the buffer while it is being processed,
|
||||
* modifying the buffer in place. Sorry about that!
|
||||
*
|
||||
* This will return an error if the buffer doesn't contain enough data to
|
||||
* compute a valid signature.
|
||||
*
|
||||
* @param out The array of hashed runs representing the file content
|
||||
* @param buf The contents of the file to hash
|
||||
* @param buflen The length of the data at `buf`
|
||||
* @param generate_pairwise_hashes Should pairwise runs be hashed
|
||||
* @param out The computed similarity signature.
|
||||
* @param buf The input buffer.
|
||||
* @param buflen The input buffer size.
|
||||
* @param opts The signature computation options (see above).
|
||||
* @return 0 on success, GIT_EBUFS if the buffer doesn't contain enough data to
|
||||
* compute a valid signature (unless GIT_HASHSIG_ALLOW_SMALL_FILES is set), or
|
||||
* error code.
|
||||
*/
|
||||
GIT_EXTERN(int) git_hashsig_create(
|
||||
git_hashsig **out,
|
||||
@ -47,13 +66,17 @@ GIT_EXTERN(int) git_hashsig_create(
|
||||
git_hashsig_option_t opts);
|
||||
|
||||
/**
|
||||
* Build a similarity signature from a file
|
||||
* Compute a similarity signature for a text file
|
||||
*
|
||||
* This walks through the file, only loading a maximum of 4K of file data at
|
||||
* a time. Otherwise, it acts just like `git_hashsig_create`.
|
||||
* a time. Otherwise, it acts just like `git_hashsig_create`.
|
||||
*
|
||||
* This will return an error if the file doesn't contain enough data to
|
||||
* compute a valid signature.
|
||||
* @param out The computed similarity signature.
|
||||
* @param path The path to the input file.
|
||||
* @param opts The signature computation options (see above).
|
||||
* @return 0 on success, GIT_EBUFS if the buffer doesn't contain enough data to
|
||||
* compute a valid signature (unless GIT_HASHSIG_ALLOW_SMALL_FILES is set), or
|
||||
* error code.
|
||||
*/
|
||||
GIT_EXTERN(int) git_hashsig_create_fromfile(
|
||||
git_hashsig **out,
|
||||
@ -62,13 +85,17 @@ GIT_EXTERN(int) git_hashsig_create_fromfile(
|
||||
|
||||
/**
|
||||
* Release memory for a content similarity signature
|
||||
*
|
||||
* @param sig The similarity signature to free.
|
||||
*/
|
||||
GIT_EXTERN(void) git_hashsig_free(git_hashsig *sig);
|
||||
|
||||
/**
|
||||
* Measure similarity between two files
|
||||
* Measure similarity score between two similarity signatures
|
||||
*
|
||||
* @return <0 for error, [0 to 100] as similarity score
|
||||
* @param a The first similarity signature to compare.
|
||||
* @param b The second similarity signature to compare.
|
||||
* @return [0 to 100] on success as the similarity score, or error code.
|
||||
*/
|
||||
GIT_EXTERN(int) git_hashsig_compare(
|
||||
const git_hashsig *a,
|
||||
|
@ -219,34 +219,18 @@ int git_diff_find_similar__hashsig_for_file(
|
||||
void **out, const git_diff_file *f, const char *path, void *p)
|
||||
{
|
||||
git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
|
||||
int error = 0;
|
||||
|
||||
GIT_UNUSED(f);
|
||||
error = git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
|
||||
|
||||
if (error == GIT_EBUFS) {
|
||||
error = 0;
|
||||
giterr_clear();
|
||||
}
|
||||
|
||||
return error;
|
||||
return git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
|
||||
}
|
||||
|
||||
int git_diff_find_similar__hashsig_for_buf(
|
||||
void **out, const git_diff_file *f, const char *buf, size_t len, void *p)
|
||||
{
|
||||
git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
|
||||
int error = 0;
|
||||
|
||||
GIT_UNUSED(f);
|
||||
error = git_hashsig_create((git_hashsig **)out, buf, len, opt);
|
||||
|
||||
if (error == GIT_EBUFS) {
|
||||
error = 0;
|
||||
giterr_clear();
|
||||
}
|
||||
|
||||
return error;
|
||||
return git_hashsig_create((git_hashsig **)out, buf, len, opt);
|
||||
}
|
||||
|
||||
void git_diff_find_similar__hashsig_free(void *sig, void *payload)
|
||||
@ -258,8 +242,14 @@ void git_diff_find_similar__hashsig_free(void *sig, void *payload)
|
||||
int git_diff_find_similar__calc_similarity(
|
||||
int *score, void *siga, void *sigb, void *payload)
|
||||
{
|
||||
int error;
|
||||
|
||||
GIT_UNUSED(payload);
|
||||
*score = git_hashsig_compare(siga, sigb);
|
||||
error = git_hashsig_compare(siga, sigb);
|
||||
if (error < 0)
|
||||
return error;
|
||||
|
||||
*score = error;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -273,6 +263,7 @@ static int normalize_find_opts(
|
||||
const git_diff_find_options *given)
|
||||
{
|
||||
git_config *cfg = NULL;
|
||||
git_hashsig_option_t hashsig_opts;
|
||||
|
||||
GITERR_CHECK_VERSION(given, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options");
|
||||
|
||||
@ -354,11 +345,13 @@ static int normalize_find_opts(
|
||||
opts->metric->similarity = git_diff_find_similar__calc_similarity;
|
||||
|
||||
if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE)
|
||||
opts->metric->payload = (void *)GIT_HASHSIG_IGNORE_WHITESPACE;
|
||||
hashsig_opts = GIT_HASHSIG_IGNORE_WHITESPACE;
|
||||
else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE)
|
||||
opts->metric->payload = (void *)GIT_HASHSIG_NORMAL;
|
||||
hashsig_opts = GIT_HASHSIG_NORMAL;
|
||||
else
|
||||
opts->metric->payload = (void *)GIT_HASHSIG_SMART_WHITESPACE;
|
||||
hashsig_opts = GIT_HASHSIG_SMART_WHITESPACE;
|
||||
hashsig_opts |= GIT_HASHSIG_ALLOW_SMALL_FILES;
|
||||
opts->metric->payload = (void *)hashsig_opts;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -35,7 +35,6 @@ struct git_hashsig {
|
||||
hashsig_heap mins;
|
||||
hashsig_heap maxs;
|
||||
git_hashsig_option_t opt;
|
||||
int considered;
|
||||
};
|
||||
|
||||
#define HEAP_LCHILD_OF(I) (((I)<<1)+1)
|
||||
@ -135,25 +134,23 @@ static void hashsig_in_progress_init(
|
||||
{
|
||||
int i;
|
||||
|
||||
switch (sig->opt) {
|
||||
case GIT_HASHSIG_IGNORE_WHITESPACE:
|
||||
/* no more than one can be set */
|
||||
assert(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) ||
|
||||
!(sig->opt & GIT_HASHSIG_SMART_WHITESPACE));
|
||||
|
||||
if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) {
|
||||
for (i = 0; i < 256; ++i)
|
||||
prog->ignore_ch[i] = git__isspace_nonlf(i);
|
||||
prog->use_ignores = 1;
|
||||
break;
|
||||
case GIT_HASHSIG_SMART_WHITESPACE:
|
||||
} else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) {
|
||||
for (i = 0; i < 256; ++i)
|
||||
prog->ignore_ch[i] = git__isspace(i);
|
||||
prog->use_ignores = 1;
|
||||
break;
|
||||
default:
|
||||
} else {
|
||||
memset(prog, 0, sizeof(*prog));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#define HASHSIG_IN_PROGRESS_INIT { 1 }
|
||||
|
||||
static int hashsig_add_hashes(
|
||||
git_hashsig *sig,
|
||||
const uint8_t *data,
|
||||
@ -174,12 +171,13 @@ static int hashsig_add_hashes(
|
||||
if (use_ignores)
|
||||
for (; scan < end && git__isspace_nonlf(ch); ch = *scan)
|
||||
++scan;
|
||||
else if (sig->opt != GIT_HASHSIG_NORMAL)
|
||||
else if (sig->opt &
|
||||
(GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE))
|
||||
for (; scan < end && ch == '\r'; ch = *scan)
|
||||
++scan;
|
||||
|
||||
/* peek at next character to decide what to do next */
|
||||
if (sig->opt == GIT_HASHSIG_SMART_WHITESPACE)
|
||||
if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE)
|
||||
use_ignores = (ch == '\n');
|
||||
|
||||
if (scan >= end)
|
||||
@ -198,8 +196,6 @@ static int hashsig_add_hashes(
|
||||
hashsig_heap_insert(&sig->mins, (hashsig_t)state);
|
||||
hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
|
||||
|
||||
sig->considered++;
|
||||
|
||||
while (scan < end && (*scan == '\n' || !*scan))
|
||||
++scan;
|
||||
}
|
||||
@ -212,7 +208,8 @@ static int hashsig_add_hashes(
|
||||
|
||||
static int hashsig_finalize_hashes(git_hashsig *sig)
|
||||
{
|
||||
if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE) {
|
||||
if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE &&
|
||||
!(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) {
|
||||
giterr_set(GITERR_INVALID,
|
||||
"File too small for similarity signature calculation");
|
||||
return GIT_EBUFS;
|
||||
|
@ -53,6 +53,11 @@ void cl_git_rewritefile(const char *path, const char *content)
|
||||
cl_git_write2file(path, content, 0, O_WRONLY | O_CREAT | O_TRUNC, 0644);
|
||||
}
|
||||
|
||||
void cl_git_rmfile(const char *filename)
|
||||
{
|
||||
cl_must_pass(p_unlink(filename));
|
||||
}
|
||||
|
||||
#ifdef GIT_WIN32
|
||||
|
||||
#include "win32/utf-conv.h"
|
||||
|
@ -112,6 +112,7 @@ void cl_git_append2file(const char *filename, const char *new_content);
|
||||
void cl_git_rewritefile(const char *filename, const char *new_content);
|
||||
void cl_git_write2file(const char *path, const char *data,
|
||||
size_t datalen, int flags, unsigned int mode);
|
||||
void cl_git_rmfile(const char *filename);
|
||||
|
||||
bool cl_toggle_filemode(const char *filename);
|
||||
bool cl_is_chmod_supported(void);
|
||||
|
@ -381,37 +381,53 @@ void test_diff_rename__not_exact_match(void)
|
||||
git_tree_free(new_tree);
|
||||
}
|
||||
|
||||
void test_diff_rename__handles_small_files(void)
|
||||
void test_diff_rename__test_small_files(void)
|
||||
{
|
||||
const char *tree_sha = "2bc7f351d20b53f1c72c16c4b036e491c478c49a";
|
||||
git_index *index;
|
||||
git_tree *tree;
|
||||
git_reference *head_reference;
|
||||
git_commit *head_commit;
|
||||
git_tree *head_tree;
|
||||
git_tree *commit_tree;
|
||||
git_signature *signature;
|
||||
git_diff *diff;
|
||||
git_diff_options diffopts = GIT_DIFF_OPTIONS_INIT;
|
||||
git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT;
|
||||
git_oid oid;
|
||||
const git_diff_delta *delta;
|
||||
git_diff_options diff_options = GIT_DIFF_OPTIONS_INIT;
|
||||
git_diff_find_options find_options = GIT_DIFF_FIND_OPTIONS_INIT;
|
||||
|
||||
cl_git_pass(git_repository_index(&index, g_repo));
|
||||
|
||||
tree = resolve_commit_oid_to_tree(g_repo, tree_sha);
|
||||
cl_git_mkfile("renames/small.txt", "Hello World!\n");
|
||||
cl_git_pass(git_index_add_bypath(index, "small.txt"));
|
||||
|
||||
cl_git_rewritefile("renames/songof7cities.txt", "single line\n");
|
||||
cl_git_pass(git_index_add_bypath(index, "songof7cities.txt"));
|
||||
cl_git_pass(git_repository_head(&head_reference, g_repo));
|
||||
cl_git_pass(git_reference_peel((git_object**)&head_commit, head_reference, GIT_OBJ_COMMIT));
|
||||
cl_git_pass(git_commit_tree(&head_tree, head_commit));
|
||||
cl_git_pass(git_index_write_tree(&oid, index));
|
||||
cl_git_pass(git_tree_lookup(&commit_tree, g_repo, &oid));
|
||||
cl_git_pass(git_signature_new(&signature, "Rename", "rename@example.com", 1404157834, 0));
|
||||
cl_git_pass(git_commit_create(&oid, g_repo, "HEAD", signature, signature, NULL, "Test commit", commit_tree, 1, (const git_commit**)&head_commit));
|
||||
|
||||
cl_git_rewritefile("renames/untimely.txt", "untimely\n");
|
||||
cl_git_pass(git_index_add_bypath(index, "untimely.txt"));
|
||||
cl_git_mkfile("renames/copy.txt", "Hello World!\n");
|
||||
cl_git_rmfile("renames/small.txt");
|
||||
|
||||
/* Tests that we can invoke find_similar on small files
|
||||
* and that the GIT_EBUFS (too small) error code is not
|
||||
* propagated to the caller.
|
||||
*/
|
||||
cl_git_pass(git_diff_tree_to_index(&diff, g_repo, tree, index, &diffopts));
|
||||
diff_options.flags = GIT_DIFF_INCLUDE_UNTRACKED;
|
||||
cl_git_pass(git_diff_tree_to_workdir(&diff, g_repo, commit_tree, &diff_options));
|
||||
find_options.flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_FOR_UNTRACKED;
|
||||
cl_git_pass(git_diff_find_similar(diff, &find_options));
|
||||
|
||||
opts.flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES |
|
||||
GIT_DIFF_FIND_AND_BREAK_REWRITES;
|
||||
cl_git_pass(git_diff_find_similar(diff, &opts));
|
||||
cl_assert_equal_i(git_diff_num_deltas(diff), 1);
|
||||
delta = git_diff_get_delta(diff, 0);
|
||||
cl_assert_equal_i(delta->status, GIT_DELTA_RENAMED);
|
||||
cl_assert_equal_s(delta->old_file.path, "small.txt");
|
||||
cl_assert_equal_s(delta->new_file.path, "copy.txt");
|
||||
|
||||
git_diff_free(diff);
|
||||
git_tree_free(tree);
|
||||
git_signature_free(signature);
|
||||
git_tree_free(commit_tree);
|
||||
git_tree_free(head_tree);
|
||||
git_commit_free(head_commit);
|
||||
git_reference_free(head_reference);
|
||||
git_index_free(index);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user