From 9c454b007b57669e7baf2b8b69cf053f32a620a2 Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Fri, 11 Jan 2013 22:13:02 -0800 Subject: [PATCH] Initial implementation of similarity scoring algo This adds a new `git_buf_text_hashsig` type and functions to generate these hash signatures and compare them to give a similarity score. This can be plugged into diff similarity scoring. --- src/buf_text.c | 302 +++++++++++++++++++++++++++++++++++++++ src/buf_text.h | 48 +++++++ tests-clar/core/buffer.c | 89 ++++++++++++ 3 files changed, 439 insertions(+) diff --git a/src/buf_text.c b/src/buf_text.c index 3a8f442b4..ab583f830 100644 --- a/src/buf_text.c +++ b/src/buf_text.c @@ -5,6 +5,7 @@ * a Linking Exception. For full terms see the included COPYING file. */ #include "buf_text.h" +#include "fileops.h" int git_buf_text_puts_escaped( git_buf *buf, @@ -212,3 +213,304 @@ bool git_buf_text_gather_stats( return (stats->nul > 0 || ((stats->printable >> 7) < stats->nonprintable)); } + +#define SIMILARITY_MAXRUN 256 +#define SIMILARITY_HASH_START 5381 +#define SIMILARITY_HASH_UPDATE(S,N) (((S) << 5) + (S) + (uint32_t)(N)) + +enum { + SIMILARITY_FORMAT_UNKNOWN = 0, + SIMILARITY_FORMAT_TEXT = 1, + SIMILARITY_FORMAT_BINARY = 2 +}; + +struct git_buf_text_hashsig { + uint32_t *hashes; + size_t size; + size_t asize; + unsigned int format : 2; + unsigned int pairs : 1; +}; + +static int similarity_advance(git_buf_text_hashsig *sig, uint32_t hash) +{ + if (sig->size >= sig->asize) { + size_t new_asize = sig->asize + 512; + uint32_t *new_hashes = + git__realloc(sig->hashes, new_asize * sizeof(uint32_t)); + GITERR_CHECK_ALLOC(new_hashes); + + sig->hashes = new_hashes; + sig->asize = new_asize; + } + + sig->hashes[sig->size++] = hash; + return 0; +} + +static int similarity_add_hashes( + git_buf_text_hashsig *sig, + uint32_t *hash_start, + size_t *hashlen_start, + const char *ptr, + size_t len) +{ + int error = 0; + const char *scan = ptr, *scan_end = ptr + len; + char term = (sig->format == SIMILARITY_FORMAT_TEXT) ? '\n' : '\0'; + uint32_t hash = hash_start ? *hash_start : SIMILARITY_HASH_START; + size_t hashlen = hashlen_start ? *hashlen_start : 0; + + while (scan < scan_end) { + char ch = *scan++; + + if (ch == term || hashlen >= SIMILARITY_MAXRUN) { + if ((error = similarity_advance(sig, hash)) < 0) + break; + + hash = SIMILARITY_HASH_START; + hashlen = 0; + + /* skip run of terminators */ + while (scan < scan_end && *scan == term) + scan++; + } else { + hash = SIMILARITY_HASH_UPDATE(hash, ch); + hashlen++; + } + } + + if (hash_start) + *hash_start = hash; + if (hashlen_start) + *hashlen_start = hashlen; + + /* if we're not saving intermediate state, add final hash as needed */ + if (!error && !hash_start && hashlen > 0) + error = similarity_advance(sig, hash); + + return error; +} + +/* + * Decide if \0 or \n terminated runs are a better choice for hashes + */ +static void similarity_guess_format( + git_buf_text_hashsig *sig, const char *ptr, size_t len) +{ + size_t lines = 0, line_length = 0, max_line_length = 0; + size_t runs = 0, run_length = 0, max_run_length = 0; + + /* don't process more than 4k of data for this */ + if (len > 4096) + len = 4096; + + /* gather some stats */ + while (len--) { + char ch = *ptr++; + + if (ch == '\0') { + runs++; + if (max_run_length < run_length) + max_run_length = run_length; + run_length = 0; + } else if (ch == '\n') { + lines++; + if (max_line_length < line_length) + max_line_length = line_length; + line_length = 0; + } else { + run_length++; + line_length++; + } + } + + /* the following heuristic could probably be improved */ + if (lines > runs) + sig->format = SIMILARITY_FORMAT_TEXT; + else if (runs > 0) + sig->format = SIMILARITY_FORMAT_BINARY; + else + sig->format = SIMILARITY_FORMAT_UNKNOWN; +} + +static int similarity_compare_score(const void *a, const void *b) +{ + uint32_t av = *(uint32_t *)a, bv = *(uint32_t *)b; + return (av < bv) ? -1 : (av > bv) ? 1 : 0; +} + +static int similarity_finalize_hashes( + git_buf_text_hashsig *sig, bool generate_pairs) +{ + if (!sig->size) + return 0; + + /* create pairwise hashes if requested */ + + if (generate_pairs) { + size_t i, needed_size = sig->size * 2 - 1; + + if (needed_size > sig->asize) { + uint32_t *new_hashes = + git__realloc(sig->hashes, needed_size * sizeof(uint32_t)); + GITERR_CHECK_ALLOC(new_hashes); + + sig->hashes = new_hashes; + sig->asize = needed_size; + } + + for (i = 1; i < sig->size; ++i) + sig->hashes[sig->size + i - 1] = + SIMILARITY_HASH_UPDATE(sig->hashes[i - 1], sig->hashes[i]); + + sig->pairs = 1; + } + + /* sort all hashes */ + + qsort(sig->hashes, sig->size, sizeof(uint32_t), similarity_compare_score); + + if (generate_pairs) + qsort(&sig->hashes[sig->size], sig->size - 1, sizeof(uint32_t), + similarity_compare_score); + + return 0; +} + +int git_buf_text_hashsig_create( + git_buf_text_hashsig **out, + const git_buf *buf, + bool generate_pairs) +{ + int error; + git_buf_text_hashsig *sig = git__calloc(1, sizeof(git_buf_text_hashsig)); + GITERR_CHECK_ALLOC(sig); + + similarity_guess_format(sig, buf->ptr, buf->size); + + error = similarity_add_hashes(sig, NULL, NULL, buf->ptr, buf->size); + + if (!error) + error = similarity_finalize_hashes(sig, generate_pairs); + + if (!error) + *out = sig; + else + git_buf_text_hashsig_free(sig); + + return error; +} + +int git_buf_text_hashsig_create_fromfile( + git_buf_text_hashsig **out, + const char *path, + bool generate_pairs) +{ + char buf[4096]; + ssize_t buflen = 0; + uint32_t hash = SIMILARITY_HASH_START; + size_t hashlen = 0; + int error = 0, fd; + git_buf_text_hashsig *sig = git__calloc(1, sizeof(git_buf_text_hashsig)); + GITERR_CHECK_ALLOC(sig); + + if ((fd = git_futils_open_ro(path)) < 0) { + git__free(sig); + return fd; + } + + while (!error && (buflen = p_read(fd, buf, sizeof(buf))) > 0) { + if (sig->format == SIMILARITY_FORMAT_UNKNOWN) + similarity_guess_format(sig, buf, buflen); + + error = similarity_add_hashes(sig, &hash, &hashlen, buf, buflen); + } + + if (buflen < 0) { + giterr_set(GITERR_OS, + "Read error on '%s' while calculating similarity hashes", path); + error = (int)buflen; + } + + p_close(fd); + + if (!error && hashlen > 0) + error = similarity_advance(sig, hash); + + if (!error) + error = similarity_finalize_hashes(sig, generate_pairs); + + if (!error) + *out = sig; + else + git_buf_text_hashsig_free(sig); + + return error; +} + +void git_buf_text_hashsig_free(git_buf_text_hashsig *sig) +{ + if (!sig) + return; + + if (sig->hashes) { + git__free(sig->hashes); + sig->hashes = NULL; + } + + git__free(sig); +} + +int git_buf_text_hashsig_compare( + const git_buf_text_hashsig *a, + const git_buf_text_hashsig *b, + int scale) +{ + size_t matches = 0, pairs = 0, total = 0, i, j; + + if (a->format != b->format || !a->size || !b->size) + return 0; + + if (scale <= 0) + scale = 100; + + /* hash lists are sorted - just look for overlap vs total */ + + for (i = 0, j = 0; i < a->size && j < b->size; ) { + uint32_t av = a->hashes[i]; + uint32_t bv = b->hashes[j]; + + if (av < bv) + ++i; + else if (av > bv) + ++j; + else { + ++i; ++j; + ++matches; + } + } + + total = (a->size + b->size); + + if (a->pairs && b->pairs) { + for (i = 0, j = 0; i < a->size - 1 && j < b->size - 1; ) { + uint32_t av = a->hashes[i + a->size]; + uint32_t bv = b->hashes[j + b->size]; + + if (av < bv) + ++i; + else if (av > bv) + ++j; + else { + ++i; ++j; + ++pairs; + } + } + + total += (a->size + b->size - 2); + } + + return (int)(scale * 2 * (matches + pairs) / total); +} + diff --git a/src/buf_text.h b/src/buf_text.h index 458ee33c9..c48c010d3 100644 --- a/src/buf_text.h +++ b/src/buf_text.h @@ -105,4 +105,52 @@ extern int git_buf_text_detect_bom( extern bool git_buf_text_gather_stats( git_buf_text_stats *stats, const git_buf *buf, bool skip_bom); +/** + * Similarity signature of line hashes for a buffer + */ +typedef struct git_buf_text_hashsig git_buf_text_hashsig; + +/** + * Build a similarity signature for a buffer + * + * This can either generate a simple array of hashed lines/runs in the + * file, or it can also keep hashes of pairs of runs in sequence. Adding + * the pairwise runs means the final score will be sensitive to line + * ordering changes as well as individual line contents. + * + * @param out The array of hashed runs representing the file content + * @param buf The contents of the file to hash + * @param generate_pairwise_hashes Should pairwise runs be hashed + */ +extern int git_buf_text_hashsig_create( + git_buf_text_hashsig **out, + const git_buf *buf, + bool generate_pairwise_hashes); + +/** + * Build a similarity signature from a file + * + * This walks through the file, only loading a maximum of 4K of file data at + * a time. Otherwise, it acts just like `git_buf_text_hashsig_create`. + */ +extern int git_buf_text_hashsig_create_fromfile( + git_buf_text_hashsig **out, + const char *path, + bool generate_pairwise_hashes); + +/** + * Release memory for a content similarity signature + */ +extern void git_buf_text_hashsig_free(git_buf_text_hashsig *sig); + +/** + * Measure similarity between two files + * + * @return <0 for error, [0 to scale] as similarity score + */ +extern int git_buf_text_hashsig_compare( + const git_buf_text_hashsig *a, + const git_buf_text_hashsig *b, + int scale); + #endif diff --git a/tests-clar/core/buffer.c b/tests-clar/core/buffer.c index 49ab41f71..63753bb67 100644 --- a/tests-clar/core/buffer.c +++ b/tests-clar/core/buffer.c @@ -1,6 +1,7 @@ #include "clar_libgit2.h" #include "buffer.h" #include "buf_text.h" +#include "fileops.h" #define TESTSTR "Have you seen that? Have you seeeen that??" const char *test_string = TESTSTR; @@ -730,3 +731,91 @@ void test_core_buffer__classify_with_utf8(void) cl_assert(git_buf_text_is_binary(&b)); cl_assert(git_buf_text_contains_nul(&b)); } + +void test_core_buffer__similarity_metric(void) +{ + git_buf_text_hashsig *a, *b; + git_buf buf = GIT_BUF_INIT; + int sim; + + /* in the first case, we compare data to itself and expect 100% match */ + + cl_git_pass(git_buf_sets(&buf, "test data\nright here\ninline\ntada")); + cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true)); + cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true)); + + cl_assert_equal_i(100, git_buf_text_hashsig_compare(a, b, 100)); + + git_buf_text_hashsig_free(a); + git_buf_text_hashsig_free(b); + + /* in the second case, half of a is matched and all of b is matched, so + * we'll expect a score of around 66% to be the similarity score + */ + + cl_git_pass( + git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n")); + cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true)); + + cl_git_pass(git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh")); + cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true)); + + sim = git_buf_text_hashsig_compare(a, b, 100); + cl_assert(sim > 60 && sim < 70); + + git_buf_text_hashsig_free(a); + git_buf_text_hashsig_free(b); + + /* in the reversed case, 100% of line hashes match, but no pairwise hashes + * match, so we'll expect about a 50% match for a reversed file + */ + + cl_git_pass( + git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n")); + cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true)); + cl_git_pass( + git_buf_sets(&buf, "p\no\nn\nm\nl\nk\nj\ni\nh\ng\nf\ne\nd\nc\nb\na\n")); + cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true)); + + sim = git_buf_text_hashsig_compare(a, b, 100); + cl_assert(sim > 45 && sim < 55); + + git_buf_text_hashsig_free(a); + git_buf_text_hashsig_free(b); + + /* if we don't use pairwise signatures, then a reversed file should + * match 100% + */ + + cl_git_pass( + git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n")); + cl_git_pass(git_buf_text_hashsig_create(&a, &buf, false)); + cl_git_pass( + git_buf_sets(&buf, "p\no\nn\nm\nl\nk\nj\ni\nh\ng\nf\ne\nd\nc\nb\na\n")); + cl_git_pass(git_buf_text_hashsig_create(&b, &buf, false)); + + sim = git_buf_text_hashsig_compare(a, b, 100); + cl_assert_equal_i(100, sim); + + git_buf_text_hashsig_free(a); + git_buf_text_hashsig_free(b); + + /* lastly, let's check that we can hash file content as well */ + + cl_git_pass( + git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n")); + cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true)); + + cl_git_pass(git_futils_mkdir("scratch", NULL, 0755, GIT_MKDIR_PATH)); + cl_git_mkfile("scratch/testdata", + "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"); + cl_git_pass(git_buf_text_hashsig_create_fromfile(&b, "scratch/testdata", true)); + + cl_assert_equal_i(100, git_buf_text_hashsig_compare(a, b, 100)); + + git_buf_text_hashsig_free(a); + git_buf_text_hashsig_free(b); + + git_buf_free(&buf); + git_futils_rmdir_r("scratch", NULL, GIT_RMDIR_REMOVE_FILES); +}