Initial implementation of similarity scoring algo

This adds a new `git_buf_text_hashsig` type and functions to generate these hash signatures and compare them to give a similarity score. This can be plugged into diff similarity scoring.
2025-11-04 20:36:17 +00:00 · 2013-01-11 22:13:02 -08:00 · 2013-01-11 22:13:02 -08:00 · 9c454b007b
commit 9c454b007b
parent f2e1d06064
3 changed files with 439 additions and 0 deletions
--- a/src/buf_text.c
+++ b/src/buf_text.c
@ -5,6 +5,7 @@
 * a Linking Exception. For full terms see the included COPYING file.
 */
 #include "buf_text.h"
+#include "fileops.h"

 int git_buf_text_puts_escaped(
 	git_buf *buf,
@ -212,3 +213,304 @@ bool git_buf_text_gather_stats(
 	return (stats->nul > 0 ||
 		((stats->printable >> 7) < stats->nonprintable));
 }
+
+#define SIMILARITY_MAXRUN 256
+#define SIMILARITY_HASH_START  5381
+#define SIMILARITY_HASH_UPDATE(S,N) (((S) << 5) + (S) + (uint32_t)(N))
+
+enum {
+	SIMILARITY_FORMAT_UNKNOWN = 0,
+	SIMILARITY_FORMAT_TEXT = 1,
+	SIMILARITY_FORMAT_BINARY = 2
+};
+
+struct git_buf_text_hashsig {
+	uint32_t *hashes;
+	size_t size;
+	size_t asize;
+	unsigned int format : 2;
+	unsigned int pairs : 1;
+};
+
+static int similarity_advance(git_buf_text_hashsig *sig, uint32_t hash)
+{
+	if (sig->size >= sig->asize) {
+		size_t new_asize = sig->asize + 512;
+		uint32_t *new_hashes =
+			git__realloc(sig->hashes, new_asize * sizeof(uint32_t));
+		GITERR_CHECK_ALLOC(new_hashes);
+
+		sig->hashes = new_hashes;
+		sig->asize  = new_asize;
+	}
+
+	sig->hashes[sig->size++] = hash;
+	return 0;
+}
+
+static int similarity_add_hashes(
+	git_buf_text_hashsig *sig,
+	uint32_t *hash_start,
+	size_t *hashlen_start,
+	const char *ptr,
+	size_t len)
+{
+	int error = 0;
+	const char *scan = ptr, *scan_end = ptr + len;
+	char term = (sig->format == SIMILARITY_FORMAT_TEXT) ? '\n' : '\0';
+	uint32_t hash = hash_start ? *hash_start : SIMILARITY_HASH_START;
+	size_t hashlen = hashlen_start ? *hashlen_start : 0;
+
+	while (scan < scan_end) {
+		char ch = *scan++;
+
+		if (ch == term || hashlen >= SIMILARITY_MAXRUN) {
+			if ((error = similarity_advance(sig, hash)) < 0)
+				break;
+
+			hash = SIMILARITY_HASH_START;
+			hashlen = 0;
+
+			/* skip run of terminators */
+			while (scan < scan_end && *scan == term)
+				scan++;
+		} else {
+			hash = SIMILARITY_HASH_UPDATE(hash, ch);
+			hashlen++;
+		}
+	}
+
+	if (hash_start)
+		*hash_start = hash;
+	if (hashlen_start)
+		*hashlen_start = hashlen;
+
+	/* if we're not saving intermediate state, add final hash as needed */
+	if (!error && !hash_start && hashlen > 0)
+		error = similarity_advance(sig, hash);
+
+	return error;
+}
+
+/*
+ * Decide if \0 or \n terminated runs are a better choice for hashes
+ */
+static void similarity_guess_format(
+	git_buf_text_hashsig *sig, const char *ptr, size_t len)
+{
+	size_t lines = 0, line_length = 0, max_line_length = 0;
+	size_t runs = 0, run_length = 0, max_run_length = 0;
+
+	/* don't process more than 4k of data for this */
+	if (len > 4096)
+		len = 4096;
+
+	/* gather some stats */
+	while (len--) {
+		char ch = *ptr++;
+
+		if (ch == '\0') {
+			runs++;
+			if (max_run_length < run_length)
+				max_run_length = run_length;
+			run_length = 0;
+		} else if (ch == '\n') {
+			lines++;
+			if (max_line_length < line_length)
+				max_line_length = line_length;
+			line_length = 0;
+		} else {
+			run_length++;
+			line_length++;
+		}
+	}
+
+	/* the following heuristic could probably be improved */
+	if (lines > runs)
+		sig->format = SIMILARITY_FORMAT_TEXT;
+	else if (runs > 0)
+		sig->format = SIMILARITY_FORMAT_BINARY;
+	else
+		sig->format = SIMILARITY_FORMAT_UNKNOWN;
+}
+
+static int similarity_compare_score(const void *a, const void *b)
+{
+	uint32_t av = *(uint32_t *)a, bv = *(uint32_t *)b;
+	return (av < bv) ? -1 : (av > bv) ? 1 : 0;
+}
+
+static int similarity_finalize_hashes(
+	git_buf_text_hashsig *sig, bool generate_pairs)
+{
+	if (!sig->size)
+		return 0;
+
+	/* create pairwise hashes if requested */
+
+	if (generate_pairs) {
+		size_t i, needed_size = sig->size * 2 - 1;
+
+		if (needed_size > sig->asize) {
+			uint32_t *new_hashes =
+				git__realloc(sig->hashes, needed_size * sizeof(uint32_t));
+			GITERR_CHECK_ALLOC(new_hashes);
+
+			sig->hashes = new_hashes;
+			sig->asize  = needed_size;
+		}
+
+		for (i = 1; i < sig->size; ++i)
+			sig->hashes[sig->size + i - 1] =
+				SIMILARITY_HASH_UPDATE(sig->hashes[i - 1], sig->hashes[i]);
+
+		sig->pairs = 1;
+	}
+
+	/* sort all hashes */
+
+	qsort(sig->hashes, sig->size, sizeof(uint32_t), similarity_compare_score);
+
+	if (generate_pairs)
+		qsort(&sig->hashes[sig->size], sig->size - 1, sizeof(uint32_t),
+			similarity_compare_score);
+
+	return 0;
+}
+
+int git_buf_text_hashsig_create(
+	git_buf_text_hashsig **out,
+	const git_buf *buf,
+	bool generate_pairs)
+{
+	int error;
+	git_buf_text_hashsig *sig = git__calloc(1, sizeof(git_buf_text_hashsig));
+	GITERR_CHECK_ALLOC(sig);
+
+	similarity_guess_format(sig, buf->ptr, buf->size);
+
+	error = similarity_add_hashes(sig, NULL, NULL, buf->ptr, buf->size);
+
+	if (!error)
+		error = similarity_finalize_hashes(sig, generate_pairs);
+
+	if (!error)
+		*out = sig;
+	else
+		git_buf_text_hashsig_free(sig);
+
+	return error;
+}
+
+int git_buf_text_hashsig_create_fromfile(
+	git_buf_text_hashsig **out,
+	const char *path,
+	bool generate_pairs)
+{
+	char buf[4096];
+	ssize_t buflen = 0;
+	uint32_t hash = SIMILARITY_HASH_START;
+	size_t hashlen = 0;
+	int error = 0, fd;
+	git_buf_text_hashsig *sig = git__calloc(1, sizeof(git_buf_text_hashsig));
+	GITERR_CHECK_ALLOC(sig);
+
+	if ((fd = git_futils_open_ro(path)) < 0) {
+		git__free(sig);
+		return fd;
+	}
+
+	while (!error && (buflen = p_read(fd, buf, sizeof(buf))) > 0) {
+		if (sig->format == SIMILARITY_FORMAT_UNKNOWN)
+			similarity_guess_format(sig, buf, buflen);
+
+		error = similarity_add_hashes(sig, &hash, &hashlen, buf, buflen);
+	}
+
+	if (buflen < 0) {
+		giterr_set(GITERR_OS,
+			"Read error on '%s' while calculating similarity hashes", path);
+		error = (int)buflen;
+	}
+
+	p_close(fd);
+
+	if (!error && hashlen > 0)
+		error = similarity_advance(sig, hash);
+
+	if (!error)
+		error = similarity_finalize_hashes(sig, generate_pairs);
+
+	if (!error)
+		*out = sig;
+	else
+		git_buf_text_hashsig_free(sig);
+
+	return error;
+}
+
+void git_buf_text_hashsig_free(git_buf_text_hashsig *sig)
+{
+	if (!sig)
+		return;
+
+	if (sig->hashes) {
+		git__free(sig->hashes);
+		sig->hashes = NULL;
+	}
+
+	git__free(sig);
+}
+
+int git_buf_text_hashsig_compare(
+	const git_buf_text_hashsig *a,
+	const git_buf_text_hashsig *b,
+	int scale)
+{
+	size_t matches = 0, pairs = 0, total = 0, i, j;
+
+	if (a->format != b->format || !a->size || !b->size)
+		return 0;
+
+	if (scale <= 0)
+		scale = 100;
+
+	/* hash lists are sorted - just look for overlap vs total */
+
+	for (i = 0, j = 0; i < a->size && j < b->size; ) {
+		uint32_t av = a->hashes[i];
+		uint32_t bv = b->hashes[j];
+
+		if (av < bv)
+			++i;
+		else if (av > bv)
+			++j;
+		else {
+			++i; ++j;
+			++matches;
+		}
+	}
+
+	total = (a->size + b->size);
+
+	if (a->pairs && b->pairs) {
+		for (i = 0, j = 0; i < a->size - 1 && j < b->size - 1; ) {
+			uint32_t av = a->hashes[i + a->size];
+			uint32_t bv = b->hashes[j + b->size];
+
+			if (av < bv)
+				++i;
+			else if (av > bv)
+				++j;
+			else {
+				++i; ++j;
+				++pairs;
+			}
+		}
+
+		total += (a->size + b->size - 2);
+	}
+
+	return (int)(scale * 2 * (matches + pairs) / total);
+}
+
--- a/src/buf_text.h
+++ b/src/buf_text.h
@ -105,4 +105,52 @@ extern int git_buf_text_detect_bom(
 extern bool git_buf_text_gather_stats(
 	git_buf_text_stats *stats, const git_buf *buf, bool skip_bom);

+/**
+ * Similarity signature of line hashes for a buffer
+ */
+typedef struct git_buf_text_hashsig git_buf_text_hashsig;
+
+/**
+ * Build a similarity signature for a buffer
+ *
+ * This can either generate a simple array of hashed lines/runs in the
+ * file, or it can also keep hashes of pairs of runs in sequence.  Adding
+ * the pairwise runs means the final score will be sensitive to line
+ * ordering changes as well as individual line contents.
+ *
+ * @param out The array of hashed runs representing the file content
+ * @param buf The contents of the file to hash
+ * @param generate_pairwise_hashes Should pairwise runs be hashed
+ */
+extern int git_buf_text_hashsig_create(
+	git_buf_text_hashsig **out,
+	const git_buf *buf,
+	bool generate_pairwise_hashes);
+
+/**
+ * Build a similarity signature from a file
+ *
+ * This walks through the file, only loading a maximum of 4K of file data at
+ * a time.  Otherwise, it acts just like `git_buf_text_hashsig_create`.
+ */
+extern int git_buf_text_hashsig_create_fromfile(
+	git_buf_text_hashsig **out,
+	const char *path,
+	bool generate_pairwise_hashes);
+
+/**
+ * Release memory for a content similarity signature
+ */
+extern void git_buf_text_hashsig_free(git_buf_text_hashsig *sig);
+
+/**
+ * Measure similarity between two files
+ *
+ * @return <0 for error, [0 to scale] as similarity score
+ */
+extern int git_buf_text_hashsig_compare(
+	const git_buf_text_hashsig *a,
+	const git_buf_text_hashsig *b,
+	int scale);
+
 #endif
--- a/tests-clar/core/buffer.c
+++ b/tests-clar/core/buffer.c
@ -1,6 +1,7 @@
 #include "clar_libgit2.h"
 #include "buffer.h"
 #include "buf_text.h"
+#include "fileops.h"

 #define TESTSTR "Have you seen that? Have you seeeen that??"
 const char *test_string = TESTSTR;
@ -730,3 +731,91 @@ void test_core_buffer__classify_with_utf8(void)
 	cl_assert(git_buf_text_is_binary(&b));
 	cl_assert(git_buf_text_contains_nul(&b));
 }
+
+void test_core_buffer__similarity_metric(void)
+{
+	git_buf_text_hashsig *a, *b;
+	git_buf buf = GIT_BUF_INIT;
+	int sim;
+
+	/* in the first case, we compare data to itself and expect 100% match */
+
+	cl_git_pass(git_buf_sets(&buf, "test data\nright here\ninline\ntada"));
+	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
+	cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true));
+
+	cl_assert_equal_i(100, git_buf_text_hashsig_compare(a, b, 100));
+
+	git_buf_text_hashsig_free(a);
+	git_buf_text_hashsig_free(b);
+
+	/* in the second case, half of a is matched and all of b is matched, so
+	 * we'll expect a score of around 66% to be the similarity score
+	 */
+
+	cl_git_pass(
+		git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
+	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
+
+	cl_git_pass(git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh"));
+	cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true));
+
+	sim = git_buf_text_hashsig_compare(a, b, 100);
+	cl_assert(sim > 60 && sim < 70);
+
+	git_buf_text_hashsig_free(a);
+	git_buf_text_hashsig_free(b);
+
+	/* in the reversed case, 100% of line hashes match, but no pairwise hashes
+	 * match, so we'll expect about a 50% match for a reversed file
+	 */
+
+	cl_git_pass(
+		git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
+	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
+	cl_git_pass(
+		git_buf_sets(&buf, "p\no\nn\nm\nl\nk\nj\ni\nh\ng\nf\ne\nd\nc\nb\na\n"));
+	cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true));
+
+	sim = git_buf_text_hashsig_compare(a, b, 100);
+	cl_assert(sim > 45 && sim < 55);
+
+	git_buf_text_hashsig_free(a);
+	git_buf_text_hashsig_free(b);
+
+	/* if we don't use pairwise signatures, then a reversed file should
+	 * match 100%
+	 */
+
+	cl_git_pass(
+		git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
+	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, false));
+	cl_git_pass(
+		git_buf_sets(&buf, "p\no\nn\nm\nl\nk\nj\ni\nh\ng\nf\ne\nd\nc\nb\na\n"));
+	cl_git_pass(git_buf_text_hashsig_create(&b, &buf, false));
+
+	sim = git_buf_text_hashsig_compare(a, b, 100);
+	cl_assert_equal_i(100, sim);
+
+	git_buf_text_hashsig_free(a);
+	git_buf_text_hashsig_free(b);
+
+	/* lastly, let's check that we can hash file content as well */
+
+	cl_git_pass(
+		git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
+	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
+
+	cl_git_pass(git_futils_mkdir("scratch", NULL, 0755, GIT_MKDIR_PATH));
+	cl_git_mkfile("scratch/testdata",
+		"a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n");
+	cl_git_pass(git_buf_text_hashsig_create_fromfile(&b, "scratch/testdata", true));
+
+	cl_assert_equal_i(100, git_buf_text_hashsig_compare(a, b, 100));
+
+	git_buf_text_hashsig_free(a);
+	git_buf_text_hashsig_free(b);
+
+	git_buf_free(&buf);
+	git_futils_rmdir_r("scratch", NULL, GIT_RMDIR_REMOVE_FILES);
+}