Some similarity metric adjustments

This makes the text similarity metric treat \r as equivalent to \n and makes it skip whitespace immediately following a line terminator, so line indentation will have less effect on the difference measurement (and so \r\n will be treated as just a single line terminator). This also separates the text and binary hash calculators into two separate functions instead of have more if statements inside the loop. This should make it easier to have more differentiated heuristics in the future if we so wish.
2025-12-04 00:23:47 +00:00 · 2013-01-13 10:06:09 -08:00 · 2013-01-13 10:06:09 -08:00 · f3327cac1d
commit f3327cac1d
parent 9c454b007b
1 changed files with 81 additions and 23 deletions
--- a/src/buf_text.c
+++ b/src/buf_text.c
@ -232,7 +232,7 @@ struct git_buf_text_hashsig {
 	unsigned int pairs : 1;
 };
-static int similarity_advance(git_buf_text_hashsig *sig, uint32_t hash)
+static int similarity_record_hash(git_buf_text_hashsig *sig, uint32_t hash)
 {
 	if (sig->size >= sig->asize) {
 		size_t new_asize = sig->asize + 512;
@ -248,6 +248,80 @@ static int similarity_advance(git_buf_text_hashsig *sig, uint32_t hash)
 	return 0;
 }
 static int similarity_add_hashes_text(
 	git_buf_text_hashsig *sig,
 	uint32_t *hash_start,
 	size_t *hashlen_start,
 	const char *ptr,
 	size_t len)
 {
 	int error;
 	const char *scan = ptr, *scan_end = ptr + len;
 	uint32_t hash = *hash_start;
 	size_t hashlen = *hashlen_start;
 	while (scan < scan_end) {
 		char ch = *scan++;
 		if (ch == '\r' || ch == '\n' || hashlen >= SIMILARITY_MAXRUN) {
 			if ((error = similarity_record_hash(sig, hash)) < 0)
 				break;
 			hash = SIMILARITY_HASH_START;
 			hashlen = 0;
 			/* skip all whitespace immediately after line ending */
 			while (scan < scan_end && git__isspace(*scan))
 				scan++;
 		} else {
 			hash = SIMILARITY_HASH_UPDATE(hash, ch);
 			hashlen++;
 		}
 	}
 	*hash_start = hash;
 	*hashlen_start = hashlen;
 	return error;
 }
 static int similarity_add_hashes_binary(
 	git_buf_text_hashsig *sig,
 	uint32_t *hash_start,
 	size_t *hashlen_start,
 	const char *ptr,
 	size_t len)
 {
 	int error;
 	const char *scan = ptr, *scan_end = ptr + len;
 	uint32_t hash = *hash_start;
 	size_t hashlen = *hashlen_start;
 	while (scan < scan_end) {
 		char ch = *scan++;
 		if (!ch || hashlen >= SIMILARITY_MAXRUN) {
 			if ((error = similarity_record_hash(sig, hash)) < 0)
 				break;
 			hash = SIMILARITY_HASH_START;
 			hashlen = 0;
 			/* skip run of terminators */
 			while (scan < scan_end && !*scan)
 				scan++;
 		} else {
 			hash = SIMILARITY_HASH_UPDATE(hash, ch);
 			hashlen++;
 		}
 	}
 	*hash_start = hash;
 	*hashlen_start = hashlen;
 	return error;
 }
 static int similarity_add_hashes(
 	git_buf_text_hashsig *sig,
 	uint32_t *hash_start,
@ -256,29 +330,13 @@ static int similarity_add_hashes(
 	size_t len)
 {
 	int error = 0;
 	const char *scan = ptr, *scan_end = ptr + len;
 	char term = (sig->format == SIMILARITY_FORMAT_TEXT) ? '\n' : '\0';
 	uint32_t hash = hash_start ? *hash_start : SIMILARITY_HASH_START;
 	size_t hashlen = hashlen_start ? *hashlen_start : 0;
-	while (scan < scan_end) {
+	if (sig->format == SIMILARITY_FORMAT_TEXT)
-		char ch = *scan++;
+		error = similarity_add_hashes_text(sig, &hash, &hashlen, ptr, len);
-
+	else
-		if (ch == term || hashlen >= SIMILARITY_MAXRUN) {
+		error = similarity_add_hashes_binary(sig, &hash, &hashlen, ptr, len);
 			if ((error = similarity_advance(sig, hash)) < 0)
 				break;
 			hash = SIMILARITY_HASH_START;
 			hashlen = 0;
 			/* skip run of terminators */
 			while (scan < scan_end && *scan == term)
 				scan++;
 		} else {
 			hash = SIMILARITY_HASH_UPDATE(hash, ch);
 			hashlen++;
 		}
 	}
 	if (hash_start)
 		*hash_start = hash;
@ -287,7 +345,7 @@ static int similarity_add_hashes(
 	/* if we're not saving intermediate state, add final hash as needed */
 	if (!error && !hash_start && hashlen > 0)
-		error = similarity_advance(sig, hash);
+		error = similarity_record_hash(sig, hash);
 	return error;
 }
@ -436,7 +494,7 @@ int git_buf_text_hashsig_create_fromfile(
 	p_close(fd);
 	if (!error && hashlen > 0)
-		error = similarity_advance(sig, hash);
+		error = similarity_record_hash(sig, hash);
 	if (!error)
 		error = similarity_finalize_hashes(sig, generate_pairs);