mirror of
				https://git.proxmox.com/git/libgit2
				synced 2025-11-04 03:16:59 +00:00 
			
		
		
		
	Add git_oid_shorten (unique OID minimzer)
				
					
				
			Set of methods to find the minimal-length to uniquely identify every OID in a list. Useful for GUI applications, commit logs and so on. Includes stress test. Signed-off-by: Vicent Marti <tanoku@gmail.com>
This commit is contained in:
		
							parent
							
								
									b760fbf539
								
							
						
					
					
						commit
						26022f0719
					
				@ -132,6 +132,60 @@ GIT_EXTERN(void) git_oid_cpy(git_oid *out, const git_oid *src);
 | 
			
		||||
 */
 | 
			
		||||
GIT_EXTERN(int) git_oid_cmp(const git_oid *a, const git_oid *b);
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * OID Shortener object
 | 
			
		||||
 */
 | 
			
		||||
typedef struct git_oid_shorten git_oid_shorten;
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Create a new OID shortener.
 | 
			
		||||
 *
 | 
			
		||||
 * The OID shortener is used to process a list of OIDs
 | 
			
		||||
 * in text form and return the shortest length that would
 | 
			
		||||
 * uniquely identify all of them.
 | 
			
		||||
 *
 | 
			
		||||
 * E.g. look at the result of `git log --abbrev`.
 | 
			
		||||
 *
 | 
			
		||||
 * @param min_length The minimal length for all identifiers,
 | 
			
		||||
 *		which will be used even if shorter OIDs would still
 | 
			
		||||
 *		be unique.
 | 
			
		||||
 *	@return a `git_oid_shorten` instance, NULL if OOM
 | 
			
		||||
 */
 | 
			
		||||
git_oid_shorten *git_oid_shorten_new(size_t min_length);
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Add a new OID to set of shortened OIDs and calculate
 | 
			
		||||
 * the minimal length to uniquely identify all the OIDs in
 | 
			
		||||
 * the set.
 | 
			
		||||
 *
 | 
			
		||||
 * The OID is expected to be a 40-char hexadecimal string.
 | 
			
		||||
 * The OID is owned by the user and will not be modified
 | 
			
		||||
 * or freed.
 | 
			
		||||
 *
 | 
			
		||||
 * For performance reasons, there is a hard-limit of how many
 | 
			
		||||
 * OIDs can be added to a single set (around ~22000, assuming
 | 
			
		||||
 * a mostly randomized distribution), which should be enough
 | 
			
		||||
 * for any kind of program, and keeps the algorithm fast and
 | 
			
		||||
 * memory-efficient.
 | 
			
		||||
 *
 | 
			
		||||
 * Attempting to add more than those OIDs will result in a
 | 
			
		||||
 * GIT_ENOMEM error
 | 
			
		||||
 *
 | 
			
		||||
 * @param os a `git_oid_shorten` instance
 | 
			
		||||
 * @param text_oid an OID in text form
 | 
			
		||||
 * @return the minimal length to uniquely identify all OIDs
 | 
			
		||||
 *		added so far to the set; or an error code (<0) if an
 | 
			
		||||
 *		error occurs.
 | 
			
		||||
 */
 | 
			
		||||
int git_oid_shorten_add(git_oid_shorten *os, const char *text_oid);
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Free an OID shortener instance
 | 
			
		||||
 * 
 | 
			
		||||
 * @param os a `git_oid_shorten` instance
 | 
			
		||||
 */
 | 
			
		||||
void git_oid_shorten_free(git_oid_shorten *os);
 | 
			
		||||
 | 
			
		||||
/** @} */
 | 
			
		||||
GIT_END_DECL
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										178
									
								
								src/oid.c
									
									
									
									
									
								
							
							
						
						
									
										178
									
								
								src/oid.c
									
									
									
									
									
								
							@ -27,6 +27,7 @@
 | 
			
		||||
#include "git2/oid.h"
 | 
			
		||||
#include "repository.h"
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <limits.h>
 | 
			
		||||
 | 
			
		||||
static signed char from_hex[] = {
 | 
			
		||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 00 */
 | 
			
		||||
@ -166,3 +167,180 @@ int git_oid_cmp(const git_oid *a, const git_oid *b)
 | 
			
		||||
{
 | 
			
		||||
	return memcmp(a->id, b->id, sizeof(a->id));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
typedef short node_index;
 | 
			
		||||
 | 
			
		||||
typedef union {
 | 
			
		||||
	const char *tail;
 | 
			
		||||
	node_index children[16];
 | 
			
		||||
} trie_node;
 | 
			
		||||
 | 
			
		||||
struct git_oid_shorten {
 | 
			
		||||
	trie_node *nodes;
 | 
			
		||||
	size_t node_count, size;
 | 
			
		||||
	int min_length, full;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static int resize_trie(git_oid_shorten *self, size_t new_size)
 | 
			
		||||
{
 | 
			
		||||
	self->nodes = realloc(self->nodes, new_size * sizeof(trie_node));
 | 
			
		||||
	if (self->nodes == NULL)
 | 
			
		||||
		return GIT_ENOMEM;
 | 
			
		||||
 | 
			
		||||
	if (new_size > self->size) {
 | 
			
		||||
		memset(&self->nodes[self->size], 0x0, (new_size - self->size) * sizeof(trie_node));
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	self->size = new_size;
 | 
			
		||||
	return GIT_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static trie_node *push_leaf(git_oid_shorten *os, node_index idx, int push_at, const char *oid)
 | 
			
		||||
{
 | 
			
		||||
	trie_node *node, *leaf;
 | 
			
		||||
	node_index idx_leaf;
 | 
			
		||||
 | 
			
		||||
	if (os->node_count >= os->size) {
 | 
			
		||||
		if (resize_trie(os, os->size * 2) < GIT_SUCCESS)
 | 
			
		||||
			return NULL;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	idx_leaf = (node_index)os->node_count++;
 | 
			
		||||
 | 
			
		||||
	if (os->node_count == SHRT_MAX)
 | 
			
		||||
		os->full = 1;
 | 
			
		||||
 | 
			
		||||
	node = &os->nodes[idx];
 | 
			
		||||
	node->children[push_at] = -idx_leaf;
 | 
			
		||||
 | 
			
		||||
	leaf = &os->nodes[idx_leaf];
 | 
			
		||||
	leaf->tail = oid;
 | 
			
		||||
 | 
			
		||||
	return node;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
git_oid_shorten *git_oid_shorten_new(size_t min_length)
 | 
			
		||||
{
 | 
			
		||||
	git_oid_shorten *os;
 | 
			
		||||
 | 
			
		||||
	os = git__malloc(sizeof(git_oid_shorten));
 | 
			
		||||
	if (os == NULL)
 | 
			
		||||
		return NULL;
 | 
			
		||||
 | 
			
		||||
	memset(os, 0x0, sizeof(git_oid_shorten));
 | 
			
		||||
 | 
			
		||||
	if (resize_trie(os, 16) < GIT_SUCCESS) {
 | 
			
		||||
		free(os);
 | 
			
		||||
		return NULL;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	os->node_count = 1;
 | 
			
		||||
	os->min_length = min_length;
 | 
			
		||||
 | 
			
		||||
	return os;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void git_oid_shorten_free(git_oid_shorten *os)
 | 
			
		||||
{
 | 
			
		||||
	free(os->nodes);
 | 
			
		||||
	free(os);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * What wizardry is this?
 | 
			
		||||
 *
 | 
			
		||||
 * This is just a memory-optimized trie: basically a very fancy
 | 
			
		||||
 * 16-ary tree, which is used to store the prefixes of the OID
 | 
			
		||||
 * strings.
 | 
			
		||||
 *
 | 
			
		||||
 * Read more: http://en.wikipedia.org/wiki/Trie
 | 
			
		||||
 *
 | 
			
		||||
 * Magic that happens in this method:
 | 
			
		||||
 *
 | 
			
		||||
 *	- Each node in the trie is an union, so it can work both as
 | 
			
		||||
 *	a normal node, or as a leaf.
 | 
			
		||||
 *
 | 
			
		||||
 *	- Each normal node points to 16 children (one for each possible
 | 
			
		||||
 *	character in the oid). This is *not* stored in an array of
 | 
			
		||||
 *	pointers, because in a 64-bit arch this would be sucking 
 | 
			
		||||
 *	16*sizeof(void*) = 128 bytes of memory per node, which is fucking
 | 
			
		||||
 *	insane. What we do is store Node Indexes, and use these indexes
 | 
			
		||||
 *	to look up each node in the om->index array. These indexes are
 | 
			
		||||
 *	signed shorts, so this limits the amount of unique OIDs that
 | 
			
		||||
 *	fit in the structure to about 20000 (assuming a more or less uniform
 | 
			
		||||
 *	distribution).
 | 
			
		||||
 *
 | 
			
		||||
 *	- All the nodes in om->index array are stored contiguously in
 | 
			
		||||
 *	memory, and each of them is 32 bytes, so we fit 2x nodes per
 | 
			
		||||
 *	cache line. Convenient for speed.
 | 
			
		||||
 *
 | 
			
		||||
 *	- To differentiate the leafs from the normal nodes, we store all
 | 
			
		||||
 *	the indexes towards a leaf as a negative index (indexes to normal
 | 
			
		||||
 *	nodes are positives). When we find that one of the children for
 | 
			
		||||
 *	a node has a negative value, that means it's going to be a leaf.
 | 
			
		||||
 *	This reduces the amount of indexes we have by two, but also reduces
 | 
			
		||||
 *	the size of each node by 1-4 bytes (the amount we would need to
 | 
			
		||||
 *	add a `is_leaf` field): this is good because it allows the nodes
 | 
			
		||||
 *	to fit cleanly in cache lines.
 | 
			
		||||
 *
 | 
			
		||||
 *	- Once we reach an empty children, instead of continuing to insert
 | 
			
		||||
 *	new nodes for each remaining character of the OID, we store a pointer
 | 
			
		||||
 *	to the tail in the leaf; if the leaf is reached again, we turn it
 | 
			
		||||
 *	into a normal node and use the tail to create a new leaf.
 | 
			
		||||
 *
 | 
			
		||||
 *	This is a pretty good balance between performance and memory usage.
 | 
			
		||||
 */
 | 
			
		||||
int git_oid_shorten_add(git_oid_shorten *os, const char *text_oid)
 | 
			
		||||
{
 | 
			
		||||
	int i, is_leaf;
 | 
			
		||||
	node_index idx;
 | 
			
		||||
 | 
			
		||||
	if (os->full)
 | 
			
		||||
		return GIT_ENOMEM;
 | 
			
		||||
 | 
			
		||||
	idx = 0;
 | 
			
		||||
	is_leaf = 0;
 | 
			
		||||
 | 
			
		||||
	for (i = 0; i < GIT_OID_HEXSZ; ++i) {
 | 
			
		||||
		int c = from_hex[(int)text_oid[i]];
 | 
			
		||||
		trie_node *node;
 | 
			
		||||
 | 
			
		||||
		if (c == -1)
 | 
			
		||||
			return GIT_ENOTOID;
 | 
			
		||||
 | 
			
		||||
		node = &os->nodes[idx];
 | 
			
		||||
 | 
			
		||||
		if (is_leaf) {
 | 
			
		||||
			const char *tail;
 | 
			
		||||
 | 
			
		||||
			tail = node->tail;
 | 
			
		||||
			node->tail = NULL;
 | 
			
		||||
 | 
			
		||||
			node = push_leaf(os, idx, from_hex[(int)tail[0]], &tail[1]);
 | 
			
		||||
			if (node == NULL)
 | 
			
		||||
				return GIT_ENOMEM;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if (node->children[c] == 0) {
 | 
			
		||||
			if (push_leaf(os, idx, c, &text_oid[i + 1]) == NULL)
 | 
			
		||||
				return GIT_ENOMEM;
 | 
			
		||||
			break;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		idx = node->children[c];
 | 
			
		||||
		is_leaf = 0;
 | 
			
		||||
 | 
			
		||||
		if (idx < 0) {
 | 
			
		||||
			node->children[c] = idx = -idx;
 | 
			
		||||
			is_leaf = 1;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (++i > os->min_length)
 | 
			
		||||
		os->min_length = i;
 | 
			
		||||
 | 
			
		||||
	return os->min_length;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -300,6 +300,96 @@ BEGIN_TEST(oid15, "convert raw oid to string (big)")
 | 
			
		||||
	must_be_true(str && str == big && *(str+GIT_OID_HEXSZ+3) == 'Z');
 | 
			
		||||
END_TEST
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
BEGIN_TEST(oid16, "make sure the OID shortener doesn't choke on duplicate sha1s")
 | 
			
		||||
 | 
			
		||||
	git_oid_shorten *os;
 | 
			
		||||
	int min_len;
 | 
			
		||||
 | 
			
		||||
	os = git_oid_shorten_new(0);
 | 
			
		||||
	must_be_true(os != NULL);
 | 
			
		||||
 | 
			
		||||
	git_oid_shorten_add(os, "22596363b3de40b06f981fb85d82312e8c0ed511");
 | 
			
		||||
	git_oid_shorten_add(os, "ce08fe4884650f067bd5703b6a59a8b3b3c99a09");
 | 
			
		||||
	git_oid_shorten_add(os, "16a0123456789abcdef4b775213c23a8bd74f5e0");
 | 
			
		||||
	min_len = git_oid_shorten_add(os, "ce08fe4884650f067bd5703b6a59a8b3b3c99a09");
 | 
			
		||||
 | 
			
		||||
	must_be_true(min_len == GIT_OID_HEXSZ + 1);
 | 
			
		||||
 | 
			
		||||
	git_oid_shorten_free(os);
 | 
			
		||||
END_TEST
 | 
			
		||||
 | 
			
		||||
BEGIN_TEST(oid17, "stress test for the git_oid_shorten object")
 | 
			
		||||
 | 
			
		||||
#define MAX_OIDS 1000
 | 
			
		||||
 | 
			
		||||
	git_oid_shorten *os;
 | 
			
		||||
	char *oids[MAX_OIDS];
 | 
			
		||||
	char number_buffer[16];
 | 
			
		||||
	git_oid oid;
 | 
			
		||||
	size_t i, j;
 | 
			
		||||
 | 
			
		||||
	int min_len = 0, found_collision;
 | 
			
		||||
 | 
			
		||||
	os = git_oid_shorten_new(0);
 | 
			
		||||
	must_be_true(os != NULL);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Insert in the shortener 1000 unique SHA1 ids
 | 
			
		||||
	 */
 | 
			
		||||
	for (i = 0; i < MAX_OIDS; ++i) {
 | 
			
		||||
		char *oid_text;
 | 
			
		||||
 | 
			
		||||
		sprintf(number_buffer, "%u", (unsigned int)i);
 | 
			
		||||
		git_hash_buf(&oid, number_buffer, strlen(number_buffer));
 | 
			
		||||
 | 
			
		||||
		oid_text = git__malloc(GIT_OID_HEXSZ + 1);
 | 
			
		||||
		git_oid_fmt(oid_text, &oid);
 | 
			
		||||
		oid_text[GIT_OID_HEXSZ] = 0;
 | 
			
		||||
 | 
			
		||||
		min_len = git_oid_shorten_add(os, oid_text);
 | 
			
		||||
		must_be_true(min_len >= 0);
 | 
			
		||||
 | 
			
		||||
		oids[i] = oid_text;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Compare the first `min_char - 1` characters of each
 | 
			
		||||
	 * SHA1 OID. If the minimizer worked, we should find at
 | 
			
		||||
	 * least one collision
 | 
			
		||||
	 */
 | 
			
		||||
	found_collision = 0;
 | 
			
		||||
	for (i = 0; i < MAX_OIDS; ++i) {
 | 
			
		||||
		for (j = 0; j < MAX_OIDS; ++j) {
 | 
			
		||||
			if (i != j && memcmp(oids[i], oids[j], min_len - 1) == 0)
 | 
			
		||||
				found_collision = 1;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	must_be_true(found_collision == 1);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Compare the first `min_char` characters of each
 | 
			
		||||
	 * SHA1 OID. If the minimizer worked, every single preffix
 | 
			
		||||
	 * should be unique.
 | 
			
		||||
	 */
 | 
			
		||||
	found_collision = 0;
 | 
			
		||||
	for (i = 0; i < MAX_OIDS; ++i) {
 | 
			
		||||
		for (j = 0; j < MAX_OIDS; ++j) {
 | 
			
		||||
			if (i != j && memcmp(oids[i], oids[j], min_len) == 0)
 | 
			
		||||
				found_collision = 1;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	must_be_true(found_collision == 0);
 | 
			
		||||
 | 
			
		||||
	/* cleanup */
 | 
			
		||||
	for (i = 0; i < MAX_OIDS; ++i)
 | 
			
		||||
		free(oids[i]);
 | 
			
		||||
 | 
			
		||||
	git_oid_shorten_free(os);
 | 
			
		||||
 | 
			
		||||
#undef MAX_OIDS
 | 
			
		||||
END_TEST
 | 
			
		||||
 | 
			
		||||
static char *hello_id = "22596363b3de40b06f981fb85d82312e8c0ed511";
 | 
			
		||||
static char *hello_text = "hello world\n";
 | 
			
		||||
 | 
			
		||||
@ -518,6 +608,8 @@ BEGIN_SUITE(rawobjects)
 | 
			
		||||
	ADD_TEST(oid13);
 | 
			
		||||
	ADD_TEST(oid14);
 | 
			
		||||
	ADD_TEST(oid15);
 | 
			
		||||
	ADD_TEST(oid16);
 | 
			
		||||
	ADD_TEST(oid17);
 | 
			
		||||
 | 
			
		||||
	ADD_TEST(hash0);
 | 
			
		||||
	ADD_TEST(hash1);
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user