diff --git a/include/git2/oid.h b/include/git2/oid.h index 5cac46f3b..4538c6147 100644 --- a/include/git2/oid.h +++ b/include/git2/oid.h @@ -132,6 +132,60 @@ GIT_EXTERN(void) git_oid_cpy(git_oid *out, const git_oid *src); */ GIT_EXTERN(int) git_oid_cmp(const git_oid *a, const git_oid *b); +/** + * OID Shortener object + */ +typedef struct git_oid_shorten git_oid_shorten; + +/** + * Create a new OID shortener. + * + * The OID shortener is used to process a list of OIDs + * in text form and return the shortest length that would + * uniquely identify all of them. + * + * E.g. look at the result of `git log --abbrev`. + * + * @param min_length The minimal length for all identifiers, + * which will be used even if shorter OIDs would still + * be unique. + * @return a `git_oid_shorten` instance, NULL if OOM + */ +git_oid_shorten *git_oid_shorten_new(size_t min_length); + +/** + * Add a new OID to set of shortened OIDs and calculate + * the minimal length to uniquely identify all the OIDs in + * the set. + * + * The OID is expected to be a 40-char hexadecimal string. + * The OID is owned by the user and will not be modified + * or freed. + * + * For performance reasons, there is a hard-limit of how many + * OIDs can be added to a single set (around ~22000, assuming + * a mostly randomized distribution), which should be enough + * for any kind of program, and keeps the algorithm fast and + * memory-efficient. + * + * Attempting to add more than those OIDs will result in a + * GIT_ENOMEM error + * + * @param os a `git_oid_shorten` instance + * @param text_oid an OID in text form + * @return the minimal length to uniquely identify all OIDs + * added so far to the set; or an error code (<0) if an + * error occurs. + */ +int git_oid_shorten_add(git_oid_shorten *os, const char *text_oid); + +/** + * Free an OID shortener instance + * + * @param os a `git_oid_shorten` instance + */ +void git_oid_shorten_free(git_oid_shorten *os); + /** @} */ GIT_END_DECL #endif diff --git a/src/oid.c b/src/oid.c index 698d0f927..81b7d6005 100644 --- a/src/oid.c +++ b/src/oid.c @@ -27,6 +27,7 @@ #include "git2/oid.h" #include "repository.h" #include +#include static signed char from_hex[] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 00 */ @@ -166,3 +167,180 @@ int git_oid_cmp(const git_oid *a, const git_oid *b) { return memcmp(a->id, b->id, sizeof(a->id)); } + + +typedef short node_index; + +typedef union { + const char *tail; + node_index children[16]; +} trie_node; + +struct git_oid_shorten { + trie_node *nodes; + size_t node_count, size; + int min_length, full; +}; + +static int resize_trie(git_oid_shorten *self, size_t new_size) +{ + self->nodes = realloc(self->nodes, new_size * sizeof(trie_node)); + if (self->nodes == NULL) + return GIT_ENOMEM; + + if (new_size > self->size) { + memset(&self->nodes[self->size], 0x0, (new_size - self->size) * sizeof(trie_node)); + } + + self->size = new_size; + return GIT_SUCCESS; +} + +static trie_node *push_leaf(git_oid_shorten *os, node_index idx, int push_at, const char *oid) +{ + trie_node *node, *leaf; + node_index idx_leaf; + + if (os->node_count >= os->size) { + if (resize_trie(os, os->size * 2) < GIT_SUCCESS) + return NULL; + } + + idx_leaf = (node_index)os->node_count++; + + if (os->node_count == SHRT_MAX) + os->full = 1; + + node = &os->nodes[idx]; + node->children[push_at] = -idx_leaf; + + leaf = &os->nodes[idx_leaf]; + leaf->tail = oid; + + return node; +} + +git_oid_shorten *git_oid_shorten_new(size_t min_length) +{ + git_oid_shorten *os; + + os = git__malloc(sizeof(git_oid_shorten)); + if (os == NULL) + return NULL; + + memset(os, 0x0, sizeof(git_oid_shorten)); + + if (resize_trie(os, 16) < GIT_SUCCESS) { + free(os); + return NULL; + } + + os->node_count = 1; + os->min_length = min_length; + + return os; +} + +void git_oid_shorten_free(git_oid_shorten *os) +{ + free(os->nodes); + free(os); +} + + +/* + * What wizardry is this? + * + * This is just a memory-optimized trie: basically a very fancy + * 16-ary tree, which is used to store the prefixes of the OID + * strings. + * + * Read more: http://en.wikipedia.org/wiki/Trie + * + * Magic that happens in this method: + * + * - Each node in the trie is an union, so it can work both as + * a normal node, or as a leaf. + * + * - Each normal node points to 16 children (one for each possible + * character in the oid). This is *not* stored in an array of + * pointers, because in a 64-bit arch this would be sucking + * 16*sizeof(void*) = 128 bytes of memory per node, which is fucking + * insane. What we do is store Node Indexes, and use these indexes + * to look up each node in the om->index array. These indexes are + * signed shorts, so this limits the amount of unique OIDs that + * fit in the structure to about 20000 (assuming a more or less uniform + * distribution). + * + * - All the nodes in om->index array are stored contiguously in + * memory, and each of them is 32 bytes, so we fit 2x nodes per + * cache line. Convenient for speed. + * + * - To differentiate the leafs from the normal nodes, we store all + * the indexes towards a leaf as a negative index (indexes to normal + * nodes are positives). When we find that one of the children for + * a node has a negative value, that means it's going to be a leaf. + * This reduces the amount of indexes we have by two, but also reduces + * the size of each node by 1-4 bytes (the amount we would need to + * add a `is_leaf` field): this is good because it allows the nodes + * to fit cleanly in cache lines. + * + * - Once we reach an empty children, instead of continuing to insert + * new nodes for each remaining character of the OID, we store a pointer + * to the tail in the leaf; if the leaf is reached again, we turn it + * into a normal node and use the tail to create a new leaf. + * + * This is a pretty good balance between performance and memory usage. + */ +int git_oid_shorten_add(git_oid_shorten *os, const char *text_oid) +{ + int i, is_leaf; + node_index idx; + + if (os->full) + return GIT_ENOMEM; + + idx = 0; + is_leaf = 0; + + for (i = 0; i < GIT_OID_HEXSZ; ++i) { + int c = from_hex[(int)text_oid[i]]; + trie_node *node; + + if (c == -1) + return GIT_ENOTOID; + + node = &os->nodes[idx]; + + if (is_leaf) { + const char *tail; + + tail = node->tail; + node->tail = NULL; + + node = push_leaf(os, idx, from_hex[(int)tail[0]], &tail[1]); + if (node == NULL) + return GIT_ENOMEM; + } + + if (node->children[c] == 0) { + if (push_leaf(os, idx, c, &text_oid[i + 1]) == NULL) + return GIT_ENOMEM; + break; + } + + idx = node->children[c]; + is_leaf = 0; + + if (idx < 0) { + node->children[c] = idx = -idx; + is_leaf = 1; + } + } + + if (++i > os->min_length) + os->min_length = i; + + return os->min_length; +} + diff --git a/tests/t01-rawobj.c b/tests/t01-rawobj.c index cc4641589..3dfa3c9fe 100644 --- a/tests/t01-rawobj.c +++ b/tests/t01-rawobj.c @@ -300,6 +300,96 @@ BEGIN_TEST(oid15, "convert raw oid to string (big)") must_be_true(str && str == big && *(str+GIT_OID_HEXSZ+3) == 'Z'); END_TEST + +BEGIN_TEST(oid16, "make sure the OID shortener doesn't choke on duplicate sha1s") + + git_oid_shorten *os; + int min_len; + + os = git_oid_shorten_new(0); + must_be_true(os != NULL); + + git_oid_shorten_add(os, "22596363b3de40b06f981fb85d82312e8c0ed511"); + git_oid_shorten_add(os, "ce08fe4884650f067bd5703b6a59a8b3b3c99a09"); + git_oid_shorten_add(os, "16a0123456789abcdef4b775213c23a8bd74f5e0"); + min_len = git_oid_shorten_add(os, "ce08fe4884650f067bd5703b6a59a8b3b3c99a09"); + + must_be_true(min_len == GIT_OID_HEXSZ + 1); + + git_oid_shorten_free(os); +END_TEST + +BEGIN_TEST(oid17, "stress test for the git_oid_shorten object") + +#define MAX_OIDS 1000 + + git_oid_shorten *os; + char *oids[MAX_OIDS]; + char number_buffer[16]; + git_oid oid; + size_t i, j; + + int min_len = 0, found_collision; + + os = git_oid_shorten_new(0); + must_be_true(os != NULL); + + /* + * Insert in the shortener 1000 unique SHA1 ids + */ + for (i = 0; i < MAX_OIDS; ++i) { + char *oid_text; + + sprintf(number_buffer, "%u", (unsigned int)i); + git_hash_buf(&oid, number_buffer, strlen(number_buffer)); + + oid_text = git__malloc(GIT_OID_HEXSZ + 1); + git_oid_fmt(oid_text, &oid); + oid_text[GIT_OID_HEXSZ] = 0; + + min_len = git_oid_shorten_add(os, oid_text); + must_be_true(min_len >= 0); + + oids[i] = oid_text; + } + + /* + * Compare the first `min_char - 1` characters of each + * SHA1 OID. If the minimizer worked, we should find at + * least one collision + */ + found_collision = 0; + for (i = 0; i < MAX_OIDS; ++i) { + for (j = 0; j < MAX_OIDS; ++j) { + if (i != j && memcmp(oids[i], oids[j], min_len - 1) == 0) + found_collision = 1; + } + } + must_be_true(found_collision == 1); + + /* + * Compare the first `min_char` characters of each + * SHA1 OID. If the minimizer worked, every single preffix + * should be unique. + */ + found_collision = 0; + for (i = 0; i < MAX_OIDS; ++i) { + for (j = 0; j < MAX_OIDS; ++j) { + if (i != j && memcmp(oids[i], oids[j], min_len) == 0) + found_collision = 1; + } + } + must_be_true(found_collision == 0); + + /* cleanup */ + for (i = 0; i < MAX_OIDS; ++i) + free(oids[i]); + + git_oid_shorten_free(os); + +#undef MAX_OIDS +END_TEST + static char *hello_id = "22596363b3de40b06f981fb85d82312e8c0ed511"; static char *hello_text = "hello world\n"; @@ -518,6 +608,8 @@ BEGIN_SUITE(rawobjects) ADD_TEST(oid13); ADD_TEST(oid14); ADD_TEST(oid15); + ADD_TEST(oid16); + ADD_TEST(oid17); ADD_TEST(hash0); ADD_TEST(hash1);