mirror of
https://git.proxmox.com/git/libgit2
synced 2025-05-02 21:34:15 +00:00
Add git_oid_shorten
(unique OID minimzer)
Set of methods to find the minimal-length to uniquely identify every OID in a list. Useful for GUI applications, commit logs and so on. Includes stress test. Signed-off-by: Vicent Marti <tanoku@gmail.com>
This commit is contained in:
parent
b760fbf539
commit
26022f0719
@ -132,6 +132,60 @@ GIT_EXTERN(void) git_oid_cpy(git_oid *out, const git_oid *src);
|
||||
*/
|
||||
GIT_EXTERN(int) git_oid_cmp(const git_oid *a, const git_oid *b);
|
||||
|
||||
/**
|
||||
* OID Shortener object
|
||||
*/
|
||||
typedef struct git_oid_shorten git_oid_shorten;
|
||||
|
||||
/**
|
||||
* Create a new OID shortener.
|
||||
*
|
||||
* The OID shortener is used to process a list of OIDs
|
||||
* in text form and return the shortest length that would
|
||||
* uniquely identify all of them.
|
||||
*
|
||||
* E.g. look at the result of `git log --abbrev`.
|
||||
*
|
||||
* @param min_length The minimal length for all identifiers,
|
||||
* which will be used even if shorter OIDs would still
|
||||
* be unique.
|
||||
* @return a `git_oid_shorten` instance, NULL if OOM
|
||||
*/
|
||||
git_oid_shorten *git_oid_shorten_new(size_t min_length);
|
||||
|
||||
/**
|
||||
* Add a new OID to set of shortened OIDs and calculate
|
||||
* the minimal length to uniquely identify all the OIDs in
|
||||
* the set.
|
||||
*
|
||||
* The OID is expected to be a 40-char hexadecimal string.
|
||||
* The OID is owned by the user and will not be modified
|
||||
* or freed.
|
||||
*
|
||||
* For performance reasons, there is a hard-limit of how many
|
||||
* OIDs can be added to a single set (around ~22000, assuming
|
||||
* a mostly randomized distribution), which should be enough
|
||||
* for any kind of program, and keeps the algorithm fast and
|
||||
* memory-efficient.
|
||||
*
|
||||
* Attempting to add more than those OIDs will result in a
|
||||
* GIT_ENOMEM error
|
||||
*
|
||||
* @param os a `git_oid_shorten` instance
|
||||
* @param text_oid an OID in text form
|
||||
* @return the minimal length to uniquely identify all OIDs
|
||||
* added so far to the set; or an error code (<0) if an
|
||||
* error occurs.
|
||||
*/
|
||||
int git_oid_shorten_add(git_oid_shorten *os, const char *text_oid);
|
||||
|
||||
/**
|
||||
* Free an OID shortener instance
|
||||
*
|
||||
* @param os a `git_oid_shorten` instance
|
||||
*/
|
||||
void git_oid_shorten_free(git_oid_shorten *os);
|
||||
|
||||
/** @} */
|
||||
GIT_END_DECL
|
||||
#endif
|
||||
|
178
src/oid.c
178
src/oid.c
@ -27,6 +27,7 @@
|
||||
#include "git2/oid.h"
|
||||
#include "repository.h"
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
static signed char from_hex[] = {
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 00 */
|
||||
@ -166,3 +167,180 @@ int git_oid_cmp(const git_oid *a, const git_oid *b)
|
||||
{
|
||||
return memcmp(a->id, b->id, sizeof(a->id));
|
||||
}
|
||||
|
||||
|
||||
typedef short node_index;
|
||||
|
||||
typedef union {
|
||||
const char *tail;
|
||||
node_index children[16];
|
||||
} trie_node;
|
||||
|
||||
struct git_oid_shorten {
|
||||
trie_node *nodes;
|
||||
size_t node_count, size;
|
||||
int min_length, full;
|
||||
};
|
||||
|
||||
static int resize_trie(git_oid_shorten *self, size_t new_size)
|
||||
{
|
||||
self->nodes = realloc(self->nodes, new_size * sizeof(trie_node));
|
||||
if (self->nodes == NULL)
|
||||
return GIT_ENOMEM;
|
||||
|
||||
if (new_size > self->size) {
|
||||
memset(&self->nodes[self->size], 0x0, (new_size - self->size) * sizeof(trie_node));
|
||||
}
|
||||
|
||||
self->size = new_size;
|
||||
return GIT_SUCCESS;
|
||||
}
|
||||
|
||||
static trie_node *push_leaf(git_oid_shorten *os, node_index idx, int push_at, const char *oid)
|
||||
{
|
||||
trie_node *node, *leaf;
|
||||
node_index idx_leaf;
|
||||
|
||||
if (os->node_count >= os->size) {
|
||||
if (resize_trie(os, os->size * 2) < GIT_SUCCESS)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
idx_leaf = (node_index)os->node_count++;
|
||||
|
||||
if (os->node_count == SHRT_MAX)
|
||||
os->full = 1;
|
||||
|
||||
node = &os->nodes[idx];
|
||||
node->children[push_at] = -idx_leaf;
|
||||
|
||||
leaf = &os->nodes[idx_leaf];
|
||||
leaf->tail = oid;
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
git_oid_shorten *git_oid_shorten_new(size_t min_length)
|
||||
{
|
||||
git_oid_shorten *os;
|
||||
|
||||
os = git__malloc(sizeof(git_oid_shorten));
|
||||
if (os == NULL)
|
||||
return NULL;
|
||||
|
||||
memset(os, 0x0, sizeof(git_oid_shorten));
|
||||
|
||||
if (resize_trie(os, 16) < GIT_SUCCESS) {
|
||||
free(os);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
os->node_count = 1;
|
||||
os->min_length = min_length;
|
||||
|
||||
return os;
|
||||
}
|
||||
|
||||
void git_oid_shorten_free(git_oid_shorten *os)
|
||||
{
|
||||
free(os->nodes);
|
||||
free(os);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* What wizardry is this?
|
||||
*
|
||||
* This is just a memory-optimized trie: basically a very fancy
|
||||
* 16-ary tree, which is used to store the prefixes of the OID
|
||||
* strings.
|
||||
*
|
||||
* Read more: http://en.wikipedia.org/wiki/Trie
|
||||
*
|
||||
* Magic that happens in this method:
|
||||
*
|
||||
* - Each node in the trie is an union, so it can work both as
|
||||
* a normal node, or as a leaf.
|
||||
*
|
||||
* - Each normal node points to 16 children (one for each possible
|
||||
* character in the oid). This is *not* stored in an array of
|
||||
* pointers, because in a 64-bit arch this would be sucking
|
||||
* 16*sizeof(void*) = 128 bytes of memory per node, which is fucking
|
||||
* insane. What we do is store Node Indexes, and use these indexes
|
||||
* to look up each node in the om->index array. These indexes are
|
||||
* signed shorts, so this limits the amount of unique OIDs that
|
||||
* fit in the structure to about 20000 (assuming a more or less uniform
|
||||
* distribution).
|
||||
*
|
||||
* - All the nodes in om->index array are stored contiguously in
|
||||
* memory, and each of them is 32 bytes, so we fit 2x nodes per
|
||||
* cache line. Convenient for speed.
|
||||
*
|
||||
* - To differentiate the leafs from the normal nodes, we store all
|
||||
* the indexes towards a leaf as a negative index (indexes to normal
|
||||
* nodes are positives). When we find that one of the children for
|
||||
* a node has a negative value, that means it's going to be a leaf.
|
||||
* This reduces the amount of indexes we have by two, but also reduces
|
||||
* the size of each node by 1-4 bytes (the amount we would need to
|
||||
* add a `is_leaf` field): this is good because it allows the nodes
|
||||
* to fit cleanly in cache lines.
|
||||
*
|
||||
* - Once we reach an empty children, instead of continuing to insert
|
||||
* new nodes for each remaining character of the OID, we store a pointer
|
||||
* to the tail in the leaf; if the leaf is reached again, we turn it
|
||||
* into a normal node and use the tail to create a new leaf.
|
||||
*
|
||||
* This is a pretty good balance between performance and memory usage.
|
||||
*/
|
||||
int git_oid_shorten_add(git_oid_shorten *os, const char *text_oid)
|
||||
{
|
||||
int i, is_leaf;
|
||||
node_index idx;
|
||||
|
||||
if (os->full)
|
||||
return GIT_ENOMEM;
|
||||
|
||||
idx = 0;
|
||||
is_leaf = 0;
|
||||
|
||||
for (i = 0; i < GIT_OID_HEXSZ; ++i) {
|
||||
int c = from_hex[(int)text_oid[i]];
|
||||
trie_node *node;
|
||||
|
||||
if (c == -1)
|
||||
return GIT_ENOTOID;
|
||||
|
||||
node = &os->nodes[idx];
|
||||
|
||||
if (is_leaf) {
|
||||
const char *tail;
|
||||
|
||||
tail = node->tail;
|
||||
node->tail = NULL;
|
||||
|
||||
node = push_leaf(os, idx, from_hex[(int)tail[0]], &tail[1]);
|
||||
if (node == NULL)
|
||||
return GIT_ENOMEM;
|
||||
}
|
||||
|
||||
if (node->children[c] == 0) {
|
||||
if (push_leaf(os, idx, c, &text_oid[i + 1]) == NULL)
|
||||
return GIT_ENOMEM;
|
||||
break;
|
||||
}
|
||||
|
||||
idx = node->children[c];
|
||||
is_leaf = 0;
|
||||
|
||||
if (idx < 0) {
|
||||
node->children[c] = idx = -idx;
|
||||
is_leaf = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (++i > os->min_length)
|
||||
os->min_length = i;
|
||||
|
||||
return os->min_length;
|
||||
}
|
||||
|
||||
|
@ -300,6 +300,96 @@ BEGIN_TEST(oid15, "convert raw oid to string (big)")
|
||||
must_be_true(str && str == big && *(str+GIT_OID_HEXSZ+3) == 'Z');
|
||||
END_TEST
|
||||
|
||||
|
||||
BEGIN_TEST(oid16, "make sure the OID shortener doesn't choke on duplicate sha1s")
|
||||
|
||||
git_oid_shorten *os;
|
||||
int min_len;
|
||||
|
||||
os = git_oid_shorten_new(0);
|
||||
must_be_true(os != NULL);
|
||||
|
||||
git_oid_shorten_add(os, "22596363b3de40b06f981fb85d82312e8c0ed511");
|
||||
git_oid_shorten_add(os, "ce08fe4884650f067bd5703b6a59a8b3b3c99a09");
|
||||
git_oid_shorten_add(os, "16a0123456789abcdef4b775213c23a8bd74f5e0");
|
||||
min_len = git_oid_shorten_add(os, "ce08fe4884650f067bd5703b6a59a8b3b3c99a09");
|
||||
|
||||
must_be_true(min_len == GIT_OID_HEXSZ + 1);
|
||||
|
||||
git_oid_shorten_free(os);
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(oid17, "stress test for the git_oid_shorten object")
|
||||
|
||||
#define MAX_OIDS 1000
|
||||
|
||||
git_oid_shorten *os;
|
||||
char *oids[MAX_OIDS];
|
||||
char number_buffer[16];
|
||||
git_oid oid;
|
||||
size_t i, j;
|
||||
|
||||
int min_len = 0, found_collision;
|
||||
|
||||
os = git_oid_shorten_new(0);
|
||||
must_be_true(os != NULL);
|
||||
|
||||
/*
|
||||
* Insert in the shortener 1000 unique SHA1 ids
|
||||
*/
|
||||
for (i = 0; i < MAX_OIDS; ++i) {
|
||||
char *oid_text;
|
||||
|
||||
sprintf(number_buffer, "%u", (unsigned int)i);
|
||||
git_hash_buf(&oid, number_buffer, strlen(number_buffer));
|
||||
|
||||
oid_text = git__malloc(GIT_OID_HEXSZ + 1);
|
||||
git_oid_fmt(oid_text, &oid);
|
||||
oid_text[GIT_OID_HEXSZ] = 0;
|
||||
|
||||
min_len = git_oid_shorten_add(os, oid_text);
|
||||
must_be_true(min_len >= 0);
|
||||
|
||||
oids[i] = oid_text;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compare the first `min_char - 1` characters of each
|
||||
* SHA1 OID. If the minimizer worked, we should find at
|
||||
* least one collision
|
||||
*/
|
||||
found_collision = 0;
|
||||
for (i = 0; i < MAX_OIDS; ++i) {
|
||||
for (j = 0; j < MAX_OIDS; ++j) {
|
||||
if (i != j && memcmp(oids[i], oids[j], min_len - 1) == 0)
|
||||
found_collision = 1;
|
||||
}
|
||||
}
|
||||
must_be_true(found_collision == 1);
|
||||
|
||||
/*
|
||||
* Compare the first `min_char` characters of each
|
||||
* SHA1 OID. If the minimizer worked, every single preffix
|
||||
* should be unique.
|
||||
*/
|
||||
found_collision = 0;
|
||||
for (i = 0; i < MAX_OIDS; ++i) {
|
||||
for (j = 0; j < MAX_OIDS; ++j) {
|
||||
if (i != j && memcmp(oids[i], oids[j], min_len) == 0)
|
||||
found_collision = 1;
|
||||
}
|
||||
}
|
||||
must_be_true(found_collision == 0);
|
||||
|
||||
/* cleanup */
|
||||
for (i = 0; i < MAX_OIDS; ++i)
|
||||
free(oids[i]);
|
||||
|
||||
git_oid_shorten_free(os);
|
||||
|
||||
#undef MAX_OIDS
|
||||
END_TEST
|
||||
|
||||
static char *hello_id = "22596363b3de40b06f981fb85d82312e8c0ed511";
|
||||
static char *hello_text = "hello world\n";
|
||||
|
||||
@ -518,6 +608,8 @@ BEGIN_SUITE(rawobjects)
|
||||
ADD_TEST(oid13);
|
||||
ADD_TEST(oid14);
|
||||
ADD_TEST(oid15);
|
||||
ADD_TEST(oid16);
|
||||
ADD_TEST(oid17);
|
||||
|
||||
ADD_TEST(hash0);
|
||||
ADD_TEST(hash1);
|
||||
|
Loading…
Reference in New Issue
Block a user