From c16c8b9a7e7588f4ced41aa8f9787495f41fd918 Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Mon, 23 Apr 2012 09:23:58 -0700 Subject: [PATCH] Adding stash to hashtable implementation Adding a small stash of nodes with key conflicts has been demonstrated to greatly increase the efficiency of a cuckoo hashtable. See: http://research.microsoft.com/pubs/73856/stash-full.9-30.pdf for more details. --- src/hashtable.c | 143 ++++++++++++++++++++++++++++++++++-------------- src/hashtable.h | 13 ++++- 2 files changed, 112 insertions(+), 44 deletions(-) diff --git a/src/hashtable.c b/src/hashtable.c index 8e057d4b1..e2f131cf1 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -18,28 +18,37 @@ static git_hashtable_node *node_with_hash(git_hashtable *self, const void *key, static void node_swap_with(git_hashtable_node *self, git_hashtable_node *other); static int node_insert(git_hashtable *self, git_hashtable_node *new_node); static int insert_nodes(git_hashtable *self, git_hashtable_node *old_nodes, size_t old_size); +static void reinsert_stash(git_hashtable *self); static int resize_to(git_hashtable *self, size_t new_size) { git_hashtable_node *old_nodes = self->nodes; size_t old_size = self->size; + git_hashtable_node old_stash[GIT_HASHTABLE_STASH_SIZE]; + size_t old_stash_count = self->stash_count; self->is_resizing = 1; + if (old_stash_count > 0) + memcpy(old_stash, self->stash, + old_stash_count * sizeof(git_hashtable_node)); + do { self->size = new_size; self->size_mask = new_size - 1; self->key_count = 0; + self->stash_count = 0; self->nodes = git__calloc(1, sizeof(git_hashtable_node) * self->size); GITERR_CHECK_ALLOC(self->nodes); - if (insert_nodes(self, old_nodes, old_size) == 0) + if (insert_nodes(self, old_nodes, old_size) == 0 && + insert_nodes(self, old_stash, old_stash_count) == 0) self->is_resizing = 0; else { new_size *= 2; git__free(self->nodes); } - } while(self->is_resizing); + } while (self->is_resizing); git__free(old_nodes); return 0; @@ -47,26 +56,28 @@ static int resize_to(git_hashtable *self, size_t new_size) static int set_size(git_hashtable *self, size_t new_size) { - self->nodes = git__realloc(self->nodes, new_size * sizeof(git_hashtable_node)); + self->nodes = + git__realloc(self->nodes, new_size * sizeof(git_hashtable_node)); GITERR_CHECK_ALLOC(self->nodes); - if (new_size > self->size) { + if (new_size > self->size) memset(&self->nodes[self->size], 0x0, (new_size - self->size) * sizeof(git_hashtable_node)); - } self->size = new_size; self->size_mask = new_size - 1; return 0; } -static git_hashtable_node *node_with_hash(git_hashtable *self, const void *key, int hash_id) +GIT_INLINE(git_hashtable_node *)node_with_hash( + git_hashtable *self, const void *key, int hash_id) { size_t pos = self->hash(key, hash_id) & self->size_mask; return git_hashtable_node_at(self->nodes, pos); } -static void node_swap_with(git_hashtable_node *self, git_hashtable_node *other) +GIT_INLINE(void) node_swap_with( + git_hashtable_node *self, git_hashtable_node *other) { git_hashtable_node tmp = *self; *self = *other; @@ -76,19 +87,26 @@ static void node_swap_with(git_hashtable_node *self, git_hashtable_node *other) static int node_insert(git_hashtable *self, git_hashtable_node *new_node) { int iteration, hash_id; + git_hashtable_node *node; for (iteration = 0; iteration < MAX_LOOPS; iteration++) { for (hash_id = 0; hash_id < GIT_HASHTABLE_HASHES; ++hash_id) { - git_hashtable_node *node; node = node_with_hash(self, new_node->key, hash_id); node_swap_with(new_node, node); - if (new_node->key == 0x0){ + if (new_node->key == 0x0) { self->key_count++; return 0; } } } + /* Insert into stash if there is space */ + if (self->stash_count < GIT_HASHTABLE_STASH_SIZE) { + node_swap_with(new_node, &self->stash[self->stash_count++]); + self->key_count++; + return 0; + } + /* Failed to insert node. Hashtable is currently resizing */ assert(!self->is_resizing); @@ -105,14 +123,29 @@ static int insert_nodes( for (i = 0; i < old_size; ++i) { git_hashtable_node *node = git_hashtable_node_at(old_nodes, i); - if (node->key && - git_hashtable_insert(self, node->key, node->value) < 0) + if (node->key && node_insert(self, node) < 0) return -1; } return 0; } +static void reinsert_stash(git_hashtable *self) +{ + int stash_count; + struct git_hashtable_node stash[GIT_HASHTABLE_STASH_SIZE]; + + if (self->stash_count <= 0) + return; + + memcpy(stash, self->stash, self->stash_count * sizeof(git_hashtable_node)); + stash_count = self->stash_count; + self->stash_count = 0; + + /* the node_insert() calls *cannot* fail because the stash is empty */ + insert_nodes(self, stash, stash_count); +} + git_hashtable *git_hashtable_alloc( size_t min_size, git_hash_ptr hash, @@ -127,21 +160,11 @@ git_hashtable *git_hashtable_alloc( memset(table, 0x0, sizeof(git_hashtable)); - if (min_size < 8) - min_size = 8; - - /* round up size to closest power of 2 */ - min_size--; - min_size |= min_size >> 1; - min_size |= min_size >> 2; - min_size |= min_size >> 4; - min_size |= min_size >> 8; - min_size |= min_size >> 16; - table->hash = hash; table->key_equal = key_eq; - set_size(table, min_size + 1); + min_size = git__size_t_powerof2(min_size < 8 ? 8 : min_size); + set_size(table, min_size); return table; } @@ -151,6 +174,8 @@ void git_hashtable_clear(git_hashtable *self) assert(self); memset(self->nodes, 0x0, sizeof(git_hashtable_node) * self->size); + + self->stash_count = 0; self->key_count = 0; } @@ -200,39 +225,70 @@ int git_hashtable_insert2( } } -void *git_hashtable_lookup(git_hashtable *self, const void *key) +static git_hashtable_node *find_node(git_hashtable *self, const void *key) { - int hash_id; + int hash_id, count = 0; git_hashtable_node *node; - assert(self && self->nodes); - for (hash_id = 0; hash_id < GIT_HASHTABLE_HASHES; ++hash_id) { node = node_with_hash(self, key, hash_id); - if (node->key && self->key_equal(key, node->key) == 0) - return node->value; + if (node->key) { + ++count; + if (self->key_equal(key, node->key) == 0) + return node; + } + } + + /* check stash if not found but all slots were filled */ + if (count == GIT_HASHTABLE_HASHES) { + for (count = 0; count < self->stash_count; ++count) + if (self->key_equal(key, self->stash[count].key) == 0) + return &self->stash[count]; } return NULL; } +static void reset_stash(git_hashtable *self, git_hashtable_node *node) +{ + /* if node was in stash, then compact stash */ + ssize_t offset = node - self->stash; + + if (offset >= 0 && offset < self->stash_count) { + if (offset < self->stash_count - 1) + memmove(node, node + 1, (self->stash_count - offset) * + sizeof(git_hashtable_node)); + self->stash_count--; + } + + reinsert_stash(self); +} + +void *git_hashtable_lookup(git_hashtable *self, const void *key) +{ + git_hashtable_node *node; + assert(self && key); + node = find_node(self, key); + return node ? node->value : NULL; +} + int git_hashtable_remove2( git_hashtable *self, const void *key, void **old_value) { - int hash_id; git_hashtable_node *node; assert(self && self->nodes); - for (hash_id = 0; hash_id < GIT_HASHTABLE_HASHES; ++hash_id) { - node = node_with_hash(self, key, hash_id); - if (node->key && self->key_equal(key, node->key) == 0) { - *old_value = node->value; - node->key = NULL; - node->value = NULL; - self->key_count--; - return 0; - } + node = find_node(self, key); + if (node) { + *old_value = node->value; + + node->key = NULL; + node->value = NULL; + self->key_count--; + + reset_stash(self, node); + return 0; } return GIT_ENOTFOUND; @@ -240,10 +296,15 @@ int git_hashtable_remove2( int git_hashtable_merge(git_hashtable *self, git_hashtable *other) { - if (resize_to(self, (self->size + other->size) * 2) < 0) + size_t new_size = git__size_t_powerof2(self->size + other->size); + + if (resize_to(self, new_size) < 0) return -1; - return insert_nodes(self, other->nodes, other->key_count); + if (insert_nodes(self, other->nodes, other->key_count) < 0) + return -1; + + return insert_nodes(self, other->stash, other->stash_count); } diff --git a/src/hashtable.h b/src/hashtable.h index 0bab84543..448487507 100644 --- a/src/hashtable.h +++ b/src/hashtable.h @@ -22,6 +22,8 @@ struct git_hashtable_node { void *value; }; +#define GIT_HASHTABLE_STASH_SIZE 3 + struct git_hashtable { struct git_hashtable_node *nodes; @@ -29,6 +31,9 @@ struct git_hashtable { size_t size; size_t key_count; + struct git_hashtable_node stash[GIT_HASHTABLE_STASH_SIZE]; + int stash_count; + int is_resizing; git_hash_ptr hash; @@ -38,9 +43,11 @@ struct git_hashtable { typedef struct git_hashtable_node git_hashtable_node; typedef struct git_hashtable git_hashtable; -git_hashtable *git_hashtable_alloc(size_t min_size, - git_hash_ptr hash, - git_hash_keyeq_ptr key_eq); +git_hashtable *git_hashtable_alloc( + size_t min_size, + git_hash_ptr hash, + git_hash_keyeq_ptr key_eq); + void *git_hashtable_lookup(git_hashtable *h, const void *key); int git_hashtable_remove2(git_hashtable *table, const void *key, void **old_value);