From 8cc16e29e88a02e26b16da66aca6fa589831b76c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 30 Jun 2011 23:22:42 +0300 Subject: [PATCH 1/5] index: speedup git_index_append()/git_index_append2() git_index_find() in index_insert() is useless if replace is not requested (append). Do not call it in this case. It speedup git_index_append() *dramatically* on large indexes. $ cat index_test.c int main(int argc, char **argv) { git_index *index; git_repository *repo; git_odb *odb; struct git_index_entry entry; git_oid tree_oid; char tree_hex[41]; int i; git_repository_init(&repo, "/tmp/myrepo", 0); odb = git_repository_database(repo); git_repository_index(&index, repo); memset(&entry, 0, sizeof(entry)); git_odb_write(&entry.oid, odb, "", 0, GIT_OBJ_BLOB); entry.path = "test.file"; for (i = 0; i < 50000; i++) git_index_append2(index, &entry); git_tree_create_fromindex(&tree_oid, index); git_oid_fmt(tree_hex, &tree_oid); tree_hex[40] = '\0'; printf("tree: %s\n", tree_hex); git_index_free(index); git_repository_free(repo); return 0; } Before: $ time ./index_test tree: 43f73659c43b651588cc81459d9e25b08721b95d ./index_test 151.19s user 0.05s system 99% cpu 2:31.78 total After: $ time ./index_test tree: 43f73659c43b651588cc81459d9e25b08721b95d ./index_test 0.05s user 0.00s system 94% cpu 0.059 total About 2573 times speedup on this test :) Signed-off-by: Kirill A. Shutemov --- src/index.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/index.c b/src/index.c index 06fa95c7b..fd905d007 100644 --- a/src/index.c +++ b/src/index.c @@ -348,6 +348,7 @@ static int index_insert(git_index *index, const git_index_entry *source_entry, i git_index_entry *entry; size_t path_length; int position; + git_index_entry **entry_array; assert(index && source_entry); @@ -375,27 +376,28 @@ static int index_insert(git_index *index, const git_index_entry *source_entry, i else entry->flags |= GIT_IDXENTRY_NAMEMASK;; + /* + * replacing is not requested: just insert entry at the end; + * the index is no longer sorted + */ + if (!replace) + return git_vector_insert(&index->entries, entry); /* look if an entry with this path already exists */ position = git_index_find(index, source_entry->path); - /* if no entry exists and replace is not set, - * add the entry at the end; - * the index is no longer sorted */ - if (!replace || position == GIT_ENOTFOUND) { - if (git_vector_insert(&index->entries, entry) < GIT_SUCCESS) - return GIT_ENOMEM; + /* + * if no entry exists add the entry at the end; + * the index is no longer sorted + */ + if (position == GIT_ENOTFOUND) + return git_vector_insert(&index->entries, entry); - /* if a previous entry exists and replace is set, - * replace it */ - } else { - git_index_entry **entry_array = (git_index_entry **)index->entries.contents; - - free((char *)entry_array[position]->path); - free(entry_array[position]); - - entry_array[position] = entry; - } + /* exists, replace it */ + entry_array = (git_index_entry **) index->entries.contents; + free((char *)entry_array[position]->path); + free(entry_array[position]); + entry_array[position] = entry; return GIT_SUCCESS; } From c20ffa6104cf9797a14cc40dcc963102ef9a98a2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 1 Jul 2011 00:34:23 +0300 Subject: [PATCH 2/5] util: introduce merge sort routine In some cases it's important to preserve order of elements with equal keys (stable sort). qsort(3) doesn't define order of elements with equal keys. git__msort() implements merge sort which is stable sort. Implementation taken from git. Function renamed git_qsort() -> git__msort(). Signed-off-by: Kirill A. Shutemov --- src/util.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/util.h | 3 +++ 2 files changed, 67 insertions(+) diff --git a/src/util.c b/src/util.c index 5b8a1367c..6bdd2d3e6 100644 --- a/src/util.c +++ b/src/util.c @@ -332,3 +332,67 @@ uint32_t git__hash(const void *key, int len, uint32_t seed) return h1; } #endif + +/* + * A merge sort implementation, simplified from the qsort implementation + * by Mike Haertel, which is a part of the GNU C Library. + */ +static void msort_with_tmp(void *b, size_t n, size_t s, + int (*cmp)(const void *, const void *), + char *t) +{ + char *tmp; + char *b1, *b2; + size_t n1, n2; + + if (n <= 1) + return; + + n1 = n / 2; + n2 = n - n1; + b1 = b; + b2 = (char *)b + (n1 * s); + + msort_with_tmp(b1, n1, s, cmp, t); + msort_with_tmp(b2, n2, s, cmp, t); + + tmp = t; + + while (n1 > 0 && n2 > 0) { + if (cmp(b1, b2) <= 0) { + memcpy(tmp, b1, s); + tmp += s; + b1 += s; + --n1; + } else { + memcpy(tmp, b2, s); + tmp += s; + b2 += s; + --n2; + } + } + if (n1 > 0) + memcpy(tmp, b1, n1 * s); + memcpy(b, t, (n - n2) * s); +} + +int git__msort(void *b, size_t n, size_t s, + int (*cmp)(const void *, const void *)) +{ + const size_t size = n * s; + char buf[1024]; + + if (size < sizeof(buf)) { + /* The temporary array fits on the small on-stack buffer. */ + msort_with_tmp(b, n, s, cmp, buf); + } else { + /* It's somewhat large, so malloc it. */ + char *tmp = git__malloc(size); + if (tmp == NULL) + return GIT_ENOMEM; + msort_with_tmp(b, n, s, cmp, tmp); + free(tmp); + } + + return GIT_SUCCESS; +} diff --git a/src/util.h b/src/util.h index 0faf7f69c..c9ca4dec0 100644 --- a/src/util.h +++ b/src/util.h @@ -118,4 +118,7 @@ extern int git__fnmatch(const char *pattern, const char *name, int flags); } \ } while (0) +extern int git__msort(void *b, size_t n, size_t s, + int (*cmp)(const void *, const void *)); + #endif /* INCLUDE_util_h__ */ From 0b0a6b115d617fe79a0be0ddecd89738aefa574c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 1 Jul 2011 00:48:37 +0300 Subject: [PATCH 3/5] vector, index: use git__msort() for vector sorting Index operation use git_vector_sort() to sort index entries. Since index support adding duplicates (two or more entries with the same path), it's important to preserve order of elements. Preserving order of elements allows to make decisions based on order. For example it's possible to implement function witch removes all duplicates except last added. Signed-off-by: Kirill A. Shutemov --- src/vector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vector.c b/src/vector.c index 0451eb082..8d09350fc 100644 --- a/src/vector.c +++ b/src/vector.c @@ -94,7 +94,7 @@ void git_vector_sort(git_vector *v) if (v->sorted || v->_cmp == NULL) return; - qsort(v->contents, v->length, sizeof(void *), v->_cmp); + git__msort(v->contents, v->length, sizeof(void *), v->_cmp); v->sorted = 1; } From 476c42acc554e7b3f79c945c2a461d4e25dde41c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sat, 2 Jul 2011 00:41:49 +0300 Subject: [PATCH 4/5] vector: implement git_vector_uniq() The routine remove duplictes from the vector. Only the last added element of elements with equal keys remains in the vector. Signed-off-by: Kirill A. Shutemov --- src/vector.c | 20 ++++++++++++++++++++ src/vector.h | 2 +- tests/t00-core.c | 23 +++++++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/src/vector.c b/src/vector.c index 8d09350fc..2fc051f0c 100644 --- a/src/vector.c +++ b/src/vector.c @@ -162,6 +162,26 @@ int git_vector_remove(git_vector *v, unsigned int idx) return GIT_SUCCESS; } +void git_vector_uniq(git_vector *v) +{ + git_vector_cmp cmp; + unsigned int i, j; + + if (v->length <= 1) + return; + + git_vector_sort(v); + cmp = v->_cmp ? v->_cmp : strict_comparison; + + for (i = 0, j = 1 ; j < v->length; ++j) + if (!cmp(v->contents + i, v->contents + j)) + v->contents[i] = v->contents[j]; + else + v->contents[++i] = v->contents[j]; + + v->length -= j - i - 1; +} + void git_vector_clear(git_vector *v) { assert(v); diff --git a/src/vector.h b/src/vector.h index 256452ee5..76778ba4e 100644 --- a/src/vector.h +++ b/src/vector.h @@ -32,5 +32,5 @@ GIT_INLINE(void *) git_vector_get(git_vector *v, unsigned int position) int git_vector_insert(git_vector *v, void *element); int git_vector_remove(git_vector *v, unsigned int idx); - +void git_vector_uniq(git_vector *v); #endif diff --git a/tests/t00-core.c b/tests/t00-core.c index ba5188a43..661f92508 100644 --- a/tests/t00-core.c +++ b/tests/t00-core.c @@ -73,6 +73,28 @@ BEGIN_TEST(vector1, "don't read past array bounds on remove()") git_vector_free(&x); END_TEST +static int test_cmp(const void *a, const void *b) +{ + int n1 = *(int *)a; + int n2 = *(int *)b; + + return n1 - n2; +} + +BEGIN_TEST(vector2, "remove duplicates") + git_vector x; + must_pass(git_vector_init(&x, 5, test_cmp)); + must_pass(git_vector_insert(&x, (void *) 0xdeadbeef)); + must_pass(git_vector_insert(&x, (void *) 0xcafebabe)); + must_pass(git_vector_insert(&x, (void *) 0xcafebabe)); + must_pass(git_vector_insert(&x, (void *) 0xdeadbeef)); + must_pass(git_vector_insert(&x, (void *) 0xcafebabe)); + must_be_true(x.length == 5); + git_vector_uniq(&x); + must_be_true(x.length == 2); + git_vector_free(&x); +END_TEST + BEGIN_TEST(path0, "get the dirname of a path") char dir[64], *dir2; @@ -480,6 +502,7 @@ BEGIN_SUITE(core) ADD_TEST(vector0); ADD_TEST(vector1); + ADD_TEST(vector2); ADD_TEST(path0); ADD_TEST(path1); From 245adf4f3cc1d47352e626ef53372d07761d84cf Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sat, 2 Jul 2011 01:08:42 +0300 Subject: [PATCH 5/5] index: introduce git_index_uniq() function It removes all entries with equal path except last added. On large indexes git_index_append() + git_index_uniq() before writing is *much* faster, than git_index_add(). Signed-off-by: Kirill A. Shutemov --- include/git2/index.h | 7 +++++++ src/index.c | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/include/git2/index.h b/include/git2/index.h index 83a18e160..d34ed0a92 100644 --- a/include/git2/index.h +++ b/include/git2/index.h @@ -172,6 +172,13 @@ GIT_EXTERN(int) git_index_write(git_index *index); */ GIT_EXTERN(int) git_index_find(git_index *index, const char *path); +/** + * Remove all entries with equal path except last added + * + * @param index an existing index object + */ +GIT_EXTERN(void) git_index_uniq(git_index *index); + /** * Add or update an index entry from a file in disk * diff --git a/src/index.c b/src/index.c index fd905d007..dd33db92a 100644 --- a/src/index.c +++ b/src/index.c @@ -487,6 +487,11 @@ int git_index_find(git_index *index, const char *path) return git_vector_bsearch2(&index->entries, index_srch, path); } +void git_index_uniq(git_index *index) +{ + git_vector_uniq(&index->entries); +} + const git_index_entry_unmerged *git_index_get_unmerged_bypath(git_index *index, const char *path) { int pos;