From 86d5810b82965224b1270c3627da029588935abb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Tue, 6 May 2014 16:20:14 +0200 Subject: [PATCH 01/10] pack: remove misleading comment --- src/pack.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/pack.c b/src/pack.c index de038a45c..9c6a4e78b 100644 --- a/src/pack.c +++ b/src/pack.c @@ -546,13 +546,6 @@ static int packfile_unpack_delta( if (!cached) { /* have to inflate it */ error = git_packfile_unpack(&base, p, &base_offset); - - /* - * TODO: git.git tries to load the base from other packfiles - * or loose objects. - * - * We'll need to do this in order to support thin packs. - */ if (error < 0) return error; } From ae0817393c0aaff5c4b085c46ed11acc0ab64198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Tue, 6 May 2014 21:21:04 +0200 Subject: [PATCH 02/10] pack: do not repeat the same error message four times Repeating this error message makes it harder to find out where we actually are finding the error, and they don't really describe what we're trying to do. --- src/pack.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pack.c b/src/pack.c index 9c6a4e78b..b9374d04f 100644 --- a/src/pack.c +++ b/src/pack.c @@ -653,7 +653,7 @@ int git_packfile_stream_open(git_packfile_stream *obj, struct git_pack_file *p, st = inflateInit(&obj->zstream); if (st != Z_OK) { git__free(obj); - giterr_set(GITERR_ZLIB, "Failed to inflate packfile"); + giterr_set(GITERR_ZLIB, "failed to init packfile stream"); return -1; } @@ -684,7 +684,7 @@ ssize_t git_packfile_stream_read(git_packfile_stream *obj, void *buffer, size_t written = len - obj->zstream.avail_out; if (st != Z_OK && st != Z_STREAM_END) { - giterr_set(GITERR_ZLIB, "Failed to inflate packfile"); + giterr_set(GITERR_ZLIB, "error reading from the zlib stream"); return -1; } @@ -729,7 +729,7 @@ int packfile_unpack_compressed( st = inflateInit(&stream); if (st != Z_OK) { git__free(buffer); - giterr_set(GITERR_ZLIB, "Failed to inflate packfile"); + giterr_set(GITERR_ZLIB, "failed to init zlib stream on unpack"); return -1; } @@ -756,7 +756,7 @@ int packfile_unpack_compressed( if ((st != Z_STREAM_END) || stream.total_out != size) { git__free(buffer); - giterr_set(GITERR_ZLIB, "Failed to inflate packfile"); + giterr_set(GITERR_ZLIB, "error inflating zlib stream"); return -1; } From 2acdf4b854bf55ba2630c7342d09b136d919d6ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Tue, 6 May 2014 19:20:33 +0200 Subject: [PATCH 03/10] pack: unpack using a loop We currently make use of recursive function calls to unpack an object, resolving the deltas as we come back down the chain. This means that we have unbounded stack growth as we look up objects in a pack. This is now done in two steps: first we figure out what the dependency chain is by looking up the delta bases until we reach a non-delta object, pushing the information we need onto a stack and then we pop from that stack and apply the deltas until there are no more left. This version of the code does not make use of the delta base cache so it is slower than what's in the mainline. A later commit will reintroduce it. --- src/pack.c | 140 ++++++++++++++++++++++++++++++++++++++++++++--------- src/pack.h | 9 ++++ 2 files changed, 126 insertions(+), 23 deletions(-) diff --git a/src/pack.c b/src/pack.c index b9374d04f..523905f8f 100644 --- a/src/pack.c +++ b/src/pack.c @@ -40,6 +40,13 @@ static int pack_entry_find_offset( const git_oid *short_oid, size_t len); +/** + * Generate the chain of dependencies which we need to get to the + * object at `off`. As we use a stack, the latest is the base object, + * the rest are deltas. + */ +static int pack_dependency_chain(git_dependency_chain *chain, struct git_pack_file *p, git_off_t off); + static int packfile_error(const char *message) { giterr_set(GITERR_ODB, "Invalid pack file - %s", message); @@ -583,47 +590,63 @@ int git_packfile_unpack( git_mwindow *w_curs = NULL; git_off_t curpos = *obj_offset; int error; + git_dependency_chain chain; + struct pack_chain_elem *elem; - size_t size = 0; - git_otype type; + git_otype base_type; /* * TODO: optionally check the CRC on the packfile */ + error = pack_dependency_chain(&chain, p, *obj_offset); + if (error < 0) + return error; + obj->data = NULL; obj->len = 0; obj->type = GIT_OBJ_BAD; - error = git_packfile_unpack_header(&size, &type, &p->mwf, &w_curs, &curpos); + /* the first one is the base, so we expand that one */ + elem = git_array_pop(chain); + curpos = elem->offset; + error = packfile_unpack_compressed(obj, p, &w_curs, &curpos, elem->size, elem->type); git_mwindow_close(&w_curs); if (error < 0) - return error; + goto cleanup; - switch (type) { - case GIT_OBJ_OFS_DELTA: - case GIT_OBJ_REF_DELTA: - error = packfile_unpack_delta( - obj, p, &w_curs, &curpos, - size, type, *obj_offset); - break; + base_type = elem->type; + /* we now apply each consecutive delta until we run out */ + while (git_array_size(chain) > 0) { + git_rawobj base, delta; - case GIT_OBJ_COMMIT: - case GIT_OBJ_TREE: - case GIT_OBJ_BLOB: - case GIT_OBJ_TAG: - error = packfile_unpack_compressed( - obj, p, &w_curs, &curpos, - size, type); - break; + elem = git_array_pop(chain); + curpos = elem->offset; + error = packfile_unpack_compressed(&delta, p, &w_curs, &curpos, elem->size, elem->type); + git_mwindow_close(&w_curs); - default: - error = packfile_error("invalid packfile type in header");; - break; + if (error < 0) + break; + + /* the current object becomes the new base, on which we apply the delta */ + base = *obj; + obj->data = NULL; + obj->len = 0; + obj->type = GIT_OBJ_BAD; + + error = git__delta_apply(obj, base.data, base.len, delta.data, delta.len); + git__free(delta.data); + git__free(base.data); + + if (error < 0) + break; + + obj->type = base_type; } - *obj_offset = curpos; +cleanup: + git_array_clear(chain); return error; } @@ -1215,3 +1238,74 @@ int git_pack_entry_find( git_oid_cpy(&e->sha1, &found_oid); return 0; } + +static int pack_dependency_chain(git_dependency_chain *chain_out, struct git_pack_file *p, git_off_t obj_offset) +{ + git_dependency_chain chain = GIT_ARRAY_INIT; + git_mwindow *w_curs = NULL; + git_off_t curpos = obj_offset, base_offset; + int error = 0, found_base = 0; + size_t size; + git_otype type; + + while (!found_base && error == 0) { + struct pack_chain_elem *elem; + + curpos = obj_offset; + elem = git_array_alloc(chain); + if (!elem) + return -1; + + error = git_packfile_unpack_header(&size, &type, &p->mwf, &w_curs, &curpos); + git_mwindow_close(&w_curs); + + if (error < 0) + return error; + + elem->offset = curpos; + elem->size = size; + elem->type = type; + + switch (type) { + case GIT_OBJ_OFS_DELTA: + case GIT_OBJ_REF_DELTA: + base_offset = get_delta_base(p, &w_curs, &curpos, type, obj_offset); + git_mwindow_close(&w_curs); + + if (base_offset == 0) + return packfile_error("delta offset is zero"); + if (base_offset < 0) /* must actually be an error code */ + return (int)base_offset; + + /* we need to pass the pos *after* the delta-base bit */ + elem->offset = curpos; + + /* go through the loop again, but with the new object */ + obj_offset = base_offset; + break; + + /* one of these means we've found the final object in the chain */ + case GIT_OBJ_COMMIT: + case GIT_OBJ_TREE: + case GIT_OBJ_BLOB: + case GIT_OBJ_TAG: + found_base = 1; + break; + + default: + error = packfile_error("invalid packfile type in header"); + break; + } + } + + if (!found_base) { + git_array_clear(chain); + return packfile_error("after dependency chain loop; cannot happen"); + } + + if (error < 0) + git_array_clear(chain); + + *chain_out = chain; + return error; +} diff --git a/src/pack.h b/src/pack.h index 58f81e2f0..a2ea3849f 100644 --- a/src/pack.h +++ b/src/pack.h @@ -17,6 +17,7 @@ #include "mwindow.h" #include "odb.h" #include "oidmap.h" +#include "array.h" #define GIT_PACK_FILE_MODE 0444 @@ -60,6 +61,14 @@ typedef struct git_pack_cache_entry { git_rawobj raw; } git_pack_cache_entry; +struct pack_chain_elem { + git_off_t offset; + size_t size; + git_otype type; +}; + +typedef git_array_t(struct pack_chain_elem) git_dependency_chain; + #include "offmap.h" GIT__USE_OFFMAP; From a332e91c92524cc21818eadfbe723361d31dc187 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Tue, 6 May 2014 23:37:28 +0200 Subject: [PATCH 04/10] pack: use a cache for delta bases when unpacking Bring back the use of the delta base cache for unpacking objects. When generating the delta chain, we stop when we find a delta base in the pack's cache and use that as the starting point. --- src/pack.c | 145 ++++++++++++++++++++++++++--------------------------- src/pack.h | 5 ++ 2 files changed, 77 insertions(+), 73 deletions(-) diff --git a/src/pack.c b/src/pack.c index 523905f8f..c1d7592fd 100644 --- a/src/pack.c +++ b/src/pack.c @@ -42,8 +42,9 @@ static int pack_entry_find_offset( /** * Generate the chain of dependencies which we need to get to the - * object at `off`. As we use a stack, the latest is the base object, - * the rest are deltas. + * object at `off`. `chain` is used a stack, popping gives the right + * order to apply deltas on. If an object is found in the pack's base + * cache, we stop calculating there. */ static int pack_dependency_chain(git_dependency_chain *chain, struct git_pack_file *p, git_off_t off); @@ -521,67 +522,6 @@ int git_packfile_resolve_header( return error; } -static int packfile_unpack_delta( - git_rawobj *obj, - struct git_pack_file *p, - git_mwindow **w_curs, - git_off_t *curpos, - size_t delta_size, - git_otype delta_type, - git_off_t obj_offset) -{ - git_off_t base_offset, base_key; - git_rawobj base, delta; - git_pack_cache_entry *cached = NULL; - int error, found_base = 0; - - base_offset = get_delta_base(p, w_curs, curpos, delta_type, obj_offset); - git_mwindow_close(w_curs); - if (base_offset == 0) - return packfile_error("delta offset is zero"); - if (base_offset < 0) /* must actually be an error code */ - return (int)base_offset; - - if (!p->bases.entries && (cache_init(&p->bases) < 0)) - return -1; - - base_key = base_offset; /* git_packfile_unpack modifies base_offset */ - if ((cached = cache_get(&p->bases, base_offset)) != NULL) { - memcpy(&base, &cached->raw, sizeof(git_rawobj)); - found_base = 1; - } - - if (!cached) { /* have to inflate it */ - error = git_packfile_unpack(&base, p, &base_offset); - if (error < 0) - return error; - } - - error = packfile_unpack_compressed(&delta, p, w_curs, curpos, delta_size, delta_type); - git_mwindow_close(w_curs); - - if (error < 0) { - if (!found_base) - git__free(base.data); - return error; - } - - obj->type = base.type; - error = git__delta_apply(obj, base.data, base.len, delta.data, delta.len); - if (error < 0) - goto on_error; - - if (found_base) - git_atomic_dec(&cached->refcount); - else if (cache_add(&p->bases, &base, base_key) < 0) - git__free(base.data); - -on_error: - git__free(delta.data); - - return error; /* error set by git__delta_apply */ -} - int git_packfile_unpack( git_rawobj *obj, struct git_pack_file *p, @@ -589,10 +529,10 @@ int git_packfile_unpack( { git_mwindow *w_curs = NULL; git_off_t curpos = *obj_offset; - int error; - git_dependency_chain chain; + int error, free_base = 0; + git_dependency_chain chain = GIT_ARRAY_INIT; struct pack_chain_elem *elem; - + git_pack_cache_entry *cached = NULL; git_otype base_type; /* @@ -609,16 +549,38 @@ int git_packfile_unpack( /* the first one is the base, so we expand that one */ elem = git_array_pop(chain); - curpos = elem->offset; - error = packfile_unpack_compressed(obj, p, &w_curs, &curpos, elem->size, elem->type); - git_mwindow_close(&w_curs); + if (elem->cached) { + cached = elem->cached_entry; + memcpy(obj, &cached->raw, sizeof(git_rawobj)); + base_type = obj->type; + } else { + curpos = elem->offset; + error = packfile_unpack_compressed(obj, p, &w_curs, &curpos, elem->size, elem->type); + git_mwindow_close(&w_curs); + base_type = elem->type; + free_base = 1; + } if (error < 0) goto cleanup; - base_type = elem->type; + /* + * Finding the object we want as the base element is + * problematic, as we need to make sure we don't accidentally + * give the caller the cached object, which it would then feel + * free to free, so we need to copy the data. + */ + if (cached && git_array_size(chain) == 0) { + void *data = obj->data; + obj->data = git__malloc(obj->len + 1); + GITERR_CHECK_ALLOC(obj->data); + memcpy(obj->data, data, obj->len + 1); + git_atomic_dec(&cached->refcount); + goto cleanup; + } + /* we now apply each consecutive delta until we run out */ - while (git_array_size(chain) > 0) { + while (git_array_size(chain) > 0 && !error) { git_rawobj base, delta; elem = git_array_pop(chain); @@ -636,16 +598,39 @@ int git_packfile_unpack( obj->type = GIT_OBJ_BAD; error = git__delta_apply(obj, base.data, base.len, delta.data, delta.len); + obj->type = base_type; + /* + * We usually don't want to free the base at this + * point, as we put it into the cache in the previous + * iteration. free_base lets us know that we got the + * base object directly from the packfile, so we can free it. + */ git__free(delta.data); - git__free(base.data); + if (free_base) { + free_base = 0; + git__free(base.data); + } + + if (cached) { + git_atomic_dec(&cached->refcount); + cached = NULL; + } if (error < 0) break; - obj->type = base_type; + /* only try to cache if we're not handing this buffer off to the caller */ + if (git_array_size(chain) > 0 && + (error = cache_add(&p->bases, obj, elem->base_key)) < 0) + goto cleanup; } cleanup: + if (error < 0) + git__free(obj->data); + + *obj_offset = elem->offset; + git_array_clear(chain); return error; } @@ -1248,8 +1233,12 @@ static int pack_dependency_chain(git_dependency_chain *chain_out, struct git_pac size_t size; git_otype type; + if (!p->bases.entries && (cache_init(&p->bases) < 0)) + return -1; + while (!found_base && error == 0) { struct pack_chain_elem *elem; + git_pack_cache_entry *cached = NULL; curpos = obj_offset; elem = git_array_alloc(chain); @@ -1262,13 +1251,23 @@ static int pack_dependency_chain(git_dependency_chain *chain_out, struct git_pac if (error < 0) return error; + elem->cached = 0; elem->offset = curpos; elem->size = size; elem->type = type; + elem->base_key = obj_offset; switch (type) { case GIT_OBJ_OFS_DELTA: case GIT_OBJ_REF_DELTA: + /* if we have a base cached, we can stop here instead */ + if ((cached = cache_get(&p->bases, obj_offset)) != NULL) { + elem->cached_entry = cached; + elem->cached = 1; + found_base = 1; + break; + } + base_offset = get_delta_base(p, &w_curs, &curpos, type, obj_offset); git_mwindow_close(&w_curs); diff --git a/src/pack.h b/src/pack.h index a2ea3849f..e86889d1b 100644 --- a/src/pack.h +++ b/src/pack.h @@ -62,9 +62,14 @@ typedef struct git_pack_cache_entry { } git_pack_cache_entry; struct pack_chain_elem { + int cached; + git_off_t base_key; + /* if we don't have it cached we have this */ git_off_t offset; size_t size; git_otype type; + /* if cached, we have this instead */ + git_pack_cache_entry *cached_entry; }; typedef git_array_t(struct pack_chain_elem) git_dependency_chain; From e6d10c58b547181fe19f6bacff6bd0dee9f67b9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Thu, 8 May 2014 16:24:54 +0200 Subject: [PATCH 05/10] pack: make sure not to leak the dep chain --- src/pack.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/pack.c b/src/pack.c index c1d7592fd..a8577d389 100644 --- a/src/pack.c +++ b/src/pack.c @@ -1242,14 +1242,16 @@ static int pack_dependency_chain(git_dependency_chain *chain_out, struct git_pac curpos = obj_offset; elem = git_array_alloc(chain); - if (!elem) - return -1; + if (!elem) { + error = -1; + goto on_error; + } error = git_packfile_unpack_header(&size, &type, &p->mwf, &w_curs, &curpos); git_mwindow_close(&w_curs); if (error < 0) - return error; + goto on_error; elem->cached = 0; elem->offset = curpos; @@ -1273,8 +1275,10 @@ static int pack_dependency_chain(git_dependency_chain *chain_out, struct git_pac if (base_offset == 0) return packfile_error("delta offset is zero"); - if (base_offset < 0) /* must actually be an error code */ - return (int)base_offset; + if (base_offset < 0) { /* must actually be an error code */ + error = (int)base_offset; + goto on_error; + } /* we need to pass the pos *after* the delta-base bit */ elem->offset = curpos; @@ -1302,9 +1306,10 @@ static int pack_dependency_chain(git_dependency_chain *chain_out, struct git_pac return packfile_error("after dependency chain loop; cannot happen"); } - if (error < 0) - git_array_clear(chain); - *chain_out = chain; return error; + +on_error: + git_array_clear(chain); + return error; } From b2559f477a3f8e2bc76140ca2c76d8cc30b5f5da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Thu, 8 May 2014 17:14:59 +0200 Subject: [PATCH 06/10] pack: preallocate a 64-element chain Dependency chains are often large and require a few reallocations. Allocate a 64-element chain before doing anything else to avoid allocations during the loop. This value comes from the stack-allocated one git uses. We still allocate this on the heap, but it does help performance a little bit. --- src/pack.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pack.c b/src/pack.c index a8577d389..664e8efb0 100644 --- a/src/pack.c +++ b/src/pack.c @@ -1236,6 +1236,7 @@ static int pack_dependency_chain(git_dependency_chain *chain_out, struct git_pac if (!p->bases.entries && (cache_init(&p->bases) < 0)) return -1; + git_array_init_to_size(chain, 64); while (!found_base && error == 0) { struct pack_chain_elem *elem; git_pack_cache_entry *cached = NULL; From 9dbd150f5f499bbf768d65c4ebb596651e77a1b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Fri, 9 May 2014 09:36:09 +0200 Subject: [PATCH 07/10] pack: simplify delta chain code The switch makes the loop somewhat unwieldy. Let's assume it's fine and perform the check when we're accessing the data. This makes our code look a lot more like git's. --- src/pack.c | 100 +++++++++++++++++++++++++++-------------------------- 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/src/pack.c b/src/pack.c index 664e8efb0..e1fd276b7 100644 --- a/src/pack.c +++ b/src/pack.c @@ -549,21 +549,38 @@ int git_packfile_unpack( /* the first one is the base, so we expand that one */ elem = git_array_pop(chain); + base_type = elem->type; if (elem->cached) { cached = elem->cached_entry; memcpy(obj, &cached->raw, sizeof(git_rawobj)); - base_type = obj->type; - } else { - curpos = elem->offset; - error = packfile_unpack_compressed(obj, p, &w_curs, &curpos, elem->size, elem->type); - git_mwindow_close(&w_curs); - base_type = elem->type; - free_base = 1; } if (error < 0) goto cleanup; + switch (base_type) { + case GIT_OBJ_COMMIT: + case GIT_OBJ_TREE: + case GIT_OBJ_BLOB: + case GIT_OBJ_TAG: + if (!elem->cached) { + curpos = elem->offset; + error = packfile_unpack_compressed(obj, p, &w_curs, &curpos, elem->size, elem->type); + git_mwindow_close(&w_curs); + free_base = 1; + } + if (error < 0) + goto cleanup; + break; + case GIT_OBJ_OFS_DELTA: + case GIT_OBJ_REF_DELTA: + error = packfile_error("dependency chain ends in a delta"); + goto cleanup; + default: + error = packfile_error("invalid packfile type in header"); + goto cleanup; + } + /* * Finding the object we want as the base element is * problematic, as we need to make sure we don't accidentally @@ -1229,7 +1246,7 @@ static int pack_dependency_chain(git_dependency_chain *chain_out, struct git_pac git_dependency_chain chain = GIT_ARRAY_INIT; git_mwindow *w_curs = NULL; git_off_t curpos = obj_offset, base_offset; - int error = 0, found_base = 0; + int error = 0; size_t size; git_otype type; @@ -1237,7 +1254,7 @@ static int pack_dependency_chain(git_dependency_chain *chain_out, struct git_pac return -1; git_array_init_to_size(chain, 64); - while (!found_base && error == 0) { + while (true) { struct pack_chain_elem *elem; git_pack_cache_entry *cached = NULL; @@ -1248,6 +1265,16 @@ static int pack_dependency_chain(git_dependency_chain *chain_out, struct git_pac goto on_error; } + elem->base_key = obj_offset; + + /* if we have a base cached, we can stop here instead */ + if ((cached = cache_get(&p->bases, obj_offset)) != NULL) { + elem->cached_entry = cached; + elem->cached = 1; + elem->type = cached->raw.type; + break; + } + error = git_packfile_unpack_header(&size, &type, &p->mwf, &w_curs, &curpos); git_mwindow_close(&w_curs); @@ -1260,51 +1287,26 @@ static int pack_dependency_chain(git_dependency_chain *chain_out, struct git_pac elem->type = type; elem->base_key = obj_offset; - switch (type) { - case GIT_OBJ_OFS_DELTA: - case GIT_OBJ_REF_DELTA: - /* if we have a base cached, we can stop here instead */ - if ((cached = cache_get(&p->bases, obj_offset)) != NULL) { - elem->cached_entry = cached; - elem->cached = 1; - found_base = 1; - break; - } - - base_offset = get_delta_base(p, &w_curs, &curpos, type, obj_offset); - git_mwindow_close(&w_curs); - - if (base_offset == 0) - return packfile_error("delta offset is zero"); - if (base_offset < 0) { /* must actually be an error code */ - error = (int)base_offset; - goto on_error; - } - - /* we need to pass the pos *after* the delta-base bit */ - elem->offset = curpos; - - /* go through the loop again, but with the new object */ - obj_offset = base_offset; + if (type != GIT_OBJ_OFS_DELTA && type != GIT_OBJ_REF_DELTA) break; - /* one of these means we've found the final object in the chain */ - case GIT_OBJ_COMMIT: - case GIT_OBJ_TREE: - case GIT_OBJ_BLOB: - case GIT_OBJ_TAG: - found_base = 1; - break; + base_offset = get_delta_base(p, &w_curs, &curpos, type, obj_offset); + git_mwindow_close(&w_curs); - default: - error = packfile_error("invalid packfile type in header"); - break; + if (base_offset == 0) { + error = packfile_error("delta offset is zero"); + goto on_error; + } + if (base_offset < 0) { /* must actually be an error code */ + error = (int)base_offset; + goto on_error; } - } - if (!found_base) { - git_array_clear(chain); - return packfile_error("after dependency chain loop; cannot happen"); + /* we need to pass the pos *after* the delta-base bit */ + elem->offset = curpos; + + /* go through the loop again, but with the new object */ + obj_offset = base_offset; } *chain_out = chain; From a3ffbf230e454309c96961a182520a53f555d356 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Sun, 11 May 2014 03:50:34 +0200 Subject: [PATCH 08/10] pack: expose a cached delta base directly Instead of going through a special entry in the chain, let's pass it as an output parameter. --- src/pack.c | 185 ++++++++++++++++++++++++++--------------------------- src/pack.h | 4 -- 2 files changed, 92 insertions(+), 97 deletions(-) diff --git a/src/pack.c b/src/pack.c index e1fd276b7..b5e8febd4 100644 --- a/src/pack.c +++ b/src/pack.c @@ -40,14 +40,6 @@ static int pack_entry_find_offset( const git_oid *short_oid, size_t len); -/** - * Generate the chain of dependencies which we need to get to the - * object at `off`. `chain` is used a stack, popping gives the right - * order to apply deltas on. If an object is found in the pack's base - * cache, we stop calculating there. - */ -static int pack_dependency_chain(git_dependency_chain *chain, struct git_pack_file *p, git_off_t off); - static int packfile_error(const char *message) { giterr_set(GITERR_ODB, "Invalid pack file - %s", message); @@ -522,6 +514,87 @@ int git_packfile_resolve_header( return error; } +/** + * Generate the chain of dependencies which we need to get to the + * object at `off`. `chain` is used a stack, popping gives the right + * order to apply deltas on. If an object is found in the pack's base + * cache, we stop calculating there. + */ +static int pack_dependency_chain(git_dependency_chain *chain_out, git_pack_cache_entry **cached_out, + git_off_t *cached_off, struct git_pack_file *p, git_off_t obj_offset) +{ + git_dependency_chain chain = GIT_ARRAY_INIT; + git_mwindow *w_curs = NULL; + git_off_t curpos = obj_offset, base_offset; + int error = 0; + size_t size; + git_otype type; + + if (!p->bases.entries && (cache_init(&p->bases) < 0)) + return -1; + + git_array_init_to_size(chain, 64); + while (true) { + struct pack_chain_elem *elem; + git_pack_cache_entry *cached = NULL; + + /* if we have a base cached, we can stop here instead */ + if ((cached = cache_get(&p->bases, obj_offset)) != NULL) { + *cached_out = cached; + *cached_off = obj_offset; + break; + } + + curpos = obj_offset; + elem = git_array_alloc(chain); + if (!elem) { + error = -1; + goto on_error; + } + + elem->base_key = obj_offset; + + error = git_packfile_unpack_header(&size, &type, &p->mwf, &w_curs, &curpos); + git_mwindow_close(&w_curs); + + if (error < 0) + goto on_error; + + elem->offset = curpos; + elem->size = size; + elem->type = type; + elem->base_key = obj_offset; + + if (type != GIT_OBJ_OFS_DELTA && type != GIT_OBJ_REF_DELTA) + break; + + base_offset = get_delta_base(p, &w_curs, &curpos, type, obj_offset); + git_mwindow_close(&w_curs); + + if (base_offset == 0) { + error = packfile_error("delta offset is zero"); + goto on_error; + } + if (base_offset < 0) { /* must actually be an error code */ + error = (int)base_offset; + goto on_error; + } + + /* we need to pass the pos *after* the delta-base bit */ + elem->offset = curpos; + + /* go through the loop again, but with the new object */ + obj_offset = base_offset; + } + + *chain_out = chain; + return error; + +on_error: + git_array_clear(chain); + return error; +} + int git_packfile_unpack( git_rawobj *obj, struct git_pack_file *p, @@ -531,7 +604,7 @@ int git_packfile_unpack( git_off_t curpos = *obj_offset; int error, free_base = 0; git_dependency_chain chain = GIT_ARRAY_INIT; - struct pack_chain_elem *elem; + struct pack_chain_elem *elem = NULL; git_pack_cache_entry *cached = NULL; git_otype base_type; @@ -539,7 +612,7 @@ int git_packfile_unpack( * TODO: optionally check the CRC on the packfile */ - error = pack_dependency_chain(&chain, p, *obj_offset); + error = pack_dependency_chain(&chain, &cached, obj_offset, p, *obj_offset); if (error < 0) return error; @@ -547,12 +620,12 @@ int git_packfile_unpack( obj->len = 0; obj->type = GIT_OBJ_BAD; - /* the first one is the base, so we expand that one */ - elem = git_array_pop(chain); - base_type = elem->type; - if (elem->cached) { - cached = elem->cached_entry; + if (cached) { memcpy(obj, &cached->raw, sizeof(git_rawobj)); + base_type = obj->type; + } else { + elem = git_array_pop(chain); + base_type = elem->type; } if (error < 0) @@ -563,10 +636,11 @@ int git_packfile_unpack( case GIT_OBJ_TREE: case GIT_OBJ_BLOB: case GIT_OBJ_TAG: - if (!elem->cached) { + if (!cached) { curpos = elem->offset; error = packfile_unpack_compressed(obj, p, &w_curs, &curpos, elem->size, elem->type); git_mwindow_close(&w_curs); + base_type = elem->type; free_base = 1; } if (error < 0) @@ -646,7 +720,8 @@ cleanup: if (error < 0) git__free(obj->data); - *obj_offset = elem->offset; + if (elem) + *obj_offset = elem->offset; git_array_clear(chain); return error; @@ -1240,79 +1315,3 @@ int git_pack_entry_find( git_oid_cpy(&e->sha1, &found_oid); return 0; } - -static int pack_dependency_chain(git_dependency_chain *chain_out, struct git_pack_file *p, git_off_t obj_offset) -{ - git_dependency_chain chain = GIT_ARRAY_INIT; - git_mwindow *w_curs = NULL; - git_off_t curpos = obj_offset, base_offset; - int error = 0; - size_t size; - git_otype type; - - if (!p->bases.entries && (cache_init(&p->bases) < 0)) - return -1; - - git_array_init_to_size(chain, 64); - while (true) { - struct pack_chain_elem *elem; - git_pack_cache_entry *cached = NULL; - - curpos = obj_offset; - elem = git_array_alloc(chain); - if (!elem) { - error = -1; - goto on_error; - } - - elem->base_key = obj_offset; - - /* if we have a base cached, we can stop here instead */ - if ((cached = cache_get(&p->bases, obj_offset)) != NULL) { - elem->cached_entry = cached; - elem->cached = 1; - elem->type = cached->raw.type; - break; - } - - error = git_packfile_unpack_header(&size, &type, &p->mwf, &w_curs, &curpos); - git_mwindow_close(&w_curs); - - if (error < 0) - goto on_error; - - elem->cached = 0; - elem->offset = curpos; - elem->size = size; - elem->type = type; - elem->base_key = obj_offset; - - if (type != GIT_OBJ_OFS_DELTA && type != GIT_OBJ_REF_DELTA) - break; - - base_offset = get_delta_base(p, &w_curs, &curpos, type, obj_offset); - git_mwindow_close(&w_curs); - - if (base_offset == 0) { - error = packfile_error("delta offset is zero"); - goto on_error; - } - if (base_offset < 0) { /* must actually be an error code */ - error = (int)base_offset; - goto on_error; - } - - /* we need to pass the pos *after* the delta-base bit */ - elem->offset = curpos; - - /* go through the loop again, but with the new object */ - obj_offset = base_offset; - } - - *chain_out = chain; - return error; - -on_error: - git_array_clear(chain); - return error; -} diff --git a/src/pack.h b/src/pack.h index e86889d1b..610e70c18 100644 --- a/src/pack.h +++ b/src/pack.h @@ -62,14 +62,10 @@ typedef struct git_pack_cache_entry { } git_pack_cache_entry; struct pack_chain_elem { - int cached; git_off_t base_key; - /* if we don't have it cached we have this */ git_off_t offset; size_t size; git_otype type; - /* if cached, we have this instead */ - git_pack_cache_entry *cached_entry; }; typedef git_array_t(struct pack_chain_elem) git_dependency_chain; From 15bcced22379857978d80539da5fdd8f4667ff95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Sun, 11 May 2014 05:31:22 +0200 Subject: [PATCH 09/10] pack: use stack allocation for smaller delta chains This avoid allocating the array on the heap for relatively small chains. The expected performance increase is sadly not really noticeable. --- src/pack.c | 61 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/src/pack.c b/src/pack.c index b5e8febd4..235e8d3ee 100644 --- a/src/pack.c +++ b/src/pack.c @@ -514,26 +514,30 @@ int git_packfile_resolve_header( return error; } +#define SMALL_STACK_SIZE 64 + /** * Generate the chain of dependencies which we need to get to the * object at `off`. `chain` is used a stack, popping gives the right * order to apply deltas on. If an object is found in the pack's base * cache, we stop calculating there. */ -static int pack_dependency_chain(git_dependency_chain *chain_out, git_pack_cache_entry **cached_out, - git_off_t *cached_off, struct git_pack_file *p, git_off_t obj_offset) +static int pack_dependency_chain(git_dependency_chain *chain_out, + git_pack_cache_entry **cached_out, git_off_t *cached_off, + struct pack_chain_elem *small_stack, size_t *stack_sz, + struct git_pack_file *p, git_off_t obj_offset) { git_dependency_chain chain = GIT_ARRAY_INIT; git_mwindow *w_curs = NULL; git_off_t curpos = obj_offset, base_offset; - int error = 0; - size_t size; + int error = 0, use_heap = 0; + size_t size, elem_pos; git_otype type; if (!p->bases.entries && (cache_init(&p->bases) < 0)) return -1; - git_array_init_to_size(chain, 64); + elem_pos = 0; while (true) { struct pack_chain_elem *elem; git_pack_cache_entry *cached = NULL; @@ -545,11 +549,24 @@ static int pack_dependency_chain(git_dependency_chain *chain_out, git_pack_cache break; } + /* if we run out of space on the small stack, use the array */ + if (elem_pos == SMALL_STACK_SIZE) { + git_array_init_to_size(chain, elem_pos); + GITERR_CHECK_ARRAY(chain); + memcpy(chain.ptr, small_stack, elem_pos * sizeof(struct pack_chain_elem)); + chain.size = elem_pos; + use_heap = 1; + } + curpos = obj_offset; - elem = git_array_alloc(chain); - if (!elem) { - error = -1; - goto on_error; + if (!use_heap) { + elem = &small_stack[elem_pos]; + } else { + elem = git_array_alloc(chain); + if (!elem) { + error = -1; + goto on_error; + } } elem->base_key = obj_offset; @@ -585,8 +602,11 @@ static int pack_dependency_chain(git_dependency_chain *chain_out, git_pack_cache /* go through the loop again, but with the new object */ obj_offset = base_offset; + elem_pos++; } + + *stack_sz = elem_pos + 1; *chain_out = chain; return error; @@ -604,15 +624,17 @@ int git_packfile_unpack( git_off_t curpos = *obj_offset; int error, free_base = 0; git_dependency_chain chain = GIT_ARRAY_INIT; - struct pack_chain_elem *elem = NULL; + struct pack_chain_elem *elem = NULL, *stack; git_pack_cache_entry *cached = NULL; + struct pack_chain_elem small_stack[SMALL_STACK_SIZE]; + size_t stack_size, elem_pos; git_otype base_type; /* * TODO: optionally check the CRC on the packfile */ - error = pack_dependency_chain(&chain, &cached, obj_offset, p, *obj_offset); + error = pack_dependency_chain(&chain, &cached, obj_offset, small_stack, &stack_size, p, *obj_offset); if (error < 0) return error; @@ -620,11 +642,16 @@ int git_packfile_unpack( obj->len = 0; obj->type = GIT_OBJ_BAD; + /* let's point to the right stack */ + stack = chain.ptr ? chain.ptr : small_stack; + + elem_pos = stack_size; if (cached) { memcpy(obj, &cached->raw, sizeof(git_rawobj)); base_type = obj->type; + elem_pos--; /* stack_size includes the base, which isn't actually there */ } else { - elem = git_array_pop(chain); + elem = &stack[--elem_pos]; base_type = elem->type; } @@ -661,7 +688,7 @@ int git_packfile_unpack( * give the caller the cached object, which it would then feel * free to free, so we need to copy the data. */ - if (cached && git_array_size(chain) == 0) { + if (cached && stack_size == 1) { void *data = obj->data; obj->data = git__malloc(obj->len + 1); GITERR_CHECK_ALLOC(obj->data); @@ -671,10 +698,10 @@ int git_packfile_unpack( } /* we now apply each consecutive delta until we run out */ - while (git_array_size(chain) > 0 && !error) { + while (elem_pos > 0 && !error) { git_rawobj base, delta; - elem = git_array_pop(chain); + elem = &stack[elem_pos - 1]; curpos = elem->offset; error = packfile_unpack_compressed(&delta, p, &w_curs, &curpos, elem->size, elem->type); git_mwindow_close(&w_curs); @@ -711,9 +738,11 @@ int git_packfile_unpack( break; /* only try to cache if we're not handing this buffer off to the caller */ - if (git_array_size(chain) > 0 && + if (elem_pos != 1 && (error = cache_add(&p->bases, obj, elem->base_key)) < 0) goto cleanup; + + elem_pos--; } cleanup: From c968ce2c2c49ca7a559ecb8f7014f777c3a8a5f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Mon, 12 May 2014 02:01:05 +0200 Subject: [PATCH 10/10] pack: don't forget to cache the base object The base object is a good cache candidate, so we shouldn't forget to add it to the cache. --- src/pack.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/pack.c b/src/pack.c index 235e8d3ee..d93ee25f9 100644 --- a/src/pack.c +++ b/src/pack.c @@ -668,7 +668,6 @@ int git_packfile_unpack( error = packfile_unpack_compressed(obj, p, &w_curs, &curpos, elem->size, elem->type); git_mwindow_close(&w_curs); base_type = elem->type; - free_base = 1; } if (error < 0) goto cleanup; @@ -683,7 +682,7 @@ int git_packfile_unpack( } /* - * Finding the object we want as the base element is + * Finding the object we want a cached base element is * problematic, as we need to make sure we don't accidentally * give the caller the cached object, which it would then feel * free to free, so we need to copy the data. @@ -701,6 +700,13 @@ int git_packfile_unpack( while (elem_pos > 0 && !error) { git_rawobj base, delta; + /* + * We can now try to add the base to the cache, as + * long as it's not already the cached one. + */ + if (!cached) + free_base = !!cache_add(&p->bases, obj, elem->base_key); + elem = &stack[elem_pos - 1]; curpos = elem->offset; error = packfile_unpack_compressed(&delta, p, &w_curs, &curpos, elem->size, elem->type); @@ -737,11 +743,6 @@ int git_packfile_unpack( if (error < 0) break; - /* only try to cache if we're not handing this buffer off to the caller */ - if (elem_pos != 1 && - (error = cache_add(&p->bases, obj, elem->base_key)) < 0) - goto cleanup; - elem_pos--; }