From a6563619e98437a99ff49eef590f62dbb1584358 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Wed, 5 Feb 2014 13:01:54 +0100 Subject: [PATCH 1/3] commit: faster parsing The current code issues a lot of strncmp() calls in order to check for the end of the header, simply in order to copy it and start going through it again. These are a lot of calls for something we can check as we go along. Knowing the amount of parents beforehand to reduce allocations in extreme cases does not make up for them. Instead start parsing immediately and check for the double-newline after each header field, leaving the raw_header allocation for the end, which lets us go through the header once and reduces the amount of strncmp() calls significantly. In unscientific testing, this has reduced a shortlog-like usage (walking though the whole history of a branch and extracting data from the commits) of git.git from ~830ms to ~700ms and makes the time we spend in strncmp() negligible. --- src/commit.c | 40 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/src/commit.c b/src/commit.c index da7c4992e..730fa6403 100644 --- a/src/commit.c +++ b/src/commit.c @@ -164,33 +164,15 @@ int git_commit__parse(void *_commit, git_odb_object *odb_obj) const char *buffer_start = git_odb_object_data(odb_obj), *buffer; const char *buffer_end = buffer_start + git_odb_object_size(odb_obj); git_oid parent_id; - uint32_t parent_count = 0; size_t header_len; - /* find end-of-header (counting parents as we go) */ - for (buffer = buffer_start; buffer < buffer_end; ++buffer) { - if (!strncmp("\n\n", buffer, 2)) { - ++buffer; - break; - } - if (!strncmp("\nparent ", buffer, strlen("\nparent "))) - ++parent_count; - } + buffer = buffer_start; - header_len = buffer - buffer_start; - commit->raw_header = git__strndup(buffer_start, header_len); - GITERR_CHECK_ALLOC(commit->raw_header); - - /* point "buffer" to header data */ - buffer = commit->raw_header; - buffer_end = commit->raw_header + header_len; - - if (parent_count < 1) - parent_count = 1; - - git_array_init_to_size(commit->parent_ids, parent_count); + /* Allocate for one, which will allow not to realloc 90% of the time */ + git_array_init_to_size(commit->parent_ids, 1); GITERR_CHECK_ARRAY(commit->parent_ids); + /* The tree is always the first field */ if (git_oid__parse(&commit->tree_id, &buffer, buffer_end, "tree ") < 0) goto bad_buffer; @@ -221,6 +203,9 @@ int git_commit__parse(void *_commit, git_odb_object *odb_obj) /* Parse add'l header entries */ while (buffer < buffer_end) { const char *eoln = buffer; + if (buffer[-1] == '\n' && buffer[0] == '\n') + break; + while (eoln < buffer_end && *eoln != '\n') ++eoln; @@ -236,13 +221,12 @@ int git_commit__parse(void *_commit, git_odb_object *odb_obj) buffer = eoln; } - /* point "buffer" to data after header */ - buffer = git_odb_object_data(odb_obj); - buffer_end = buffer + git_odb_object_size(odb_obj); + header_len = buffer - buffer_start; + commit->raw_header = git__strndup(buffer_start, header_len); + GITERR_CHECK_ALLOC(commit->raw_header); - buffer += header_len; - if (*buffer == '\n') - ++buffer; + /* point "buffer" to data after header, +1 for the final LF */ + buffer = buffer_start + header_len + 1; /* extract commit message */ if (buffer <= buffer_end) { From 1e6f0ac4360bae25ef0a9618c9b091192b8e8997 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Wed, 5 Feb 2014 13:49:51 +0100 Subject: [PATCH 2/3] utils: don't reimplement strnlen The standard library provides a very nice strnlen function, which knows to use SSE, let's not reimplement it ourselves. --- src/util.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util.h b/src/util.h index f9de909e9..c18221f49 100644 --- a/src/util.h +++ b/src/util.h @@ -7,6 +7,7 @@ #ifndef INCLUDE_util_h__ #define INCLUDE_util_h__ +#include "posix.h" #include "common.h" #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) @@ -50,8 +51,7 @@ GIT_INLINE(char *) git__strndup(const char *str, size_t n) size_t length = 0; char *ptr; - while (length < n && str[length]) - ++length; + length = p_strnlen(str, n); ptr = (char*)git__malloc(length + 1); From 24f3024f36ca44ca8bb32e1a80a048ac4301eaf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Wed, 5 Feb 2014 14:31:01 +0100 Subject: [PATCH 3/3] Split p_strlen into its own header We need this from util.h and posix.h, but the latter includes common.h which includes util.h, which means p_strlen is not defined by the time we get to git__strndup(). Split the definition on p_strlen() off into its own header so we can use it in util.h. --- src/posix.h | 13 +------------ src/strnlen.h | 23 +++++++++++++++++++++++ src/util.h | 2 +- 3 files changed, 25 insertions(+), 13 deletions(-) create mode 100644 src/strnlen.h diff --git a/src/posix.h b/src/posix.h index 0d9be49a9..6d3a84eba 100644 --- a/src/posix.h +++ b/src/posix.h @@ -89,18 +89,7 @@ extern struct tm * p_gmtime_r (const time_t *timer, struct tm *result); # include "unix/posix.h" #endif -#if defined(__MINGW32__) || defined(__sun) || defined(__APPLE__) -# define NO_STRNLEN -#endif - -#ifdef NO_STRNLEN -GIT_INLINE(size_t) p_strnlen(const char *s, size_t maxlen) { - const char *end = memchr(s, 0, maxlen); - return end ? (size_t)(end - s) : maxlen; -} -#else -# define p_strnlen strnlen -#endif +#include "strnlen.h" #ifdef NO_READDIR_R # include diff --git a/src/strnlen.h b/src/strnlen.h new file mode 100644 index 000000000..007da2e55 --- /dev/null +++ b/src/strnlen.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) the libgit2 contributors. All rights reserved. + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ +#ifndef INCLUDE_strlen_h__ +#define INCLUDE_strlen_h__ + +#if defined(__MINGW32__) || defined(__sun) || defined(__APPLE__) +# define NO_STRNLEN +#endif + +#ifdef NO_STRNLEN +GIT_INLINE(size_t) p_strnlen(const char *s, size_t maxlen) { + const char *end = memchr(s, 0, maxlen); + return end ? (size_t)(end - s) : maxlen; +} +#else +# define p_strnlen strnlen +#endif + +#endif diff --git a/src/util.h b/src/util.h index c18221f49..e378786d9 100644 --- a/src/util.h +++ b/src/util.h @@ -7,8 +7,8 @@ #ifndef INCLUDE_util_h__ #define INCLUDE_util_h__ -#include "posix.h" #include "common.h" +#include "strnlen.h" #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) #define bitsizeof(x) (CHAR_BIT * sizeof(x))