From 52462e1ccecdea86cceae9d25bf343265831bbaf Mon Sep 17 00:00:00 2001 From: pontusm Date: Sun, 13 May 2012 10:11:13 +0200 Subject: [PATCH 01/10] Test case to reproduce issue #690. Staged file status does not handle CRLF correctly. Ensures that the test repo has core.autocrlf=true for the test to fail. --- tests-clar/status/worktree.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests-clar/status/worktree.c b/tests-clar/status/worktree.c index 75975c988..d4ae48b8d 100644 --- a/tests-clar/status/worktree.c +++ b/tests-clar/status/worktree.c @@ -797,3 +797,30 @@ void test_status_worktree__interruptable_foreach(void) cl_assert_equal_i(8, count); } + +void test_status_worktree__new_staged_file_must_handle_crlf(void) +{ + git_repository *repo; + git_index *index; + git_config *config; + unsigned int status; + + cl_git_pass(git_repository_init(&repo, "getting_started", 0)); + + // Ensure that repo has core.autocrlf=true + cl_git_pass(git_repository_config(&config, repo)); + cl_git_pass(git_config_set_bool(config, "core.autocrlf", true)); + + cl_git_mkfile("getting_started/testfile.txt", "content\r\n"); // Content with CRLF + + cl_git_pass(git_repository_index(&index, repo)); + cl_git_pass(git_index_add(index, "testfile.txt", 0)); + cl_git_pass(git_index_write(index)); + + cl_git_pass(git_status_file(&status, repo, "testfile.txt")); + cl_assert_equal_i(GIT_STATUS_INDEX_NEW, status); + + git_config_free(config); + git_index_free(index); + git_repository_free(repo); +} From f8e2cc9a0a59dc87f8e8842b6818f3df180fffda Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Fri, 31 Aug 2012 15:53:47 -0700 Subject: [PATCH 02/10] Alternate test for autocrlf with status I couldn't get the last failing test to actually fail. This is a different test suggested by @nulltoken which should fail. --- src/crlf.c | 2 +- tests-clar/status/worktree.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/crlf.c b/src/crlf.c index 509e55897..1b6898ba6 100644 --- a/src/crlf.c +++ b/src/crlf.c @@ -276,7 +276,7 @@ static int find_and_add_filter(git_vector *filters, git_repository *repo, const /* * Use the core Git logic to see if we should perform CRLF for this file - * based on its attributes & the value of `core.auto_crlf` + * based on its attributes & the value of `core.autocrlf` */ ca.crlf_action = crlf_input_action(&ca); diff --git a/tests-clar/status/worktree.c b/tests-clar/status/worktree.c index d4ae48b8d..c0412ef96 100644 --- a/tests-clar/status/worktree.c +++ b/tests-clar/status/worktree.c @@ -824,3 +824,24 @@ void test_status_worktree__new_staged_file_must_handle_crlf(void) git_index_free(index); git_repository_free(repo); } + +void test_status_worktree__line_endings_dont_count_as_changes_with_autocrlf(void) +{ + git_repository *repo = cl_git_sandbox_init("status"); + git_config *config; + unsigned int status; + + cl_git_pass(git_repository_config(&config, repo)); + cl_git_pass(git_config_set_bool(config, "core.autocrlf", true)); + git_config_free(config); + + cl_git_rewritefile("status/current_file", "current_file\r\n"); + + cl_git_pass(git_status_file(&status, repo, "current_file")); + +#ifdef GIT_WIN32 + cl_assert_equal_i(GIT_STATUS_CURRENT, status); +#else + cl_assert_equal_i(GIT_STATUS_WT_MODIFIED, status); +#endif +} From 8f9b6a132b358b23b518197240184e2f08e0a913 Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Fri, 31 Aug 2012 16:39:30 -0700 Subject: [PATCH 03/10] Better header comments --- include/git2/diff.h | 56 ++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/include/git2/diff.h b/include/git2/diff.h index 85bb308dd..2898f3b20 100644 --- a/include/git2/diff.h +++ b/include/git2/diff.h @@ -386,18 +386,19 @@ GIT_EXTERN(void) git_diff_iterator_free(git_diff_iterator *iterator); /** * Return the number of files in the diff. * - * Note that there is an uncommon scenario where this number might be too - * high -- if a file in the working directory has been "touched" on disk but - * the contents were then reverted, it might have been added to the - * `git_diff_list` as a MODIFIED file along with a note that the status - * needs to be confirmed when the file contents are loaded into memory. In - * that case, when the file is loaded, we will check the contents and might - * switch it back to UNMODIFIED. The loading of the file is deferred until - * as late as possible. As a result, this might return a value what was too - * high in those circumstances. + * NOTE: This number has to be treated as an upper bound on the number of + * files that have changed if the diff is with the working directory. * - * This is true of `git_diff_foreach` as well, but the only implication - * there is that the `progress` value would not advance evenly. + * Why?! For efficiency, we defer loading the file contents as long as + * possible, so if a file has been "touched" in the working directory and + * then reverted to the original content, it may get stored in the diff list + * as MODIFIED along with a flag that the status should be reconfirmed when + * it is actually loaded into memory. When that load happens, it could get + * flipped to UNMODIFIED. If unmodified files are being skipped, then the + * iterator will skip that file and this number may be too high. + * + * This behavior is true of `git_diff_foreach` as well, but the only + * implication there is that the `progress` value would not advance evenly. * * @param iterator The iterator object * @return The maximum number of files to be iterated over @@ -450,16 +451,19 @@ GIT_EXTERN(int) git_diff_iterator_next_file( * It is recommended that you not call this if the file is a binary * file, but it is allowed to do so. * - * Warning! Call this function for the first time on a file is when the + * The `header` text output will contain the standard hunk header that + * would appear in diff output. The header string will be NUL terminated. + * + * WARNING! Call this function for the first time on a file is when the * actual text diff will be computed (it cannot be computed incrementally) * so the first call for a new file is expensive (at least in relative * terms - in reality, it is still pretty darn fast). * - * @param range Pointer where to store the range for the hunk - * @param header Pointer where to store the header for the chunk; - * this string is owned by the library and should not be freed by - * the user - * @param header_len Pointer where to store the length of the returned header + * @param range Output pointer to range of lines covered by the hunk; + * This range object is owned by the library and should not be freed. + * @param header Output pointer to the text of the hunk header + * This string is owned by the library and should not be freed. + * @param header_len Output pointer to store the length of the header text * @param iterator The iterator object * @return 0 on success, GIT_ITEROVER when done with current file, other * value < 0 on error @@ -473,12 +477,18 @@ GIT_EXTERN(int) git_diff_iterator_next_hunk( /** * Return the next line of the current hunk of diffs. * - * @param line_origin Pointer where to store a GIT_DIFF_LINE_ value; - * this value is a single character, not a buffer - * @param content Pointer where to store the content of the line; - * this string is owned by the library and should not be freed by - * the user - * @param Pointer where to store the length of the returned content + * The `line_origin` output will tell you what type of line this is + * (e.g. was it added or removed or is it just context for the diff). + * + * The `content` will be a pointer to the file data that goes in the + * line. IT WILL NOT BE NUL TERMINATED. You have to use the `content_len` + * value and only process that many bytes of data from the content string. + * + * @param line_origin Output pointer to store a GIT_DIFF_LINE value for this + * next chunk of data. The value is a single character, not a buffer. + * @param content Output pointer to store the content of the diff; this + * string is owned by the library and should not be freed. + * @param content_len Output pointer to store the length of the content. * @param iterator The iterator object * @return 0 on success, GIT_ITEROVER when done with current line, other * value < 0 on error From 60b9d3fcef04a6beb0ad4df225ada058afabf0b9 Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Wed, 5 Sep 2012 15:00:40 -0700 Subject: [PATCH 04/10] Implement filters for status/diff blobs This adds support to diff and status for running filters (a la crlf) on blobs in the workdir before computing SHAs and before generating text diffs. This ended up being a bit more code change than I had thought since I had to reorganize some of the diff logic to minimize peak memory use when filtering blobs in a diff. This also adds a cap on the maximum size of data that will be loaded to diff. I set it at 512Mb which should match core git. Right now it is a #define in src/diff.h but it could be moved into the public API if desired. --- src/diff.c | 33 ++++-- src/diff.h | 2 + src/diff_output.c | 220 ++++++++++++++++++++++++----------- src/fileops.c | 62 ++++++---- src/fileops.h | 1 + src/odb.c | 33 +++++- src/odb.h | 17 ++- tests-clar/status/worktree.c | 4 - 8 files changed, 263 insertions(+), 109 deletions(-) diff --git a/src/diff.c b/src/diff.c index f8a01086c..499b95b44 100644 --- a/src/diff.c +++ b/src/diff.c @@ -11,6 +11,7 @@ #include "fileops.h" #include "config.h" #include "attr_file.h" +#include "filter.h" static char *diff_prefix_from_pathspec(const git_strarray *pathspec) { @@ -63,8 +64,8 @@ static bool diff_path_matches_pathspec(git_diff_list *diff, const char *path) git_vector_foreach(&diff->pathspec, i, match) { int result = strcmp(match->pattern, path) ? FNM_NOMATCH : 0; - - if (((diff->opts.flags & GIT_DIFF_DISABLE_PATHSPEC_MATCH) == 0) && + + if (((diff->opts.flags & GIT_DIFF_DISABLE_PATHSPEC_MATCH) == 0) && result == FNM_NOMATCH) result = p_fnmatch(match->pattern, path, 0); @@ -262,12 +263,14 @@ static int diff_delta__from_two( delta = diff_delta__alloc(diff, status, old_entry->path); GITERR_CHECK_ALLOC(delta); - delta->old_file.mode = old_mode; git_oid_cpy(&delta->old_file.oid, &old_entry->oid); + delta->old_file.size = old_entry->file_size; + delta->old_file.mode = old_mode; delta->old_file.flags |= GIT_DIFF_FILE_VALID_OID; - delta->new_file.mode = new_mode; git_oid_cpy(&delta->new_file.oid, new_oid ? new_oid : &new_entry->oid); + delta->new_file.size = new_entry->file_size; + delta->new_file.mode = new_mode; if (new_oid || !git_oid_iszero(&new_entry->oid)) delta->new_file.flags |= GIT_DIFF_FILE_VALID_OID; @@ -440,14 +443,22 @@ static int oid_for_workdir_item( giterr_set(GITERR_OS, "File size overflow for 32-bit systems"); result = -1; } else { - int fd = git_futils_open_ro(full_path.ptr); - if (fd < 0) - result = fd; - else { - result = git_odb__hashfd( - oid, fd, (size_t)item->file_size, GIT_OBJ_BLOB); - p_close(fd); + git_vector filters = GIT_VECTOR_INIT; + + result = git_filters_load( + &filters, repo, item->path, GIT_FILTER_TO_ODB); + if (result >= 0) { + int fd = git_futils_open_ro(full_path.ptr); + if (fd < 0) + result = fd; + else { + result = git_odb__hashfd_filtered( + oid, fd, (size_t)item->file_size, GIT_OBJ_BLOB, &filters); + p_close(fd); + } } + + git_filters_free(&filters); } git_buf_free(&full_path); diff --git a/src/diff.h b/src/diff.h index 2785fa425..def746323 100644 --- a/src/diff.h +++ b/src/diff.h @@ -25,6 +25,8 @@ enum { GIT_DIFFCAPS_USE_DEV = (1 << 4), /* use st_dev? */ }; +#define MAX_DIFF_FILESIZE 0x20000000 + struct git_diff_list { git_refcount rc; git_repository *repo; diff --git a/src/diff_output.c b/src/diff_output.c index 2c64b92ee..e2ca8cf3e 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -22,7 +22,18 @@ * git_diff_foreach() call it is an emphemeral structure that is filled * in to execute each diff. In the case of a git_diff_iterator, it holds * most of the information for the diff in progress. - */ + * + * As each delta is processed, it goes through 3 phases: prep, load, exec. + * + * - In the prep phase, we just set the delta and quickly check the file + * attributes to see if it should be treated as binary. + * - In the load phase, we actually load the file content into memory. + * At this point, if we had deferred calculating OIDs, we might have to + * correct the delta to be UNMODIFIED. + * - In the exec phase, we actually run the diff and execute the callbacks. + * For foreach, this is just a pass-through to the user's callbacks. For + * iterators, we record the hunks and data spans into memory. + */ typedef struct { git_repository *repo; git_diff_options *opts; @@ -263,18 +274,40 @@ static void setup_xdiff_options( static int get_blob_content( git_repository *repo, - const git_oid *oid, + git_diff_file *file, git_map *map, git_blob **blob) { - if (git_oid_iszero(oid)) + int error; + git_odb *odb; + size_t len; + git_otype type; + + if (git_oid_iszero(&file->oid)) return 0; - if (git_blob_lookup(blob, repo, oid) < 0) - return -1; + /* peek at object header to avoid loading if too large */ + if ((error = git_repository_odb__weakptr(&odb, repo)) < 0 || + (error = git_odb_read_header(&len, &type, odb, &file->oid)) < 0) + return error; + + assert(type == GIT_OBJ_BLOB); + + /* if blob is too large to diff, mark as binary */ + if (len > MAX_DIFF_FILESIZE) { + file->flags |= GIT_DIFF_FILE_BINARY; + return 0; + } + + if (!file->size) + file->size = len; + + if ((error = git_blob_lookup(blob, repo, &file->oid)) < 0) + return error; map->data = (void *)git_blob_rawcontent(*blob); map->len = git_blob_rawsize(*blob); + return 0; } @@ -307,13 +340,66 @@ static int get_workdir_content( if (read_len < 0) { giterr_set(GITERR_OS, "Failed to read symlink '%s'", file->path); error = -1; - } else - map->len = read_len; + goto cleanup; + } + + map->len = read_len; } else { - error = git_futils_mmap_ro_file(map, path.ptr); - file->flags |= GIT_DIFF_FILE_UNMAP_DATA; + git_file fd = git_futils_open_ro(path.ptr); + git_vector filters = GIT_VECTOR_INIT; + + if (fd < 0) { + error = fd; + goto cleanup; + } + + if (!file->size) + file->size = git_futils_filesize(fd); + + /* if file is too large to diff, mark as binary */ + if (file->size > MAX_DIFF_FILESIZE) { + file->flags |= GIT_DIFF_FILE_BINARY; + goto close_and_cleanup; + } + + error = git_filters_load(&filters, repo, file->path, GIT_FILTER_TO_ODB); + if (error < 0) + goto close_and_cleanup; + + if (error == 0) { /* note: git_filters_load returns filter count */ + error = git_futils_mmap_ro(map, fd, 0, (size_t)file->size); + file->flags |= GIT_DIFF_FILE_UNMAP_DATA; + } else { + git_buf raw = GIT_BUF_INIT, filtered = GIT_BUF_INIT; + + if (!(error = git_futils_readbuffer_fd(&raw, fd, (size_t)file->size)) && + !(error = git_filters_apply(&filtered, &raw, &filters))) + { + map->len = git_buf_len(&filtered); + map->data = git_buf_detach(&filtered); + + file->flags |= GIT_DIFF_FILE_FREE_DATA; + } + + git_buf_free(&raw); + git_buf_free(&filtered); + } + +close_and_cleanup: + git_filters_free(&filters); + p_close(fd); } + + /* once data is loaded, update OID if we didn't have it previously */ + if (!error && (file->flags & GIT_DIFF_FILE_VALID_OID) == 0) { + error = git_odb_hash( + &file->oid, map->data, map->len, GIT_OBJ_BLOB); + if (!error) + file->flags |= GIT_DIFF_FILE_VALID_OID; + } + +cleanup: git_buf_free(&path); return error; } @@ -393,7 +479,9 @@ static int diff_delta_prep(diff_delta_context *ctxt) static int diff_delta_load(diff_delta_context *ctxt) { int error = 0; + git_repository *repo = ctxt->repo; git_diff_delta *delta = ctxt->delta; + bool load_old = false, load_new = false, check_if_unmodified = false; if (ctxt->loaded || !ctxt->delta) return 0; @@ -405,75 +493,77 @@ static int diff_delta_load(diff_delta_context *ctxt) ctxt->old_data.len = 0; ctxt->old_blob = NULL; - if (!error && delta->binary != 1 && - (delta->status == GIT_DELTA_DELETED || - delta->status == GIT_DELTA_MODIFIED)) - { - if (ctxt->old_src == GIT_ITERATOR_WORKDIR) - error = get_workdir_content( - ctxt->repo, &delta->old_file, &ctxt->old_data); - else { - error = get_blob_content( - ctxt->repo, &delta->old_file.oid, - &ctxt->old_data, &ctxt->old_blob); - - if (ctxt->new_src == GIT_ITERATOR_WORKDIR) { - /* TODO: convert crlf of blob content */ - } - } - } - ctxt->new_data.data = ""; ctxt->new_data.len = 0; ctxt->new_blob = NULL; - if (!error && delta->binary != 1 && - (delta->status == GIT_DELTA_ADDED || - delta->status == GIT_DELTA_MODIFIED)) - { - if (ctxt->new_src == GIT_ITERATOR_WORKDIR) - error = get_workdir_content( - ctxt->repo, &delta->new_file, &ctxt->new_data); - else { - error = get_blob_content( - ctxt->repo, &delta->new_file.oid, - &ctxt->new_data, &ctxt->new_blob); + if (delta->binary == 1) + goto cleanup; - if (ctxt->old_src == GIT_ITERATOR_WORKDIR) { - /* TODO: convert crlf of blob content */ - } - } - - if (!error && !(delta->new_file.flags & GIT_DIFF_FILE_VALID_OID)) { - error = git_odb_hash( - &delta->new_file.oid, ctxt->new_data.data, - ctxt->new_data.len, GIT_OBJ_BLOB); - if (error < 0) - goto cleanup; - - delta->new_file.flags |= GIT_DIFF_FILE_VALID_OID; - - /* since we did not have the definitive oid, we may have - * incorrect status and need to skip this item. - */ - if (delta->old_file.mode == delta->new_file.mode && - !git_oid_cmp(&delta->old_file.oid, &delta->new_file.oid)) - { - delta->status = GIT_DELTA_UNMODIFIED; - - if ((ctxt->opts->flags & GIT_DIFF_INCLUDE_UNMODIFIED) == 0) - goto cleanup; - } - } + switch (delta->status) { + case GIT_DELTA_ADDED: load_new = true; break; + case GIT_DELTA_DELETED: load_old = true; break; + case GIT_DELTA_MODIFIED: load_new = load_old = true; break; + default: break; } + check_if_unmodified = + (load_old && (delta->old_file.flags & GIT_DIFF_FILE_VALID_OID) == 0) || + (load_new && (delta->new_file.flags & GIT_DIFF_FILE_VALID_OID) == 0); + + /* Always try to load workdir content first, since it may need to be + * filtered (and hence use 2x memory) and we want to minimize the max + * memory footprint during diff. + */ + + if (load_old && ctxt->old_src == GIT_ITERATOR_WORKDIR) { + if ((error = get_workdir_content( + repo, &delta->old_file, &ctxt->old_data)) < 0) + goto cleanup; + + if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0) + goto cleanup; + } + + if (load_new && ctxt->new_src == GIT_ITERATOR_WORKDIR) { + if ((error = get_workdir_content( + repo, &delta->new_file, &ctxt->new_data)) < 0) + goto cleanup; + + if ((delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0) + goto cleanup; + } + + if (load_old && ctxt->old_src != GIT_ITERATOR_WORKDIR && + (error = get_blob_content( + repo, &delta->old_file, &ctxt->old_data, &ctxt->old_blob)) < 0) + goto cleanup; + + if (load_new && ctxt->new_src != GIT_ITERATOR_WORKDIR && + (error = get_blob_content( + repo, &delta->new_file, &ctxt->new_data, &ctxt->new_blob)) < 0) + goto cleanup; + + /* if we did not previously have the definitive oid, we may have + * incorrect status and need to switch this to UNMODIFIED. + */ + if (check_if_unmodified && + delta->old_file.mode == delta->new_file.mode && + !git_oid_cmp(&delta->old_file.oid, &delta->new_file.oid)) + { + delta->status = GIT_DELTA_UNMODIFIED; + + if ((ctxt->opts->flags & GIT_DIFF_INCLUDE_UNMODIFIED) == 0) + goto cleanup; + } + +cleanup: /* if we have not already decided whether file is binary, * check the first 4K for nul bytes to decide... */ if (!error && delta->binary == -1) error = diff_delta_is_binary_by_content(ctxt); -cleanup: ctxt->loaded = !error; /* flag if we would want to diff the contents of these files */ diff --git a/src/fileops.c b/src/fileops.c index 95eacb5f1..d4def1a9a 100644 --- a/src/fileops.c +++ b/src/fileops.c @@ -115,10 +115,47 @@ mode_t git_futils_canonical_mode(mode_t raw_mode) return 0; } -int git_futils_readbuffer_updated(git_buf *buf, const char *path, time_t *mtime, int *updated) +#define MAX_READ_STALLS 10 + +int git_futils_readbuffer_fd(git_buf *buf, git_file fd, size_t len) +{ + int stalls = MAX_READ_STALLS; + + git_buf_clear(buf); + + if (git_buf_grow(buf, len + 1) < 0) + return -1; + + buf->ptr[len] = '\0'; + + while (len > 0) { + ssize_t read_size = p_read(fd, buf->ptr + buf->size, len); + + if (read_size < 0) { + giterr_set(GITERR_OS, "Failed to read descriptor"); + return -1; + } + + if (read_size == 0) { + stalls--; + + if (!stalls) { + giterr_set(GITERR_OS, "Too many stalls reading descriptor"); + return -1; + } + } + + len -= read_size; + buf->size += read_size; + } + + return 0; +} + +int git_futils_readbuffer_updated( + git_buf *buf, const char *path, time_t *mtime, int *updated) { git_file fd; - size_t len; struct stat st; assert(buf && path && *path); @@ -147,30 +184,11 @@ int git_futils_readbuffer_updated(git_buf *buf, const char *path, time_t *mtime, if (mtime != NULL) *mtime = st.st_mtime; - len = (size_t) st.st_size; - - git_buf_clear(buf); - - if (git_buf_grow(buf, len + 1) < 0) { + if (git_futils_readbuffer_fd(buf, fd, (size_t)st.st_size) < 0) { p_close(fd); return -1; } - buf->ptr[len] = '\0'; - - while (len > 0) { - ssize_t read_size = p_read(fd, buf->ptr, len); - - if (read_size < 0) { - p_close(fd); - giterr_set(GITERR_OS, "Failed to read descriptor for '%s'", path); - return -1; - } - - len -= read_size; - buf->size += read_size; - } - p_close(fd); if (updated != NULL) diff --git a/src/fileops.h b/src/fileops.h index 5c23ce30b..d2944f460 100644 --- a/src/fileops.h +++ b/src/fileops.h @@ -19,6 +19,7 @@ */ extern int git_futils_readbuffer(git_buf *obj, const char *path); extern int git_futils_readbuffer_updated(git_buf *obj, const char *path, time_t *mtime, int *updated); +extern int git_futils_readbuffer_fd(git_buf *obj, git_file fd, size_t len); /** * File utils diff --git a/src/odb.c b/src/odb.c index 34033d15c..83c7a80fc 100644 --- a/src/odb.c +++ b/src/odb.c @@ -12,6 +12,7 @@ #include "hash.h" #include "odb.h" #include "delta-apply.h" +#include "filter.h" #include "git2/odb_backend.h" #include "git2/oid.h" @@ -118,11 +119,12 @@ int git_odb__hashfd(git_oid *out, git_file fd, size_t size, git_otype type) hdr_len = format_object_header(hdr, sizeof(hdr), size, type); ctx = git_hash_new_ctx(); + GITERR_CHECK_ALLOC(ctx); git_hash_update(ctx, hdr, hdr_len); while (size > 0) { - ssize_t read_len = read(fd, buffer, sizeof(buffer)); + ssize_t read_len = p_read(fd, buffer, sizeof(buffer)); if (read_len < 0) { git_hash_free_ctx(ctx); @@ -140,6 +142,33 @@ int git_odb__hashfd(git_oid *out, git_file fd, size_t size, git_otype type) return 0; } +int git_odb__hashfd_filtered( + git_oid *out, git_file fd, size_t size, git_otype type, git_vector *filters) +{ + int error; + git_buf raw = GIT_BUF_INIT; + git_buf filtered = GIT_BUF_INIT; + + if (!filters || !filters->length) + return git_odb__hashfd(out, fd, size, type); + + /* size of data is used in header, so we have to read the whole file + * into memory to apply filters before beginning to calculate the hash + */ + + if (!(error = git_futils_readbuffer_fd(&raw, fd, size))) + error = git_filters_apply(&filtered, &raw, filters); + + git_buf_free(&raw); + + if (!error) + error = git_odb_hash(out, filtered.ptr, filtered.size, type); + + git_buf_free(&filtered); + + return error; +} + int git_odb__hashlink(git_oid *out, const char *path) { struct stat st; @@ -171,7 +200,7 @@ int git_odb__hashlink(git_oid *out, const char *path) result = git_odb_hash(out, link_data, (size_t)size, GIT_OBJ_BLOB); git__free(link_data); - } else { + } else { int fd = git_futils_open_ro(path); if (fd < 0) return -1; diff --git a/src/odb.h b/src/odb.h index 263e4c30b..696e12943 100644 --- a/src/odb.h +++ b/src/odb.h @@ -58,12 +58,19 @@ int git_odb__hashobj(git_oid *id, git_rawobj *obj); int git_odb__hashfd(git_oid *out, git_file fd, size_t size, git_otype type); /* - * Hash a `path`, assuming it could be a POSIX symlink: if the path is a symlink, - * then the raw contents of the symlink will be hashed. Otherwise, this will - * fallback to `git_odb__hashfd`. + * Hash an open file descriptor applying an array of filters + * Acts just like git_odb__hashfd with the addition of filters... + */ +int git_odb__hashfd_filtered( + git_oid *out, git_file fd, size_t len, git_otype type, git_vector *filters); + +/* + * Hash a `path`, assuming it could be a POSIX symlink: if the path is a + * symlink, then the raw contents of the symlink will be hashed. Otherwise, + * this will fallback to `git_odb__hashfd`. * - * The hash type for this call is always `GIT_OBJ_BLOB` because symlinks may only - * point to blobs. + * The hash type for this call is always `GIT_OBJ_BLOB` because symlinks may + * only point to blobs. */ int git_odb__hashlink(git_oid *out, const char *path); diff --git a/tests-clar/status/worktree.c b/tests-clar/status/worktree.c index c0412ef96..05e396e1f 100644 --- a/tests-clar/status/worktree.c +++ b/tests-clar/status/worktree.c @@ -839,9 +839,5 @@ void test_status_worktree__line_endings_dont_count_as_changes_with_autocrlf(void cl_git_pass(git_status_file(&status, repo, "current_file")); -#ifdef GIT_WIN32 cl_assert_equal_i(GIT_STATUS_CURRENT, status); -#else - cl_assert_equal_i(GIT_STATUS_WT_MODIFIED, status); -#endif } From 3a3deea80bb6555706f58006bdee8e878b0fd651 Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Thu, 6 Sep 2012 15:45:50 -0700 Subject: [PATCH 05/10] Clean up blob diff path Previously when diffing blobs, the diff code just ran with a NULL repository object. Of course, that's not necessary and the test for a NULL repo was confusing. This makes the blob diff run with the repo that contains the blobs and clarifies the test that it is possible to be diffing data where the path is unknown. --- src/diff_output.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/diff_output.c b/src/diff_output.c index e2ca8cf3e..6ff880e95 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -151,7 +151,8 @@ static int update_file_is_binary_by_attr( { const char *value; - if (!repo) + /* because of blob diffs, cannot assume path is set */ + if (!file->path || !strlen(file->path)) return 0; if (git_attr_get(&value, repo, 0, file->path, "diff") < 0) @@ -1028,6 +1029,7 @@ int git_diff_blobs( diff_delta_context ctxt; git_diff_delta delta; git_blob *new, *old; + git_repository *repo; new = new_blob; old = old_blob; @@ -1038,8 +1040,15 @@ int git_diff_blobs( new = swap; } + if (new) + repo = git_object_owner((git_object *)new); + else if (old) + repo = git_object_owner((git_object *)old); + else + repo = NULL; + diff_delta_init_context( - &ctxt, NULL, options, GIT_ITERATOR_TREE, GIT_ITERATOR_TREE); + &ctxt, repo, options, GIT_ITERATOR_TREE, GIT_ITERATOR_TREE); /* populate a "fake" delta record */ From b36effa22e015871948daeea250b4996c663e11a Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Mon, 10 Sep 2012 09:59:14 -0700 Subject: [PATCH 06/10] Replace git_diff_iterator_num_files with progress The `git_diff_iterator_num_files` API was problematic, since we don't actually know the exact number of files to be iterated over until we load those files into memory. This replaces it with a new `git_diff_iterator_progress` API that goes from 0 to 1, and moves and renamed the old API for the internal places that can tolerate a max value instead of an exact value. --- include/git2/diff.h | 24 +++++++----------------- src/diff.h | 22 ++++++++++++++++++++++ src/diff_output.c | 18 +++++++----------- tests-clar/diff/diff_helpers.c | 8 +++----- 4 files changed, 39 insertions(+), 33 deletions(-) diff --git a/include/git2/diff.h b/include/git2/diff.h index 2898f3b20..7a86d2463 100644 --- a/include/git2/diff.h +++ b/include/git2/diff.h @@ -384,26 +384,16 @@ GIT_EXTERN(int) git_diff_iterator_new( GIT_EXTERN(void) git_diff_iterator_free(git_diff_iterator *iterator); /** - * Return the number of files in the diff. + * Return progress value for traversing the diff. * - * NOTE: This number has to be treated as an upper bound on the number of - * files that have changed if the diff is with the working directory. + * This returns a value between 0.0 and 1.0 that represents the progress + * through the diff iterator. The value is monotonically increasing and + * will advance gradually as you progress through the iteration. * - * Why?! For efficiency, we defer loading the file contents as long as - * possible, so if a file has been "touched" in the working directory and - * then reverted to the original content, it may get stored in the diff list - * as MODIFIED along with a flag that the status should be reconfirmed when - * it is actually loaded into memory. When that load happens, it could get - * flipped to UNMODIFIED. If unmodified files are being skipped, then the - * iterator will skip that file and this number may be too high. - * - * This behavior is true of `git_diff_foreach` as well, but the only - * implication there is that the `progress` value would not advance evenly. - * - * @param iterator The iterator object - * @return The maximum number of files to be iterated over + * @param iterator The diff iterator + * @return Value between 0.0 and 1.0 */ -GIT_EXTERN(int) git_diff_iterator_num_files(git_diff_iterator *iterator); +GIT_EXTERN(float) git_diff_iterator_progress(git_diff_iterator *iterator); /** * Return the number of hunks in the current file diff --git a/src/diff.h b/src/diff.h index def746323..ea38a678f 100644 --- a/src/diff.h +++ b/src/diff.h @@ -42,5 +42,27 @@ struct git_diff_list { extern void git_diff__cleanup_modes( uint32_t diffcaps, uint32_t *omode, uint32_t *nmode); +/** + * Return the maximum possible number of files in the diff. + * + * NOTE: This number has to be treated as an upper bound on the number of + * files that have changed if the diff is with the working directory. + * + * Why?! For efficiency, we defer loading the file contents as long as + * possible, so if a file has been "touched" in the working directory and + * then reverted to the original content, it may get stored in the diff list + * as MODIFIED along with a flag that the status should be reconfirmed when + * it is actually loaded into memory. When that load happens, it could get + * flipped to UNMODIFIED. If unmodified files are being skipped, then the + * iterator will skip that file and this number may be too high. + * + * This behavior is true of `git_diff_foreach` as well, but the only + * implication there is that the `progress` value would not advance evenly. + * + * @param iterator The iterator object + * @return The maximum number of files to be iterated over + */ +int git_diff_iterator__max_files(git_diff_iterator *iterator); + #endif diff --git a/src/diff_output.c b/src/diff_output.c index 6ff880e95..f65d0057f 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -1115,7 +1115,6 @@ struct git_diff_iterator { diff_delta_context ctxt; size_t file_index; size_t next_index; - size_t file_count; git_pool hunks; size_t hunk_count; diffiter_hunk *hunk_head; @@ -1239,8 +1238,6 @@ int git_diff_iterator_new( git_diff_iterator **iterator_ptr, git_diff_list *diff) { - size_t i; - git_diff_delta *delta; git_diff_iterator *iter; assert(diff && iterator_ptr); @@ -1261,12 +1258,6 @@ int git_diff_iterator_new( git_pool_init(&iter->lines, sizeof(diffiter_line), 0) < 0) goto fail; - git_vector_foreach(&diff->deltas, i, delta) { - if (diff_delta_should_skip(iter->ctxt.opts, delta)) - continue; - iter->file_count++; - } - *iterator_ptr = iter; return 0; @@ -1284,9 +1275,14 @@ void git_diff_iterator_free(git_diff_iterator *iter) git__free(iter); } -int git_diff_iterator_num_files(git_diff_iterator *iter) +float git_diff_iterator_progress(git_diff_iterator *iter) { - return (int)iter->file_count; + return (float)iter->next_index / (float)iter->diff->deltas.length; +} + +int git_diff_iterator__max_files(git_diff_iterator *iter) +{ + return (int)iter->diff->deltas.length; } int git_diff_iterator_num_hunks_in_file(git_diff_iterator *iter) diff --git a/tests-clar/diff/diff_helpers.c b/tests-clar/diff/diff_helpers.c index 59e01802c..ef59b686f 100644 --- a/tests-clar/diff/diff_helpers.c +++ b/tests-clar/diff/diff_helpers.c @@ -111,23 +111,21 @@ int diff_foreach_via_iterator( git_diff_hunk_fn hunk_cb, git_diff_data_fn line_cb) { - int error, curr, total; + int error; git_diff_iterator *iter; git_diff_delta *delta; if ((error = git_diff_iterator_new(&iter, diff)) < 0) return error; - curr = 0; - total = git_diff_iterator_num_files(iter); - while (!(error = git_diff_iterator_next_file(&delta, iter))) { git_diff_range *range; const char *hdr; size_t hdr_len; + float progress = git_diff_iterator_progress(iter); /* call file_cb for this file */ - if (file_cb != NULL && file_cb(data, delta, (float)curr / total) != 0) + if (file_cb != NULL && file_cb(data, delta, progress) != 0) goto abort; if (!hunk_cb && !line_cb) From e597b1890ebd43e398d84b7bf4ca366365b24d27 Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Mon, 10 Sep 2012 11:49:12 -0700 Subject: [PATCH 07/10] Move diff max_size to public API This commit adds a max_size value in the public `git_diff_options` structure so that the user can automatically flag blobs over a certain size as binary regardless of other properties. Also, and perhaps more importantly, this moves binary detection to be as early as possible in the diff traversal inner loop and makes sure that we stop loading objects as soon as we decide that they are binary. --- include/git2/diff.h | 9 ++- src/diff_output.c | 148 +++++++++++++++++++++++++------------------- 2 files changed, 91 insertions(+), 66 deletions(-) diff --git a/include/git2/diff.h b/include/git2/diff.h index 7a86d2463..4b4591a9e 100644 --- a/include/git2/diff.h +++ b/include/git2/diff.h @@ -56,7 +56,13 @@ enum { * values. Similarly, passing NULL for the options structure will * give the defaults. The default values are marked below. * - * @todo Most of the parameters here are not actually supported at this time. + * - flags: a combination of the GIT_DIFF_... values above + * - context_lines: number of lines of context to show around diffs + * - interhunk_lines: min lines between diff hunks to merge them + * - old_prefix: "directory" to prefix to old file names (default "a") + * - new_prefix: "directory" to prefix to new file names (default "b") + * - pathspec: array of paths / patterns to constrain diff + * - max_size: maximum blob size to diff, above this treated as binary */ typedef struct { uint32_t flags; /**< defaults to GIT_DIFF_NORMAL */ @@ -65,6 +71,7 @@ typedef struct { char *old_prefix; /**< defaults to "a" */ char *new_prefix; /**< defaults to "b" */ git_strarray pathspec; /**< defaults to show all paths */ + git_off_t max_size; /**< defaults to 512Mb */ } git_diff_options; /** diff --git a/src/diff_output.c b/src/diff_output.c index f65d0057f..8873a4dc7 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -172,7 +172,7 @@ static void update_delta_is_binary(git_diff_delta *delta) if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0 || (delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0) delta->binary = 1; - else if ((delta->old_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0 || + else if ((delta->old_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0 && (delta->new_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0) delta->binary = 0; /* otherwise leave delta->binary value untouched */ @@ -219,34 +219,46 @@ static int diff_delta_is_binary_by_attr(diff_delta_context *ctxt) return error; } -static int diff_delta_is_binary_by_content(diff_delta_context *ctxt) +static int diff_delta_is_binary_by_content( + diff_delta_context *ctxt, git_diff_file *file, git_map *map) { - git_diff_delta *delta = ctxt->delta; git_buf search; - if ((delta->old_file.flags & BINARY_DIFF_FLAGS) == 0) { - search.ptr = ctxt->old_data.data; - search.size = min(ctxt->old_data.len, 4000); + if ((file->flags & BINARY_DIFF_FLAGS) == 0) { + search.ptr = map->data; + search.size = min(map->len, 4000); if (git_buf_is_binary(&search)) - delta->old_file.flags |= GIT_DIFF_FILE_BINARY; + file->flags |= GIT_DIFF_FILE_BINARY; else - delta->old_file.flags |= GIT_DIFF_FILE_NOT_BINARY; + file->flags |= GIT_DIFF_FILE_NOT_BINARY; } - if ((delta->new_file.flags & BINARY_DIFF_FLAGS) == 0) { - search.ptr = ctxt->new_data.data; - search.size = min(ctxt->new_data.len, 4000); + update_delta_is_binary(ctxt->delta); - if (git_buf_is_binary(&search)) - delta->new_file.flags |= GIT_DIFF_FILE_BINARY; - else - delta->new_file.flags |= GIT_DIFF_FILE_NOT_BINARY; + return 0; +} + +static int diff_delta_is_binary_by_size( + diff_delta_context *ctxt, git_diff_file *file) +{ + git_off_t threshold = MAX_DIFF_FILESIZE; + + if ((file->flags & BINARY_DIFF_FLAGS) != 0) + return 0; + + if (ctxt && ctxt->opts) { + if (ctxt->opts->max_size < 0) + return 0; + + if (ctxt->opts->max_size > 0) + threshold = ctxt->opts->max_size; } - update_delta_is_binary(delta); + if (file->size > threshold) + file->flags |= GIT_DIFF_FILE_BINARY; - /* TODO: if value != NULL, implement diff drivers */ + update_delta_is_binary(ctxt->delta); return 0; } @@ -274,53 +286,56 @@ static void setup_xdiff_options( } static int get_blob_content( - git_repository *repo, + diff_delta_context *ctxt, git_diff_file *file, git_map *map, git_blob **blob) { int error; - git_odb *odb; - size_t len; - git_otype type; if (git_oid_iszero(&file->oid)) return 0; - /* peek at object header to avoid loading if too large */ - if ((error = git_repository_odb__weakptr(&odb, repo)) < 0 || - (error = git_odb_read_header(&len, &type, odb, &file->oid)) < 0) - return error; + if (!file->size) { + git_odb *odb; + size_t len; + git_otype type; - assert(type == GIT_OBJ_BLOB); + /* peek at object header to avoid loading if too large */ + if ((error = git_repository_odb__weakptr(&odb, ctxt->repo)) < 0 || + (error = git_odb_read_header(&len, &type, odb, &file->oid)) < 0) + return error; - /* if blob is too large to diff, mark as binary */ - if (len > MAX_DIFF_FILESIZE) { - file->flags |= GIT_DIFF_FILE_BINARY; - return 0; + assert(type == GIT_OBJ_BLOB); + + file->size = len; } - if (!file->size) - file->size = len; + /* if blob is too large to diff, mark as binary */ + if ((error = diff_delta_is_binary_by_size(ctxt, file)) < 0) + return error; + if (ctxt->delta->binary == 1) + return 0; - if ((error = git_blob_lookup(blob, repo, &file->oid)) < 0) + if ((error = git_blob_lookup(blob, ctxt->repo, &file->oid)) < 0) return error; map->data = (void *)git_blob_rawcontent(*blob); map->len = git_blob_rawsize(*blob); - return 0; + return diff_delta_is_binary_by_content(ctxt, file, map); } static int get_workdir_content( - git_repository *repo, + diff_delta_context *ctxt, git_diff_file *file, git_map *map) { int error = 0; git_buf path = GIT_BUF_INIT; + const char *wd = git_repository_workdir(ctxt->repo); - if (git_buf_joinpath(&path, git_repository_workdir(repo), file->path) < 0) + if (git_buf_joinpath(&path, wd, file->path) < 0) return -1; if (S_ISLNK(file->mode)) { @@ -358,13 +373,12 @@ static int get_workdir_content( if (!file->size) file->size = git_futils_filesize(fd); - /* if file is too large to diff, mark as binary */ - if (file->size > MAX_DIFF_FILESIZE) { - file->flags |= GIT_DIFF_FILE_BINARY; + if ((error = diff_delta_is_binary_by_size(ctxt, file)) < 0 || + ctxt->delta->binary == 1) goto close_and_cleanup; - } - error = git_filters_load(&filters, repo, file->path, GIT_FILTER_TO_ODB); + error = git_filters_load( + &filters, ctxt->repo, file->path, GIT_FILTER_TO_ODB); if (error < 0) goto close_and_cleanup; @@ -400,6 +414,9 @@ close_and_cleanup: file->flags |= GIT_DIFF_FILE_VALID_OID; } + if (!error) + error = diff_delta_is_binary_by_content(ctxt, file, map); + cleanup: git_buf_free(&path); return error; @@ -480,7 +497,6 @@ static int diff_delta_prep(diff_delta_context *ctxt) static int diff_delta_load(diff_delta_context *ctxt) { int error = 0; - git_repository *repo = ctxt->repo; git_diff_delta *delta = ctxt->delta; bool load_old = false, load_new = false, check_if_unmodified = false; @@ -519,31 +535,35 @@ static int diff_delta_load(diff_delta_context *ctxt) if (load_old && ctxt->old_src == GIT_ITERATOR_WORKDIR) { if ((error = get_workdir_content( - repo, &delta->old_file, &ctxt->old_data)) < 0) + ctxt, &delta->old_file, &ctxt->old_data)) < 0) goto cleanup; - - if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0) + if (delta->binary == 1) goto cleanup; } if (load_new && ctxt->new_src == GIT_ITERATOR_WORKDIR) { if ((error = get_workdir_content( - repo, &delta->new_file, &ctxt->new_data)) < 0) + ctxt, &delta->new_file, &ctxt->new_data)) < 0) goto cleanup; - - if ((delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0) + if (delta->binary == 1) goto cleanup; } - if (load_old && ctxt->old_src != GIT_ITERATOR_WORKDIR && - (error = get_blob_content( - repo, &delta->old_file, &ctxt->old_data, &ctxt->old_blob)) < 0) - goto cleanup; + if (load_old && ctxt->old_src != GIT_ITERATOR_WORKDIR) { + if ((error = get_blob_content( + ctxt, &delta->old_file, &ctxt->old_data, &ctxt->old_blob)) < 0) + goto cleanup; + if (delta->binary == 1) + goto cleanup; + } - if (load_new && ctxt->new_src != GIT_ITERATOR_WORKDIR && - (error = get_blob_content( - repo, &delta->new_file, &ctxt->new_data, &ctxt->new_blob)) < 0) - goto cleanup; + if (load_new && ctxt->new_src != GIT_ITERATOR_WORKDIR) { + if ((error = get_blob_content( + ctxt, &delta->new_file, &ctxt->new_data, &ctxt->new_blob)) < 0) + goto cleanup; + if (delta->binary == 1) + goto cleanup; + } /* if we did not previously have the definitive oid, we may have * incorrect status and need to switch this to UNMODIFIED. @@ -559,12 +579,6 @@ static int diff_delta_load(diff_delta_context *ctxt) } cleanup: - /* if we have not already decided whether file is binary, - * check the first 4K for nul bytes to decide... - */ - if (!error && delta->binary == -1) - error = diff_delta_is_binary_by_content(ctxt); - ctxt->loaded = !error; /* flag if we would want to diff the contents of these files */ @@ -1069,9 +1083,13 @@ int git_diff_blobs( if ((error = diff_delta_prep(&ctxt)) < 0) goto cleanup; - if (delta.binary == -1 && - (error = diff_delta_is_binary_by_content(&ctxt)) < 0) - goto cleanup; + if (delta.binary == -1) { + if ((error = diff_delta_is_binary_by_content( + &ctxt, &delta.old_file, &ctxt.old_data)) < 0 || + (error = diff_delta_is_binary_by_content( + &ctxt, &delta.new_file, &ctxt.new_data)) < 0) + goto cleanup; + } ctxt.loaded = 1; ctxt.diffable = (delta.binary != 1 && delta.status != GIT_DELTA_UNMODIFIED); From c6ac28fdc57d04a9a5eba129cfd267c7adde43b3 Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Mon, 10 Sep 2012 12:24:05 -0700 Subject: [PATCH 08/10] Reorg internal odb read header and object lookup Often `git_odb_read_header` will "fail" and have to read the entire object into memory instead of just the header. When this happens, the object is loaded and then disposed of immediately, which makes it difficult to efficiently use the header information to decide if the object should be loaded (since attempting to do so will often result in loading the object twice). This commit takes the existing code and reorganizes it to have two new functions: - `git_odb__read_header_or_object` which acts just like the old read header function except that it returns the object, too, if it was forced to load the whole thing. It then becomes the callers responsibility to free the `git_odb_object`. - `git_object__from_odb_object` which was extracted from the old `git_object_lookup` and creates a subclass of `git_object` from an existing `git_odb_object` (separating the ODB lookup from the `git_object` creation). This allows you to use the first header reading function efficiently without instantiating the `git_odb_object` twice. There is no net change to the behavior of any of the existing functions, but this allows internal code to tap into the ODB lookup and object creation to be more efficient. --- src/diff_output.c | 13 ++++++- src/object.c | 98 ++++++++++++++++++++++++++--------------------- src/object.h | 39 +++++++++++++++++++ src/odb.c | 24 ++++++++++-- src/odb.h | 8 ++++ src/repository.h | 25 +----------- 6 files changed, 135 insertions(+), 72 deletions(-) create mode 100644 src/object.h diff --git a/src/diff_output.c b/src/diff_output.c index 8873a4dc7..dbef7ddc1 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -292,6 +292,7 @@ static int get_blob_content( git_blob **blob) { int error; + git_odb_object *odb_obj = NULL; if (git_oid_iszero(&file->oid)) return 0; @@ -303,7 +304,8 @@ static int get_blob_content( /* peek at object header to avoid loading if too large */ if ((error = git_repository_odb__weakptr(&odb, ctxt->repo)) < 0 || - (error = git_odb_read_header(&len, &type, odb, &file->oid)) < 0) + (error = git_odb__read_header_or_object( + &odb_obj, &len, &type, odb, &file->oid)) < 0) return error; assert(type == GIT_OBJ_BLOB); @@ -317,7 +319,14 @@ static int get_blob_content( if (ctxt->delta->binary == 1) return 0; - if ((error = git_blob_lookup(blob, ctxt->repo, &file->oid)) < 0) + if (odb_obj != NULL) { + error = git_object__from_odb_object( + (git_object **)blob, ctxt->repo, odb_obj, GIT_OBJ_BLOB); + git_odb_object_free(odb_obj); + } else + error = git_blob_lookup(blob, ctxt->repo, &file->oid); + + if (error) return error; map->data = (void *)git_blob_rawcontent(*blob); diff --git a/src/object.c b/src/object.c index 5130d97ac..2e45eb86a 100644 --- a/src/object.c +++ b/src/object.c @@ -77,6 +77,58 @@ static int create_object(git_object **object_out, git_otype type) return 0; } +int git_object__from_odb_object( + git_object **object_out, + git_repository *repo, + git_odb_object *odb_obj, + git_otype type) +{ + int error; + git_object *object = NULL; + + if (type != GIT_OBJ_ANY && type != odb_obj->raw.type) { + giterr_set(GITERR_ODB, "The requested type does not match the type in the ODB"); + return GIT_ENOTFOUND; + } + + type = odb_obj->raw.type; + + if ((error = create_object(&object, type)) < 0) + return error; + + /* Initialize parent object */ + git_oid_cpy(&object->cached.oid, &odb_obj->cached.oid); + object->repo = repo; + + switch (type) { + case GIT_OBJ_COMMIT: + error = git_commit__parse((git_commit *)object, odb_obj); + break; + + case GIT_OBJ_TREE: + error = git_tree__parse((git_tree *)object, odb_obj); + break; + + case GIT_OBJ_TAG: + error = git_tag__parse((git_tag *)object, odb_obj); + break; + + case GIT_OBJ_BLOB: + error = git_blob__parse((git_blob *)object, odb_obj); + break; + + default: + break; + } + + if (error < 0) + git_object__free(object); + else + *object_out = git_cache_try_store(&repo->objects, object); + + return error; +} + int git_object_lookup_prefix( git_object **object_out, git_repository *repo, @@ -148,53 +200,11 @@ int git_object_lookup_prefix( if (error < 0) return error; - if (type != GIT_OBJ_ANY && type != odb_obj->raw.type) { - git_odb_object_free(odb_obj); - giterr_set(GITERR_ODB, "The given type does not match the type on the ODB"); - return GIT_ENOTFOUND; - } - - type = odb_obj->raw.type; - - if (create_object(&object, type) < 0) { - git_odb_object_free(odb_obj); - return -1; - } - - /* Initialize parent object */ - git_oid_cpy(&object->cached.oid, &odb_obj->cached.oid); - object->repo = repo; - - switch (type) { - case GIT_OBJ_COMMIT: - error = git_commit__parse((git_commit *)object, odb_obj); - break; - - case GIT_OBJ_TREE: - error = git_tree__parse((git_tree *)object, odb_obj); - break; - - case GIT_OBJ_TAG: - error = git_tag__parse((git_tag *)object, odb_obj); - break; - - case GIT_OBJ_BLOB: - error = git_blob__parse((git_blob *)object, odb_obj); - break; - - default: - break; - } + error = git_object__from_odb_object(object_out, repo, odb_obj, type); git_odb_object_free(odb_obj); - if (error < 0) { - git_object__free(object); - return -1; - } - - *object_out = git_cache_try_store(&repo->objects, object); - return 0; + return error; } int git_object_lookup(git_object **object_out, git_repository *repo, const git_oid *id, git_otype type) { diff --git a/src/object.h b/src/object.h new file mode 100644 index 000000000..bc12aad04 --- /dev/null +++ b/src/object.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2009-2012 the libgit2 contributors + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ +#ifndef INCLUDE_object_h__ +#define INCLUDE_object_h__ + +/** Base git object for inheritance */ +struct git_object { + git_cached_obj cached; + git_repository *repo; + git_otype type; +}; + +/* fully free the object; internal method, DO NOT EXPORT */ +void git_object__free(void *object); + +GIT_INLINE(int) git_object__dup(git_object **dest, git_object *source) +{ + git_cached_obj_incref(source); + *dest = source; + return 0; +} + +int git_object__from_odb_object( + git_object **object_out, + git_repository *repo, + git_odb_object *odb_obj, + git_otype type); + +int git_object__resolve_to_type(git_object **obj, git_otype type); + +int git_oid__parse(git_oid *oid, const char **buffer_out, const char *buffer_end, const char *header); + +void git_oid__writebuf(git_buf *buf, const char *header, const git_oid *oid); + +#endif diff --git a/src/odb.c b/src/odb.c index 83c7a80fc..0e03e40ee 100644 --- a/src/odb.c +++ b/src/odb.c @@ -513,20 +513,37 @@ int git_odb_exists(git_odb *db, const git_oid *id) } int git_odb_read_header(size_t *len_p, git_otype *type_p, git_odb *db, const git_oid *id) +{ + int error; + git_odb_object *object; + + error = git_odb__read_header_or_object(&object, len_p, type_p, db, id); + + if (object) + git_odb_object_free(object); + + return error; +} + +int git_odb__read_header_or_object( + git_odb_object **out, size_t *len_p, git_otype *type_p, + git_odb *db, const git_oid *id) { unsigned int i; int error = GIT_ENOTFOUND; git_odb_object *object; - assert(db && id); + assert(db && id && out && len_p && type_p); if ((object = git_cache_get(&db->cache, id)) != NULL) { *len_p = object->raw.len; *type_p = object->raw.type; - git_odb_object_free(object); + *out = object; return 0; } + *out = NULL; + for (i = 0; i < db->backends.length && error < 0; ++i) { backend_internal *internal = git_vector_get(&db->backends, i); git_odb_backend *b = internal->backend; @@ -547,7 +564,8 @@ int git_odb_read_header(size_t *len_p, git_otype *type_p, git_odb *db, const git *len_p = object->raw.len; *type_p = object->raw.type; - git_odb_object_free(object); + *out = object; + return 0; } diff --git a/src/odb.h b/src/odb.h index 696e12943..e9e33dde8 100644 --- a/src/odb.h +++ b/src/odb.h @@ -84,4 +84,12 @@ int git_odb__error_notfound(const char *message, const git_oid *oid); */ int git_odb__error_ambiguous(const char *message); +/* + * Attempt to read object header or just return whole object if it could + * not be read. + */ +int git_odb__read_header_or_object( + git_odb_object **out, size_t *len_p, git_otype *type_p, + git_odb *db, const git_oid *id); + #endif diff --git a/src/repository.h b/src/repository.h index 4695edf3a..82988ba0a 100644 --- a/src/repository.h +++ b/src/repository.h @@ -18,6 +18,7 @@ #include "refs.h" #include "buffer.h" #include "odb.h" +#include "object.h" #include "attr.h" #include "strmap.h" @@ -75,13 +76,6 @@ enum { GIT_REPOSITORY_INIT__IS_REINIT = (1u << 18), }; -/** Base git object for inheritance */ -struct git_object { - git_cached_obj cached; - git_repository *repo; - git_otype type; -}; - /** Internal structure for repository object */ struct git_repository { git_odb *_odb; @@ -102,21 +96,6 @@ struct git_repository { git_cvar_value cvar_cache[GIT_CVAR_CACHE_MAX]; }; -/* fully free the object; internal method, DO NOT EXPORT */ -void git_object__free(void *object); - -GIT_INLINE(int) git_object__dup(git_object **dest, git_object *source) -{ - git_cached_obj_incref(source); - *dest = source; - return 0; -} - -int git_object__resolve_to_type(git_object **obj, git_otype type); - -int git_oid__parse(git_oid *oid, const char **buffer_out, const char *buffer_end, const char *header); -void git_oid__writebuf(git_buf *buf, const char *header, const git_oid *oid); - GIT_INLINE(git_attr_cache *) git_repository_attr_cache(git_repository *repo) { return &repo->attrcache; @@ -136,7 +115,7 @@ int git_repository_odb__weakptr(git_odb **out, git_repository *repo); int git_repository_index__weakptr(git_index **out, git_repository *repo); /* - * CVAR cache + * CVAR cache * * Efficient access to the most used config variables of a repository. * The cache is cleared everytime the config backend is replaced. From 1f35e89dbf6e0be8952cc4324a45fd600be5ca05 Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Tue, 11 Sep 2012 12:03:33 -0700 Subject: [PATCH 09/10] Fix diff binary file detection In the process of adding tests for the max file size threshold (which treats files over a certain size as binary) there seem to be a number of problems in the new code with detecting binaries. This should fix those up, as well as add a test for the file size threshold stuff. Also, this un-deprecates `GIT_DIFF_LINE_ADD_EOFNL`, since I finally found a legitimate situation where it would be returned. --- include/git2/diff.h | 19 +++++++- src/diff_output.c | 56 +++++++++++++++------- src/fileops.c | 32 ++++--------- tests-clar/diff/diff_helpers.c | 3 +- tests-clar/diff/diffiter.c | 85 ++++++++++++++++++++++++++++++++++ 5 files changed, 153 insertions(+), 42 deletions(-) diff --git a/include/git2/diff.h b/include/git2/diff.h index 4b4591a9e..05825c50d 100644 --- a/include/git2/diff.h +++ b/include/git2/diff.h @@ -79,13 +79,28 @@ typedef struct { */ typedef struct git_diff_list git_diff_list; +/** + * Flags that can be set for the file on side of a diff. + * + * Most of the flags are just for internal consumption by libgit2, + * but some of them may be interesting to external users. They are: + * + * - VALID_OID - the `oid` value is computed and correct + * - FREE_PATH - the `path` string is separated allocated memory + * - BINARY - this file should be considered binary data + * - NOT_BINARY - this file should be considered text data + * - FREE_DATA - the internal file data is kept in allocated memory + * - UNMAP_DATA - the internal file data is kept in mmap'ed memory + * - NO_DATA - this side of the diff should not be loaded + */ enum { GIT_DIFF_FILE_VALID_OID = (1 << 0), GIT_DIFF_FILE_FREE_PATH = (1 << 1), GIT_DIFF_FILE_BINARY = (1 << 2), GIT_DIFF_FILE_NOT_BINARY = (1 << 3), GIT_DIFF_FILE_FREE_DATA = (1 << 4), - GIT_DIFF_FILE_UNMAP_DATA = (1 << 5) + GIT_DIFF_FILE_UNMAP_DATA = (1 << 5), + GIT_DIFF_FILE_NO_DATA = (1 << 6), }; /** @@ -176,7 +191,7 @@ enum { GIT_DIFF_LINE_CONTEXT = ' ', GIT_DIFF_LINE_ADDITION = '+', GIT_DIFF_LINE_DELETION = '-', - GIT_DIFF_LINE_ADD_EOFNL = '\n', /**< DEPRECATED - will not be returned */ + GIT_DIFF_LINE_ADD_EOFNL = '\n', /**< Removed line w/o LF & added one with */ GIT_DIFF_LINE_DEL_EOFNL = '\0', /**< LF was removed at end of file */ /* The following values will only be sent to a `git_diff_data_fn` when diff --git a/src/diff_output.c b/src/diff_output.c index dbef7ddc1..ea40c3355 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -172,9 +172,13 @@ static void update_delta_is_binary(git_diff_delta *delta) if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0 || (delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0) delta->binary = 1; - else if ((delta->old_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0 && - (delta->new_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0) + +#define NOT_BINARY_FLAGS (GIT_DIFF_FILE_NOT_BINARY|GIT_DIFF_FILE_NO_DATA) + + else if ((delta->old_file.flags & NOT_BINARY_FLAGS) != 0 && + (delta->new_file.flags & NOT_BINARY_FLAGS) != 0) delta->binary = 0; + /* otherwise leave delta->binary value untouched */ } @@ -507,7 +511,7 @@ static int diff_delta_load(diff_delta_context *ctxt) { int error = 0; git_diff_delta *delta = ctxt->delta; - bool load_old = false, load_new = false, check_if_unmodified = false; + bool check_if_unmodified = false; if (ctxt->loaded || !ctxt->delta) return 0; @@ -527,22 +531,33 @@ static int diff_delta_load(diff_delta_context *ctxt) goto cleanup; switch (delta->status) { - case GIT_DELTA_ADDED: load_new = true; break; - case GIT_DELTA_DELETED: load_old = true; break; - case GIT_DELTA_MODIFIED: load_new = load_old = true; break; - default: break; + case GIT_DELTA_ADDED: + delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA; + break; + case GIT_DELTA_DELETED: + delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA; + break; + case GIT_DELTA_MODIFIED: + break; + default: + delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA; + delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA; + break; } +#define CHECK_UNMODIFIED (GIT_DIFF_FILE_NO_DATA | GIT_DIFF_FILE_VALID_OID) + check_if_unmodified = - (load_old && (delta->old_file.flags & GIT_DIFF_FILE_VALID_OID) == 0) || - (load_new && (delta->new_file.flags & GIT_DIFF_FILE_VALID_OID) == 0); + (delta->old_file.flags & CHECK_UNMODIFIED) == 0 && + (delta->new_file.flags & CHECK_UNMODIFIED) == 0; /* Always try to load workdir content first, since it may need to be * filtered (and hence use 2x memory) and we want to minimize the max * memory footprint during diff. */ - if (load_old && ctxt->old_src == GIT_ITERATOR_WORKDIR) { + if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 && + ctxt->old_src == GIT_ITERATOR_WORKDIR) { if ((error = get_workdir_content( ctxt, &delta->old_file, &ctxt->old_data)) < 0) goto cleanup; @@ -550,7 +565,8 @@ static int diff_delta_load(diff_delta_context *ctxt) goto cleanup; } - if (load_new && ctxt->new_src == GIT_ITERATOR_WORKDIR) { + if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 && + ctxt->new_src == GIT_ITERATOR_WORKDIR) { if ((error = get_workdir_content( ctxt, &delta->new_file, &ctxt->new_data)) < 0) goto cleanup; @@ -558,7 +574,8 @@ static int diff_delta_load(diff_delta_context *ctxt) goto cleanup; } - if (load_old && ctxt->old_src != GIT_ITERATOR_WORKDIR) { + if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 && + ctxt->old_src != GIT_ITERATOR_WORKDIR) { if ((error = get_blob_content( ctxt, &delta->old_file, &ctxt->old_data, &ctxt->old_blob)) < 0) goto cleanup; @@ -566,7 +583,8 @@ static int diff_delta_load(diff_delta_context *ctxt) goto cleanup; } - if (load_new && ctxt->new_src != GIT_ITERATOR_WORKDIR) { + if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 && + ctxt->new_src != GIT_ITERATOR_WORKDIR) { if ((error = get_blob_content( ctxt, &delta->new_file, &ctxt->new_data, &ctxt->new_blob)) < 0) goto cleanup; @@ -588,6 +606,10 @@ static int diff_delta_load(diff_delta_context *ctxt) } cleanup: + /* last change to update binary flag */ + if (delta->binary == -1) + update_delta_is_binary(delta); + ctxt->loaded = !error; /* flag if we would want to diff the contents of these files */ @@ -629,9 +651,10 @@ static int diff_delta_cb(void *priv, mmbuffer_t *bufs, int len) } if (len == 3 && !ctxt->cb_error) { - /* This should only happen if we are adding a line that does not - * have a newline at the end and the old code did. In that case, - * we have a ADD with a DEL_EOFNL as a pair. + /* If we have a '+' and a third buf, then we have added a line + * without a newline and the old code had one, so DEL_EOFNL. + * If we have a '-' and a third buf, then we have removed a line + * with out a newline but added a blank line, so ADD_EOFNL. */ char origin = (*bufs[0].ptr == '+') ? GIT_DIFF_LINE_DEL_EOFNL : @@ -1036,6 +1059,7 @@ static void set_data_from_blob( } else { map->data = ""; file->size = map->len = 0; + file->flags |= GIT_DIFF_FILE_NO_DATA; } } diff --git a/src/fileops.c b/src/fileops.c index d4def1a9a..cbe3d4782 100644 --- a/src/fileops.c +++ b/src/fileops.c @@ -115,40 +115,26 @@ mode_t git_futils_canonical_mode(mode_t raw_mode) return 0; } -#define MAX_READ_STALLS 10 - int git_futils_readbuffer_fd(git_buf *buf, git_file fd, size_t len) { - int stalls = MAX_READ_STALLS; + ssize_t read_size; git_buf_clear(buf); if (git_buf_grow(buf, len + 1) < 0) return -1; - buf->ptr[len] = '\0'; + /* p_read loops internally to read len bytes */ + read_size = p_read(fd, buf->ptr, len); - while (len > 0) { - ssize_t read_size = p_read(fd, buf->ptr + buf->size, len); - - if (read_size < 0) { - giterr_set(GITERR_OS, "Failed to read descriptor"); - return -1; - } - - if (read_size == 0) { - stalls--; - - if (!stalls) { - giterr_set(GITERR_OS, "Too many stalls reading descriptor"); - return -1; - } - } - - len -= read_size; - buf->size += read_size; + if (read_size < 0) { + giterr_set(GITERR_OS, "Failed to read descriptor"); + return -1; } + buf->ptr[read_size] = '\0'; + buf->size = read_size; + return 0; } diff --git a/tests-clar/diff/diff_helpers.c b/tests-clar/diff/diff_helpers.c index ef59b686f..767b34392 100644 --- a/tests-clar/diff/diff_helpers.c +++ b/tests-clar/diff/diff_helpers.c @@ -89,7 +89,8 @@ int diff_line_fn( e->line_adds++; break; case GIT_DIFF_LINE_ADD_EOFNL: - assert(0); + /* technically not a line add, but we'll count it as such */ + e->line_adds++; break; case GIT_DIFF_LINE_DELETION: e->line_dels++; diff --git a/tests-clar/diff/diffiter.c b/tests-clar/diff/diffiter.c index 56c254741..23071e48b 100644 --- a/tests-clar/diff/diffiter.c +++ b/tests-clar/diff/diffiter.c @@ -114,3 +114,88 @@ void test_diff_diffiter__iterate_files_and_hunks(void) git_diff_iterator_free(iter); git_diff_list_free(diff); } + +void test_diff_diffiter__max_size_threshold(void) +{ + git_repository *repo = cl_git_sandbox_init("status"); + git_diff_options opts = {0}; + git_diff_list *diff = NULL; + git_diff_iterator *iter; + git_diff_delta *delta; + int error, file_count = 0, binary_count = 0, hunk_count = 0; + + opts.context_lines = 3; + opts.interhunk_lines = 1; + opts.flags |= GIT_DIFF_INCLUDE_IGNORED | GIT_DIFF_INCLUDE_UNTRACKED; + + cl_git_pass(git_diff_workdir_to_index(repo, &opts, &diff)); + cl_git_pass(git_diff_iterator_new(&iter, diff)); + + while ((error = git_diff_iterator_next_file(&delta, iter)) != GIT_ITEROVER) { + cl_assert_equal_i(0, error); + cl_assert(delta); + + file_count++; + + hunk_count += git_diff_iterator_num_hunks_in_file(iter); + + assert(delta->binary == 0 || delta->binary == 1); + + binary_count += delta->binary; + } + + cl_assert_equal_i(GIT_ITEROVER, error); + cl_assert(delta == NULL); + + cl_assert_equal_i(13, file_count); + cl_assert_equal_i(0, binary_count); + cl_assert_equal_i(8, hunk_count); + + git_diff_iterator_free(iter); + git_diff_list_free(diff); + + /* try again with low file size threshold */ + + file_count = 0; + binary_count = 0; + hunk_count = 0; + + opts.context_lines = 3; + opts.interhunk_lines = 1; + opts.flags |= GIT_DIFF_INCLUDE_IGNORED | GIT_DIFF_INCLUDE_UNTRACKED; + opts.max_size = 50; /* treat anything over 50 bytes as binary! */ + + cl_git_pass(git_diff_workdir_to_index(repo, &opts, &diff)); + cl_git_pass(git_diff_iterator_new(&iter, diff)); + + while ((error = git_diff_iterator_next_file(&delta, iter)) != GIT_ITEROVER) { + cl_assert_equal_i(0, error); + cl_assert(delta); + + file_count++; + + hunk_count += git_diff_iterator_num_hunks_in_file(iter); + + assert(delta->binary == 0 || delta->binary == 1); + + binary_count += delta->binary; + } + + cl_assert_equal_i(GIT_ITEROVER, error); + cl_assert(delta == NULL); + + cl_assert_equal_i(13, file_count); + + /* Three files are over the 50 byte threshold: + * - staged_changes_file_deleted + * - staged_changes_modified_file + * - staged_new_file_modified_file + */ + cl_assert_equal_i(3, binary_count); + + cl_assert_equal_i(5, hunk_count); + + git_diff_iterator_free(iter); + git_diff_list_free(diff); + +} From c859184bb459d9801a394dc44f5b0b0e55453263 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Tue, 11 Sep 2012 23:05:24 +0200 Subject: [PATCH 10/10] Properly handle p_reads --- src/amiga/map.c | 11 ++++------- src/blob.c | 19 +++++++++---------- src/filebuf.c | 9 +++++++-- src/fileops.c | 2 +- src/odb.c | 20 +++++++++++--------- 5 files changed, 32 insertions(+), 29 deletions(-) diff --git a/src/amiga/map.c b/src/amiga/map.c index 2fb065c8b..c601de724 100755 --- a/src/amiga/map.c +++ b/src/amiga/map.c @@ -24,18 +24,15 @@ int p_mmap(git_map *out, size_t len, int prot, int flags, int fd, git_off_t offs return -1; } - if((out->data = malloc(len))) { - p_lseek(fd, offset, SEEK_SET); - p_read(fd, out->data, len); - } + out->data = malloc(len); + GITERR_CHECK_ALLOC(out->data); - if (!out->data || (out->data == MAP_FAILED)) { - giterr_set(GITERR_OS, "Failed to mmap. Could not write data"); + if (p_lseek(fd, offset, SEEK_SET) < 0 || p_read(fd, out->data, len) != len) + giterr_set(GITERR_OS, "mmap emulation failed"); return -1; } out->len = len; - return 0; } diff --git a/src/blob.c b/src/blob.c index 699adec6b..6267ae7b2 100644 --- a/src/blob.c +++ b/src/blob.c @@ -68,6 +68,7 @@ static int write_file_stream( int fd, error; char buffer[4096]; git_odb_stream *stream = NULL; + ssize_t read_len, written = 0; if ((error = git_odb_open_wstream( &stream, odb, (size_t)file_size, GIT_OBJ_BLOB)) < 0) @@ -78,20 +79,18 @@ static int write_file_stream( return -1; } - while (!error && file_size > 0) { - ssize_t read_len = p_read(fd, buffer, sizeof(buffer)); - - if (read_len < 0) { - giterr_set( - GITERR_OS, "Failed to create blob. Can't read whole file"); - error = -1; - } - else if (!(error = stream->write(stream, buffer, read_len))) - file_size -= read_len; + while (!error && (read_len = p_read(fd, buffer, sizeof(buffer))) > 0) { + error = stream->write(stream, buffer, read_len); + written += read_len; } p_close(fd); + if (written != file_size || read_len < 0) { + giterr_set(GITERR_OS, "Failed to read file into stream"); + error = -1; + } + if (!error) error = stream->finalize_write(oid, stream); diff --git a/src/filebuf.c b/src/filebuf.c index cfc8528e6..b9b908c8d 100644 --- a/src/filebuf.c +++ b/src/filebuf.c @@ -73,7 +73,7 @@ static int lock_file(git_filebuf *file, int flags) if ((flags & GIT_FILEBUF_APPEND) && git_path_exists(file->path_original) == true) { git_file source; char buffer[2048]; - size_t read_bytes; + ssize_t read_bytes; source = p_open(file->path_original, O_RDONLY); if (source < 0) { @@ -83,13 +83,18 @@ static int lock_file(git_filebuf *file, int flags) return -1; } - while ((read_bytes = p_read(source, buffer, 2048)) > 0) { + while ((read_bytes = p_read(source, buffer, sizeof(buffer))) > 0) { p_write(file->fd, buffer, read_bytes); if (file->digest) git_hash_update(file->digest, buffer, read_bytes); } p_close(source); + + if (read_bytes < 0) { + giterr_set(GITERR_OS, "Failed to read file '%s'", file->path_original); + return -1; + } } return 0; diff --git a/src/fileops.c b/src/fileops.c index cbe3d4782..8ccf063d5 100644 --- a/src/fileops.c +++ b/src/fileops.c @@ -127,7 +127,7 @@ int git_futils_readbuffer_fd(git_buf *buf, git_file fd, size_t len) /* p_read loops internally to read len bytes */ read_size = p_read(fd, buf->ptr, len); - if (read_size < 0) { + if (read_size != (ssize_t)len) { giterr_set(GITERR_OS, "Failed to read descriptor"); return -1; } diff --git a/src/odb.c b/src/odb.c index 0e03e40ee..0d3d809f7 100644 --- a/src/odb.c +++ b/src/odb.c @@ -115,6 +115,7 @@ int git_odb__hashfd(git_oid *out, git_file fd, size_t size, git_otype type) int hdr_len; char hdr[64], buffer[2048]; git_hash_ctx *ctx; + ssize_t read_len; hdr_len = format_object_header(hdr, sizeof(hdr), size, type); @@ -123,19 +124,20 @@ int git_odb__hashfd(git_oid *out, git_file fd, size_t size, git_otype type) git_hash_update(ctx, hdr, hdr_len); - while (size > 0) { - ssize_t read_len = p_read(fd, buffer, sizeof(buffer)); - - if (read_len < 0) { - git_hash_free_ctx(ctx); - giterr_set(GITERR_OS, "Error reading file"); - return -1; - } - + while (size > 0 && (read_len = p_read(fd, buffer, sizeof(buffer))) > 0) { git_hash_update(ctx, buffer, read_len); size -= read_len; } + /* If p_read returned an error code, the read obviously failed. + * If size is not zero, the file was truncated after we originally + * stat'd it, so we consider this a read failure too */ + if (read_len < 0 || size > 0) { + git_hash_free_ctx(ctx); + giterr_set(GITERR_OS, "Error reading file for hashing"); + return -1; + } + git_hash_final(out, ctx); git_hash_free_ctx(ctx);