mirror of
https://git.proxmox.com/git/libgit2
synced 2025-05-08 19:51:31 +00:00
Fix diff binary file detection
In the process of adding tests for the max file size threshold (which treats files over a certain size as binary) there seem to be a number of problems in the new code with detecting binaries. This should fix those up, as well as add a test for the file size threshold stuff. Also, this un-deprecates `GIT_DIFF_LINE_ADD_EOFNL`, since I finally found a legitimate situation where it would be returned.
This commit is contained in:
parent
c6ac28fdc5
commit
1f35e89dbf
@ -79,13 +79,28 @@ typedef struct {
|
||||
*/
|
||||
typedef struct git_diff_list git_diff_list;
|
||||
|
||||
/**
|
||||
* Flags that can be set for the file on side of a diff.
|
||||
*
|
||||
* Most of the flags are just for internal consumption by libgit2,
|
||||
* but some of them may be interesting to external users. They are:
|
||||
*
|
||||
* - VALID_OID - the `oid` value is computed and correct
|
||||
* - FREE_PATH - the `path` string is separated allocated memory
|
||||
* - BINARY - this file should be considered binary data
|
||||
* - NOT_BINARY - this file should be considered text data
|
||||
* - FREE_DATA - the internal file data is kept in allocated memory
|
||||
* - UNMAP_DATA - the internal file data is kept in mmap'ed memory
|
||||
* - NO_DATA - this side of the diff should not be loaded
|
||||
*/
|
||||
enum {
|
||||
GIT_DIFF_FILE_VALID_OID = (1 << 0),
|
||||
GIT_DIFF_FILE_FREE_PATH = (1 << 1),
|
||||
GIT_DIFF_FILE_BINARY = (1 << 2),
|
||||
GIT_DIFF_FILE_NOT_BINARY = (1 << 3),
|
||||
GIT_DIFF_FILE_FREE_DATA = (1 << 4),
|
||||
GIT_DIFF_FILE_UNMAP_DATA = (1 << 5)
|
||||
GIT_DIFF_FILE_UNMAP_DATA = (1 << 5),
|
||||
GIT_DIFF_FILE_NO_DATA = (1 << 6),
|
||||
};
|
||||
|
||||
/**
|
||||
@ -176,7 +191,7 @@ enum {
|
||||
GIT_DIFF_LINE_CONTEXT = ' ',
|
||||
GIT_DIFF_LINE_ADDITION = '+',
|
||||
GIT_DIFF_LINE_DELETION = '-',
|
||||
GIT_DIFF_LINE_ADD_EOFNL = '\n', /**< DEPRECATED - will not be returned */
|
||||
GIT_DIFF_LINE_ADD_EOFNL = '\n', /**< Removed line w/o LF & added one with */
|
||||
GIT_DIFF_LINE_DEL_EOFNL = '\0', /**< LF was removed at end of file */
|
||||
|
||||
/* The following values will only be sent to a `git_diff_data_fn` when
|
||||
|
@ -172,9 +172,13 @@ static void update_delta_is_binary(git_diff_delta *delta)
|
||||
if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0 ||
|
||||
(delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0)
|
||||
delta->binary = 1;
|
||||
else if ((delta->old_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0 &&
|
||||
(delta->new_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0)
|
||||
|
||||
#define NOT_BINARY_FLAGS (GIT_DIFF_FILE_NOT_BINARY|GIT_DIFF_FILE_NO_DATA)
|
||||
|
||||
else if ((delta->old_file.flags & NOT_BINARY_FLAGS) != 0 &&
|
||||
(delta->new_file.flags & NOT_BINARY_FLAGS) != 0)
|
||||
delta->binary = 0;
|
||||
|
||||
/* otherwise leave delta->binary value untouched */
|
||||
}
|
||||
|
||||
@ -507,7 +511,7 @@ static int diff_delta_load(diff_delta_context *ctxt)
|
||||
{
|
||||
int error = 0;
|
||||
git_diff_delta *delta = ctxt->delta;
|
||||
bool load_old = false, load_new = false, check_if_unmodified = false;
|
||||
bool check_if_unmodified = false;
|
||||
|
||||
if (ctxt->loaded || !ctxt->delta)
|
||||
return 0;
|
||||
@ -527,22 +531,33 @@ static int diff_delta_load(diff_delta_context *ctxt)
|
||||
goto cleanup;
|
||||
|
||||
switch (delta->status) {
|
||||
case GIT_DELTA_ADDED: load_new = true; break;
|
||||
case GIT_DELTA_DELETED: load_old = true; break;
|
||||
case GIT_DELTA_MODIFIED: load_new = load_old = true; break;
|
||||
default: break;
|
||||
case GIT_DELTA_ADDED:
|
||||
delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA;
|
||||
break;
|
||||
case GIT_DELTA_DELETED:
|
||||
delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA;
|
||||
break;
|
||||
case GIT_DELTA_MODIFIED:
|
||||
break;
|
||||
default:
|
||||
delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA;
|
||||
delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA;
|
||||
break;
|
||||
}
|
||||
|
||||
#define CHECK_UNMODIFIED (GIT_DIFF_FILE_NO_DATA | GIT_DIFF_FILE_VALID_OID)
|
||||
|
||||
check_if_unmodified =
|
||||
(load_old && (delta->old_file.flags & GIT_DIFF_FILE_VALID_OID) == 0) ||
|
||||
(load_new && (delta->new_file.flags & GIT_DIFF_FILE_VALID_OID) == 0);
|
||||
(delta->old_file.flags & CHECK_UNMODIFIED) == 0 &&
|
||||
(delta->new_file.flags & CHECK_UNMODIFIED) == 0;
|
||||
|
||||
/* Always try to load workdir content first, since it may need to be
|
||||
* filtered (and hence use 2x memory) and we want to minimize the max
|
||||
* memory footprint during diff.
|
||||
*/
|
||||
|
||||
if (load_old && ctxt->old_src == GIT_ITERATOR_WORKDIR) {
|
||||
if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
|
||||
ctxt->old_src == GIT_ITERATOR_WORKDIR) {
|
||||
if ((error = get_workdir_content(
|
||||
ctxt, &delta->old_file, &ctxt->old_data)) < 0)
|
||||
goto cleanup;
|
||||
@ -550,7 +565,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (load_new && ctxt->new_src == GIT_ITERATOR_WORKDIR) {
|
||||
if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
|
||||
ctxt->new_src == GIT_ITERATOR_WORKDIR) {
|
||||
if ((error = get_workdir_content(
|
||||
ctxt, &delta->new_file, &ctxt->new_data)) < 0)
|
||||
goto cleanup;
|
||||
@ -558,7 +574,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (load_old && ctxt->old_src != GIT_ITERATOR_WORKDIR) {
|
||||
if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
|
||||
ctxt->old_src != GIT_ITERATOR_WORKDIR) {
|
||||
if ((error = get_blob_content(
|
||||
ctxt, &delta->old_file, &ctxt->old_data, &ctxt->old_blob)) < 0)
|
||||
goto cleanup;
|
||||
@ -566,7 +583,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (load_new && ctxt->new_src != GIT_ITERATOR_WORKDIR) {
|
||||
if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
|
||||
ctxt->new_src != GIT_ITERATOR_WORKDIR) {
|
||||
if ((error = get_blob_content(
|
||||
ctxt, &delta->new_file, &ctxt->new_data, &ctxt->new_blob)) < 0)
|
||||
goto cleanup;
|
||||
@ -588,6 +606,10 @@ static int diff_delta_load(diff_delta_context *ctxt)
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/* last change to update binary flag */
|
||||
if (delta->binary == -1)
|
||||
update_delta_is_binary(delta);
|
||||
|
||||
ctxt->loaded = !error;
|
||||
|
||||
/* flag if we would want to diff the contents of these files */
|
||||
@ -629,9 +651,10 @@ static int diff_delta_cb(void *priv, mmbuffer_t *bufs, int len)
|
||||
}
|
||||
|
||||
if (len == 3 && !ctxt->cb_error) {
|
||||
/* This should only happen if we are adding a line that does not
|
||||
* have a newline at the end and the old code did. In that case,
|
||||
* we have a ADD with a DEL_EOFNL as a pair.
|
||||
/* If we have a '+' and a third buf, then we have added a line
|
||||
* without a newline and the old code had one, so DEL_EOFNL.
|
||||
* If we have a '-' and a third buf, then we have removed a line
|
||||
* with out a newline but added a blank line, so ADD_EOFNL.
|
||||
*/
|
||||
char origin =
|
||||
(*bufs[0].ptr == '+') ? GIT_DIFF_LINE_DEL_EOFNL :
|
||||
@ -1036,6 +1059,7 @@ static void set_data_from_blob(
|
||||
} else {
|
||||
map->data = "";
|
||||
file->size = map->len = 0;
|
||||
file->flags |= GIT_DIFF_FILE_NO_DATA;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -115,40 +115,26 @@ mode_t git_futils_canonical_mode(mode_t raw_mode)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define MAX_READ_STALLS 10
|
||||
|
||||
int git_futils_readbuffer_fd(git_buf *buf, git_file fd, size_t len)
|
||||
{
|
||||
int stalls = MAX_READ_STALLS;
|
||||
ssize_t read_size;
|
||||
|
||||
git_buf_clear(buf);
|
||||
|
||||
if (git_buf_grow(buf, len + 1) < 0)
|
||||
return -1;
|
||||
|
||||
buf->ptr[len] = '\0';
|
||||
/* p_read loops internally to read len bytes */
|
||||
read_size = p_read(fd, buf->ptr, len);
|
||||
|
||||
while (len > 0) {
|
||||
ssize_t read_size = p_read(fd, buf->ptr + buf->size, len);
|
||||
|
||||
if (read_size < 0) {
|
||||
giterr_set(GITERR_OS, "Failed to read descriptor");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (read_size == 0) {
|
||||
stalls--;
|
||||
|
||||
if (!stalls) {
|
||||
giterr_set(GITERR_OS, "Too many stalls reading descriptor");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
len -= read_size;
|
||||
buf->size += read_size;
|
||||
if (read_size < 0) {
|
||||
giterr_set(GITERR_OS, "Failed to read descriptor");
|
||||
return -1;
|
||||
}
|
||||
|
||||
buf->ptr[read_size] = '\0';
|
||||
buf->size = read_size;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -89,7 +89,8 @@ int diff_line_fn(
|
||||
e->line_adds++;
|
||||
break;
|
||||
case GIT_DIFF_LINE_ADD_EOFNL:
|
||||
assert(0);
|
||||
/* technically not a line add, but we'll count it as such */
|
||||
e->line_adds++;
|
||||
break;
|
||||
case GIT_DIFF_LINE_DELETION:
|
||||
e->line_dels++;
|
||||
|
@ -114,3 +114,88 @@ void test_diff_diffiter__iterate_files_and_hunks(void)
|
||||
git_diff_iterator_free(iter);
|
||||
git_diff_list_free(diff);
|
||||
}
|
||||
|
||||
void test_diff_diffiter__max_size_threshold(void)
|
||||
{
|
||||
git_repository *repo = cl_git_sandbox_init("status");
|
||||
git_diff_options opts = {0};
|
||||
git_diff_list *diff = NULL;
|
||||
git_diff_iterator *iter;
|
||||
git_diff_delta *delta;
|
||||
int error, file_count = 0, binary_count = 0, hunk_count = 0;
|
||||
|
||||
opts.context_lines = 3;
|
||||
opts.interhunk_lines = 1;
|
||||
opts.flags |= GIT_DIFF_INCLUDE_IGNORED | GIT_DIFF_INCLUDE_UNTRACKED;
|
||||
|
||||
cl_git_pass(git_diff_workdir_to_index(repo, &opts, &diff));
|
||||
cl_git_pass(git_diff_iterator_new(&iter, diff));
|
||||
|
||||
while ((error = git_diff_iterator_next_file(&delta, iter)) != GIT_ITEROVER) {
|
||||
cl_assert_equal_i(0, error);
|
||||
cl_assert(delta);
|
||||
|
||||
file_count++;
|
||||
|
||||
hunk_count += git_diff_iterator_num_hunks_in_file(iter);
|
||||
|
||||
assert(delta->binary == 0 || delta->binary == 1);
|
||||
|
||||
binary_count += delta->binary;
|
||||
}
|
||||
|
||||
cl_assert_equal_i(GIT_ITEROVER, error);
|
||||
cl_assert(delta == NULL);
|
||||
|
||||
cl_assert_equal_i(13, file_count);
|
||||
cl_assert_equal_i(0, binary_count);
|
||||
cl_assert_equal_i(8, hunk_count);
|
||||
|
||||
git_diff_iterator_free(iter);
|
||||
git_diff_list_free(diff);
|
||||
|
||||
/* try again with low file size threshold */
|
||||
|
||||
file_count = 0;
|
||||
binary_count = 0;
|
||||
hunk_count = 0;
|
||||
|
||||
opts.context_lines = 3;
|
||||
opts.interhunk_lines = 1;
|
||||
opts.flags |= GIT_DIFF_INCLUDE_IGNORED | GIT_DIFF_INCLUDE_UNTRACKED;
|
||||
opts.max_size = 50; /* treat anything over 50 bytes as binary! */
|
||||
|
||||
cl_git_pass(git_diff_workdir_to_index(repo, &opts, &diff));
|
||||
cl_git_pass(git_diff_iterator_new(&iter, diff));
|
||||
|
||||
while ((error = git_diff_iterator_next_file(&delta, iter)) != GIT_ITEROVER) {
|
||||
cl_assert_equal_i(0, error);
|
||||
cl_assert(delta);
|
||||
|
||||
file_count++;
|
||||
|
||||
hunk_count += git_diff_iterator_num_hunks_in_file(iter);
|
||||
|
||||
assert(delta->binary == 0 || delta->binary == 1);
|
||||
|
||||
binary_count += delta->binary;
|
||||
}
|
||||
|
||||
cl_assert_equal_i(GIT_ITEROVER, error);
|
||||
cl_assert(delta == NULL);
|
||||
|
||||
cl_assert_equal_i(13, file_count);
|
||||
|
||||
/* Three files are over the 50 byte threshold:
|
||||
* - staged_changes_file_deleted
|
||||
* - staged_changes_modified_file
|
||||
* - staged_new_file_modified_file
|
||||
*/
|
||||
cl_assert_equal_i(3, binary_count);
|
||||
|
||||
cl_assert_equal_i(5, hunk_count);
|
||||
|
||||
git_diff_iterator_free(iter);
|
||||
git_diff_list_free(diff);
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user