From d0f00de4d8e2173a3132f0024e74f5049638ce2f Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Fri, 16 May 2014 11:08:19 -0700 Subject: [PATCH 1/2] Increase binary detection len to 8k --- src/blob.c | 3 ++- src/diff_driver.c | 6 +++++- src/filter.h | 4 ++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/blob.c b/src/blob.c index ab7dec67f..30d5b705b 100644 --- a/src/blob.c +++ b/src/blob.c @@ -334,7 +334,8 @@ int git_blob_is_binary(const git_blob *blob) assert(blob); content.ptr = blob->odb_object->buffer; - content.size = min(blob->odb_object->cached.size, 4000); + content.size = + min(blob->odb_object->cached.size, GIT_FILTER_BYTES_TO_CHECK_NUL); content.asize = 0; return git_buf_text_is_binary(&content); diff --git a/src/diff_driver.c b/src/diff_driver.c index dc8e79e25..c3c5f365b 100644 --- a/src/diff_driver.c +++ b/src/diff_driver.c @@ -397,7 +397,11 @@ void git_diff_driver_update_options( int git_diff_driver_content_is_binary( git_diff_driver *driver, const char *content, size_t content_len) { - const git_buf search = { (char *)content, 0, min(content_len, 4000) }; + git_buf search; + + search.ptr = (char *)content; + search.size = min(content_len, GIT_FILTER_BYTES_TO_CHECK_NUL); + search.asize = 0; GIT_UNUSED(driver); diff --git a/src/filter.h b/src/filter.h index d0ace0f9a..5a366108b 100644 --- a/src/filter.h +++ b/src/filter.h @@ -10,6 +10,10 @@ #include "common.h" #include "git2/filter.h" +/* Amount of file to examine for NUL byte when checking binary-ness */ +#define GIT_FILTER_BYTES_TO_CHECK_NUL 8000 + +/* Possible CRLF values */ typedef enum { GIT_CRLF_GUESS = -1, GIT_CRLF_BINARY = 0, From 8af4966db15ed35832235627e2d01068d4734dea Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Fri, 16 May 2014 16:30:58 -0700 Subject: [PATCH 2/2] Git binary check compat tests A variety of data patterns for diffs verified to match the behavior of binary detection with Git on the command line. --- tests/diff/workdir.c | 114 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/tests/diff/workdir.c b/tests/diff/workdir.c index a6d48abc6..f82bb00e8 100644 --- a/tests/diff/workdir.c +++ b/tests/diff/workdir.c @@ -1580,3 +1580,117 @@ void test_diff_workdir__can_update_index(void) git_diff_free(diff); } + +#define STR7 "0123456" +#define STR8 "01234567" +#define STR40 STR8 STR8 STR8 STR8 STR8 +#define STR200 STR40 STR40 STR40 STR40 STR40 +#define STR999Z STR200 STR200 STR200 STR200 STR40 STR40 STR40 STR40 \ + STR8 STR8 STR8 STR8 STR7 "\0" +#define STR1000 STR200 STR200 STR200 STR200 STR200 +#define STR3999Z STR1000 STR1000 STR1000 STR999Z +#define STR4000 STR1000 STR1000 STR1000 STR1000 + +static void assert_delta_binary(git_diff *diff, size_t idx, int is_binary) +{ + git_patch *patch; + const git_diff_delta *delta; + + cl_git_pass(git_patch_from_diff(&patch, diff, idx)); + delta = git_patch_get_delta(patch); + cl_assert_equal_b((delta->flags & GIT_DIFF_FLAG_BINARY), is_binary); + git_patch_free(patch); +} + +void test_diff_workdir__binary_detection(void) +{ + git_index *idx; + git_diff *diff = NULL; + git_buf b = GIT_BUF_INIT; + int i; + git_buf data[10] = { + { "1234567890", 0, 0 }, /* 0 - all ascii text control */ + { "Åü†HøπΩ", 0, 0 }, /* 1 - UTF-8 multibyte text */ + { "\xEF\xBB\xBFÜ⤒ƒ8£€", 0, 0 }, /* 2 - UTF-8 with BOM */ + { STR999Z, 0, 1000 }, /* 3 - ASCII with NUL at 1000 */ + { STR3999Z, 0, 4000 }, /* 4 - ASCII with NUL at 4000 */ + { STR4000 STR3999Z "x", 0, 8001 }, /* 5 - ASCII with NUL at 8000 */ + { STR4000 STR4000 "\0", 0, 8001 }, /* 6 - ASCII with NUL at 8001 */ + { "\x00\xDC\x00\x6E\x21\x39\xFE\x0E\x00\x63\x00\xF8" + "\x00\x64\x00\x65\x20\x48", 0, 18 }, /* 7 - UTF-16 text */ + { "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d" + "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d", + 0, 26 }, /* 8 - All non-printable characters (no NUL) */ + { "Hello \x01\x02\x03\x04\x05\x06 World!\x01\x02\x03\x04" + "\x05\x06\x07", 0, 26 }, /* 9 - 50-50 non-printable (no NUL) */ + }; + + g_repo = cl_git_sandbox_init("empty_standard_repo"); + cl_git_pass(git_repository_index(&idx, g_repo)); + + /* We start with ASCII in index and test data in workdir, + * then we will try with test data in index and ASCII in workdir. + */ + + cl_git_pass(git_buf_sets(&b, "empty_standard_repo/0")); + for (i = 0; i < 10; ++i) { + b.ptr[b.size - 1] = '0' + i; + cl_git_mkfile(b.ptr, "baseline"); + cl_git_pass(git_index_add_bypath(idx, &b.ptr[b.size - 1])); + + if (data[i].size == 0) + data[i].size = strlen(data[i].ptr); + cl_git_write2file( + b.ptr, data[i].ptr, data[i].size, O_WRONLY|O_TRUNC, 0664); + } + git_index_write(idx); + + cl_git_pass(git_diff_index_to_workdir(&diff, g_repo, NULL, NULL)); + + cl_assert_equal_i(10, git_diff_num_deltas(diff)); + + /* using diff binary detection (i.e. looking for NUL byte) */ + assert_delta_binary(diff, 0, false); + assert_delta_binary(diff, 1, false); + assert_delta_binary(diff, 2, false); + assert_delta_binary(diff, 3, true); + assert_delta_binary(diff, 4, true); + assert_delta_binary(diff, 5, true); + assert_delta_binary(diff, 6, false); + assert_delta_binary(diff, 7, true); + assert_delta_binary(diff, 8, false); + assert_delta_binary(diff, 9, false); + /* The above have been checked to match command-line Git */ + + git_diff_free(diff); + + cl_git_pass(git_buf_sets(&b, "empty_standard_repo/0")); + for (i = 0; i < 10; ++i) { + b.ptr[b.size - 1] = '0' + i; + cl_git_pass(git_index_add_bypath(idx, &b.ptr[b.size - 1])); + + cl_git_write2file(b.ptr, "baseline\n", 9, O_WRONLY|O_TRUNC, 0664); + } + git_index_write(idx); + + cl_git_pass(git_diff_index_to_workdir(&diff, g_repo, NULL, NULL)); + + cl_assert_equal_i(10, git_diff_num_deltas(diff)); + + /* using diff binary detection (i.e. looking for NUL byte) */ + assert_delta_binary(diff, 0, false); + assert_delta_binary(diff, 1, false); + assert_delta_binary(diff, 2, false); + assert_delta_binary(diff, 3, true); + assert_delta_binary(diff, 4, true); + assert_delta_binary(diff, 5, true); + assert_delta_binary(diff, 6, false); + assert_delta_binary(diff, 7, true); + assert_delta_binary(diff, 8, false); + assert_delta_binary(diff, 9, false); + + git_diff_free(diff); + + git_index_free(idx); + git_buf_free(&b); +}