From c0b01b7572232bd6c0cd848e9b92ed8a8d678bcf Mon Sep 17 00:00:00 2001 From: Edward Thomson Date: Mon, 19 Aug 2013 18:46:26 -0500 Subject: [PATCH] Skip UTF-8 BOM in binary detection When a git_buf contains a UTF-8 BOM, the three bytes comprising that BOM are treated as unprintable characters. For a small git_buf, the three BOM characters overwhelm the printable characters. This is problematic when trying to check out a small file as the CR/LF filtering will not apply. --- src/buf_text.c | 6 ++++++ tests-clar/core/buffer.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/src/buf_text.c b/src/buf_text.c index 472339def..ecf592b51 100644 --- a/src/buf_text.c +++ b/src/buf_text.c @@ -170,8 +170,14 @@ int git_buf_text_common_prefix(git_buf *buf, const git_strarray *strings) bool git_buf_text_is_binary(const git_buf *buf) { const char *scan = buf->ptr, *end = buf->ptr + buf->size; + git_bom_t bom; int printable = 0, nonprintable = 0; + scan += git_buf_text_detect_bom(&bom, buf, 0); + + if (bom > GIT_BOM_UTF8) + return 1; + while (scan < end) { unsigned char c = *scan++; diff --git a/tests-clar/core/buffer.c b/tests-clar/core/buffer.c index 9d9628cfd..8a0b6711f 100644 --- a/tests-clar/core/buffer.c +++ b/tests-clar/core/buffer.c @@ -718,6 +718,8 @@ void test_core_buffer__classify_with_utf8(void) size_t data1len = 31; char *data2 = "Internal NUL!!!\000\n\nI see you!\n"; size_t data2len = 29; + char *data3 = "\xef\xbb\xbfThis is UTF-8 with a BOM.\n"; + size_t data3len = 20; git_buf b; b.ptr = data0; b.size = b.asize = data0len; @@ -731,6 +733,10 @@ void test_core_buffer__classify_with_utf8(void) b.ptr = data2; b.size = b.asize = data2len; cl_assert(git_buf_text_is_binary(&b)); cl_assert(git_buf_text_contains_nul(&b)); + + b.ptr = data3; b.size = b.asize = data3len; + cl_assert(!git_buf_text_is_binary(&b)); + cl_assert(!git_buf_text_contains_nul(&b)); } #define SIMILARITY_TEST_DATA_1 \