diff --git a/src/path.c b/src/path.c index 724d9ede2..b9c9729c1 100644 --- a/src/path.c +++ b/src/path.c @@ -1282,93 +1282,57 @@ GIT_INLINE(bool) verify_dospath( component[last] != ':'); } -GIT_INLINE(bool) verify_dotgit_hfs(const char *component, size_t len) +static int32_t next_hfs_char(const char **in, size_t *len) { - const unsigned char *c; - int git = 0, ign = 0; - unsigned char one, two; + while (*len) { + int32_t codepoint; + int cp_len = git__utf8_iterate((const uint8_t *)(*in), (int)(*len), &codepoint); + if (cp_len < 0) + return -1; - while (len) { - switch (*(c = (const unsigned char *)component++)) { - case '.': - if (ign || git++ != 0) - return true; - break; - case 'g': - case 'G': - if (ign || git++ != 1) - return true; - break; - case 'i': - case 'I': - if (ign || git++ != 2) - return true; - break; - case 't': - case 'T': - if (ign || git++ != 3) - return true; - break; + (*in) += cp_len; + (*len) -= cp_len; - case 0xe2: - case 0xef: - if (ign++ != 0) - return true; - one = *c; - break; - - case 0x80: - case 0x81: - if (ign++ != 1 || one != 0xe2) - return true; - two = *c; - break; - - case 0xbb: - if (ign++ != 1 || one != 0xef) - return true; - two = *c; - break; - - case 0x8c: - case 0x8d: - case 0x8e: - case 0x8f: - if (ign != 2 || two != 0x80) - return true; - ign = 0; - break; - - case 0xaa: - case 0xab: - case 0xac: - case 0xad: - case 0xae: - if (ign != 2 || (two != 0x80 && two != 0x81)) - return true; - ign = 0; - break; - - case 0xaf: - if (ign != 2 || two != 0x81) - return true; - ign = 0; - break; - - case 0xbf: - if (ign != 2 || two != 0xbb) - return true; - ign = 0; - break; - - default: - return true; + /* these code points are ignored completely */ + switch (codepoint) { + case 0x200c: /* ZERO WIDTH NON-JOINER */ + case 0x200d: /* ZERO WIDTH JOINER */ + case 0x200e: /* LEFT-TO-RIGHT MARK */ + case 0x200f: /* RIGHT-TO-LEFT MARK */ + case 0x202a: /* LEFT-TO-RIGHT EMBEDDING */ + case 0x202b: /* RIGHT-TO-LEFT EMBEDDING */ + case 0x202c: /* POP DIRECTIONAL FORMATTING */ + case 0x202d: /* LEFT-TO-RIGHT OVERRIDE */ + case 0x202e: /* RIGHT-TO-LEFT OVERRIDE */ + case 0x206a: /* INHIBIT SYMMETRIC SWAPPING */ + case 0x206b: /* ACTIVATE SYMMETRIC SWAPPING */ + case 0x206c: /* INHIBIT ARABIC FORM SHAPING */ + case 0x206d: /* ACTIVATE ARABIC FORM SHAPING */ + case 0x206e: /* NATIONAL DIGIT SHAPES */ + case 0x206f: /* NOMINAL DIGIT SHAPES */ + case 0xfeff: /* ZERO WIDTH NO-BREAK SPACE */ + continue; } - len--; + /* fold into lowercase -- this will only fold characters in + * the ASCII range, which is perfectly fine, because the + * git folder name can only be composed of ascii characters + */ + return tolower(codepoint); } + return 0; /* NULL byte -- end of string */ +} - return (ign || git != 4); +static bool verify_dotgit_hfs(const char *path, size_t len) +{ + if (next_hfs_char(&path, &len) != '.' || + next_hfs_char(&path, &len) != 'g' || + next_hfs_char(&path, &len) != 'i' || + next_hfs_char(&path, &len) != 't' || + next_hfs_char(&path, &len) != 0) + return true; + + return false; } GIT_INLINE(bool) verify_char(unsigned char c, unsigned int flags) diff --git a/src/util.c b/src/util.c index 6b0efbea5..7ee3e2ff9 100644 --- a/src/util.c +++ b/src/util.c @@ -664,3 +664,79 @@ void git__insertsort_r( if (freeswap) git__free(swapel); } + +static const int8_t utf8proc_utf8class[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +int git__utf8_charlen(const uint8_t *str, int str_len) +{ + int length, i; + + length = utf8proc_utf8class[str[0]]; + if (!length) + return -1; + + if (str_len >= 0 && length > str_len) + return -str_len; + + for (i = 1; i < length; i++) { + if ((str[i] & 0xC0) != 0x80) + return -i; + } + + return length; +} + +int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst) +{ + int length; + int32_t uc = -1; + + *dst = -1; + length = git__utf8_charlen(str, str_len); + if (length < 0) + return -1; + + switch (length) { + case 1: + uc = str[0]; + break; + case 2: + uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F); + if (uc < 0x80) uc = -1; + break; + case 3: + uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) + + (str[2] & 0x3F); + if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) || + (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; + break; + case 4: + uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) + + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F); + if (uc < 0x10000 || uc >= 0x110000) uc = -1; + break; + } + + if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) + return -1; + + *dst = uc; + return length; +} diff --git a/src/util.h b/src/util.h index 17cc08987..7cfc0d644 100644 --- a/src/util.h +++ b/src/util.h @@ -367,6 +367,17 @@ extern int git__date_rfc2822_fmt(char *out, size_t len, const git_time *date); */ extern size_t git__unescape(char *str); +/* + * Iterate through an UTF-8 string, yielding one + * codepoint at a time. + * + * @param str current position in the string + * @param str_len size left in the string; -1 if the string is NULL-terminated + * @param dst pointer where to store the current codepoint + * @return length in bytes of the read codepoint; -1 if the codepoint was invalid + */ +extern int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst); + /* * Safely zero-out memory, making sure that the compiler * doesn't optimize away the operation.