Pass unconverted data when iconv doesn't like it

When using Iconv to convert unicode data and iconv doesn't like
the source data (because it thinks that it's not actual UTF-8),
instead of stopping the operation, just use the unconverted data.
This will generally do the right thing on the filesystem, since
that is the source of the non-UTF-8 path data anyhow.

This adds some tests for creating and looking up branches with
messy Unicode names.  Also, this takes the helper function that
was previously internal to `git_repository_init` and makes it
into `git_path_does_fs_decompose_unicode` which is a useful in
tests to understand what the expected results should be.
This commit is contained in:
Russell Belfer 2014-05-08 13:52:46 -07:00
parent ed476c236b
commit 43a0413524
4 changed files with 122 additions and 56 deletions

View File

@ -799,8 +799,11 @@ int git_path_iconv(git_path_iconv_t *ic, char **in, size_t *inlen)
if (rv != (size_t)-1)
break;
/* if we cannot convert the data (probably because iconv thinks
* it is not valid UTF-8 source data), then use original data
*/
if (errno != E2BIG)
goto fail;
return 0;
/* make space for 2x the remaining data to be converted
* (with per retry overhead to avoid infinite loops)
@ -823,6 +826,64 @@ fail:
return -1;
}
static const char *nfc_file = "\xC3\x85\x73\x74\x72\xC3\xB6\x6D.XXXXXX";
static const char *nfd_file = "\x41\xCC\x8A\x73\x74\x72\x6F\xCC\x88\x6D.XXXXXX";
/* Check if the platform is decomposing unicode data for us. We will
* emulate core Git and prefer to use precomposed unicode data internally
* on these platforms, composing the decomposed unicode on the fly.
*
* This mainly happens on the Mac where HDFS stores filenames as
* decomposed unicode. Even on VFAT and SAMBA file systems, the Mac will
* return decomposed unicode from readdir() even when the actual
* filesystem is storing precomposed unicode.
*/
bool git_path_does_fs_decompose_unicode(const char *root)
{
git_buf path = GIT_BUF_INIT;
int fd;
bool found_decomposed = false;
char tmp[6];
/* Create a file using a precomposed path and then try to find it
* using the decomposed name. If the lookup fails, then we will mark
* that we should precompose unicode for this repository.
*/
if (git_buf_joinpath(&path, root, nfc_file) < 0 ||
(fd = p_mkstemp(path.ptr)) < 0)
goto done;
p_close(fd);
/* record trailing digits generated by mkstemp */
memcpy(tmp, path.ptr + path.size - sizeof(tmp), sizeof(tmp));
/* try to look up as NFD path */
if (git_buf_joinpath(&path, root, nfd_file) < 0)
goto done;
memcpy(path.ptr + path.size - sizeof(tmp), tmp, sizeof(tmp));
found_decomposed = git_path_exists(path.ptr);
/* remove temporary file (using original precomposed path) */
if (git_buf_joinpath(&path, root, nfc_file) < 0)
goto done;
memcpy(path.ptr + path.size - sizeof(tmp), tmp, sizeof(tmp));
(void)p_unlink(path.ptr);
done:
git_buf_free(&path);
return found_decomposed;
}
#else
bool git_path_does_fs_decompose_unicode(const char *root)
{
GIT_UNUSED(root);
return false;
}
#endif
#if defined(__sun) || defined(__GNU__)

View File

@ -436,4 +436,6 @@ extern int git_path_iconv(git_path_iconv_t *ic, char **in, size_t *inlen);
#endif /* GIT_USE_ICONV */
extern bool git_path_does_fs_decompose_unicode(const char *root);
#endif

View File

@ -880,60 +880,6 @@ static bool are_symlinks_supported(const char *wd_path)
return symlinks_supported;
}
#ifdef GIT_USE_ICONV
static const char *nfc_file = "\xC3\x85\x73\x74\x72\xC3\xB6\x6D.XXXXXX";
static const char *nfd_file = "\x41\xCC\x8A\x73\x74\x72\x6F\xCC\x88\x6D.XXXXXX";
/* Check if the platform is decomposing unicode data for us. We will
* emulate core Git and prefer to use precomposed unicode data internally
* on these platforms, composing the decomposed unicode on the fly.
*
* This mainly happens on the Mac where HDFS stores filenames as
* decomposed unicode. Even on VFAT and SAMBA file systems, the Mac will
* return decomposed unicode from readdir() even when the actual
* filesystem is storing precomposed unicode.
*/
static bool does_fs_decompose_unicode_paths(const char *wd_path)
{
git_buf path = GIT_BUF_INIT;
int fd;
bool found_decomposed = false;
char tmp[6];
/* Create a file using a precomposed path and then try to find it
* using the decomposed name. If the lookup fails, then we will mark
* that we should precompose unicode for this repository.
*/
if (git_buf_joinpath(&path, wd_path, nfc_file) < 0 ||
(fd = p_mkstemp(path.ptr)) < 0)
goto done;
p_close(fd);
/* record trailing digits generated by mkstemp */
memcpy(tmp, path.ptr + path.size - sizeof(tmp), sizeof(tmp));
/* try to look up as NFD path */
if (git_buf_joinpath(&path, wd_path, nfd_file) < 0)
goto done;
memcpy(path.ptr + path.size - sizeof(tmp), tmp, sizeof(tmp));
found_decomposed = git_path_exists(path.ptr);
/* remove temporary file (using original precomposed path) */
if (git_buf_joinpath(&path, wd_path, nfc_file) < 0)
goto done;
memcpy(path.ptr + path.size - sizeof(tmp), tmp, sizeof(tmp));
(void)p_unlink(path.ptr);
done:
git_buf_free(&path);
return found_decomposed;
}
#endif
static int create_empty_file(const char *path, mode_t mode)
{
int fd;
@ -1024,8 +970,9 @@ static int repo_init_fs_configs(
#ifdef GIT_USE_ICONV
if ((error = git_config_set_bool(
cfg, "core.precomposeunicode",
does_fs_decompose_unicode_paths(work_dir))) < 0)
git_path_does_fs_decompose_unicode(work_dir))) < 0)
return error;
/* on non-iconv platforms, don't even set core.precomposeunicode */
#endif
return 0;

View File

@ -1,5 +1,6 @@
#include "clar_libgit2.h"
#include "refs.h"
#include "path.h"
static git_repository *repo;
static git_commit *target;
@ -137,3 +138,58 @@ void test_refs_branches_create__default_reflog_message(void)
git_reflog_free(log);
git_signature_free(sig);
}
static void assert_branch_matches_name(
const char *expected, const char *lookup_as)
{
git_reference *ref;
git_buf b = GIT_BUF_INIT;
cl_git_pass(git_branch_lookup(&ref, repo, lookup_as, GIT_BRANCH_LOCAL));
cl_git_pass(git_buf_sets(&b, "refs/heads/"));
cl_git_pass(git_buf_puts(&b, expected));
cl_assert_equal_s(b.ptr, git_reference_name(ref));
cl_git_pass(
git_oid_cmp(git_reference_target(ref), git_commit_id(target)));
git_reference_free(ref);
git_buf_free(&b);
}
void test_refs_branches_create__can_create_branch_with_unicode(void)
{
const char *nfc = "\xC3\x85\x73\x74\x72\xC3\xB6\x6D";
const char *nfd = "\x41\xCC\x8A\x73\x74\x72\x6F\xCC\x88\x6D";
const char *emoji = "\xF0\x9F\x8D\xB7";
const char *names[] = { nfc, nfd, emoji };
const char *alt[] = { nfd, nfc, NULL };
const char *expected[] = { nfc, nfd, emoji };
unsigned int i;
retrieve_known_commit(&target, repo);
if (cl_repo_get_bool(repo, "core.precomposeunicode"))
expected[1] = nfc;
#ifdef __APPLE__
/* test decomp. because not all Mac filesystems decompose unicode */
else if (git_path_does_fs_decompose_unicode(git_repository_path(repo)))
expected[0] = nfd;
#endif
for (i = 0; i < ARRAY_SIZE(names); ++i) {
cl_git_pass(git_branch_create(
&branch, repo, names[i], target, 0, NULL, NULL));
cl_git_pass(git_oid_cmp(
git_reference_target(branch), git_commit_id(target)));
assert_branch_matches_name(expected[i], names[i]);
if (alt[i])
assert_branch_matches_name(expected[i], alt[i]);
cl_git_pass(git_branch_delete(branch));
git_reference_free(branch);
branch = NULL;
}
}