From 3fa764edd2a11691876153fef6523375b6e4553d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Wed, 4 Nov 2015 09:20:14 -0800 Subject: [PATCH 1/5] filebuf: allow using a custom buffer size Allow setting the buffer size on open in order to use this data structure more generally as a spill buffer, with larger buffer sizes for specific use-cases. --- src/filebuf.c | 7 ++++++- src/filebuf.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/filebuf.c b/src/filebuf.c index 101d5082a..6eee530ee 100644 --- a/src/filebuf.c +++ b/src/filebuf.c @@ -272,6 +272,11 @@ cleanup: } int git_filebuf_open(git_filebuf *file, const char *path, int flags, mode_t mode) +{ + return git_filebuf_open_withsize(file, path, flags, mode, WRITE_BUFFER_SIZE); +} + +int git_filebuf_open_withsize(git_filebuf *file, const char *path, int flags, mode_t mode, size_t size) { int compression, error = -1; size_t path_len, alloc_len; @@ -286,7 +291,7 @@ int git_filebuf_open(git_filebuf *file, const char *path, int flags, mode_t mode if (flags & GIT_FILEBUF_DO_NOT_BUFFER) file->do_not_buffer = true; - file->buf_size = WRITE_BUFFER_SIZE; + file->buf_size = size; file->buf_pos = 0; file->fd = -1; file->last_error = BUFERR_OK; diff --git a/src/filebuf.h b/src/filebuf.h index f4d255b0a..467708d45 100644 --- a/src/filebuf.h +++ b/src/filebuf.h @@ -79,6 +79,7 @@ int git_filebuf_reserve(git_filebuf *file, void **buff, size_t len); int git_filebuf_printf(git_filebuf *file, const char *format, ...) GIT_FORMAT_PRINTF(2, 3); int git_filebuf_open(git_filebuf *lock, const char *path, int flags, mode_t mode); +int git_filebuf_open_withsize(git_filebuf *file, const char *path, int flags, mode_t mode, size_t size); int git_filebuf_commit(git_filebuf *lock); int git_filebuf_commit_at(git_filebuf *lock, const char *path); void git_filebuf_cleanup(git_filebuf *lock); From 0a5c6028898e637544962c2c6b1ef8eeeb9c1d38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Wed, 4 Nov 2015 10:30:48 -0800 Subject: [PATCH 2/5] blob: introduce creating a blob by writing into a stream The pair of `git_blob_create_frombuffer()` and `git_blob_create_frombuffer_commit()` is meant to replace `git_blob_create_fromchunks()` by providing a way for a user to write a new blob when they want filtering or they do not know the size. This approach allows the caller to retain control over when to add data to this buffer and a more natural fit into higher-level language's own stream abstractions instead of having to handle IO wait in the callback. The in-memory buffer size of 2MB is chosen somewhat arbitrarily to be a round multiple of usual page sizes and a value where most blobs seem likely to be either going to be way below or way over that size. It's also a round number of pages. This implementation re-uses the helper we have from `_fromchunks()` so we end up writing everything to disk, but hopefully more efficiently than with a default filebuf. A later optimisation can be to avoid writing the in-memory contents to disk, with some extra complexity. --- include/git2/blob.h | 43 ++++++++++++++ src/blob.c | 92 +++++++++++++++++++++++++++++ tests/object/blob/fromstream.c | 103 +++++++++++++++++++++++++++++++++ 3 files changed, 238 insertions(+) create mode 100644 tests/object/blob/fromstream.c diff --git a/include/git2/blob.h b/include/git2/blob.h index 9a57c37f5..f451593cd 100644 --- a/include/git2/blob.h +++ b/include/git2/blob.h @@ -191,6 +191,49 @@ GIT_EXTERN(int) git_blob_create_fromchunks( git_blob_chunk_cb callback, void *payload); +/** + * Create a stream to write a new blob into the object db + * + * This function may need to buffer the data on disk and will in + * general not be the right choice if you know the size of the data + * to write. If you have data in memory, use + * `git_blob_create_frombuffer()`. If you do not, but know the size of + * the contents (and don't want/need to perform filtering), use + * `git_odb_open_wstream()`. + * + * Don't close this stream yourself but pass it to + * `git_blob_create_fromstream_commit()` to commit the write to the + * object db and get the object id. + * + * If the `hintpath` parameter is filled, it will be used to determine + * what git filters should be applied to the object before it is written + * to the object database. + * + * @param out the stream into which to write + * @param repo Repository where the blob will be written. + * This repository can be bare or not. + * @param hintpath If not NULL, will be used to select data filters + * to apply onto the content of the blob to be created. + * @return 0 or error code + */ +GIT_EXTERN(int) git_blob_create_fromstream( + git_writestream **out, + git_repository *repo, + const char *hintpath); + +/** + * Close the stream and write the blob to the object db + * + * The stream will be closed and freed. + * + * @param out the id of the new blob + * @param stream the stream to close + * @return 0 or an error code + */ +GIT_EXTERN(int) git_blob_create_fromstream_commit( + git_oid *out, + git_writestream *stream); + /** * Write an in-memory buffer to the ODB as a blob * diff --git a/src/blob.c b/src/blob.c index ad0f4ac62..a1ef2479e 100644 --- a/src/blob.c +++ b/src/blob.c @@ -334,6 +334,98 @@ cleanup: return error; } +typedef struct { + git_writestream parent; + git_filebuf fbuf; + git_repository *repo; + char *hintpath; +} blob_writestream; + +static int blob_writestream_close(git_writestream *_stream) +{ + blob_writestream *stream = (blob_writestream *) _stream; + + git_filebuf_cleanup(&stream->fbuf); + return 0; +} + +static void blob_writestream_free(git_writestream *_stream) +{ + blob_writestream *stream = (blob_writestream *) _stream; + + git_filebuf_cleanup(&stream->fbuf); + git__free(stream->hintpath); + git__free(stream); +} + +static int blob_writestream_write(git_writestream *_stream, const char *buffer, size_t len) +{ + blob_writestream *stream = (blob_writestream *) _stream; + + return git_filebuf_write(&stream->fbuf, buffer, len); +} + +int git_blob_create_fromstream(git_writestream **out, git_repository *repo, const char *hintpath) +{ + int error; + git_buf path = GIT_BUF_INIT; + blob_writestream *stream; + + assert(out && repo); + + stream = git__calloc(1, sizeof(blob_writestream)); + GITERR_CHECK_ALLOC(stream); + + if (hintpath) { + stream->hintpath = git__strdup(hintpath); + GITERR_CHECK_ALLOC(stream->hintpath); + } + + stream->repo = repo; + stream->parent.write = blob_writestream_write; + stream->parent.close = blob_writestream_close; + stream->parent.free = blob_writestream_free; + + if ((error = git_buf_joinpath(&path, + git_repository_path(repo), GIT_OBJECTS_DIR "streamed")) < 0) + goto cleanup; + + if ((error = git_filebuf_open_withsize(&stream->fbuf, git_buf_cstr(&path), GIT_FILEBUF_TEMPORARY, + 0666, 2 * 1024 * 1024)) < 0) + goto cleanup; + + *out = (git_writestream *) stream; + +cleanup: + if (error < 0) + blob_writestream_free((git_writestream *) stream); + + git_buf_free(&path); + return error; +} + +int git_blob_create_fromstream_commit(git_oid *out, git_writestream *_stream) +{ + int error; + blob_writestream *stream = (blob_writestream *) _stream; + + /* + * We can make this more officient by avoiding writing to + * disk, but for now let's re-use the helper functions we + * have. + */ + if ((error = git_filebuf_flush(&stream->fbuf)) < 0) + goto cleanup; + + error = git_blob__create_from_paths(out, NULL, stream->repo, stream->fbuf.path_lock, + stream->hintpath, 0, !!stream->hintpath); + +cleanup: + blob_writestream_free(_stream); + return error; + +} + int git_blob_is_binary(const git_blob *blob) { git_buf content = GIT_BUF_INIT; diff --git a/tests/object/blob/fromstream.c b/tests/object/blob/fromstream.c new file mode 100644 index 000000000..10f2d8b31 --- /dev/null +++ b/tests/object/blob/fromstream.c @@ -0,0 +1,103 @@ +#include "clar_libgit2.h" +#include "buffer.h" +#include "posix.h" +#include "path.h" +#include "fileops.h" + +static git_repository *repo; +static char textual_content[] = "libgit2\n\r\n\0"; + +void test_object_blob_fromstream__initialize(void) +{ + repo = cl_git_sandbox_init("testrepo.git"); +} + +void test_object_blob_fromstream__cleanup(void) +{ + cl_git_sandbox_cleanup(); +} + +static int text_chunked_source_cb(char *content, size_t max_length, void *payload) +{ + int *count; + + GIT_UNUSED(max_length); + + count = (int *)payload; + (*count)--; + + if (*count == 0) + return 0; + + strcpy(content, textual_content); + return (int)strlen(textual_content); +} + +void test_object_blob_fromstream__multiple_write(void) +{ + git_oid expected_id, id; + git_object *blob; + git_writestream *stream; + int i, howmany = 6; + + cl_git_pass(git_oid_fromstr(&expected_id, "321cbdf08803c744082332332838df6bd160f8f9")); + + cl_git_fail_with(GIT_ENOTFOUND, + git_object_lookup(&blob, repo, &expected_id, GIT_OBJ_ANY)); + + cl_git_pass(git_blob_create_fromstream(&stream, repo, NULL)); + + for (i = 0; i < howmany; i++) + cl_git_pass(stream->write(stream, textual_content, strlen(textual_content))); + + cl_git_pass(git_blob_create_fromstream_end(&id, stream)); + cl_assert_equal_oid(&expected_id, &id); + + cl_git_pass(git_object_lookup(&blob, repo, &expected_id, GIT_OBJ_BLOB)); + + git_object_free(blob); +} + +#define GITATTR "* text=auto\n" \ + "*.txt text\n" \ + "*.data binary\n" + +static void write_attributes(git_repository *repo) +{ + git_buf buf = GIT_BUF_INIT; + + cl_git_pass(git_buf_joinpath(&buf, git_repository_path(repo), "info")); + cl_git_pass(git_buf_joinpath(&buf, git_buf_cstr(&buf), "attributes")); + + cl_git_pass(git_futils_mkpath2file(git_buf_cstr(&buf), 0777)); + cl_git_rewritefile(git_buf_cstr(&buf), GITATTR); + + git_buf_free(&buf); +} + +static void assert_named_chunked_blob(const char *expected_sha, const char *fake_name) +{ + git_oid expected_id, id; + git_writestream *stream; + int i, howmany = 6; + + cl_git_pass(git_oid_fromstr(&expected_id, expected_sha)); + + cl_git_pass(git_blob_create_fromstream(&stream, repo, fake_name)); + + for (i = 0; i < howmany; i++) + cl_git_pass(stream->write(stream, textual_content, strlen(textual_content))); + + cl_git_pass(git_blob_create_fromstream_end(&id, stream)); + + cl_assert_equal_oid(&expected_id, &id); +} + +void test_object_blob_fromstream__creating_a_blob_from_chunks_honors_the_attributes_directives(void) +{ + write_attributes(repo); + + assert_named_chunked_blob("321cbdf08803c744082332332838df6bd160f8f9", "dummy.data"); + assert_named_chunked_blob("e9671e138a780833cb689753570fd10a55be84fb", "dummy.txt"); + assert_named_chunked_blob("e9671e138a780833cb689753570fd10a55be84fb", "dummy.dunno"); +} From 35e68606da5978f8e9ccdbd01194354583ddf021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Wed, 4 Nov 2015 10:36:50 -0800 Subject: [PATCH 3/5] blob: fix fromchunks iteration counter By returning when the count goes to zero rather than below it, setting `howmany` to 7 in fact writes out the string 6 times. Correct the termination condition to write out the string the amount of times we specify. --- tests/object/blob/fromchunks.c | 8 ++++---- tests/object/blob/fromstream.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/object/blob/fromchunks.c b/tests/object/blob/fromchunks.c index b61cabfe1..5a7d8f722 100644 --- a/tests/object/blob/fromchunks.c +++ b/tests/object/blob/fromchunks.c @@ -26,7 +26,7 @@ static int text_chunked_source_cb(char *content, size_t max_length, void *payloa count = (int *)payload; (*count)--; - if (*count == 0) + if (*count < 0) return 0; strcpy(content, textual_content); @@ -37,7 +37,7 @@ void test_object_blob_fromchunks__can_create_a_blob_from_a_in_memory_chunk_provi { git_oid expected_oid, oid; git_object *blob; - int howmany = 7; + int howmany = 6; cl_git_pass(git_oid_fromstr(&expected_oid, "321cbdf08803c744082332332838df6bd160f8f9")); @@ -58,7 +58,7 @@ void test_object_blob_fromchunks__doesnot_overwrite_an_already_existing_object(v git_buf path = GIT_BUF_INIT; git_buf content = GIT_BUF_INIT; git_oid expected_oid, oid; - int howmany = 7; + int howmany = 6; cl_git_pass(git_oid_fromstr(&expected_oid, "321cbdf08803c744082332332838df6bd160f8f9")); @@ -101,7 +101,7 @@ static void write_attributes(git_repository *repo) static void assert_named_chunked_blob(const char *expected_sha, const char *fake_name) { git_oid expected_oid, oid; - int howmany = 7; + int howmany = 6; cl_git_pass(git_oid_fromstr(&expected_oid, expected_sha)); diff --git a/tests/object/blob/fromstream.c b/tests/object/blob/fromstream.c index 10f2d8b31..fb6b0784c 100644 --- a/tests/object/blob/fromstream.c +++ b/tests/object/blob/fromstream.c @@ -50,7 +50,7 @@ void test_object_blob_fromstream__multiple_write(void) for (i = 0; i < howmany; i++) cl_git_pass(stream->write(stream, textual_content, strlen(textual_content))); - cl_git_pass(git_blob_create_fromstream_end(&id, stream)); + cl_git_pass(git_blob_create_fromstream_commit(&id, stream)); cl_assert_equal_oid(&expected_id, &id); cl_git_pass(git_object_lookup(&blob, repo, &expected_id, GIT_OBJ_BLOB)); @@ -88,7 +88,7 @@ static void assert_named_chunked_blob(const char *expected_sha, const char *fake for (i = 0; i < howmany; i++) cl_git_pass(stream->write(stream, textual_content, strlen(textual_content))); - cl_git_pass(git_blob_create_fromstream_end(&id, stream)); + cl_git_pass(git_blob_create_fromstream_commit(&id, stream)); cl_assert_equal_oid(&expected_id, &id); } From e2bb9ed3715ad10ccee7da2c9d09006bd8b8db7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Wed, 4 Nov 2015 10:39:55 -0800 Subject: [PATCH 4/5] CHANGELOG: add a note about _fromstream() and _fromstream_commit() --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 60e58403f..35e926482 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ v0.24 + 1 * `git_commit_create_buffer()` creates a commit and writes it into a user-provided buffer instead of writing it into the object db. +* `git_blob_create_fromstream()` and + `git_blob_create_fromstream_commit()` allow you to create a blob by + writing into a stream. Useful when you do not know the final size or + want to copy the contents from another stream. + ### API removals ### Breaking API changes From 6669e3e83900f76721603ed8a7ad9f7435042674 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Sun, 8 Nov 2015 04:28:08 +0100 Subject: [PATCH 5/5] blob: remove _fromchunks() The callback mechanism makes it awkward to write data from an IO source; move to `_fromstream()` which lets the caller remain in control, in the same vein as we prefer iterators over foreach callbacks. --- CHANGELOG.md | 4 + src/blob.c | 60 ------------- tests/object/blob/fromchunks.c | 156 --------------------------------- 3 files changed, 4 insertions(+), 216 deletions(-) delete mode 100644 tests/object/blob/fromchunks.c diff --git a/CHANGELOG.md b/CHANGELOG.md index 35e926482..924cfa187 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,10 @@ v0.24 + 1 ### API removals +* `git_blob_create_fromchunks()` has been removed in favour of + `git_blob_create_fromstream()`. + + ### Breaking API changes v0.24 diff --git a/src/blob.c b/src/blob.c index a1ef2479e..1926c9e58 100644 --- a/src/blob.c +++ b/src/blob.c @@ -274,66 +274,6 @@ int git_blob_create_fromdisk( return error; } -#define BUFFER_SIZE 4096 - -int git_blob_create_fromchunks( - git_oid *id, - git_repository *repo, - const char *hintpath, - int (*source_cb)(char *content, size_t max_length, void *payload), - void *payload) -{ - int error; - char *content = NULL; - git_filebuf file = GIT_FILEBUF_INIT; - git_buf path = GIT_BUF_INIT; - - assert(id && repo && source_cb); - - if ((error = git_buf_joinpath( - &path, git_repository_path(repo), GIT_OBJECTS_DIR "streamed")) < 0) - goto cleanup; - - content = git__malloc(BUFFER_SIZE); - GITERR_CHECK_ALLOC(content); - - if ((error = git_filebuf_open( - &file, git_buf_cstr(&path), GIT_FILEBUF_TEMPORARY, 0666)) < 0) - goto cleanup; - - while (1) { - int read_bytes = source_cb(content, BUFFER_SIZE, payload); - - if (!read_bytes) - break; - - if (read_bytes > BUFFER_SIZE) { - giterr_set(GITERR_OBJECT, "Invalid chunk size while creating blob"); - error = GIT_EBUFS; - } else if (read_bytes < 0) { - error = giterr_set_after_callback(read_bytes); - } else { - error = git_filebuf_write(&file, content, read_bytes); - } - - if (error < 0) - goto cleanup; - } - - if ((error = git_filebuf_flush(&file)) < 0) - goto cleanup; - - error = git_blob__create_from_paths( - id, NULL, repo, file.path_lock, hintpath, 0, hintpath != NULL); - -cleanup: - git_buf_free(&path); - git_filebuf_cleanup(&file); - git__free(content); - - return error; -} - typedef struct { git_writestream parent; git_filebuf fbuf; diff --git a/tests/object/blob/fromchunks.c b/tests/object/blob/fromchunks.c deleted file mode 100644 index 5a7d8f722..000000000 --- a/tests/object/blob/fromchunks.c +++ /dev/null @@ -1,156 +0,0 @@ -#include "clar_libgit2.h" -#include "buffer.h" -#include "posix.h" -#include "path.h" -#include "fileops.h" - -static git_repository *repo; -static char textual_content[] = "libgit2\n\r\n\0"; - -void test_object_blob_fromchunks__initialize(void) -{ - repo = cl_git_sandbox_init("testrepo.git"); -} - -void test_object_blob_fromchunks__cleanup(void) -{ - cl_git_sandbox_cleanup(); -} - -static int text_chunked_source_cb(char *content, size_t max_length, void *payload) -{ - int *count; - - GIT_UNUSED(max_length); - - count = (int *)payload; - (*count)--; - - if (*count < 0) - return 0; - - strcpy(content, textual_content); - return (int)strlen(textual_content); -} - -void test_object_blob_fromchunks__can_create_a_blob_from_a_in_memory_chunk_provider(void) -{ - git_oid expected_oid, oid; - git_object *blob; - int howmany = 6; - - cl_git_pass(git_oid_fromstr(&expected_oid, "321cbdf08803c744082332332838df6bd160f8f9")); - - cl_git_fail_with( - git_object_lookup(&blob, repo, &expected_oid, GIT_OBJ_ANY), - GIT_ENOTFOUND); - - cl_git_pass(git_blob_create_fromchunks(&oid, repo, NULL, text_chunked_source_cb, &howmany)); - - cl_git_pass(git_object_lookup(&blob, repo, &expected_oid, GIT_OBJ_ANY)); - cl_assert(git_oid_cmp(&expected_oid, git_object_id(blob)) == 0); - - git_object_free(blob); -} - -void test_object_blob_fromchunks__doesnot_overwrite_an_already_existing_object(void) -{ - git_buf path = GIT_BUF_INIT; - git_buf content = GIT_BUF_INIT; - git_oid expected_oid, oid; - int howmany = 6; - - cl_git_pass(git_oid_fromstr(&expected_oid, "321cbdf08803c744082332332838df6bd160f8f9")); - - cl_git_pass(git_blob_create_fromchunks(&oid, repo, NULL, text_chunked_source_cb, &howmany)); - - /* Let's replace the content of the blob file storage with something else... */ - cl_git_pass(git_buf_joinpath(&path, git_repository_path(repo), "objects/32/1cbdf08803c744082332332838df6bd160f8f9")); - cl_git_pass(p_unlink(git_buf_cstr(&path))); - cl_git_mkfile(git_buf_cstr(&path), "boom"); - - /* ...request a creation of the same blob... */ - howmany = 7; - cl_git_pass(git_blob_create_fromchunks(&oid, repo, NULL, text_chunked_source_cb, &howmany)); - - /* ...and ensure the content of the faked blob file hasn't been altered */ - cl_git_pass(git_futils_readbuffer(&content, git_buf_cstr(&path))); - cl_assert(!git__strcmp("boom", git_buf_cstr(&content))); - - git_buf_free(&path); - git_buf_free(&content); -} - -#define GITATTR "* text=auto\n" \ - "*.txt text\n" \ - "*.data binary\n" - -static void write_attributes(git_repository *repo) -{ - git_buf buf = GIT_BUF_INIT; - - cl_git_pass(git_buf_joinpath(&buf, git_repository_path(repo), "info")); - cl_git_pass(git_buf_joinpath(&buf, git_buf_cstr(&buf), "attributes")); - - cl_git_pass(git_futils_mkpath2file(git_buf_cstr(&buf), 0777)); - cl_git_rewritefile(git_buf_cstr(&buf), GITATTR); - - git_buf_free(&buf); -} - -static void assert_named_chunked_blob(const char *expected_sha, const char *fake_name) -{ - git_oid expected_oid, oid; - int howmany = 6; - - cl_git_pass(git_oid_fromstr(&expected_oid, expected_sha)); - - cl_git_pass(git_blob_create_fromchunks(&oid, repo, fake_name, text_chunked_source_cb, &howmany)); - cl_assert(git_oid_cmp(&expected_oid, &oid) == 0); -} - -void test_object_blob_fromchunks__creating_a_blob_from_chunks_honors_the_attributes_directives(void) -{ - write_attributes(repo); - - assert_named_chunked_blob("321cbdf08803c744082332332838df6bd160f8f9", "dummy.data"); - assert_named_chunked_blob("e9671e138a780833cb689753570fd10a55be84fb", "dummy.txt"); - assert_named_chunked_blob("e9671e138a780833cb689753570fd10a55be84fb", "dummy.dunno"); -} - -static int failing_chunked_source_cb( - char *content, size_t max_length, void *payload) -{ - int *count = (int *)payload; - - GIT_UNUSED(max_length); - - (*count)--; - if (*count == 0) - return -1234; - - strcpy(content, textual_content); - return (int)strlen(textual_content); -} - -void test_object_blob_fromchunks__can_stop_with_error(void) -{ - git_oid expected_oid, oid; - git_object *blob; - int howmany = 7; - - cl_git_pass(git_oid_fromstr( - &expected_oid, "321cbdf08803c744082332332838df6bd160f8f9")); - - cl_git_fail_with( - git_object_lookup(&blob, repo, &expected_oid, GIT_OBJ_ANY), - GIT_ENOTFOUND); - - cl_git_fail_with(git_blob_create_fromchunks( - &oid, repo, NULL, failing_chunked_source_cb, &howmany), -1234); - - cl_git_fail_with( - git_object_lookup(&blob, repo, &expected_oid, GIT_OBJ_ANY), - GIT_ENOTFOUND); -} -