From 15711565f66de53c22c3a9faee04fc2092409ce4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 27 Mar 2013 17:36:27 +0100 Subject: [PATCH 1/6] qemu-file: drop socket_put_buffer It is enough to implement one of socket_writev_buffer and socket_put_buffer. Reviewed-by: Juan Quintela Reviewed-by: Orit Wassermann Signed-off-by: Paolo Bonzini --- savevm.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/savevm.c b/savevm.c index 53515cb794..ffabbff12c 100644 --- a/savevm.c +++ b/savevm.c @@ -219,18 +219,6 @@ static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) return len; } -static int socket_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size) -{ - QEMUFileSocket *s = opaque; - ssize_t len; - - len = qemu_send_full(s->fd, buf, size, 0); - if (len < size) { - len = -socket_error(); - } - return len; -} - static int socket_close(void *opaque) { QEMUFileSocket *s = opaque; @@ -404,7 +392,6 @@ static const QEMUFileOps socket_read_ops = { static const QEMUFileOps socket_write_ops = { .get_fd = socket_get_fd, - .put_buffer = socket_put_buffer, .writev_buffer = socket_writev_buffer, .close = socket_close }; From cb6247a7e3e07ead908d2e7fbc8848cc2e135056 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 27 Mar 2013 17:36:28 +0100 Subject: [PATCH 2/6] iov: reorganize iov_send_recv, part 1 Once the initial part of the iov is dropped, it is not used anymore. Modify iov/iovcnt directly instead of adjusting them with the "si" variable. Reviewed-by: Juan Quintela Reviewed-by: Orit Wassermann Signed-off-by: Paolo Bonzini --- util/iov.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/util/iov.c b/util/iov.c index 9dae318197..adb9a70fcd 100644 --- a/util/iov.c +++ b/util/iov.c @@ -159,16 +159,22 @@ ssize_t iov_send_recv(int sockfd, struct iovec *iov, unsigned iov_cnt, for (si = 0; si < iov_cnt && offset >= iov[si].iov_len; ++si) { offset -= iov[si].iov_len; } + + /* si == iov_cnt would only be valid if bytes == 0, which + * we already ruled out above. */ + assert(si < iov_cnt); + iov += si; + iov_cnt -= si; + if (offset) { - assert(si < iov_cnt); /* second, skip `offset' bytes from the (now) first element, * undo it on exit */ - iov[si].iov_base += offset; - iov[si].iov_len -= offset; + iov[0].iov_base += offset; + iov[0].iov_len -= offset; } /* Find the end position skipping `bytes' bytes: */ /* first, skip all full-sized elements */ - for (ei = si; ei < iov_cnt && iov[ei].iov_len <= bytes; ++ei) { + for (ei = 0; ei < iov_cnt && iov[ei].iov_len <= bytes; ++ei) { bytes -= iov[ei].iov_len; } if (bytes) { @@ -183,12 +189,12 @@ ssize_t iov_send_recv(int sockfd, struct iovec *iov, unsigned iov_cnt, ++ei; } - ret = do_send_recv(sockfd, iov + si, ei - si, do_send); + ret = do_send_recv(sockfd, iov, ei, do_send); /* Undo the changes above */ if (offset) { - iov[si].iov_base -= offset; - iov[si].iov_len += offset; + iov[0].iov_base -= offset; + iov[0].iov_len += offset; } if (bytes) { iov[ei-1].iov_len += bytes; From 5209d6753c90a3d6411abd0729a9cca3775dce3f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 27 Mar 2013 17:36:29 +0100 Subject: [PATCH 3/6] iov: reorganize iov_send_recv, part 2 Do not touch the "bytes" argument anymore. Instead, remember the original length of the last iovec if we touch it, and restore it afterwards. This requires undoing the changes in opposite order. The previous algorithm didn't care. Reviewed-by: Juan Quintela Reviewed-by: Orit Wassermann Signed-off-by: Paolo Bonzini --- util/iov.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/util/iov.c b/util/iov.c index adb9a70fcd..110d18eba6 100644 --- a/util/iov.c +++ b/util/iov.c @@ -145,7 +145,9 @@ ssize_t iov_send_recv(int sockfd, struct iovec *iov, unsigned iov_cnt, bool do_send) { ssize_t ret; + size_t orig_len, tail; unsigned si, ei; /* start and end indexes */ + if (bytes == 0) { /* Catch the do-nothing case early, as otherwise we will pass an * empty iovec to sendmsg/recvmsg(), and not all implementations @@ -174,31 +176,29 @@ ssize_t iov_send_recv(int sockfd, struct iovec *iov, unsigned iov_cnt, } /* Find the end position skipping `bytes' bytes: */ /* first, skip all full-sized elements */ - for (ei = 0; ei < iov_cnt && iov[ei].iov_len <= bytes; ++ei) { - bytes -= iov[ei].iov_len; + tail = bytes; + for (ei = 0; ei < iov_cnt && iov[ei].iov_len <= tail; ++ei) { + tail -= iov[ei].iov_len; } - if (bytes) { - /* second, fixup the last element, and remember - * the length we've cut from the end of it in `bytes' */ - size_t tail; + if (tail) { + /* second, fixup the last element, and remember the original + * length */ assert(ei < iov_cnt); - assert(iov[ei].iov_len > bytes); - tail = iov[ei].iov_len - bytes; - iov[ei].iov_len = bytes; - bytes = tail; /* bytes is now equal to the tail size */ - ++ei; + assert(iov[ei].iov_len > tail); + orig_len = iov[ei].iov_len; + iov[ei++].iov_len = tail; } ret = do_send_recv(sockfd, iov, ei, do_send); /* Undo the changes above */ + if (tail) { + iov[ei-1].iov_len = orig_len; + } if (offset) { iov[0].iov_base -= offset; iov[0].iov_len += offset; } - if (bytes) { - iov[ei-1].iov_len += bytes; - } return ret; } From f48869ad2825b640911666bb091cedb1e1d6ad5e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 27 Mar 2013 17:36:30 +0100 Subject: [PATCH 4/6] iov: reorganize iov_send_recv, part 3 "si" and "ei" are merged in a single variable. Reviewed-by: Juan Quintela Reviewed-by: Orit Wassermann Signed-off-by: Paolo Bonzini --- util/iov.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/util/iov.c b/util/iov.c index 110d18eba6..f14ff0b475 100644 --- a/util/iov.c +++ b/util/iov.c @@ -146,7 +146,7 @@ ssize_t iov_send_recv(int sockfd, struct iovec *iov, unsigned iov_cnt, { ssize_t ret; size_t orig_len, tail; - unsigned si, ei; /* start and end indexes */ + unsigned niov; if (bytes == 0) { /* Catch the do-nothing case early, as otherwise we will pass an @@ -158,15 +158,15 @@ ssize_t iov_send_recv(int sockfd, struct iovec *iov, unsigned iov_cnt, /* Find the start position, skipping `offset' bytes: * first, skip all full-sized vector elements, */ - for (si = 0; si < iov_cnt && offset >= iov[si].iov_len; ++si) { - offset -= iov[si].iov_len; + for (niov = 0; niov < iov_cnt && offset >= iov[niov].iov_len; ++niov) { + offset -= iov[niov].iov_len; } - /* si == iov_cnt would only be valid if bytes == 0, which + /* niov == iov_cnt would only be valid if bytes == 0, which * we already ruled out above. */ - assert(si < iov_cnt); - iov += si; - iov_cnt -= si; + assert(niov < iov_cnt); + iov += niov; + iov_cnt -= niov; if (offset) { /* second, skip `offset' bytes from the (now) first element, @@ -177,23 +177,23 @@ ssize_t iov_send_recv(int sockfd, struct iovec *iov, unsigned iov_cnt, /* Find the end position skipping `bytes' bytes: */ /* first, skip all full-sized elements */ tail = bytes; - for (ei = 0; ei < iov_cnt && iov[ei].iov_len <= tail; ++ei) { - tail -= iov[ei].iov_len; + for (niov = 0; niov < iov_cnt && iov[niov].iov_len <= tail; ++niov) { + tail -= iov[niov].iov_len; } if (tail) { /* second, fixup the last element, and remember the original * length */ - assert(ei < iov_cnt); - assert(iov[ei].iov_len > tail); - orig_len = iov[ei].iov_len; - iov[ei++].iov_len = tail; + assert(niov < iov_cnt); + assert(iov[niov].iov_len > tail); + orig_len = iov[niov].iov_len; + iov[niov++].iov_len = tail; } - ret = do_send_recv(sockfd, iov, ei, do_send); + ret = do_send_recv(sockfd, iov, niov, do_send); /* Undo the changes above */ if (tail) { - iov[ei-1].iov_len = orig_len; + iov[niov-1].iov_len = orig_len; } if (offset) { iov[0].iov_base -= offset; From 83f75c26e8e791311900d0e2a4cc9379abedb85c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 27 Mar 2013 17:36:31 +0100 Subject: [PATCH 5/6] iov: handle partial writes from sendmsg and recvmsg Partial writes can still happen in sendmsg and recvmsg, if a signal is received in the middle of a write. To handle this, retry the operation with a new offset/bytes pair. Reviewed-by: Juan Quintela Reviewed-by: Orit Wassermann Signed-off-by: Paolo Bonzini --- util/iov.c | 106 ++++++++++++++++++++++++++++------------------------- 1 file changed, 57 insertions(+), 49 deletions(-) diff --git a/util/iov.c b/util/iov.c index f14ff0b475..d32226d644 100644 --- a/util/iov.c +++ b/util/iov.c @@ -144,63 +144,71 @@ ssize_t iov_send_recv(int sockfd, struct iovec *iov, unsigned iov_cnt, size_t offset, size_t bytes, bool do_send) { + ssize_t total = 0; ssize_t ret; size_t orig_len, tail; unsigned niov; - if (bytes == 0) { - /* Catch the do-nothing case early, as otherwise we will pass an - * empty iovec to sendmsg/recvmsg(), and not all implementations - * accept this. - */ - return 0; - } + while (bytes > 0) { + /* Find the start position, skipping `offset' bytes: + * first, skip all full-sized vector elements, */ + for (niov = 0; niov < iov_cnt && offset >= iov[niov].iov_len; ++niov) { + offset -= iov[niov].iov_len; + } - /* Find the start position, skipping `offset' bytes: - * first, skip all full-sized vector elements, */ - for (niov = 0; niov < iov_cnt && offset >= iov[niov].iov_len; ++niov) { - offset -= iov[niov].iov_len; - } - - /* niov == iov_cnt would only be valid if bytes == 0, which - * we already ruled out above. */ - assert(niov < iov_cnt); - iov += niov; - iov_cnt -= niov; - - if (offset) { - /* second, skip `offset' bytes from the (now) first element, - * undo it on exit */ - iov[0].iov_base += offset; - iov[0].iov_len -= offset; - } - /* Find the end position skipping `bytes' bytes: */ - /* first, skip all full-sized elements */ - tail = bytes; - for (niov = 0; niov < iov_cnt && iov[niov].iov_len <= tail; ++niov) { - tail -= iov[niov].iov_len; - } - if (tail) { - /* second, fixup the last element, and remember the original - * length */ + /* niov == iov_cnt would only be valid if bytes == 0, which + * we already ruled out in the loop condition. */ assert(niov < iov_cnt); - assert(iov[niov].iov_len > tail); - orig_len = iov[niov].iov_len; - iov[niov++].iov_len = tail; + iov += niov; + iov_cnt -= niov; + + if (offset) { + /* second, skip `offset' bytes from the (now) first element, + * undo it on exit */ + iov[0].iov_base += offset; + iov[0].iov_len -= offset; + } + /* Find the end position skipping `bytes' bytes: */ + /* first, skip all full-sized elements */ + tail = bytes; + for (niov = 0; niov < iov_cnt && iov[niov].iov_len <= tail; ++niov) { + tail -= iov[niov].iov_len; + } + if (tail) { + /* second, fixup the last element, and remember the original + * length */ + assert(niov < iov_cnt); + assert(iov[niov].iov_len > tail); + orig_len = iov[niov].iov_len; + iov[niov++].iov_len = tail; + } + + ret = do_send_recv(sockfd, iov, niov, do_send); + + /* Undo the changes above before checking for errors */ + if (tail) { + iov[niov-1].iov_len = orig_len; + } + if (offset) { + iov[0].iov_base -= offset; + iov[0].iov_len += offset; + } + + if (ret < 0) { + assert(errno != EINTR); + if (errno == EAGAIN && total > 0) { + return total; + } + return -1; + } + + /* Prepare for the next iteration */ + offset += ret; + total += ret; + bytes -= ret; } - ret = do_send_recv(sockfd, iov, niov, do_send); - - /* Undo the changes above */ - if (tail) { - iov[niov-1].iov_len = orig_len; - } - if (offset) { - iov[0].iov_base -= offset; - iov[0].iov_len += offset; - } - - return ret; + return total; } From e9d8fbf53a33983c81d67d18e1baa914eb16cdea Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 27 Mar 2013 17:36:32 +0100 Subject: [PATCH 6/6] qemu-file: do not use stdio for qemu_fdopen This uses system calls directly for Unix file descriptors, so that the efficient writev_buffer can be used. Pay attention to the possibility of partial writes in writev. Reviewed-by: Juan Quintela Reviewed-by: Orit Wassermann Signed-off-by: Paolo Bonzini --- savevm.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 90 insertions(+), 11 deletions(-) diff --git a/savevm.c b/savevm.c index ffabbff12c..31dcce975e 100644 --- a/savevm.c +++ b/savevm.c @@ -356,9 +356,94 @@ static const QEMUFileOps stdio_file_write_ops = { .close = stdio_fclose }; +static ssize_t unix_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, + int64_t pos) +{ + QEMUFileSocket *s = opaque; + ssize_t len, offset; + ssize_t size = iov_size(iov, iovcnt); + ssize_t total = 0; + + assert(iovcnt > 0); + offset = 0; + while (size > 0) { + /* Find the next start position; skip all full-sized vector elements */ + while (offset >= iov[0].iov_len) { + offset -= iov[0].iov_len; + iov++, iovcnt--; + } + + /* skip `offset' bytes from the (now) first element, undo it on exit */ + assert(iovcnt > 0); + iov[0].iov_base += offset; + iov[0].iov_len -= offset; + + do { + len = writev(s->fd, iov, iovcnt); + } while (len == -1 && errno == EINTR); + if (len == -1) { + return -errno; + } + + /* Undo the changes above */ + iov[0].iov_base -= offset; + iov[0].iov_len += offset; + + /* Prepare for the next iteration */ + offset += len; + total += len; + size -= len; + } + + return total; +} + +static int unix_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) +{ + QEMUFileSocket *s = opaque; + ssize_t len; + + for (;;) { + len = read(s->fd, buf, size); + if (len != -1) { + break; + } + if (errno == EAGAIN) { + yield_until_fd_readable(s->fd); + } else if (errno != EINTR) { + break; + } + } + + if (len == -1) { + len = -errno; + } + return len; +} + +static int unix_close(void *opaque) +{ + QEMUFileSocket *s = opaque; + close(s->fd); + g_free(s); + return 0; +} + +static const QEMUFileOps unix_read_ops = { + .get_fd = socket_get_fd, + .get_buffer = unix_get_buffer, + .close = unix_close +}; + +static const QEMUFileOps unix_write_ops = { + .get_fd = socket_get_fd, + .writev_buffer = unix_writev_buffer, + .close = unix_close +}; + QEMUFile *qemu_fdopen(int fd, const char *mode) { - QEMUFileStdio *s; + QEMUFileSocket *s; if (mode == NULL || (mode[0] != 'r' && mode[0] != 'w') || @@ -367,21 +452,15 @@ QEMUFile *qemu_fdopen(int fd, const char *mode) return NULL; } - s = g_malloc0(sizeof(QEMUFileStdio)); - s->stdio_file = fdopen(fd, mode); - if (!s->stdio_file) - goto fail; + s = g_malloc0(sizeof(QEMUFileSocket)); + s->fd = fd; if(mode[0] == 'r') { - s->file = qemu_fopen_ops(s, &stdio_file_read_ops); + s->file = qemu_fopen_ops(s, &unix_read_ops); } else { - s->file = qemu_fopen_ops(s, &stdio_file_write_ops); + s->file = qemu_fopen_ops(s, &unix_write_ops); } return s->file; - -fail: - g_free(s); - return NULL; } static const QEMUFileOps socket_read_ops = {