From 282c6a2f292705f823554447ca0b7731b6f81a97 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Mon, 14 Jan 2013 18:26:53 +0000 Subject: [PATCH 1/3] xen_disk: fix memory leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On ioreq_release the full ioreq was memset to 0, loosing all the data and memory allocations inside the QEMUIOVector, which leads to a memory leak. Create a new function to specifically reset ioreq. Reported-by: Maik Wessler Signed-off-by: Roger Pau Monné Signed-off-by: Stefano Stabellini --- hw/xen_disk.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/hw/xen_disk.c b/hw/xen_disk.c index a6a64a245..b7c797787 100644 --- a/hw/xen_disk.c +++ b/hw/xen_disk.c @@ -112,6 +112,31 @@ struct XenBlkDev { /* ------------------------------------------------------------- */ +static void ioreq_reset(struct ioreq *ioreq) +{ + memset(&ioreq->req, 0, sizeof(ioreq->req)); + ioreq->status = 0; + ioreq->start = 0; + ioreq->presync = 0; + ioreq->postsync = 0; + ioreq->mapped = 0; + + memset(ioreq->domids, 0, sizeof(ioreq->domids)); + memset(ioreq->refs, 0, sizeof(ioreq->refs)); + ioreq->prot = 0; + memset(ioreq->page, 0, sizeof(ioreq->page)); + ioreq->pages = NULL; + + ioreq->aio_inflight = 0; + ioreq->aio_errors = 0; + + ioreq->blkdev = NULL; + memset(&ioreq->list, 0, sizeof(ioreq->list)); + memset(&ioreq->acct, 0, sizeof(ioreq->acct)); + + qemu_iovec_reset(&ioreq->v); +} + static struct ioreq *ioreq_start(struct XenBlkDev *blkdev) { struct ioreq *ioreq = NULL; @@ -129,7 +154,6 @@ static struct ioreq *ioreq_start(struct XenBlkDev *blkdev) /* get one from freelist */ ioreq = QLIST_FIRST(&blkdev->freelist); QLIST_REMOVE(ioreq, list); - qemu_iovec_reset(&ioreq->v); } QLIST_INSERT_HEAD(&blkdev->inflight, ioreq, list); blkdev->requests_inflight++; @@ -153,7 +177,7 @@ static void ioreq_release(struct ioreq *ioreq, bool finish) struct XenBlkDev *blkdev = ioreq->blkdev; QLIST_REMOVE(ioreq, list); - memset(ioreq, 0, sizeof(*ioreq)); + ioreq_reset(ioreq); ioreq->blkdev = blkdev; QLIST_INSERT_HEAD(&blkdev->freelist, ioreq, list); if (finish) { From 9e496d7458bb01b717afe22db10a724db57d53fd Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Mon, 14 Jan 2013 18:28:19 +0000 Subject: [PATCH 2/3] xen_disk: add persistent grant support to xen_disk backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This protocol extension reuses the same set of grant pages for all transactions between the front/back drivers, avoiding expensive tlb flushes, grant table lock contention and switches between userspace and kernel space. The full description of the protocol can be found in the public blkif.h header. http://xenbits.xen.org/gitweb/?p=xen.git;a=blob_plain;f=xen/include/public/io/blkif.h Speed improvement with 15 guests performing I/O is ~450%. Signed-off-by: Roger Pau Monné Signed-off-by: Stefano Stabellini --- hw/xen_disk.c | 171 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 154 insertions(+), 17 deletions(-) diff --git a/hw/xen_disk.c b/hw/xen_disk.c index b7c797787..9ee72a0de 100644 --- a/hw/xen_disk.c +++ b/hw/xen_disk.c @@ -51,6 +51,13 @@ static int max_requests = 32; #define BLOCK_SIZE 512 #define IOCB_COUNT (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2) +struct PersistentGrant { + void *page; + struct XenBlkDev *blkdev; +}; + +typedef struct PersistentGrant PersistentGrant; + struct ioreq { blkif_request_t req; int16_t status; @@ -68,6 +75,7 @@ struct ioreq { int prot; void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST]; void *pages; + int num_unmap; /* aio status */ int aio_inflight; @@ -104,6 +112,12 @@ struct XenBlkDev { int requests_inflight; int requests_finished; + /* Persistent grants extension */ + gboolean feature_persistent; + GTree *persistent_gnts; + unsigned int persistent_gnt_count; + unsigned int max_grants; + /* qemu block driver */ DriveInfo *dinfo; BlockDriverState *bs; @@ -137,6 +151,29 @@ static void ioreq_reset(struct ioreq *ioreq) qemu_iovec_reset(&ioreq->v); } +static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data) +{ + uint ua = GPOINTER_TO_UINT(a); + uint ub = GPOINTER_TO_UINT(b); + return (ua > ub) - (ua < ub); +} + +static void destroy_grant(gpointer pgnt) +{ + PersistentGrant *grant = pgnt; + XenGnttab gnt = grant->blkdev->xendev.gnttabdev; + + if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) { + xen_be_printf(&grant->blkdev->xendev, 0, + "xc_gnttab_munmap failed: %s\n", + strerror(errno)); + } + grant->blkdev->persistent_gnt_count--; + xen_be_printf(&grant->blkdev->xendev, 3, + "unmapped grant %p\n", grant->page); + g_free(grant); +} + static struct ioreq *ioreq_start(struct XenBlkDev *blkdev) { struct ioreq *ioreq = NULL; @@ -265,21 +302,21 @@ static void ioreq_unmap(struct ioreq *ioreq) XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev; int i; - if (ioreq->v.niov == 0 || ioreq->mapped == 0) { + if (ioreq->num_unmap == 0 || ioreq->mapped == 0) { return; } if (batch_maps) { if (!ioreq->pages) { return; } - if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->v.niov) != 0) { + if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) { xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n", strerror(errno)); } - ioreq->blkdev->cnt_map -= ioreq->v.niov; + ioreq->blkdev->cnt_map -= ioreq->num_unmap; ioreq->pages = NULL; } else { - for (i = 0; i < ioreq->v.niov; i++) { + for (i = 0; i < ioreq->num_unmap; i++) { if (!ioreq->page[i]) { continue; } @@ -297,41 +334,120 @@ static void ioreq_unmap(struct ioreq *ioreq) static int ioreq_map(struct ioreq *ioreq) { XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev; - int i; + uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + int i, j, new_maps = 0; + PersistentGrant *grant; + /* domids and refs variables will contain the information necessary + * to map the grants that are needed to fulfill this request. + * + * After mapping the needed grants, the page array will contain the + * memory address of each granted page in the order specified in ioreq + * (disregarding if it's a persistent grant or not). + */ if (ioreq->v.niov == 0 || ioreq->mapped == 1) { return 0; } - if (batch_maps) { + if (ioreq->blkdev->feature_persistent) { + for (i = 0; i < ioreq->v.niov; i++) { + grant = g_tree_lookup(ioreq->blkdev->persistent_gnts, + GUINT_TO_POINTER(ioreq->refs[i])); + + if (grant != NULL) { + page[i] = grant->page; + xen_be_printf(&ioreq->blkdev->xendev, 3, + "using persistent-grant %" PRIu32 "\n", + ioreq->refs[i]); + } else { + /* Add the grant to the list of grants that + * should be mapped + */ + domids[new_maps] = ioreq->domids[i]; + refs[new_maps] = ioreq->refs[i]; + page[i] = NULL; + new_maps++; + } + } + /* Set the protection to RW, since grants may be reused later + * with a different protection than the one needed for this request + */ + ioreq->prot = PROT_WRITE | PROT_READ; + } else { + /* All grants in the request should be mapped */ + memcpy(refs, ioreq->refs, sizeof(refs)); + memcpy(domids, ioreq->domids, sizeof(domids)); + memset(page, 0, sizeof(page)); + new_maps = ioreq->v.niov; + } + + if (batch_maps && new_maps) { ioreq->pages = xc_gnttab_map_grant_refs - (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot); + (gnt, new_maps, domids, refs, ioreq->prot); if (ioreq->pages == NULL) { xen_be_printf(&ioreq->blkdev->xendev, 0, "can't map %d grant refs (%s, %d maps)\n", - ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map); + new_maps, strerror(errno), ioreq->blkdev->cnt_map); return -1; } - for (i = 0; i < ioreq->v.niov; i++) { - ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE + - (uintptr_t)ioreq->v.iov[i].iov_base; + for (i = 0, j = 0; i < ioreq->v.niov; i++) { + if (page[i] == NULL) { + page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE; + } } - ioreq->blkdev->cnt_map += ioreq->v.niov; - } else { - for (i = 0; i < ioreq->v.niov; i++) { + ioreq->blkdev->cnt_map += new_maps; + } else if (new_maps) { + for (i = 0; i < new_maps; i++) { ioreq->page[i] = xc_gnttab_map_grant_ref - (gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot); + (gnt, domids[i], refs[i], ioreq->prot); if (ioreq->page[i] == NULL) { xen_be_printf(&ioreq->blkdev->xendev, 0, "can't map grant ref %d (%s, %d maps)\n", - ioreq->refs[i], strerror(errno), ioreq->blkdev->cnt_map); + refs[i], strerror(errno), ioreq->blkdev->cnt_map); ioreq_unmap(ioreq); return -1; } - ioreq->v.iov[i].iov_base = ioreq->page[i] + (uintptr_t)ioreq->v.iov[i].iov_base; ioreq->blkdev->cnt_map++; } + for (i = 0, j = 0; i < ioreq->v.niov; i++) { + if (page[i] == NULL) { + page[i] = ioreq->page[j++]; + } + } + } + if (ioreq->blkdev->feature_persistent) { + while ((ioreq->blkdev->persistent_gnt_count < ioreq->blkdev->max_grants) + && new_maps) { + /* Go through the list of newly mapped grants and add as many + * as possible to the list of persistently mapped grants. + * + * Since we start at the end of ioreq->page(s), we only need + * to decrease new_maps to prevent this granted pages from + * being unmapped in ioreq_unmap. + */ + grant = g_malloc0(sizeof(*grant)); + new_maps--; + if (batch_maps) { + grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE; + } else { + grant->page = ioreq->page[new_maps]; + } + grant->blkdev = ioreq->blkdev; + xen_be_printf(&ioreq->blkdev->xendev, 3, + "adding grant %" PRIu32 " page: %p\n", + refs[new_maps], grant->page); + g_tree_insert(ioreq->blkdev->persistent_gnts, + GUINT_TO_POINTER(refs[new_maps]), + grant); + ioreq->blkdev->persistent_gnt_count++; + } + } + for (i = 0; i < ioreq->v.niov; i++) { + ioreq->v.iov[i].iov_base += (uintptr_t)page[i]; } ioreq->mapped = 1; + ioreq->num_unmap = new_maps; return 0; } @@ -679,6 +795,7 @@ static int blk_init(struct XenDevice *xendev) /* fill info */ xenstore_write_be_int(&blkdev->xendev, "feature-barrier", 1); + xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1); xenstore_write_be_int(&blkdev->xendev, "info", info); xenstore_write_be_int(&blkdev->xendev, "sector-size", blkdev->file_blk); xenstore_write_be_int(&blkdev->xendev, "sectors", @@ -702,6 +819,7 @@ out_error: static int blk_connect(struct XenDevice *xendev) { struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev); + int pers; if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) { return -1; @@ -710,6 +828,11 @@ static int blk_connect(struct XenDevice *xendev) &blkdev->xendev.remote_port) == -1) { return -1; } + if (xenstore_read_fe_int(&blkdev->xendev, "feature-persistent", &pers)) { + blkdev->feature_persistent = FALSE; + } else { + blkdev->feature_persistent = !!pers; + } blkdev->protocol = BLKIF_PROTOCOL_NATIVE; if (blkdev->xendev.protocol) { @@ -753,6 +876,15 @@ static int blk_connect(struct XenDevice *xendev) } } + if (blkdev->feature_persistent) { + /* Init persistent grants */ + blkdev->max_grants = max_requests * BLKIF_MAX_SEGMENTS_PER_REQUEST; + blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp, + NULL, NULL, + (GDestroyNotify)destroy_grant); + blkdev->persistent_gnt_count = 0; + } + xen_be_bind_evtchn(&blkdev->xendev); xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, " @@ -793,6 +925,11 @@ static int blk_free(struct XenDevice *xendev) blk_disconnect(xendev); } + /* Free persistent grants */ + if (blkdev->feature_persistent) { + g_tree_destroy(blkdev->persistent_gnts); + } + while (!QLIST_EMPTY(&blkdev->freelist)) { ioreq = QLIST_FIRST(&blkdev->freelist); QLIST_REMOVE(ioreq, list); From 7e7b7cba16faa7b721b822fa9ed8bebafa35700f Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 14 Jan 2013 18:30:30 +0000 Subject: [PATCH 3/3] xen_disk: implement BLKIF_OP_FLUSH_DISKCACHE, remove BLKIF_OP_WRITE_BARRIER Signed-off-by: Stefano Stabellini --- hw/xen_disk.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/hw/xen_disk.c b/hw/xen_disk.c index 9ee72a0de..7fea87156 100644 --- a/hw/xen_disk.c +++ b/hw/xen_disk.c @@ -243,12 +243,11 @@ static int ioreq_parse(struct ioreq *ioreq) case BLKIF_OP_READ: ioreq->prot = PROT_WRITE; /* to memory */ break; - case BLKIF_OP_WRITE_BARRIER: + case BLKIF_OP_FLUSH_DISKCACHE: + ioreq->presync = 1; if (!ioreq->req.nr_segments) { - ioreq->presync = 1; return 0; } - ioreq->presync = ioreq->postsync = 1; /* fall through */ case BLKIF_OP_WRITE: ioreq->prot = PROT_READ; /* from memory */ @@ -509,7 +508,7 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq) qemu_aio_complete, ioreq); break; case BLKIF_OP_WRITE: - case BLKIF_OP_WRITE_BARRIER: + case BLKIF_OP_FLUSH_DISKCACHE: if (!ioreq->req.nr_segments) { break; } @@ -794,7 +793,7 @@ static int blk_init(struct XenDevice *xendev) blkdev->file_size, blkdev->file_size >> 20); /* fill info */ - xenstore_write_be_int(&blkdev->xendev, "feature-barrier", 1); + xenstore_write_be_int(&blkdev->xendev, "feature-flush-cache", 1); xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1); xenstore_write_be_int(&blkdev->xendev, "info", info); xenstore_write_be_int(&blkdev->xendev, "sector-size", blkdev->file_blk);