Merge remote-tracking branch 'kwolf/for-anthony' into staging

This commit is contained in:
Anthony Liguori 2011-12-05 09:39:25 -06:00
commit eb5d5beaeb
40 changed files with 1333 additions and 347 deletions

View File

@ -387,7 +387,7 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) { for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
if (bmds_aio_inflight(bmds, sector)) { if (bmds_aio_inflight(bmds, sector)) {
qemu_aio_flush(); bdrv_drain_all();
} }
if (bdrv_get_dirty(bmds->bs, sector)) { if (bdrv_get_dirty(bmds->bs, sector)) {

613
block.c
View File

@ -30,6 +30,7 @@
#include "qjson.h" #include "qjson.h"
#include "qemu-coroutine.h" #include "qemu-coroutine.h"
#include "qmp-commands.h" #include "qmp-commands.h"
#include "qemu-timer.h"
#ifdef CONFIG_BSD #ifdef CONFIG_BSD
#include <sys/types.h> #include <sys/types.h>
@ -73,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
bool is_write); bool is_write);
static void coroutine_fn bdrv_co_do_rw(void *opaque); static void coroutine_fn bdrv_co_do_rw(void *opaque);
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
bool is_write, double elapsed_time, uint64_t *wait);
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
double elapsed_time, uint64_t *wait);
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
bool is_write, int64_t *wait);
static QTAILQ_HEAD(, BlockDriverState) bdrv_states = static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
QTAILQ_HEAD_INITIALIZER(bdrv_states); QTAILQ_HEAD_INITIALIZER(bdrv_states);
@ -105,6 +113,79 @@ int is_windows_drive(const char *filename)
} }
#endif #endif
/* throttling disk I/O limits */
void bdrv_io_limits_disable(BlockDriverState *bs)
{
bs->io_limits_enabled = false;
while (qemu_co_queue_next(&bs->throttled_reqs));
if (bs->block_timer) {
qemu_del_timer(bs->block_timer);
qemu_free_timer(bs->block_timer);
bs->block_timer = NULL;
}
bs->slice_start = 0;
bs->slice_end = 0;
bs->slice_time = 0;
memset(&bs->io_base, 0, sizeof(bs->io_base));
}
static void bdrv_block_timer(void *opaque)
{
BlockDriverState *bs = opaque;
qemu_co_queue_next(&bs->throttled_reqs);
}
void bdrv_io_limits_enable(BlockDriverState *bs)
{
qemu_co_queue_init(&bs->throttled_reqs);
bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
bs->slice_start = qemu_get_clock_ns(vm_clock);
bs->slice_end = bs->slice_start + bs->slice_time;
memset(&bs->io_base, 0, sizeof(bs->io_base));
bs->io_limits_enabled = true;
}
bool bdrv_io_limits_enabled(BlockDriverState *bs)
{
BlockIOLimit *io_limits = &bs->io_limits;
return io_limits->bps[BLOCK_IO_LIMIT_READ]
|| io_limits->bps[BLOCK_IO_LIMIT_WRITE]
|| io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
|| io_limits->iops[BLOCK_IO_LIMIT_READ]
|| io_limits->iops[BLOCK_IO_LIMIT_WRITE]
|| io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
}
static void bdrv_io_limits_intercept(BlockDriverState *bs,
bool is_write, int nb_sectors)
{
int64_t wait_time = -1;
if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
qemu_co_queue_wait(&bs->throttled_reqs);
}
/* In fact, we hope to keep each request's timing, in FIFO mode. The next
* throttled requests will not be dequeued until the current request is
* allowed to be serviced. So if the current request still exceeds the
* limits, it will be inserted to the head. All requests followed it will
* be still in throttled_reqs queue.
*/
while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
qemu_mod_timer(bs->block_timer,
wait_time + qemu_get_clock_ns(vm_clock));
qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
}
qemu_co_queue_next(&bs->throttled_reqs);
}
/* check if the path starts with "<protocol>:" */ /* check if the path starts with "<protocol>:" */
static int path_has_protocol(const char *path) static int path_has_protocol(const char *path)
{ {
@ -457,6 +538,22 @@ int bdrv_parse_cache_flags(const char *mode, int *flags)
return 0; return 0;
} }
/**
* The copy-on-read flag is actually a reference count so multiple users may
* use the feature without worrying about clobbering its previous state.
* Copy-on-read stays enabled until all users have called to disable it.
*/
void bdrv_enable_copy_on_read(BlockDriverState *bs)
{
bs->copy_on_read++;
}
void bdrv_disable_copy_on_read(BlockDriverState *bs)
{
assert(bs->copy_on_read > 0);
bs->copy_on_read--;
}
/* /*
* Common part for opening disk images and files * Common part for opening disk images and files
*/ */
@ -478,6 +575,11 @@ static int bdrv_open_common(BlockDriverState *bs, const char *filename,
bs->growable = 0; bs->growable = 0;
bs->buffer_alignment = 512; bs->buffer_alignment = 512;
assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
bdrv_enable_copy_on_read(bs);
}
pstrcpy(bs->filename, sizeof(bs->filename), filename); pstrcpy(bs->filename, sizeof(bs->filename), filename);
bs->backing_file[0] = '\0'; bs->backing_file[0] = '\0';
@ -687,6 +789,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
bdrv_dev_change_media_cb(bs, true); bdrv_dev_change_media_cb(bs, true);
} }
/* throttling disk I/O limits */
if (bs->io_limits_enabled) {
bdrv_io_limits_enable(bs);
}
return 0; return 0;
unlink_and_fail: unlink_and_fail:
@ -715,6 +822,7 @@ void bdrv_close(BlockDriverState *bs)
#endif #endif
bs->opaque = NULL; bs->opaque = NULL;
bs->drv = NULL; bs->drv = NULL;
bs->copy_on_read = 0;
if (bs->file != NULL) { if (bs->file != NULL) {
bdrv_close(bs->file); bdrv_close(bs->file);
@ -722,6 +830,11 @@ void bdrv_close(BlockDriverState *bs)
bdrv_dev_change_media_cb(bs, false); bdrv_dev_change_media_cb(bs, false);
} }
/*throttling disk I/O limits*/
if (bs->io_limits_enabled) {
bdrv_io_limits_disable(bs);
}
} }
void bdrv_close_all(void) void bdrv_close_all(void)
@ -733,6 +846,25 @@ void bdrv_close_all(void)
} }
} }
/*
* Wait for pending requests to complete across all BlockDriverStates
*
* This function does not flush data to disk, use bdrv_flush_all() for that
* after calling this function.
*/
void bdrv_drain_all(void)
{
BlockDriverState *bs;
qemu_aio_flush();
/* If requests are still pending there is a bug somewhere */
QTAILQ_FOREACH(bs, &bdrv_states, list) {
assert(QLIST_EMPTY(&bs->tracked_requests));
assert(qemu_co_queue_empty(&bs->throttled_reqs));
}
}
/* make a BlockDriverState anonymous by removing from bdrv_state list. /* make a BlockDriverState anonymous by removing from bdrv_state list.
Also, NULL terminate the device_name to prevent double remove */ Also, NULL terminate the device_name to prevent double remove */
void bdrv_make_anon(BlockDriverState *bs) void bdrv_make_anon(BlockDriverState *bs)
@ -922,7 +1054,7 @@ int bdrv_commit(BlockDriverState *bs)
buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
for (sector = 0; sector < total_sectors; sector += n) { for (sector = 0; sector < total_sectors; sector += n) {
if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) { if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
if (bdrv_read(bs, sector, buf, n) != 0) { if (bdrv_read(bs, sector, buf, n) != 0) {
ret = -EIO; ret = -EIO;
@ -980,6 +1112,118 @@ void bdrv_commit_all(void)
} }
} }
struct BdrvTrackedRequest {
BlockDriverState *bs;
int64_t sector_num;
int nb_sectors;
bool is_write;
QLIST_ENTRY(BdrvTrackedRequest) list;
Coroutine *co; /* owner, used for deadlock detection */
CoQueue wait_queue; /* coroutines blocked on this request */
};
/**
* Remove an active request from the tracked requests list
*
* This function should be called when a tracked request is completing.
*/
static void tracked_request_end(BdrvTrackedRequest *req)
{
QLIST_REMOVE(req, list);
qemu_co_queue_restart_all(&req->wait_queue);
}
/**
* Add an active request to the tracked requests list
*/
static void tracked_request_begin(BdrvTrackedRequest *req,
BlockDriverState *bs,
int64_t sector_num,
int nb_sectors, bool is_write)
{
*req = (BdrvTrackedRequest){
.bs = bs,
.sector_num = sector_num,
.nb_sectors = nb_sectors,
.is_write = is_write,
.co = qemu_coroutine_self(),
};
qemu_co_queue_init(&req->wait_queue);
QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
}
/**
* Round a region to cluster boundaries
*/
static void round_to_clusters(BlockDriverState *bs,
int64_t sector_num, int nb_sectors,
int64_t *cluster_sector_num,
int *cluster_nb_sectors)
{
BlockDriverInfo bdi;
if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
*cluster_sector_num = sector_num;
*cluster_nb_sectors = nb_sectors;
} else {
int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
*cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
*cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
nb_sectors, c);
}
}
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
int64_t sector_num, int nb_sectors) {
/* aaaa bbbb */
if (sector_num >= req->sector_num + req->nb_sectors) {
return false;
}
/* bbbb aaaa */
if (req->sector_num >= sector_num + nb_sectors) {
return false;
}
return true;
}
static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
int64_t sector_num, int nb_sectors)
{
BdrvTrackedRequest *req;
int64_t cluster_sector_num;
int cluster_nb_sectors;
bool retry;
/* If we touch the same cluster it counts as an overlap. This guarantees
* that allocating writes will be serialized and not race with each other
* for the same cluster. For example, in copy-on-read it ensures that the
* CoR read and write operations are atomic and guest writes cannot
* interleave between them.
*/
round_to_clusters(bs, sector_num, nb_sectors,
&cluster_sector_num, &cluster_nb_sectors);
do {
retry = false;
QLIST_FOREACH(req, &bs->tracked_requests, list) {
if (tracked_request_overlaps(req, cluster_sector_num,
cluster_nb_sectors)) {
/* Hitting this means there was a reentrant request, for
* example, a block driver issuing nested requests. This must
* never happen since it means deadlock.
*/
assert(qemu_coroutine_self() != req->co);
qemu_co_queue_wait(&req->wait_queue);
retry = true;
break;
}
}
} while (retry);
}
/* /*
* Return values: * Return values:
* 0 - success * 0 - success
@ -1252,6 +1496,61 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
return 0; return 0;
} }
static int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
{
/* Perform I/O through a temporary buffer so that users who scribble over
* their read buffer while the operation is in progress do not end up
* modifying the image file. This is critical for zero-copy guest I/O
* where anything might happen inside guest memory.
*/
void *bounce_buffer;
struct iovec iov;
QEMUIOVector bounce_qiov;
int64_t cluster_sector_num;
int cluster_nb_sectors;
size_t skip_bytes;
int ret;
/* Cover entire cluster so no additional backing file I/O is required when
* allocating cluster in the image file.
*/
round_to_clusters(bs, sector_num, nb_sectors,
&cluster_sector_num, &cluster_nb_sectors);
trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors,
cluster_sector_num, cluster_nb_sectors);
iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
qemu_iovec_init_external(&bounce_qiov, &iov, 1);
ret = bs->drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
&bounce_qiov);
if (ret < 0) {
goto err;
}
ret = bs->drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
&bounce_qiov);
if (ret < 0) {
/* It might be okay to ignore write errors for guest requests. If this
* is a deliberate copy-on-read then we don't want to ignore the error.
* Simply report it in all cases.
*/
goto err;
}
skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
nb_sectors * BDRV_SECTOR_SIZE);
err:
qemu_vfree(bounce_buffer);
return ret;
}
/* /*
* Handle a read request in coroutine context * Handle a read request in coroutine context
*/ */
@ -1259,6 +1558,8 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
{ {
BlockDriver *drv = bs->drv; BlockDriver *drv = bs->drv;
BdrvTrackedRequest req;
int ret;
if (!drv) { if (!drv) {
return -ENOMEDIUM; return -ENOMEDIUM;
@ -1267,7 +1568,36 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
return -EIO; return -EIO;
} }
return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); /* throttling disk read I/O */
if (bs->io_limits_enabled) {
bdrv_io_limits_intercept(bs, false, nb_sectors);
}
if (bs->copy_on_read) {
wait_for_overlapping_requests(bs, sector_num, nb_sectors);
}
tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
if (bs->copy_on_read) {
int pnum;
ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
if (ret < 0) {
goto out;
}
if (!ret || pnum != nb_sectors) {
ret = bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, qiov);
goto out;
}
}
ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
out:
tracked_request_end(&req);
return ret;
} }
int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
@ -1285,6 +1615,7 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
{ {
BlockDriver *drv = bs->drv; BlockDriver *drv = bs->drv;
BdrvTrackedRequest req;
int ret; int ret;
if (!bs->drv) { if (!bs->drv) {
@ -1297,6 +1628,17 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
return -EIO; return -EIO;
} }
/* throttling disk write I/O */
if (bs->io_limits_enabled) {
bdrv_io_limits_intercept(bs, true, nb_sectors);
}
if (bs->copy_on_read) {
wait_for_overlapping_requests(bs, sector_num, nb_sectors);
}
tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
if (bs->dirty_bitmap) { if (bs->dirty_bitmap) {
@ -1307,6 +1649,8 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
bs->wr_highest_sector = sector_num + nb_sectors - 1; bs->wr_highest_sector = sector_num + nb_sectors - 1;
} }
tracked_request_end(&req);
return ret; return ret;
} }
@ -1526,6 +1870,14 @@ void bdrv_get_geometry_hint(BlockDriverState *bs,
*psecs = bs->secs; *psecs = bs->secs;
} }
/* throttling disk io limits */
void bdrv_set_io_limits(BlockDriverState *bs,
BlockIOLimit *io_limits)
{
bs->io_limits = *io_limits;
bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
}
/* Recognize floppy formats */ /* Recognize floppy formats */
typedef struct FDFormat { typedef struct FDFormat {
FDriveType drive; FDriveType drive;
@ -1778,31 +2130,87 @@ int bdrv_has_zero_init(BlockDriverState *bs)
return 1; return 1;
} }
typedef struct BdrvCoIsAllocatedData {
BlockDriverState *bs;
int64_t sector_num;
int nb_sectors;
int *pnum;
int ret;
bool done;
} BdrvCoIsAllocatedData;
/* /*
* Returns true iff the specified sector is present in the disk image. Drivers * Returns true iff the specified sector is present in the disk image. Drivers
* not implementing the functionality are assumed to not support backing files, * not implementing the functionality are assumed to not support backing files,
* hence all their sectors are reported as allocated. * hence all their sectors are reported as allocated.
* *
* If 'sector_num' is beyond the end of the disk image the return value is 0
* and 'pnum' is set to 0.
*
* 'pnum' is set to the number of sectors (including and immediately following * 'pnum' is set to the number of sectors (including and immediately following
* the specified sector) that are known to be in the same * the specified sector) that are known to be in the same
* allocated/unallocated state. * allocated/unallocated state.
* *
* 'nb_sectors' is the max value 'pnum' should be set to. * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
* beyond the end of the disk image it will be clamped.
*/ */
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
int *pnum) int nb_sectors, int *pnum)
{ {
int64_t n; int64_t n;
if (!bs->drv->bdrv_is_allocated) {
if (sector_num >= bs->total_sectors) { if (sector_num >= bs->total_sectors) {
*pnum = 0; *pnum = 0;
return 0; return 0;
} }
n = bs->total_sectors - sector_num;
*pnum = (n < nb_sectors) ? (n) : (nb_sectors); n = bs->total_sectors - sector_num;
if (n < nb_sectors) {
nb_sectors = n;
}
if (!bs->drv->bdrv_co_is_allocated) {
*pnum = nb_sectors;
return 1; return 1;
} }
return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum);
return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
}
/* Coroutine wrapper for bdrv_is_allocated() */
static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
{
BdrvCoIsAllocatedData *data = opaque;
BlockDriverState *bs = data->bs;
data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
data->pnum);
data->done = true;
}
/*
* Synchronous wrapper around bdrv_co_is_allocated().
*
* See bdrv_co_is_allocated() for details.
*/
int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
int *pnum)
{
Coroutine *co;
BdrvCoIsAllocatedData data = {
.bs = bs,
.sector_num = sector_num,
.nb_sectors = nb_sectors,
.pnum = pnum,
.done = false,
};
co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
qemu_coroutine_enter(co, &data);
while (!data.done) {
qemu_aio_wait();
}
return data.ret;
} }
void bdrv_mon_event(const BlockDriverState *bdrv, void bdrv_mon_event(const BlockDriverState *bdrv,
@ -1869,6 +2277,21 @@ BlockInfoList *qmp_query_block(Error **errp)
info->value->inserted->has_backing_file = true; info->value->inserted->has_backing_file = true;
info->value->inserted->backing_file = g_strdup(bs->backing_file); info->value->inserted->backing_file = g_strdup(bs->backing_file);
} }
if (bs->io_limits_enabled) {
info->value->inserted->bps =
bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
info->value->inserted->bps_rd =
bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
info->value->inserted->bps_wr =
bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
info->value->inserted->iops =
bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
info->value->inserted->iops_rd =
bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
info->value->inserted->iops_wr =
bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
}
} }
/* XXX: waiting for the qapi to support GSList */ /* XXX: waiting for the qapi to support GSList */
@ -2480,6 +2903,170 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
acb->pool->cancel(acb); acb->pool->cancel(acb);
} }
/* block I/O throttling */
static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
bool is_write, double elapsed_time, uint64_t *wait)
{
uint64_t bps_limit = 0;
double bytes_limit, bytes_base, bytes_res;
double slice_time, wait_time;
if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
} else if (bs->io_limits.bps[is_write]) {
bps_limit = bs->io_limits.bps[is_write];
} else {
if (wait) {
*wait = 0;
}
return false;
}
slice_time = bs->slice_end - bs->slice_start;
slice_time /= (NANOSECONDS_PER_SECOND);
bytes_limit = bps_limit * slice_time;
bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
}
/* bytes_base: the bytes of data which have been read/written; and
* it is obtained from the history statistic info.
* bytes_res: the remaining bytes of data which need to be read/written.
* (bytes_base + bytes_res) / bps_limit: used to calcuate
* the total time for completing reading/writting all data.
*/
bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
if (bytes_base + bytes_res <= bytes_limit) {
if (wait) {
*wait = 0;
}
return false;
}
/* Calc approx time to dispatch */
wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
/* When the I/O rate at runtime exceeds the limits,
* bs->slice_end need to be extended in order that the current statistic
* info can be kept until the timer fire, so it is increased and tuned
* based on the result of experiment.
*/
bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
if (wait) {
*wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
}
return true;
}
static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
double elapsed_time, uint64_t *wait)
{
uint64_t iops_limit = 0;
double ios_limit, ios_base;
double slice_time, wait_time;
if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
} else if (bs->io_limits.iops[is_write]) {
iops_limit = bs->io_limits.iops[is_write];
} else {
if (wait) {
*wait = 0;
}
return false;
}
slice_time = bs->slice_end - bs->slice_start;
slice_time /= (NANOSECONDS_PER_SECOND);
ios_limit = iops_limit * slice_time;
ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
}
if (ios_base + 1 <= ios_limit) {
if (wait) {
*wait = 0;
}
return false;
}
/* Calc approx time to dispatch */
wait_time = (ios_base + 1) / iops_limit;
if (wait_time > elapsed_time) {
wait_time = wait_time - elapsed_time;
} else {
wait_time = 0;
}
bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
if (wait) {
*wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
}
return true;
}
static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
bool is_write, int64_t *wait)
{
int64_t now, max_wait;
uint64_t bps_wait = 0, iops_wait = 0;
double elapsed_time;
int bps_ret, iops_ret;
now = qemu_get_clock_ns(vm_clock);
if ((bs->slice_start < now)
&& (bs->slice_end > now)) {
bs->slice_end = now + bs->slice_time;
} else {
bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
bs->slice_start = now;
bs->slice_end = now + bs->slice_time;
bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
bs->io_base.ios[is_write] = bs->nr_ops[is_write];
bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
}
elapsed_time = now - bs->slice_start;
elapsed_time /= (NANOSECONDS_PER_SECOND);
bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
is_write, elapsed_time, &bps_wait);
iops_ret = bdrv_exceed_iops_limits(bs, is_write,
elapsed_time, &iops_wait);
if (bps_ret || iops_ret) {
max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
if (wait) {
*wait = max_wait;
}
now = qemu_get_clock_ns(vm_clock);
if (bs->slice_end < now + max_wait) {
bs->slice_end = now + max_wait;
}
return true;
}
if (wait) {
*wait = 0;
}
return false;
}
/**************************************************************/ /**************************************************************/
/* async block device emulation */ /* async block device emulation */

12
block.h
View File

@ -70,6 +70,7 @@ typedef struct BlockDevOps {
#define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the thread pool */ #define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the thread pool */
#define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */ #define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */
#define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */ #define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */
#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
#define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH) #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH)
@ -98,6 +99,11 @@ void bdrv_info(Monitor *mon, QObject **ret_data);
void bdrv_stats_print(Monitor *mon, const QObject *data); void bdrv_stats_print(Monitor *mon, const QObject *data);
void bdrv_info_stats(Monitor *mon, QObject **ret_data); void bdrv_info_stats(Monitor *mon, QObject **ret_data);
/* disk I/O throttling */
void bdrv_io_limits_enable(BlockDriverState *bs);
void bdrv_io_limits_disable(BlockDriverState *bs);
bool bdrv_io_limits_enabled(BlockDriverState *bs);
void bdrv_init(void); void bdrv_init(void);
void bdrv_init_with_whitelist(void); void bdrv_init_with_whitelist(void);
BlockDriver *bdrv_find_protocol(const char *filename); BlockDriver *bdrv_find_protocol(const char *filename);
@ -138,6 +144,8 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, QEMUIOVector *qiov); int nb_sectors, QEMUIOVector *qiov);
int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, QEMUIOVector *qiov); int nb_sectors, QEMUIOVector *qiov);
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, int *pnum);
int bdrv_truncate(BlockDriverState *bs, int64_t offset); int bdrv_truncate(BlockDriverState *bs, int64_t offset);
int64_t bdrv_getlength(BlockDriverState *bs); int64_t bdrv_getlength(BlockDriverState *bs);
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs); int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
@ -206,6 +214,7 @@ int bdrv_flush(BlockDriverState *bs);
int coroutine_fn bdrv_co_flush(BlockDriverState *bs); int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
void bdrv_flush_all(void); void bdrv_flush_all(void);
void bdrv_close_all(void); void bdrv_close_all(void);
void bdrv_drain_all(void);
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
int bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); int bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
@ -305,6 +314,9 @@ void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
int nr_sectors); int nr_sectors);
int64_t bdrv_get_dirty_count(BlockDriverState *bs); int64_t bdrv_get_dirty_count(BlockDriverState *bs);
void bdrv_enable_copy_on_read(BlockDriverState *bs);
void bdrv_disable_copy_on_read(BlockDriverState *bs);
void bdrv_set_in_use(BlockDriverState *bs, int in_use); void bdrv_set_in_use(BlockDriverState *bs, int in_use);
int bdrv_in_use(BlockDriverState *bs); int bdrv_in_use(BlockDriverState *bs);

View File

@ -132,8 +132,8 @@ static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
/* Return true if first block has been changed (ie. current version is /* Return true if first block has been changed (ie. current version is
* in COW file). Set the number of continuous blocks for which that * in COW file). Set the number of continuous blocks for which that
* is true. */ * is true. */
static int cow_is_allocated(BlockDriverState *bs, int64_t sector_num, static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs,
int nb_sectors, int *num_same) int64_t sector_num, int nb_sectors, int *num_same)
{ {
int changed; int changed;
@ -171,14 +171,14 @@ static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
return error; return error;
} }
static int cow_read(BlockDriverState *bs, int64_t sector_num, static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num,
uint8_t *buf, int nb_sectors) uint8_t *buf, int nb_sectors)
{ {
BDRVCowState *s = bs->opaque; BDRVCowState *s = bs->opaque;
int ret, n; int ret, n;
while (nb_sectors > 0) { while (nb_sectors > 0) {
if (cow_is_allocated(bs, sector_num, nb_sectors, &n)) { if (bdrv_co_is_allocated(bs, sector_num, nb_sectors, &n)) {
ret = bdrv_pread(bs->file, ret = bdrv_pread(bs->file,
s->cow_sectors_offset + sector_num * 512, s->cow_sectors_offset + sector_num * 512,
buf, n * 512); buf, n * 512);
@ -243,12 +243,12 @@ static void cow_close(BlockDriverState *bs)
static int cow_create(const char *filename, QEMUOptionParameter *options) static int cow_create(const char *filename, QEMUOptionParameter *options)
{ {
int fd, cow_fd;
struct cow_header_v2 cow_header; struct cow_header_v2 cow_header;
struct stat st; struct stat st;
int64_t image_sectors = 0; int64_t image_sectors = 0;
const char *image_filename = NULL; const char *image_filename = NULL;
int ret; int ret;
BlockDriverState *cow_bs;
/* Read out options */ /* Read out options */
while (options && options->name) { while (options && options->name) {
@ -260,10 +260,16 @@ static int cow_create(const char *filename, QEMUOptionParameter *options)
options++; options++;
} }
cow_fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, ret = bdrv_create_file(filename, options);
0644); if (ret < 0) {
if (cow_fd < 0) return ret;
return -errno; }
ret = bdrv_file_open(&cow_bs, filename, BDRV_O_RDWR);
if (ret < 0) {
return ret;
}
memset(&cow_header, 0, sizeof(cow_header)); memset(&cow_header, 0, sizeof(cow_header));
cow_header.magic = cpu_to_be32(COW_MAGIC); cow_header.magic = cpu_to_be32(COW_MAGIC);
cow_header.version = cpu_to_be32(COW_VERSION); cow_header.version = cpu_to_be32(COW_VERSION);
@ -271,16 +277,9 @@ static int cow_create(const char *filename, QEMUOptionParameter *options)
/* Note: if no file, we put a dummy mtime */ /* Note: if no file, we put a dummy mtime */
cow_header.mtime = cpu_to_be32(0); cow_header.mtime = cpu_to_be32(0);
fd = open(image_filename, O_RDONLY | O_BINARY); if (stat(image_filename, &st) != 0) {
if (fd < 0) {
close(cow_fd);
goto mtime_fail; goto mtime_fail;
} }
if (fstat(fd, &st) != 0) {
close(fd);
goto mtime_fail;
}
close(fd);
cow_header.mtime = cpu_to_be32(st.st_mtime); cow_header.mtime = cpu_to_be32(st.st_mtime);
mtime_fail: mtime_fail:
pstrcpy(cow_header.backing_file, sizeof(cow_header.backing_file), pstrcpy(cow_header.backing_file, sizeof(cow_header.backing_file),
@ -288,21 +287,20 @@ static int cow_create(const char *filename, QEMUOptionParameter *options)
} }
cow_header.sectorsize = cpu_to_be32(512); cow_header.sectorsize = cpu_to_be32(512);
cow_header.size = cpu_to_be64(image_sectors * 512); cow_header.size = cpu_to_be64(image_sectors * 512);
ret = qemu_write_full(cow_fd, &cow_header, sizeof(cow_header)); ret = bdrv_pwrite(cow_bs, 0, &cow_header, sizeof(cow_header));
if (ret != sizeof(cow_header)) { if (ret != sizeof(cow_header)) {
ret = -errno;
goto exit; goto exit;
} }
/* resize to include at least all the bitmap */ /* resize to include at least all the bitmap */
ret = ftruncate(cow_fd, sizeof(cow_header) + ((image_sectors + 7) >> 3)); ret = bdrv_truncate(cow_bs,
sizeof(cow_header) + ((image_sectors + 7) >> 3));
if (ret) { if (ret) {
ret = -errno;
goto exit; goto exit;
} }
exit: exit:
close(cow_fd); bdrv_delete(cow_bs);
return ret; return ret;
} }
@ -337,7 +335,7 @@ static BlockDriver bdrv_cow = {
.bdrv_read = cow_co_read, .bdrv_read = cow_co_read,
.bdrv_write = cow_co_write, .bdrv_write = cow_co_write,
.bdrv_co_flush_to_disk = cow_co_flush, .bdrv_co_flush_to_disk = cow_co_flush,
.bdrv_is_allocated = cow_is_allocated, .bdrv_co_is_allocated = cow_co_is_allocated,
.create_options = cow_create_options, .create_options = cow_create_options,
}; };

View File

@ -368,14 +368,16 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
return cluster_offset; return cluster_offset;
} }
static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num, static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs,
int nb_sectors, int *pnum) int64_t sector_num, int nb_sectors, int *pnum)
{ {
BDRVQcowState *s = bs->opaque; BDRVQcowState *s = bs->opaque;
int index_in_cluster, n; int index_in_cluster, n;
uint64_t cluster_offset; uint64_t cluster_offset;
qemu_co_mutex_lock(&s->lock);
cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
qemu_co_mutex_unlock(&s->lock);
index_in_cluster = sector_num & (s->cluster_sectors - 1); index_in_cluster = sector_num & (s->cluster_sectors - 1);
n = s->cluster_sectors - index_in_cluster; n = s->cluster_sectors - index_in_cluster;
if (n > nb_sectors) if (n > nb_sectors)
@ -433,7 +435,7 @@ static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
return 0; return 0;
} }
static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, QEMUIOVector *qiov) int nb_sectors, QEMUIOVector *qiov)
{ {
BDRVQcowState *s = bs->opaque; BDRVQcowState *s = bs->opaque;
@ -531,7 +533,7 @@ fail:
goto done; goto done;
} }
static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, QEMUIOVector *qiov) int nb_sectors, QEMUIOVector *qiov)
{ {
BDRVQcowState *s = bs->opaque; BDRVQcowState *s = bs->opaque;
@ -844,7 +846,7 @@ static BlockDriver bdrv_qcow = {
.bdrv_co_readv = qcow_co_readv, .bdrv_co_readv = qcow_co_readv,
.bdrv_co_writev = qcow_co_writev, .bdrv_co_writev = qcow_co_writev,
.bdrv_co_flush_to_disk = qcow_co_flush, .bdrv_co_flush_to_disk = qcow_co_flush,
.bdrv_is_allocated = qcow_is_allocated, .bdrv_co_is_allocated = qcow_co_is_allocated,
.bdrv_set_key = qcow_set_key, .bdrv_set_key = qcow_set_key,
.bdrv_make_empty = qcow_make_empty, .bdrv_make_empty = qcow_make_empty,

View File

@ -289,89 +289,62 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
} }
} }
static int coroutine_fn copy_sectors(BlockDriverState *bs,
static int qcow2_read(BlockDriverState *bs, int64_t sector_num, uint64_t start_sect,
uint8_t *buf, int nb_sectors) uint64_t cluster_offset,
int n_start, int n_end)
{ {
BDRVQcowState *s = bs->opaque; BDRVQcowState *s = bs->opaque;
int ret, index_in_cluster, n, n1;
uint64_t cluster_offset;
struct iovec iov;
QEMUIOVector qiov; QEMUIOVector qiov;
struct iovec iov;
while (nb_sectors > 0) {
n = nb_sectors;
ret = qcow2_get_cluster_offset(bs, sector_num << 9, &n,
&cluster_offset);
if (ret < 0) {
return ret;
}
index_in_cluster = sector_num & (s->cluster_sectors - 1);
if (!cluster_offset) {
if (bs->backing_hd) {
/* read from the base image */
iov.iov_base = buf;
iov.iov_len = n * 512;
qemu_iovec_init_external(&qiov, &iov, 1);
n1 = qcow2_backing_read1(bs->backing_hd, &qiov, sector_num, n);
if (n1 > 0) {
BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING);
ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
if (ret < 0)
return -1;
}
} else {
memset(buf, 0, 512 * n);
}
} else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
if (qcow2_decompress_cluster(bs, cluster_offset) < 0)
return -1;
memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
} else {
BLKDBG_EVENT(bs->file, BLKDBG_READ);
ret = bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512);
if (ret != n * 512)
return -1;
if (s->crypt_method) {
qcow2_encrypt_sectors(s, sector_num, buf, buf, n, 0,
&s->aes_decrypt_key);
}
}
nb_sectors -= n;
sector_num += n;
buf += n * 512;
}
return 0;
}
static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
uint64_t cluster_offset, int n_start, int n_end)
{
BDRVQcowState *s = bs->opaque;
int n, ret; int n, ret;
/*
* If this is the last cluster and it is only partially used, we must only
* copy until the end of the image, or bdrv_check_request will fail for the
* bdrv_read/write calls below.
*/
if (start_sect + n_end > bs->total_sectors) {
n_end = bs->total_sectors - start_sect;
}
n = n_end - n_start; n = n_end - n_start;
if (n <= 0) if (n <= 0) {
return 0; return 0;
}
iov.iov_len = n * BDRV_SECTOR_SIZE;
iov.iov_base = qemu_blockalign(bs, iov.iov_len);
qemu_iovec_init_external(&qiov, &iov, 1);
BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
ret = qcow2_read(bs, start_sect + n_start, s->cluster_data, n);
if (ret < 0) /* Call .bdrv_co_readv() directly instead of using the public block-layer
return ret; * interface. This avoids double I/O throttling and request tracking,
* which can lead to deadlock when block layer copy-on-read is enabled.
*/
ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
if (ret < 0) {
goto out;
}
if (s->crypt_method) { if (s->crypt_method) {
qcow2_encrypt_sectors(s, start_sect + n_start, qcow2_encrypt_sectors(s, start_sect + n_start,
s->cluster_data, iov.iov_base, iov.iov_base, n, 1,
s->cluster_data, n, 1,
&s->aes_encrypt_key); &s->aes_encrypt_key);
} }
BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
ret = bdrv_write(bs->file, (cluster_offset >> 9) + n_start, ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov);
s->cluster_data, n); if (ret < 0) {
if (ret < 0) goto out;
return ret; }
return 0;
ret = 0;
out:
qemu_vfree(iov.iov_base);
return ret;
} }
@ -620,7 +593,9 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9; start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
if (m->n_start) { if (m->n_start) {
cow = true; cow = true;
qemu_co_mutex_unlock(&s->lock);
ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start); ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
qemu_co_mutex_lock(&s->lock);
if (ret < 0) if (ret < 0)
goto err; goto err;
} }
@ -628,8 +603,10 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
if (m->nb_available & (s->cluster_sectors - 1)) { if (m->nb_available & (s->cluster_sectors - 1)) {
uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1); uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
cow = true; cow = true;
qemu_co_mutex_unlock(&s->lock);
ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9), ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
m->nb_available - end, s->cluster_sectors); m->nb_available - end, s->cluster_sectors);
qemu_co_mutex_lock(&s->lock);
if (ret < 0) if (ret < 0)
goto err; goto err;
} }

View File

@ -700,6 +700,10 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
l2_table = NULL; l2_table = NULL;
l1_table = NULL; l1_table = NULL;
l1_size2 = l1_size * sizeof(uint64_t); l1_size2 = l1_size * sizeof(uint64_t);
/* WARNING: qcow2_snapshot_goto relies on this function not using the
* l1_table_offset when it is the current s->l1_table_offset! Be careful
* when changing this! */
if (l1_table_offset != s->l1_table_offset) { if (l1_table_offset != s->l1_table_offset) {
if (l1_size2 != 0) { if (l1_size2 != 0) {
l1_table = g_malloc0(align_offset(l1_size2, 512)); l1_table = g_malloc0(align_offset(l1_size2, 512));
@ -819,7 +823,8 @@ fail:
qcow2_cache_set_writethrough(bs, s->refcount_block_cache, qcow2_cache_set_writethrough(bs, s->refcount_block_cache,
old_refcount_writethrough); old_refcount_writethrough);
if (l1_modified) { /* Update L1 only if it isn't deleted anyway (addend = -1) */
if (addend >= 0 && l1_modified) {
for(i = 0; i < l1_size; i++) for(i = 0; i < l1_size; i++)
cpu_to_be64s(&l1_table[i]); cpu_to_be64s(&l1_table[i]);
if (bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, if (bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table,

View File

@ -68,6 +68,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
int i, id_str_size, name_size; int i, id_str_size, name_size;
int64_t offset; int64_t offset;
uint32_t extra_data_size; uint32_t extra_data_size;
int ret;
if (!s->nb_snapshots) { if (!s->nb_snapshots) {
s->snapshots = NULL; s->snapshots = NULL;
@ -77,10 +78,15 @@ int qcow2_read_snapshots(BlockDriverState *bs)
offset = s->snapshots_offset; offset = s->snapshots_offset;
s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot)); s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot));
for(i = 0; i < s->nb_snapshots; i++) { for(i = 0; i < s->nb_snapshots; i++) {
/* Read statically sized part of the snapshot header */
offset = align_offset(offset, 8); offset = align_offset(offset, 8);
if (bdrv_pread(bs->file, offset, &h, sizeof(h)) != sizeof(h)) ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
if (ret < 0) {
goto fail; goto fail;
}
offset += sizeof(h); offset += sizeof(h);
sn = s->snapshots + i; sn = s->snapshots + i;
sn->l1_table_offset = be64_to_cpu(h.l1_table_offset); sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
@ -94,25 +100,34 @@ int qcow2_read_snapshots(BlockDriverState *bs)
id_str_size = be16_to_cpu(h.id_str_size); id_str_size = be16_to_cpu(h.id_str_size);
name_size = be16_to_cpu(h.name_size); name_size = be16_to_cpu(h.name_size);
/* Skip extra data */
offset += extra_data_size; offset += extra_data_size;
/* Read snapshot ID */
sn->id_str = g_malloc(id_str_size + 1); sn->id_str = g_malloc(id_str_size + 1);
if (bdrv_pread(bs->file, offset, sn->id_str, id_str_size) != id_str_size) ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size);
if (ret < 0) {
goto fail; goto fail;
}
offset += id_str_size; offset += id_str_size;
sn->id_str[id_str_size] = '\0'; sn->id_str[id_str_size] = '\0';
/* Read snapshot name */
sn->name = g_malloc(name_size + 1); sn->name = g_malloc(name_size + 1);
if (bdrv_pread(bs->file, offset, sn->name, name_size) != name_size) ret = bdrv_pread(bs->file, offset, sn->name, name_size);
if (ret < 0) {
goto fail; goto fail;
}
offset += name_size; offset += name_size;
sn->name[name_size] = '\0'; sn->name[name_size] = '\0';
} }
s->snapshots_size = offset - s->snapshots_offset; s->snapshots_size = offset - s->snapshots_offset;
return 0; return 0;
fail:
fail:
qcow2_free_snapshots(bs); qcow2_free_snapshots(bs);
return -1; return ret;
} }
/* add at the end of the file a new list of snapshots */ /* add at the end of the file a new list of snapshots */
@ -122,9 +137,12 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
QCowSnapshot *sn; QCowSnapshot *sn;
QCowSnapshotHeader h; QCowSnapshotHeader h;
int i, name_size, id_str_size, snapshots_size; int i, name_size, id_str_size, snapshots_size;
uint64_t data64; struct {
uint32_t data32; uint32_t nb_snapshots;
uint64_t snapshots_offset;
} QEMU_PACKED header_data;
int64_t offset, snapshots_offset; int64_t offset, snapshots_offset;
int ret;
/* compute the size of the snapshots */ /* compute the size of the snapshots */
offset = 0; offset = 0;
@ -137,6 +155,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
} }
snapshots_size = offset; snapshots_size = offset;
/* Allocate space for the new snapshot list */
snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size); snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size);
bdrv_flush(bs->file); bdrv_flush(bs->file);
offset = snapshots_offset; offset = snapshots_offset;
@ -144,6 +163,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
return offset; return offset;
} }
/* Write all snapshots to the new list */
for(i = 0; i < s->nb_snapshots; i++) { for(i = 0; i < s->nb_snapshots; i++) {
sn = s->snapshots + i; sn = s->snapshots + i;
memset(&h, 0, sizeof(h)); memset(&h, 0, sizeof(h));
@ -159,34 +179,55 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
h.id_str_size = cpu_to_be16(id_str_size); h.id_str_size = cpu_to_be16(id_str_size);
h.name_size = cpu_to_be16(name_size); h.name_size = cpu_to_be16(name_size);
offset = align_offset(offset, 8); offset = align_offset(offset, 8);
if (bdrv_pwrite_sync(bs->file, offset, &h, sizeof(h)) < 0)
ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h));
if (ret < 0) {
goto fail; goto fail;
}
offset += sizeof(h); offset += sizeof(h);
if (bdrv_pwrite_sync(bs->file, offset, sn->id_str, id_str_size) < 0)
ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size);
if (ret < 0) {
goto fail; goto fail;
}
offset += id_str_size; offset += id_str_size;
if (bdrv_pwrite_sync(bs->file, offset, sn->name, name_size) < 0)
ret = bdrv_pwrite(bs->file, offset, sn->name, name_size);
if (ret < 0) {
goto fail; goto fail;
}
offset += name_size; offset += name_size;
} }
/* update the various header fields */ /*
data64 = cpu_to_be64(snapshots_offset); * Update the header to point to the new snapshot table. This requires the
if (bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, snapshots_offset), * new table and its refcounts to be stable on disk.
&data64, sizeof(data64)) < 0) */
ret = bdrv_flush(bs);
if (ret < 0) {
goto fail; goto fail;
data32 = cpu_to_be32(s->nb_snapshots); }
if (bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
&data32, sizeof(data32)) < 0) QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) !=
offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots));
header_data.nb_snapshots = cpu_to_be32(s->nb_snapshots);
header_data.snapshots_offset = cpu_to_be64(snapshots_offset);
ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
&header_data, sizeof(header_data));
if (ret < 0) {
goto fail; goto fail;
}
/* free the old snapshot table */ /* free the old snapshot table */
qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size); qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size);
s->snapshots_offset = snapshots_offset; s->snapshots_offset = snapshots_offset;
s->snapshots_size = snapshots_size; s->snapshots_size = snapshots_size;
return 0; return 0;
fail:
return -1; fail:
return ret;
} }
static void find_new_snapshot_id(BlockDriverState *bs, static void find_new_snapshot_id(BlockDriverState *bs,
@ -236,72 +277,92 @@ static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
{ {
BDRVQcowState *s = bs->opaque; BDRVQcowState *s = bs->opaque;
QCowSnapshot *snapshots1, sn1, *sn = &sn1; QCowSnapshot *new_snapshot_list = NULL;
QCowSnapshot *old_snapshot_list = NULL;
QCowSnapshot sn1, *sn = &sn1;
int i, ret; int i, ret;
uint64_t *l1_table = NULL; uint64_t *l1_table = NULL;
int64_t l1_table_offset; int64_t l1_table_offset;
memset(sn, 0, sizeof(*sn)); memset(sn, 0, sizeof(*sn));
/* Generate an ID if it wasn't passed */
if (sn_info->id_str[0] == '\0') { if (sn_info->id_str[0] == '\0') {
/* compute a new id */
find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str)); find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
} }
/* check that the ID is unique */ /* Check that the ID is unique */
if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) {
return -ENOENT; return -ENOENT;
}
/* Populate sn with passed data */
sn->id_str = g_strdup(sn_info->id_str); sn->id_str = g_strdup(sn_info->id_str);
if (!sn->id_str)
goto fail;
sn->name = g_strdup(sn_info->name); sn->name = g_strdup(sn_info->name);
if (!sn->name)
goto fail;
sn->vm_state_size = sn_info->vm_state_size; sn->vm_state_size = sn_info->vm_state_size;
sn->date_sec = sn_info->date_sec; sn->date_sec = sn_info->date_sec;
sn->date_nsec = sn_info->date_nsec; sn->date_nsec = sn_info->date_nsec;
sn->vm_clock_nsec = sn_info->vm_clock_nsec; sn->vm_clock_nsec = sn_info->vm_clock_nsec;
ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1); /* Allocate the L1 table of the snapshot and copy the current one there. */
if (ret < 0)
goto fail;
/* create the L1 table of the snapshot */
l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t)); l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
if (l1_table_offset < 0) { if (l1_table_offset < 0) {
ret = l1_table_offset;
goto fail; goto fail;
} }
bdrv_flush(bs->file);
sn->l1_table_offset = l1_table_offset; sn->l1_table_offset = l1_table_offset;
sn->l1_size = s->l1_size; sn->l1_size = s->l1_size;
if (s->l1_size != 0) { l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
} else {
l1_table = NULL;
}
for(i = 0; i < s->l1_size; i++) { for(i = 0; i < s->l1_size; i++) {
l1_table[i] = cpu_to_be64(s->l1_table[i]); l1_table[i] = cpu_to_be64(s->l1_table[i]);
} }
if (bdrv_pwrite_sync(bs->file, sn->l1_table_offset,
l1_table, s->l1_size * sizeof(uint64_t)) < 0) ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table,
s->l1_size * sizeof(uint64_t));
if (ret < 0) {
goto fail; goto fail;
}
g_free(l1_table); g_free(l1_table);
l1_table = NULL; l1_table = NULL;
snapshots1 = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot)); /*
if (s->snapshots) { * Increase the refcounts of all clusters and make sure everything is
memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot)); * stable on disk before updating the snapshot table to contain a pointer
g_free(s->snapshots); * to the new L1 table.
*/
ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
if (ret < 0) {
goto fail;
} }
s->snapshots = snapshots1;
ret = bdrv_flush(bs);
if (ret < 0) {
goto fail;
}
/* Append the new snapshot to the snapshot list */
new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
if (s->snapshots) {
memcpy(new_snapshot_list, s->snapshots,
s->nb_snapshots * sizeof(QCowSnapshot));
old_snapshot_list = s->snapshots;
}
s->snapshots = new_snapshot_list;
s->snapshots[s->nb_snapshots++] = *sn; s->snapshots[s->nb_snapshots++] = *sn;
if (qcow2_write_snapshots(bs) < 0) ret = qcow2_write_snapshots(bs);
if (ret < 0) {
g_free(s->snapshots);
s->snapshots = old_snapshot_list;
goto fail; goto fail;
}
g_free(old_snapshot_list);
#ifdef DEBUG_ALLOC #ifdef DEBUG_ALLOC
{ {
BdrvCheckResult result = {0}; BdrvCheckResult result = {0};
@ -309,10 +370,13 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
} }
#endif #endif
return 0; return 0;
fail:
fail:
g_free(sn->id_str);
g_free(sn->name); g_free(sn->name);
g_free(l1_table); g_free(l1_table);
return -1;
return ret;
} }
/* copy the snapshot 'snapshot_name' into the current disk image */ /* copy the snapshot 'snapshot_name' into the current disk image */
@ -322,38 +386,92 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
QCowSnapshot *sn; QCowSnapshot *sn;
int i, snapshot_index; int i, snapshot_index;
int cur_l1_bytes, sn_l1_bytes; int cur_l1_bytes, sn_l1_bytes;
int ret;
uint64_t *sn_l1_table = NULL;
/* Search the snapshot */
snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
if (snapshot_index < 0) if (snapshot_index < 0) {
return -ENOENT; return -ENOENT;
}
sn = &s->snapshots[snapshot_index]; sn = &s->snapshots[snapshot_index];
if (qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0) /*
goto fail; * Make sure that the current L1 table is big enough to contain the whole
* L1 table of the snapshot. If the snapshot L1 table is smaller, the
if (qcow2_grow_l1_table(bs, sn->l1_size, true) < 0) * current one must be padded with zeros.
*/
ret = qcow2_grow_l1_table(bs, sn->l1_size, true);
if (ret < 0) {
goto fail; goto fail;
}
cur_l1_bytes = s->l1_size * sizeof(uint64_t); cur_l1_bytes = s->l1_size * sizeof(uint64_t);
sn_l1_bytes = sn->l1_size * sizeof(uint64_t); sn_l1_bytes = sn->l1_size * sizeof(uint64_t);
if (cur_l1_bytes > sn_l1_bytes) { /*
memset(s->l1_table + sn->l1_size, 0, cur_l1_bytes - sn_l1_bytes); * Copy the snapshot L1 table to the current L1 table.
*
* Before overwriting the old current L1 table on disk, make sure to
* increase all refcounts for the clusters referenced by the new one.
* Decrease the refcount referenced by the old one only when the L1
* table is overwritten.
*/
sn_l1_table = g_malloc0(cur_l1_bytes);
ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes);
if (ret < 0) {
goto fail;
} }
/* copy the snapshot l1 table to the current l1 table */ ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset,
if (bdrv_pread(bs->file, sn->l1_table_offset, sn->l1_size, 1);
s->l1_table, sn_l1_bytes) < 0) if (ret < 0) {
goto fail; goto fail;
if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, }
s->l1_table, cur_l1_bytes) < 0)
ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table,
cur_l1_bytes);
if (ret < 0) {
goto fail; goto fail;
}
/*
* Decrease refcount of clusters of current L1 table.
*
* At this point, the in-memory s->l1_table points to the old L1 table,
* whereas on disk we already have the new one.
*
* qcow2_update_snapshot_refcount special cases the current L1 table to use
* the in-memory data instead of really using the offset to load a new one,
* which is why this works.
*/
ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset,
s->l1_size, -1);
/*
* Now update the in-memory L1 table to be in sync with the on-disk one. We
* need to do this even if updating refcounts failed.
*/
for(i = 0;i < s->l1_size; i++) { for(i = 0;i < s->l1_size; i++) {
be64_to_cpus(&s->l1_table[i]); s->l1_table[i] = be64_to_cpu(sn_l1_table[i]);
} }
if (qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0) if (ret < 0) {
goto fail; goto fail;
}
g_free(sn_l1_table);
sn_l1_table = NULL;
/*
* Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed
* when we decreased the refcount of the old snapshot.
*/
ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
if (ret < 0) {
goto fail;
}
#ifdef DEBUG_ALLOC #ifdef DEBUG_ALLOC
{ {
@ -362,39 +480,59 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
} }
#endif #endif
return 0; return 0;
fail:
return -EIO; fail:
g_free(sn_l1_table);
return ret;
} }
int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
{ {
BDRVQcowState *s = bs->opaque; BDRVQcowState *s = bs->opaque;
QCowSnapshot *sn; QCowSnapshot sn;
int snapshot_index, ret; int snapshot_index, ret;
/* Search the snapshot */
snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
if (snapshot_index < 0) if (snapshot_index < 0) {
return -ENOENT; return -ENOENT;
sn = &s->snapshots[snapshot_index]; }
sn = s->snapshots[snapshot_index];
ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1); /* Remove it from the snapshot list */
if (ret < 0) memmove(s->snapshots + snapshot_index,
return ret; s->snapshots + snapshot_index + 1,
/* must update the copied flag on the current cluster offsets */ (s->nb_snapshots - snapshot_index - 1) * sizeof(sn));
ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
if (ret < 0)
return ret;
qcow2_free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
g_free(sn->id_str);
g_free(sn->name);
memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
s->nb_snapshots--; s->nb_snapshots--;
ret = qcow2_write_snapshots(bs); ret = qcow2_write_snapshots(bs);
if (ret < 0) { if (ret < 0) {
/* XXX: restore snapshot if error ? */
return ret; return ret;
} }
/*
* The snapshot is now unused, clean up. If we fail after this point, we
* won't recover but just leak clusters.
*/
g_free(sn.id_str);
g_free(sn.name);
/*
* Now decrease the refcounts of clusters referenced by the snapshot and
* free the L1 table.
*/
ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset,
sn.l1_size, -1);
if (ret < 0) {
return ret;
}
qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t));
/* must update the copied flag on the current cluster offsets */
ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
if (ret < 0) {
return ret;
}
#ifdef DEBUG_ALLOC #ifdef DEBUG_ALLOC
{ {
BdrvCheckResult result = {0}; BdrvCheckResult result = {0};
@ -435,32 +573,42 @@ int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name) int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name)
{ {
int i, snapshot_index, l1_size2; int i, snapshot_index;
BDRVQcowState *s = bs->opaque; BDRVQcowState *s = bs->opaque;
QCowSnapshot *sn; QCowSnapshot *sn;
uint64_t *new_l1_table;
int new_l1_bytes;
int ret;
assert(bs->read_only);
/* Search the snapshot */
snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name); snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name);
if (snapshot_index < 0) { if (snapshot_index < 0) {
return -ENOENT; return -ENOENT;
} }
sn = &s->snapshots[snapshot_index]; sn = &s->snapshots[snapshot_index];
/* Allocate and read in the snapshot's L1 table */
new_l1_bytes = s->l1_size * sizeof(uint64_t);
new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512));
ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes);
if (ret < 0) {
g_free(new_l1_table);
return ret;
}
/* Switch the L1 table */
g_free(s->l1_table);
s->l1_size = sn->l1_size; s->l1_size = sn->l1_size;
l1_size2 = s->l1_size * sizeof(uint64_t);
if (s->l1_table != NULL) {
g_free(s->l1_table);
}
s->l1_table_offset = sn->l1_table_offset; s->l1_table_offset = sn->l1_table_offset;
s->l1_table = g_malloc0(align_offset(l1_size2, 512)); s->l1_table = new_l1_table;
if (bdrv_pread(bs->file, sn->l1_table_offset,
s->l1_table, l1_size2) != l1_size2) {
return -1;
}
for(i = 0;i < s->l1_size; i++) { for(i = 0;i < s->l1_size; i++) {
be64_to_cpus(&s->l1_table[i]); be64_to_cpus(&s->l1_table[i]);
} }
return 0; return 0;
} }

View File

@ -273,8 +273,9 @@ static int qcow2_open(BlockDriverState *bs, int flags)
} }
bs->backing_file[len] = '\0'; bs->backing_file[len] = '\0';
} }
if (qcow2_read_snapshots(bs) < 0) {
ret = -EINVAL; ret = qcow2_read_snapshots(bs);
if (ret < 0) {
goto fail; goto fail;
} }
@ -343,16 +344,19 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key)
return 0; return 0;
} }
static int qcow2_is_allocated(BlockDriverState *bs, int64_t sector_num, static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
int nb_sectors, int *pnum) int64_t sector_num, int nb_sectors, int *pnum)
{ {
BDRVQcowState *s = bs->opaque;
uint64_t cluster_offset; uint64_t cluster_offset;
int ret; int ret;
*pnum = nb_sectors; *pnum = nb_sectors;
/* FIXME We can get errors here, but the bdrv_is_allocated interface can't /* FIXME We can get errors here, but the bdrv_co_is_allocated interface
* pass them on today */ * can't pass them on today */
qemu_co_mutex_lock(&s->lock);
ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset); ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
qemu_co_mutex_unlock(&s->lock);
if (ret < 0) { if (ret < 0) {
*pnum = 0; *pnum = 0;
} }
@ -377,7 +381,7 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
return n1; return n1;
} }
static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
int remaining_sectors, QEMUIOVector *qiov) int remaining_sectors, QEMUIOVector *qiov)
{ {
BDRVQcowState *s = bs->opaque; BDRVQcowState *s = bs->opaque;
@ -512,12 +516,12 @@ static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
/* Restart all dependent requests */ /* Restart all dependent requests */
if (!qemu_co_queue_empty(&m->dependent_requests)) { if (!qemu_co_queue_empty(&m->dependent_requests)) {
qemu_co_mutex_unlock(&s->lock); qemu_co_mutex_unlock(&s->lock);
while(qemu_co_queue_next(&m->dependent_requests)); qemu_co_queue_restart_all(&m->dependent_requests);
qemu_co_mutex_lock(&s->lock); qemu_co_mutex_lock(&s->lock);
} }
} }
static int qcow2_co_writev(BlockDriverState *bs, static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
int64_t sector_num, int64_t sector_num,
int remaining_sectors, int remaining_sectors,
QEMUIOVector *qiov) QEMUIOVector *qiov)
@ -1137,7 +1141,7 @@ fail:
return ret; return ret;
} }
static int qcow2_co_flush_to_os(BlockDriverState *bs) static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
{ {
BDRVQcowState *s = bs->opaque; BDRVQcowState *s = bs->opaque;
int ret; int ret;
@ -1159,7 +1163,7 @@ static int qcow2_co_flush_to_os(BlockDriverState *bs)
return 0; return 0;
} }
static int qcow2_co_flush_to_disk(BlockDriverState *bs) static coroutine_fn int qcow2_co_flush_to_disk(BlockDriverState *bs)
{ {
return bdrv_co_flush(bs->file); return bdrv_co_flush(bs->file);
} }
@ -1276,7 +1280,7 @@ static BlockDriver bdrv_qcow2 = {
.bdrv_open = qcow2_open, .bdrv_open = qcow2_open,
.bdrv_close = qcow2_close, .bdrv_close = qcow2_close,
.bdrv_create = qcow2_create, .bdrv_create = qcow2_create,
.bdrv_is_allocated = qcow2_is_allocated, .bdrv_co_is_allocated = qcow2_co_is_allocated,
.bdrv_set_key = qcow2_set_key, .bdrv_set_key = qcow2_set_key,
.bdrv_make_empty = qcow2_make_empty, .bdrv_make_empty = qcow2_make_empty,

View File

@ -29,7 +29,7 @@ static void qed_read_table_cb(void *opaque, int ret)
{ {
QEDReadTableCB *read_table_cb = opaque; QEDReadTableCB *read_table_cb = opaque;
QEDTable *table = read_table_cb->table; QEDTable *table = read_table_cb->table;
int noffsets = read_table_cb->iov.iov_len / sizeof(uint64_t); int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
int i; int i;
/* Handle I/O error */ /* Handle I/O error */
@ -65,7 +65,7 @@ static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
qemu_iovec_init_external(qiov, &read_table_cb->iov, 1); qemu_iovec_init_external(qiov, &read_table_cb->iov, 1);
aiocb = bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov, aiocb = bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov,
read_table_cb->iov.iov_len / BDRV_SECTOR_SIZE, qiov->size / BDRV_SECTOR_SIZE,
qed_read_table_cb, read_table_cb); qed_read_table_cb, read_table_cb);
if (!aiocb) { if (!aiocb) {
qed_read_table_cb(read_table_cb, -EIO); qed_read_table_cb(read_table_cb, -EIO);
@ -160,7 +160,7 @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
aiocb = bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, aiocb = bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
&write_table_cb->qiov, &write_table_cb->qiov,
write_table_cb->iov.iov_len / BDRV_SECTOR_SIZE, write_table_cb->qiov.size / BDRV_SECTOR_SIZE,
qed_write_table_cb, write_table_cb); qed_write_table_cb, write_table_cb);
if (!aiocb) { if (!aiocb) {
qed_write_table_cb(write_table_cb, -EIO); qed_write_table_cb(write_table_cb, -EIO);

View File

@ -661,6 +661,7 @@ static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options)
} }
typedef struct { typedef struct {
Coroutine *co;
int is_allocated; int is_allocated;
int *pnum; int *pnum;
} QEDIsAllocatedCB; } QEDIsAllocatedCB;
@ -670,10 +671,14 @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
QEDIsAllocatedCB *cb = opaque; QEDIsAllocatedCB *cb = opaque;
*cb->pnum = len / BDRV_SECTOR_SIZE; *cb->pnum = len / BDRV_SECTOR_SIZE;
cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO); cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO);
if (cb->co) {
qemu_coroutine_enter(cb->co, NULL);
}
} }
static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num, static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs,
int nb_sectors, int *pnum) int64_t sector_num,
int nb_sectors, int *pnum)
{ {
BDRVQEDState *s = bs->opaque; BDRVQEDState *s = bs->opaque;
uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
@ -686,8 +691,10 @@ static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num,
qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb); qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb);
/* Now sleep if the callback wasn't invoked immediately */
while (cb.is_allocated == -1) { while (cb.is_allocated == -1) {
qemu_aio_wait(); cb.co = qemu_coroutine_self();
qemu_coroutine_yield();
} }
qed_unref_l2_cache_entry(request.l2_table); qed_unref_l2_cache_entry(request.l2_table);
@ -1485,7 +1492,7 @@ static BlockDriver bdrv_qed = {
.bdrv_open = bdrv_qed_open, .bdrv_open = bdrv_qed_open,
.bdrv_close = bdrv_qed_close, .bdrv_close = bdrv_qed_close,
.bdrv_create = bdrv_qed_create, .bdrv_create = bdrv_qed_create,
.bdrv_is_allocated = bdrv_qed_is_allocated, .bdrv_co_is_allocated = bdrv_qed_co_is_allocated,
.bdrv_make_empty = bdrv_qed_make_empty, .bdrv_make_empty = bdrv_qed_make_empty,
.bdrv_aio_readv = bdrv_qed_aio_readv, .bdrv_aio_readv = bdrv_qed_aio_readv,
.bdrv_aio_writev = bdrv_qed_aio_writev, .bdrv_aio_writev = bdrv_qed_aio_writev,

View File

@ -1715,7 +1715,7 @@ out:
return 1; return 1;
} }
static int sd_co_writev(BlockDriverState *bs, int64_t sector_num, static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, QEMUIOVector *qiov) int nb_sectors, QEMUIOVector *qiov)
{ {
SheepdogAIOCB *acb; SheepdogAIOCB *acb;
@ -1744,7 +1744,7 @@ static int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
return acb->ret; return acb->ret;
} }
static int sd_co_readv(BlockDriverState *bs, int64_t sector_num, static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, QEMUIOVector *qiov) int nb_sectors, QEMUIOVector *qiov)
{ {
SheepdogAIOCB *acb; SheepdogAIOCB *acb;

View File

@ -472,8 +472,8 @@ static int vdi_open(BlockDriverState *bs, int flags)
return -1; return -1;
} }
static int vdi_is_allocated(BlockDriverState *bs, int64_t sector_num, static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
int nb_sectors, int *pnum) int64_t sector_num, int nb_sectors, int *pnum)
{ {
/* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */ /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */
BDRVVdiState *s = (BDRVVdiState *)bs->opaque; BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
@ -996,7 +996,7 @@ static BlockDriver bdrv_vdi = {
.bdrv_close = vdi_close, .bdrv_close = vdi_close,
.bdrv_create = vdi_create, .bdrv_create = vdi_create,
.bdrv_co_flush_to_disk = vdi_co_flush, .bdrv_co_flush_to_disk = vdi_co_flush,
.bdrv_is_allocated = vdi_is_allocated, .bdrv_co_is_allocated = vdi_co_is_allocated,
.bdrv_make_empty = vdi_make_empty, .bdrv_make_empty = vdi_make_empty,
.bdrv_aio_readv = vdi_aio_readv, .bdrv_aio_readv = vdi_aio_readv,

View File

@ -861,8 +861,8 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
return NULL; return NULL;
} }
static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num, static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs,
int nb_sectors, int *pnum) int64_t sector_num, int nb_sectors, int *pnum)
{ {
BDRVVmdkState *s = bs->opaque; BDRVVmdkState *s = bs->opaque;
int64_t index_in_cluster, n, ret; int64_t index_in_cluster, n, ret;
@ -873,8 +873,10 @@ static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
if (!extent) { if (!extent) {
return 0; return 0;
} }
qemu_co_mutex_lock(&s->lock);
ret = get_cluster_offset(bs, extent, NULL, ret = get_cluster_offset(bs, extent, NULL,
sector_num * 512, 0, &offset); sector_num * 512, 0, &offset);
qemu_co_mutex_unlock(&s->lock);
/* get_cluster_offset returning 0 means success */ /* get_cluster_offset returning 0 means success */
ret = !ret; ret = !ret;
@ -1596,7 +1598,7 @@ static BlockDriver bdrv_vmdk = {
.bdrv_close = vmdk_close, .bdrv_close = vmdk_close,
.bdrv_create = vmdk_create, .bdrv_create = vmdk_create,
.bdrv_co_flush_to_disk = vmdk_co_flush, .bdrv_co_flush_to_disk = vmdk_co_flush,
.bdrv_is_allocated = vmdk_is_allocated, .bdrv_co_is_allocated = vmdk_co_is_allocated,
.bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
.create_options = vmdk_create_options, .create_options = vmdk_create_options,

View File

@ -2758,7 +2758,7 @@ static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num,
return ret; return ret;
} }
static int vvfat_is_allocated(BlockDriverState *bs, static int coroutine_fn vvfat_co_is_allocated(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, int* n) int64_t sector_num, int nb_sectors, int* n)
{ {
BDRVVVFATState* s = bs->opaque; BDRVVVFATState* s = bs->opaque;
@ -2855,7 +2855,7 @@ static BlockDriver bdrv_vvfat = {
.bdrv_read = vvfat_co_read, .bdrv_read = vvfat_co_read,
.bdrv_write = vvfat_co_write, .bdrv_write = vvfat_co_write,
.bdrv_close = vvfat_close, .bdrv_close = vvfat_close,
.bdrv_is_allocated = vvfat_is_allocated, .bdrv_co_is_allocated = vvfat_co_is_allocated,
.protocol_name = "fat", .protocol_name = "fat",
}; };

View File

@ -34,6 +34,13 @@
#define BLOCK_FLAG_ENCRYPT 1 #define BLOCK_FLAG_ENCRYPT 1
#define BLOCK_FLAG_COMPAT6 4 #define BLOCK_FLAG_COMPAT6 4
#define BLOCK_IO_LIMIT_READ 0
#define BLOCK_IO_LIMIT_WRITE 1
#define BLOCK_IO_LIMIT_TOTAL 2
#define BLOCK_IO_SLICE_TIME 100000000
#define NANOSECONDS_PER_SECOND 1000000000.0
#define BLOCK_OPT_SIZE "size" #define BLOCK_OPT_SIZE "size"
#define BLOCK_OPT_ENCRYPT "encryption" #define BLOCK_OPT_ENCRYPT "encryption"
#define BLOCK_OPT_COMPAT6 "compat6" #define BLOCK_OPT_COMPAT6 "compat6"
@ -44,12 +51,24 @@
#define BLOCK_OPT_PREALLOC "preallocation" #define BLOCK_OPT_PREALLOC "preallocation"
#define BLOCK_OPT_SUBFMT "subformat" #define BLOCK_OPT_SUBFMT "subformat"
typedef struct BdrvTrackedRequest BdrvTrackedRequest;
typedef struct AIOPool { typedef struct AIOPool {
void (*cancel)(BlockDriverAIOCB *acb); void (*cancel)(BlockDriverAIOCB *acb);
int aiocb_size; int aiocb_size;
BlockDriverAIOCB *free_aiocb; BlockDriverAIOCB *free_aiocb;
} AIOPool; } AIOPool;
typedef struct BlockIOLimit {
int64_t bps[3];
int64_t iops[3];
} BlockIOLimit;
typedef struct BlockIOBaseValue {
uint64_t bytes[2];
uint64_t ios[2];
} BlockIOBaseValue;
struct BlockDriver { struct BlockDriver {
const char *format_name; const char *format_name;
int instance_size; int instance_size;
@ -63,8 +82,6 @@ struct BlockDriver {
const uint8_t *buf, int nb_sectors); const uint8_t *buf, int nb_sectors);
void (*bdrv_close)(BlockDriverState *bs); void (*bdrv_close)(BlockDriverState *bs);
int (*bdrv_create)(const char *filename, QEMUOptionParameter *options); int (*bdrv_create)(const char *filename, QEMUOptionParameter *options);
int (*bdrv_is_allocated)(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, int *pnum);
int (*bdrv_set_key)(BlockDriverState *bs, const char *key); int (*bdrv_set_key)(BlockDriverState *bs, const char *key);
int (*bdrv_make_empty)(BlockDriverState *bs); int (*bdrv_make_empty)(BlockDriverState *bs);
/* aio */ /* aio */
@ -86,6 +103,8 @@ struct BlockDriver {
int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs, int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
int64_t sector_num, int nb_sectors); int64_t sector_num, int nb_sectors);
int coroutine_fn (*bdrv_co_is_allocated)(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, int *pnum);
/* /*
* Invalidate any cached meta-data. * Invalidate any cached meta-data.
@ -179,6 +198,8 @@ struct BlockDriverState {
int encrypted; /* if true, the media is encrypted */ int encrypted; /* if true, the media is encrypted */
int valid_key; /* if true, a valid encryption key has been set */ int valid_key; /* if true, a valid encryption key has been set */
int sg; /* if true, the device is a /dev/sg* */ int sg; /* if true, the device is a /dev/sg* */
int copy_on_read; /* if true, copy read backing sectors into image
note this is a reference count */
BlockDriver *drv; /* NULL means no media */ BlockDriver *drv; /* NULL means no media */
void *opaque; void *opaque;
@ -201,6 +222,16 @@ struct BlockDriverState {
void *sync_aiocb; void *sync_aiocb;
/* the time for latest disk I/O */
int64_t slice_time;
int64_t slice_start;
int64_t slice_end;
BlockIOLimit io_limits;
BlockIOBaseValue io_base;
CoQueue throttled_reqs;
QEMUTimer *block_timer;
bool io_limits_enabled;
/* I/O stats (display with "info blockstats"). */ /* I/O stats (display with "info blockstats"). */
uint64_t nr_bytes[BDRV_MAX_IOTYPE]; uint64_t nr_bytes[BDRV_MAX_IOTYPE];
uint64_t nr_ops[BDRV_MAX_IOTYPE]; uint64_t nr_ops[BDRV_MAX_IOTYPE];
@ -228,6 +259,8 @@ struct BlockDriverState {
int in_use; /* users other than guest access, eg. block migration */ int in_use; /* users other than guest access, eg. block migration */
QTAILQ_ENTRY(BlockDriverState) list; QTAILQ_ENTRY(BlockDriverState) list;
void *private; void *private;
QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
}; };
struct BlockDriverAIOCB { struct BlockDriverAIOCB {
@ -244,6 +277,9 @@ void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
BlockDriverCompletionFunc *cb, void *opaque); BlockDriverCompletionFunc *cb, void *opaque);
void qemu_aio_release(void *p); void qemu_aio_release(void *p);
void bdrv_set_io_limits(BlockDriverState *bs,
BlockIOLimit *io_limits);
#ifdef _WIN32 #ifdef _WIN32
int is_windows_drive(const char *filename); int is_windows_drive(const char *filename);
#endif #endif

View File

@ -216,6 +216,26 @@ static int parse_block_error_action(const char *buf, int is_read)
} }
} }
static bool do_check_io_limits(BlockIOLimit *io_limits)
{
bool bps_flag;
bool iops_flag;
assert(io_limits);
bps_flag = (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] != 0)
&& ((io_limits->bps[BLOCK_IO_LIMIT_READ] != 0)
|| (io_limits->bps[BLOCK_IO_LIMIT_WRITE] != 0));
iops_flag = (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] != 0)
&& ((io_limits->iops[BLOCK_IO_LIMIT_READ] != 0)
|| (io_limits->iops[BLOCK_IO_LIMIT_WRITE] != 0));
if (bps_flag || iops_flag) {
return false;
}
return true;
}
DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
{ {
const char *buf; const char *buf;
@ -235,7 +255,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
int on_read_error, on_write_error; int on_read_error, on_write_error;
const char *devaddr; const char *devaddr;
DriveInfo *dinfo; DriveInfo *dinfo;
BlockIOLimit io_limits;
int snapshot = 0; int snapshot = 0;
bool copy_on_read;
int ret; int ret;
translation = BIOS_ATA_TRANSLATION_AUTO; translation = BIOS_ATA_TRANSLATION_AUTO;
@ -252,6 +274,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
snapshot = qemu_opt_get_bool(opts, "snapshot", 0); snapshot = qemu_opt_get_bool(opts, "snapshot", 0);
ro = qemu_opt_get_bool(opts, "readonly", 0); ro = qemu_opt_get_bool(opts, "readonly", 0);
copy_on_read = qemu_opt_get_bool(opts, "copy-on-read", false);
file = qemu_opt_get(opts, "file"); file = qemu_opt_get(opts, "file");
serial = qemu_opt_get(opts, "serial"); serial = qemu_opt_get(opts, "serial");
@ -353,6 +376,26 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
} }
} }
/* disk I/O throttling */
io_limits.bps[BLOCK_IO_LIMIT_TOTAL] =
qemu_opt_get_number(opts, "bps", 0);
io_limits.bps[BLOCK_IO_LIMIT_READ] =
qemu_opt_get_number(opts, "bps_rd", 0);
io_limits.bps[BLOCK_IO_LIMIT_WRITE] =
qemu_opt_get_number(opts, "bps_wr", 0);
io_limits.iops[BLOCK_IO_LIMIT_TOTAL] =
qemu_opt_get_number(opts, "iops", 0);
io_limits.iops[BLOCK_IO_LIMIT_READ] =
qemu_opt_get_number(opts, "iops_rd", 0);
io_limits.iops[BLOCK_IO_LIMIT_WRITE] =
qemu_opt_get_number(opts, "iops_wr", 0);
if (!do_check_io_limits(&io_limits)) {
error_report("bps(iops) and bps_rd/bps_wr(iops_rd/iops_wr) "
"cannot be used at the same time");
return NULL;
}
on_write_error = BLOCK_ERR_STOP_ENOSPC; on_write_error = BLOCK_ERR_STOP_ENOSPC;
if ((buf = qemu_opt_get(opts, "werror")) != NULL) { if ((buf = qemu_opt_get(opts, "werror")) != NULL) {
if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) { if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) {
@ -460,6 +503,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error); bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error);
/* disk I/O throttling */
bdrv_set_io_limits(dinfo->bdrv, &io_limits);
switch(type) { switch(type) {
case IF_IDE: case IF_IDE:
case IF_SCSI: case IF_SCSI:
@ -502,6 +548,10 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
bdrv_flags |= (BDRV_O_SNAPSHOT|BDRV_O_CACHE_WB|BDRV_O_NO_FLUSH); bdrv_flags |= (BDRV_O_SNAPSHOT|BDRV_O_CACHE_WB|BDRV_O_NO_FLUSH);
} }
if (copy_on_read) {
bdrv_flags |= BDRV_O_COPY_ON_READ;
}
if (media == MEDIA_CDROM) { if (media == MEDIA_CDROM) {
/* CDROM is fine for any interface, don't check. */ /* CDROM is fine for any interface, don't check. */
ro = 1; ro = 1;
@ -603,7 +653,7 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
goto out; goto out;
} }
qemu_aio_flush(); bdrv_drain_all();
bdrv_flush(bs); bdrv_flush(bs);
bdrv_close(bs); bdrv_close(bs);
@ -715,6 +765,65 @@ int do_change_block(Monitor *mon, const char *device,
return monitor_read_bdrv_key_start(mon, bs, NULL, NULL); return monitor_read_bdrv_key_start(mon, bs, NULL, NULL);
} }
/* throttling disk I/O limits */
int do_block_set_io_throttle(Monitor *mon,
const QDict *qdict, QObject **ret_data)
{
BlockIOLimit io_limits;
const char *devname = qdict_get_str(qdict, "device");
BlockDriverState *bs;
io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
= qdict_get_try_int(qdict, "bps", -1);
io_limits.bps[BLOCK_IO_LIMIT_READ]
= qdict_get_try_int(qdict, "bps_rd", -1);
io_limits.bps[BLOCK_IO_LIMIT_WRITE]
= qdict_get_try_int(qdict, "bps_wr", -1);
io_limits.iops[BLOCK_IO_LIMIT_TOTAL]
= qdict_get_try_int(qdict, "iops", -1);
io_limits.iops[BLOCK_IO_LIMIT_READ]
= qdict_get_try_int(qdict, "iops_rd", -1);
io_limits.iops[BLOCK_IO_LIMIT_WRITE]
= qdict_get_try_int(qdict, "iops_wr", -1);
bs = bdrv_find(devname);
if (!bs) {
qerror_report(QERR_DEVICE_NOT_FOUND, devname);
return -1;
}
if ((io_limits.bps[BLOCK_IO_LIMIT_TOTAL] == -1)
|| (io_limits.bps[BLOCK_IO_LIMIT_READ] == -1)
|| (io_limits.bps[BLOCK_IO_LIMIT_WRITE] == -1)
|| (io_limits.iops[BLOCK_IO_LIMIT_TOTAL] == -1)
|| (io_limits.iops[BLOCK_IO_LIMIT_READ] == -1)
|| (io_limits.iops[BLOCK_IO_LIMIT_WRITE] == -1)) {
qerror_report(QERR_MISSING_PARAMETER,
"bps/bps_rd/bps_wr/iops/iops_rd/iops_wr");
return -1;
}
if (!do_check_io_limits(&io_limits)) {
qerror_report(QERR_INVALID_PARAMETER_COMBINATION);
return -1;
}
bs->io_limits = io_limits;
bs->slice_time = BLOCK_IO_SLICE_TIME;
if (!bs->io_limits_enabled && bdrv_io_limits_enabled(bs)) {
bdrv_io_limits_enable(bs);
} else if (bs->io_limits_enabled && !bdrv_io_limits_enabled(bs)) {
bdrv_io_limits_disable(bs);
} else {
if (bs->block_timer) {
qemu_mod_timer(bs->block_timer, qemu_get_clock_ns(vm_clock));
}
}
return 0;
}
int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data) int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
{ {
const char *id = qdict_get_str(qdict, "id"); const char *id = qdict_get_str(qdict, "id");
@ -731,7 +840,7 @@ int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
} }
/* quiesce block driver; prevent further io */ /* quiesce block driver; prevent further io */
qemu_aio_flush(); bdrv_drain_all();
bdrv_flush(bs); bdrv_flush(bs);
bdrv_close(bs); bdrv_close(bs);

View File

@ -63,6 +63,8 @@ int do_block_set_passwd(Monitor *mon, const QDict *qdict, QObject **ret_data);
int do_change_block(Monitor *mon, const char *device, int do_change_block(Monitor *mon, const char *device,
const char *filename, const char *fmt); const char *filename, const char *fmt);
int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data); int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data);
int do_block_set_io_throttle(Monitor *mon,
const QDict *qdict, QObject **ret_data);
int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data); int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data);
int do_block_resize(Monitor *mon, const QDict *qdict, QObject **ret_data); int do_block_resize(Monitor *mon, const QDict *qdict, QObject **ret_data);

2
cpus.c
View File

@ -396,7 +396,7 @@ static void do_vm_stop(RunState state)
pause_all_vcpus(); pause_all_vcpus();
runstate_set(state); runstate_set(state);
vm_state_notify(0, state); vm_state_notify(0, state);
qemu_aio_flush(); bdrv_drain_all();
bdrv_flush_all(); bdrv_flush_all();
monitor_protocol_event(QEVENT_STOP, NULL); monitor_protocol_event(QEVENT_STOP, NULL);
} }

View File

@ -9,6 +9,7 @@
#include "dma.h" #include "dma.h"
#include "block_int.h" #include "block_int.h"
#include "trace.h"
void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint) void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint)
{ {
@ -83,6 +84,8 @@ static void dma_bdrv_unmap(DMAAIOCB *dbs)
static void dma_complete(DMAAIOCB *dbs, int ret) static void dma_complete(DMAAIOCB *dbs, int ret)
{ {
trace_dma_complete(dbs, ret, dbs->common.cb);
dma_bdrv_unmap(dbs); dma_bdrv_unmap(dbs);
if (dbs->common.cb) { if (dbs->common.cb) {
dbs->common.cb(dbs->common.opaque, ret); dbs->common.cb(dbs->common.opaque, ret);
@ -106,6 +109,8 @@ static void dma_bdrv_cb(void *opaque, int ret)
target_phys_addr_t cur_addr, cur_len; target_phys_addr_t cur_addr, cur_len;
void *mem; void *mem;
trace_dma_bdrv_cb(dbs, ret);
dbs->acb = NULL; dbs->acb = NULL;
dbs->sector_num += dbs->iov.size / 512; dbs->sector_num += dbs->iov.size / 512;
dma_bdrv_unmap(dbs); dma_bdrv_unmap(dbs);
@ -130,6 +135,7 @@ static void dma_bdrv_cb(void *opaque, int ret)
} }
if (dbs->iov.size == 0) { if (dbs->iov.size == 0) {
trace_dma_map_wait(dbs);
cpu_register_map_client(dbs, continue_after_map_failure); cpu_register_map_client(dbs, continue_after_map_failure);
return; return;
} }
@ -145,6 +151,8 @@ static void dma_aio_cancel(BlockDriverAIOCB *acb)
{ {
DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common); DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common);
trace_dma_aio_cancel(dbs);
if (dbs->acb) { if (dbs->acb) {
BlockDriverAIOCB *acb = dbs->acb; BlockDriverAIOCB *acb = dbs->acb;
dbs->acb = NULL; dbs->acb = NULL;
@ -168,6 +176,8 @@ BlockDriverAIOCB *dma_bdrv_io(
{ {
DMAAIOCB *dbs = qemu_aio_get(&dma_aio_pool, bs, cb, opaque); DMAAIOCB *dbs = qemu_aio_get(&dma_aio_pool, bs, cb, opaque);
trace_dma_bdrv_io(dbs, bs, sector_num, to_dev);
dbs->acb = NULL; dbs->acb = NULL;
dbs->bs = bs; dbs->bs = bs;
dbs->sg = sg; dbs->sg = sg;

View File

@ -860,9 +860,10 @@ ETEXI
.args_type = "pci_addr:s,opts:s", .args_type = "pci_addr:s,opts:s",
.params = "[[<domain>:]<bus>:]<slot>\n" .params = "[[<domain>:]<bus>:]<slot>\n"
"[file=file][,if=type][,bus=n]\n" "[file=file][,if=type][,bus=n]\n"
"[,unit=m][,media=d][index=i]\n" "[,unit=m][,media=d][,index=i]\n"
"[,cyls=c,heads=h,secs=s[,trans=t]]\n" "[,cyls=c,heads=h,secs=s[,trans=t]]\n"
"[snapshot=on|off][,cache=on|off]", "[,snapshot=on|off][,cache=on|off]\n"
"[,readonly=on|off][,copy-on-read=on|off]",
.help = "add drive to PCI storage controller", .help = "add drive to PCI storage controller",
.mhandler.cmd = drive_hot_add, .mhandler.cmd = drive_hot_add,
}, },
@ -1206,6 +1207,21 @@ ETEXI
.mhandler.cmd_new = do_block_set_passwd, .mhandler.cmd_new = do_block_set_passwd,
}, },
STEXI
@item block_set_io_throttle @var{device} @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}
@findex block_set_io_throttle
Change I/O throttle limits for a block drive to @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}
ETEXI
{
.name = "block_set_io_throttle",
.args_type = "device:B,bps:l,bps_rd:l,bps_wr:l,iops:l,iops_rd:l,iops_wr:l",
.params = "device bps bps_rd bps_wr iops iops_rd iops_wr",
.help = "change I/O throttle limits for a block drive",
.user_print = monitor_user_noop,
.mhandler.cmd_new = do_block_set_io_throttle,
},
STEXI STEXI
@item block_passwd @var{device} @var{password} @item block_passwd @var{device} @var{password}
@findex block_passwd @findex block_passwd

10
hmp.c
View File

@ -216,6 +216,16 @@ void hmp_info_block(Monitor *mon)
info->value->inserted->ro, info->value->inserted->ro,
info->value->inserted->drv, info->value->inserted->drv,
info->value->inserted->encrypted); info->value->inserted->encrypted);
monitor_printf(mon, " bps=%" PRId64 " bps_rd=%" PRId64
" bps_wr=%" PRId64 " iops=%" PRId64
" iops_rd=%" PRId64 " iops_wr=%" PRId64,
info->value->inserted->bps,
info->value->inserted->bps_rd,
info->value->inserted->bps_wr,
info->value->inserted->iops,
info->value->inserted->iops_rd,
info->value->inserted->iops_wr);
} else { } else {
monitor_printf(mon, " [not inserted]"); monitor_printf(mon, " [not inserted]");
} }

View File

@ -200,8 +200,9 @@ static void pmac_ide_flush(DBDMA_io *io)
{ {
MACIOIDEState *m = io->opaque; MACIOIDEState *m = io->opaque;
if (m->aiocb) if (m->aiocb) {
qemu_aio_flush(); bdrv_drain_all();
}
} }
/* PowerMac IDE memory IO */ /* PowerMac IDE memory IO */

View File

@ -309,7 +309,7 @@ void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val)
* aio operation with preadv/pwritev. * aio operation with preadv/pwritev.
*/ */
if (bm->bus->dma->aiocb) { if (bm->bus->dma->aiocb) {
qemu_aio_flush(); bdrv_drain_all();
assert(bm->bus->dma->aiocb == NULL); assert(bm->bus->dma->aiocb == NULL);
assert((bm->status & BM_STATUS_DMAING) == 0); assert((bm->status & BM_STATUS_DMAING) == 0);
} }

View File

@ -474,7 +474,7 @@ static void virtio_blk_reset(VirtIODevice *vdev)
* This should cancel pending requests, but can't do nicely until there * This should cancel pending requests, but can't do nicely until there
* are per-device request lists. * are per-device request lists.
*/ */
qemu_aio_flush(); bdrv_drain_all();
} }
/* coalesce internal state, copy to pci i/o region 0 /* coalesce internal state, copy to pci i/o region 0

View File

@ -49,7 +49,6 @@ static int syncwrite = 0;
static int batch_maps = 0; static int batch_maps = 0;
static int max_requests = 32; static int max_requests = 32;
static int use_aio = 1;
/* ------------------------------------------------------------- */ /* ------------------------------------------------------------- */
@ -314,76 +313,6 @@ static int ioreq_map(struct ioreq *ioreq)
return 0; return 0;
} }
static int ioreq_runio_qemu_sync(struct ioreq *ioreq)
{
struct XenBlkDev *blkdev = ioreq->blkdev;
int i, rc;
off_t pos;
if (ioreq->req.nr_segments && ioreq_map(ioreq) == -1) {
goto err_no_map;
}
if (ioreq->presync) {
bdrv_flush(blkdev->bs);
}
switch (ioreq->req.operation) {
case BLKIF_OP_READ:
pos = ioreq->start;
for (i = 0; i < ioreq->v.niov; i++) {
rc = bdrv_read(blkdev->bs, pos / BLOCK_SIZE,
ioreq->v.iov[i].iov_base,
ioreq->v.iov[i].iov_len / BLOCK_SIZE);
if (rc != 0) {
xen_be_printf(&blkdev->xendev, 0, "rd I/O error (%p, len %zd)\n",
ioreq->v.iov[i].iov_base,
ioreq->v.iov[i].iov_len);
goto err;
}
pos += ioreq->v.iov[i].iov_len;
}
break;
case BLKIF_OP_WRITE:
case BLKIF_OP_WRITE_BARRIER:
if (!ioreq->req.nr_segments) {
break;
}
pos = ioreq->start;
for (i = 0; i < ioreq->v.niov; i++) {
rc = bdrv_write(blkdev->bs, pos / BLOCK_SIZE,
ioreq->v.iov[i].iov_base,
ioreq->v.iov[i].iov_len / BLOCK_SIZE);
if (rc != 0) {
xen_be_printf(&blkdev->xendev, 0, "wr I/O error (%p, len %zd)\n",
ioreq->v.iov[i].iov_base,
ioreq->v.iov[i].iov_len);
goto err;
}
pos += ioreq->v.iov[i].iov_len;
}
break;
default:
/* unknown operation (shouldn't happen -- parse catches this) */
goto err;
}
if (ioreq->postsync) {
bdrv_flush(blkdev->bs);
}
ioreq->status = BLKIF_RSP_OKAY;
ioreq_unmap(ioreq);
ioreq_finish(ioreq);
return 0;
err:
ioreq_unmap(ioreq);
err_no_map:
ioreq_finish(ioreq);
ioreq->status = BLKIF_RSP_ERROR;
return -1;
}
static void qemu_aio_complete(void *opaque, int ret) static void qemu_aio_complete(void *opaque, int ret)
{ {
struct ioreq *ioreq = opaque; struct ioreq *ioreq = opaque;
@ -554,9 +483,7 @@ static void blk_handle_requests(struct XenBlkDev *blkdev)
rp = blkdev->rings.common.sring->req_prod; rp = blkdev->rings.common.sring->req_prod;
xen_rmb(); /* Ensure we see queued requests up to 'rp'. */ xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
if (use_aio) { blk_send_response_all(blkdev);
blk_send_response_all(blkdev);
}
while (rc != rp) { while (rc != rp) {
/* pull request from ring */ /* pull request from ring */
if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc)) { if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc)) {
@ -579,16 +506,7 @@ static void blk_handle_requests(struct XenBlkDev *blkdev)
continue; continue;
} }
if (use_aio) { ioreq_runio_qemu_aio(ioreq);
/* run i/o in aio mode */
ioreq_runio_qemu_aio(ioreq);
} else {
/* run i/o in sync mode */
ioreq_runio_qemu_sync(ioreq);
}
}
if (!use_aio) {
blk_send_response_all(blkdev);
} }
if (blkdev->more_work && blkdev->requests_inflight < max_requests) { if (blkdev->more_work && blkdev->requests_inflight < max_requests) {

View File

@ -120,7 +120,7 @@ static void platform_fixed_ioport_writew(void *opaque, uint32_t addr, uint32_t v
devices, and bit 2 the non-primary-master IDE devices. */ devices, and bit 2 the non-primary-master IDE devices. */
if (val & UNPLUG_ALL_IDE_DISKS) { if (val & UNPLUG_ALL_IDE_DISKS) {
DPRINTF("unplug disks\n"); DPRINTF("unplug disks\n");
qemu_aio_flush(); bdrv_drain_all();
bdrv_flush_all(); bdrv_flush_all();
pci_unplug_disks(s->pci_dev.bus); pci_unplug_disks(s->pci_dev.bus);
} }

View File

@ -370,13 +370,27 @@
# #
# @encrypted: true if the backing device is encrypted # @encrypted: true if the backing device is encrypted
# #
# @bps: total throughput limit in bytes per second is specified
#
# @bps_rd: read throughput limit in bytes per second is specified
#
# @bps_wr: write throughput limit in bytes per second is specified
#
# @iops: total I/O operations per second is specified
#
# @iops_rd: read I/O operations per second is specified
#
# @iops_wr: write I/O operations per second is specified
#
# Since: 0.14.0 # Since: 0.14.0
# #
# Notes: This interface is only found in @BlockInfo. # Notes: This interface is only found in @BlockInfo.
## ##
{ 'type': 'BlockDeviceInfo', { 'type': 'BlockDeviceInfo',
'data': { 'file': 'str', 'ro': 'bool', 'drv': 'str', 'data': { 'file': 'str', 'ro': 'bool', 'drv': 'str',
'*backing_file': 'str', 'encrypted': 'bool' } } '*backing_file': 'str', 'encrypted': 'bool',
'bps': 'int', 'bps_rd': 'int', 'bps_wr': 'int',
'iops': 'int', 'iops_rd': 'int', 'iops_wr': 'int'} }
## ##
# @BlockDeviceIoStatus: # @BlockDeviceIoStatus:

View File

@ -341,6 +341,12 @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
return res.ll; return res.ll;
} }
/* Round number down to multiple */
#define QEMU_ALIGN_DOWN(n, m) ((n) / (m) * (m))
/* Round number up to multiple */
#define QEMU_ALIGN_UP(n, m) QEMU_ALIGN_DOWN((n) + (m) - 1, (m))
#include "module.h" #include "module.h"
#endif #endif

View File

@ -85,6 +85,34 @@ static QemuOptsList qemu_drive_opts = {
.name = "readonly", .name = "readonly",
.type = QEMU_OPT_BOOL, .type = QEMU_OPT_BOOL,
.help = "open drive file as read-only", .help = "open drive file as read-only",
},{
.name = "iops",
.type = QEMU_OPT_NUMBER,
.help = "limit total I/O operations per second",
},{
.name = "iops_rd",
.type = QEMU_OPT_NUMBER,
.help = "limit read operations per second",
},{
.name = "iops_wr",
.type = QEMU_OPT_NUMBER,
.help = "limit write operations per second",
},{
.name = "bps",
.type = QEMU_OPT_NUMBER,
.help = "limit total bytes per second",
},{
.name = "bps_rd",
.type = QEMU_OPT_NUMBER,
.help = "limit read bytes per second",
},{
.name = "bps_wr",
.type = QEMU_OPT_NUMBER,
.help = "limit write bytes per second",
},{
.name = "copy-on-read",
.type = QEMU_OPT_BOOL,
.help = "copy read data from backing file into image file",
}, },
{ /* end of list */ } { /* end of list */ }
}, },

View File

@ -61,6 +61,14 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
assert(qemu_in_coroutine()); assert(qemu_in_coroutine());
} }
void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue)
{
Coroutine *self = qemu_coroutine_self();
QTAILQ_INSERT_HEAD(&queue->entries, self, co_queue_next);
qemu_coroutine_yield();
assert(qemu_in_coroutine());
}
bool qemu_co_queue_next(CoQueue *queue) bool qemu_co_queue_next(CoQueue *queue)
{ {
Coroutine *next; Coroutine *next;
@ -76,6 +84,13 @@ bool qemu_co_queue_next(CoQueue *queue)
return (next != NULL); return (next != NULL);
} }
void qemu_co_queue_restart_all(CoQueue *queue)
{
while (qemu_co_queue_next(queue)) {
/* Do nothing */
}
}
bool qemu_co_queue_empty(CoQueue *queue) bool qemu_co_queue_empty(CoQueue *queue)
{ {
return (QTAILQ_FIRST(&queue->entries) == NULL); return (QTAILQ_FIRST(&queue->entries) == NULL);
@ -136,13 +151,7 @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
assert(qemu_in_coroutine()); assert(qemu_in_coroutine());
if (lock->writer) { if (lock->writer) {
lock->writer = false; lock->writer = false;
while (!qemu_co_queue_empty(&lock->queue)) { qemu_co_queue_restart_all(&lock->queue);
/*
* Wakeup every body. This will include some
* writers too.
*/
qemu_co_queue_next(&lock->queue);
}
} else { } else {
lock->reader--; lock->reader--;
assert(lock->reader >= 0); assert(lock->reader >= 0);

View File

@ -117,6 +117,12 @@ void qemu_co_queue_init(CoQueue *queue);
*/ */
void coroutine_fn qemu_co_queue_wait(CoQueue *queue); void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
/**
* Adds the current coroutine to the head of the CoQueue and transfers control to the
* caller of the coroutine.
*/
void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue);
/** /**
* Restarts the next coroutine in the CoQueue and removes it from the queue. * Restarts the next coroutine in the CoQueue and removes it from the queue.
* *
@ -124,6 +130,11 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
*/ */
bool qemu_co_queue_next(CoQueue *queue); bool qemu_co_queue_next(CoQueue *queue);
/**
* Restarts all coroutines in the CoQueue and leaves the queue empty.
*/
void qemu_co_queue_restart_all(CoQueue *queue);
/** /**
* Checks if the CoQueue is empty. * Checks if the CoQueue is empty.
*/ */

View File

@ -1853,9 +1853,9 @@ int main(int argc, char **argv)
command_loop(); command_loop();
/* /*
* Make sure all outstanding requests get flushed the program exits. * Make sure all outstanding requests complete before the program exits.
*/ */
qemu_aio_flush(); bdrv_drain_all();
if (bs) { if (bs) {
bdrv_delete(bs); bdrv_delete(bs);

View File

@ -135,7 +135,8 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
" [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n" " [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
" [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n" " [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n"
" [,serial=s][,addr=A][,id=name][,aio=threads|native]\n" " [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
" [,readonly=on|off]\n" " [,readonly=on|off][,copy-on-read=on|off]\n"
" [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n"
" use 'file' as a drive image\n", QEMU_ARCH_ALL) " use 'file' as a drive image\n", QEMU_ARCH_ALL)
STEXI STEXI
@item -drive @var{option}[,@var{option}[,@var{option}[,...]]] @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
@ -186,6 +187,9 @@ host disk is full; report the error to the guest otherwise).
The default setting is @option{werror=enospc} and @option{rerror=report}. The default setting is @option{werror=enospc} and @option{rerror=report}.
@item readonly @item readonly
Open drive @option{file} as read-only. Guest write attempts will fail. Open drive @option{file} as read-only. Guest write attempts will fail.
@item copy-on-read=@var{copy-on-read}
@var{copy-on-read} is "on" or "off" and enables whether to copy read backing
file sectors into the image file.
@end table @end table
By default, writethrough caching is used for all block device. This means that By default, writethrough caching is used for all block device. This means that
@ -217,6 +221,10 @@ like your host losing power, the disk storage getting disconnected accidently,
etc. you're image will most probably be rendered unusable. When using etc. you're image will most probably be rendered unusable. When using
the @option{-snapshot} option, unsafe caching is always used. the @option{-snapshot} option, unsafe caching is always used.
Copy-on-read avoids accessing the same backing file sectors repeatedly and is
useful when the backing file is over a slow network. By default copy-on-read
is off.
Instead of @option{-cdrom} you can use: Instead of @option{-cdrom} you can use:
@example @example
qemu -drive file=file,index=2,media=cdrom qemu -drive file=file,index=2,media=cdrom

View File

@ -251,6 +251,10 @@ static const QErrorStringTable qerror_table[] = {
.error_fmt = QERR_QGA_COMMAND_FAILED, .error_fmt = QERR_QGA_COMMAND_FAILED,
.desc = "Guest agent command failed, error was '%(message)'", .desc = "Guest agent command failed, error was '%(message)'",
}, },
{
.error_fmt = QERR_INVALID_PARAMETER_COMBINATION,
.desc = "Invalid paramter combination",
},
{} {}
}; };

View File

@ -207,4 +207,7 @@ QError *qobject_to_qerror(const QObject *obj);
#define QERR_QGA_COMMAND_FAILED \ #define QERR_QGA_COMMAND_FAILED \
"{ 'class': 'QgaCommandFailed', 'data': { 'message': %s } }" "{ 'class': 'QgaCommandFailed', 'data': { 'message': %s } }"
#define QERR_INVALID_PARAMETER_COMBINATION \
"{ 'class': 'InvalidParameterCombination', 'data': {} }"
#endif /* QERROR_H */ #endif /* QERROR_H */

View File

@ -848,6 +848,44 @@ Example:
"password": "12345" } } "password": "12345" } }
<- { "return": {} } <- { "return": {} }
EQMP
{
.name = "block_set_io_throttle",
.args_type = "device:B,bps:l,bps_rd:l,bps_wr:l,iops:l,iops_rd:l,iops_wr:l",
.params = "device bps bps_rd bps_wr iops iops_rd iops_wr",
.help = "change I/O throttle limits for a block drive",
.user_print = monitor_user_noop,
.mhandler.cmd_new = do_block_set_io_throttle,
},
SQMP
block_set_io_throttle
------------
Change I/O throttle limits for a block drive.
Arguments:
- "device": device name (json-string)
- "bps": total throughput limit in bytes per second(json-int)
- "bps_rd": read throughput limit in bytes per second(json-int)
- "bps_wr": read throughput limit in bytes per second(json-int)
- "iops": total I/O operations per second(json-int)
- "iops_rd": read I/O operations per second(json-int)
- "iops_wr": write I/O operations per second(json-int)
Example:
-> { "execute": "block_set_io_throttle", "arguments": { "device": "virtio0",
"bps": "1000000",
"bps_rd": "0",
"bps_wr": "0",
"iops": "0",
"iops_rd": "0",
"iops_wr": "0" } }
<- { "return": {} }
EQMP EQMP
{ {
@ -1152,6 +1190,13 @@ Each json-object contain the following:
"tftp", "vdi", "vmdk", "vpc", "vvfat" "tftp", "vdi", "vmdk", "vpc", "vvfat"
- "backing_file": backing file name (json-string, optional) - "backing_file": backing file name (json-string, optional)
- "encrypted": true if encrypted, false otherwise (json-bool) - "encrypted": true if encrypted, false otherwise (json-bool)
- "bps": limit total bytes per second (json-int)
- "bps_rd": limit read bytes per second (json-int)
- "bps_wr": limit write bytes per second (json-int)
- "iops": limit total I/O operations per second (json-int)
- "iops_rd": limit read operations per second (json-int)
- "iops_wr": limit write operations per second (json-int)
- "io-status": I/O operation status, only present if the device supports it - "io-status": I/O operation status, only present if the device supports it
and the VM is configured to stop on errors. It's always reset and the VM is configured to stop on errors. It's always reset
to "ok" when the "cont" command is issued (json_string, optional) to "ok" when the "cont" command is issued (json_string, optional)
@ -1171,7 +1216,13 @@ Example:
"ro":false, "ro":false,
"drv":"qcow2", "drv":"qcow2",
"encrypted":false, "encrypted":false,
"file":"disks/test.img" "file":"disks/test.img",
"bps":1000000,
"bps_rd":0,
"bps_wr":0,
"iops":1000000,
"iops_rd":0,
"iops_wr":0,
}, },
"type":"unknown" "type":"unknown"
}, },

View File

@ -2104,7 +2104,7 @@ int load_vmstate(const char *name)
} }
/* Flush all IO requests so they don't interfere with the new state. */ /* Flush all IO requests so they don't interfere with the new state. */
qemu_aio_flush(); bdrv_drain_all();
bs = NULL; bs = NULL;
while ((bs = bdrv_next(bs))) { while ((bs = bdrv_next(bs))) {

View File

@ -69,6 +69,7 @@ bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p" bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
# hw/virtio-blk.c # hw/virtio-blk.c
virtio_blk_req_complete(void *req, int status) "req %p status %d" virtio_blk_req_complete(void *req, int status) "req %p status %d"
@ -631,3 +632,10 @@ win_helper_no_switch_pstate(uint32_t new_pstate_regs) "change_pstate: regs new=%
win_helper_wrpil(uint32_t psrpil, uint32_t new_pil) "old=%x new=%x" win_helper_wrpil(uint32_t psrpil, uint32_t new_pil) "old=%x new=%x"
win_helper_done(uint32_t tl) "tl=%d" win_helper_done(uint32_t tl) "tl=%d"
win_helper_retry(uint32_t tl) "tl=%d" win_helper_retry(uint32_t tl) "tl=%d"
# dma-helpers.c
dma_bdrv_io(void *dbs, void *bs, int64_t sector_num, bool to_dev) "dbs=%p bs=%p sector_num=%" PRId64 " to_dev=%d"
dma_aio_cancel(void *dbs) "dbs=%p"
dma_complete(void *dbs, int ret, void *cb) "dbs=%p ret=%d cb=%p"
dma_bdrv_cb(void *dbs, int ret) "dbs=%p ret=%d"
dma_map_wait(void *dbs) "dbs=%p"

View File

@ -351,7 +351,7 @@ void xen_invalidate_map_cache(void)
MapCacheRev *reventry; MapCacheRev *reventry;
/* Flush pending AIO before destroying the mapcache */ /* Flush pending AIO before destroying the mapcache */
qemu_aio_flush(); bdrv_drain_all();
QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) { QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
DPRINTF("There should be no locked mappings at this time, " DPRINTF("There should be no locked mappings at this time, "