Block layer patches:

- Active mirror (blockdev-mirror copy-mode=write-blocking)
 - bdrv_drain_*() fixes and test cases
 - Fix crash with scsi-hd and drive_del
 -----BEGIN PGP SIGNATURE-----
 
 iQIcBAABAgAGBQJbJ+FaAAoJEH8JsnLIjy/WIRoP/29yh6sRfk+bgxxKczctZLSr
 hJYkf62oZG0NjwAqjA+P9TJjUFLuRS62cVE5IljvcKjcAaXIavJy5ZIqHx/T2BS2
 faaUMOjlhbMtH8Emeun8BoTzPXKME3m7IifxTvc1g40UkeTbeGFF8P3wZ32QtduO
 OwxkT4qM0tiV4rin2Gds4IGokigx9qJKAN3i2PJNlUYg9kXs7tjdVuJp73OK3ZVb
 w0uYvPsiY65GJ3VrLxyPrH3wCWMnI7Ep4ekbjYTusoV57iZdQ6e3kDwqAgCGNPaE
 BaCQx7Aza7D6EDE+fOKIegyh7AyUy+oA1kcA5Z4u4qsasvYIYm7bQIFh+ohRHHpf
 +SbvAY1dockOYuN6V6K2EkIU6jNFmgUFHAU9jHb7QBlgYDfsTBNOPgK6po3NtjJM
 Scv9aB7aHGmTvRFYb1LHHJWsiMOaWgXgCFelAzM6PfhSvPPE8110doGS7dqbDAI+
 PThdtvXoNACuvwNUKGc5I16iO9j3p1aL4RYz2e/9xVlsATwUYiU3D1ICpw/ejb4z
 gfwseeFxdLfWV/bNylYd9FZJSs2NoZhNrw+qghIkJ9hTO2YJguCLqQTtJzT2gTXT
 YVjEhTFGcOtTgG3Z+s5PhHwkWBCVmJWUFxh0eDa4ejfHqc1e+A2Tqxh4LgnsloaK
 WgHFympxV6Sb1s7V8i7V
 =6Krl
 -----END PGP SIGNATURE-----

Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging

Block layer patches:

- Active mirror (blockdev-mirror copy-mode=write-blocking)
- bdrv_drain_*() fixes and test cases
- Fix crash with scsi-hd and drive_del

# gpg: Signature made Mon 18 Jun 2018 17:44:10 BST
# gpg:                using RSA key 7F09B272C88F2FD6
# gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>"
# Primary key fingerprint: DC3D EB15 9A9A F95D 3D74  56FE 7F09 B272 C88F 2FD6

* remotes/kevin/tags/for-upstream: (35 commits)
  iotests: Add test for active mirroring
  block/mirror: Add copy mode QAPI interface
  block/mirror: Add active mirroring
  job: Add job_progress_increase_remaining()
  block/mirror: Add MirrorBDSOpaque
  block/dirty-bitmap: Add bdrv_dirty_iter_next_area
  test-hbitmap: Add non-advancing iter_next tests
  hbitmap: Add @advance param to hbitmap_iter_next()
  block: Generalize should_update_child() rule
  block/mirror: Use source as a BdrvChild
  block/mirror: Wait for in-flight op conflicts
  block/mirror: Use CoQueue to wait on in-flight ops
  block/mirror: Convert to coroutines
  block/mirror: Pull out mirror_perform()
  block: fix QEMU crash with scsi-hd and drive_del
  test-bdrv-drain: Test graph changes in drain_all section
  block: Allow graph changes in bdrv_drain_all_begin/end sections
  block: ignore_bds_parents parameter for drain functions
  block: Move bdrv_drain_all_begin() out of coroutine context
  block: Allow AIO_WAIT_WHILE with NULL ctx
  ...

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2018-06-19 16:04:43 +01:00
commit 0f01b9fdd4
24 changed files with 1836 additions and 319 deletions

96
block.c
View File

@ -333,6 +333,10 @@ BlockDriverState *bdrv_new(void)
qemu_co_queue_init(&bs->flush_queue); qemu_co_queue_init(&bs->flush_queue);
for (i = 0; i < bdrv_drain_all_count; i++) {
bdrv_drained_begin(bs);
}
QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list); QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
return bs; return bs;
@ -818,7 +822,13 @@ static char *bdrv_child_get_parent_desc(BdrvChild *c)
static void bdrv_child_cb_drained_begin(BdrvChild *child) static void bdrv_child_cb_drained_begin(BdrvChild *child)
{ {
BlockDriverState *bs = child->opaque; BlockDriverState *bs = child->opaque;
bdrv_drained_begin(bs); bdrv_do_drained_begin_quiesce(bs, NULL, false);
}
static bool bdrv_child_cb_drained_poll(BdrvChild *child)
{
BlockDriverState *bs = child->opaque;
return bdrv_drain_poll(bs, false, NULL, false);
} }
static void bdrv_child_cb_drained_end(BdrvChild *child) static void bdrv_child_cb_drained_end(BdrvChild *child)
@ -902,9 +912,11 @@ static void bdrv_inherited_options(int *child_flags, QDict *child_options,
} }
const BdrvChildRole child_file = { const BdrvChildRole child_file = {
.parent_is_bds = true,
.get_parent_desc = bdrv_child_get_parent_desc, .get_parent_desc = bdrv_child_get_parent_desc,
.inherit_options = bdrv_inherited_options, .inherit_options = bdrv_inherited_options,
.drained_begin = bdrv_child_cb_drained_begin, .drained_begin = bdrv_child_cb_drained_begin,
.drained_poll = bdrv_child_cb_drained_poll,
.drained_end = bdrv_child_cb_drained_end, .drained_end = bdrv_child_cb_drained_end,
.attach = bdrv_child_cb_attach, .attach = bdrv_child_cb_attach,
.detach = bdrv_child_cb_detach, .detach = bdrv_child_cb_detach,
@ -926,9 +938,11 @@ static void bdrv_inherited_fmt_options(int *child_flags, QDict *child_options,
} }
const BdrvChildRole child_format = { const BdrvChildRole child_format = {
.parent_is_bds = true,
.get_parent_desc = bdrv_child_get_parent_desc, .get_parent_desc = bdrv_child_get_parent_desc,
.inherit_options = bdrv_inherited_fmt_options, .inherit_options = bdrv_inherited_fmt_options,
.drained_begin = bdrv_child_cb_drained_begin, .drained_begin = bdrv_child_cb_drained_begin,
.drained_poll = bdrv_child_cb_drained_poll,
.drained_end = bdrv_child_cb_drained_end, .drained_end = bdrv_child_cb_drained_end,
.attach = bdrv_child_cb_attach, .attach = bdrv_child_cb_attach,
.detach = bdrv_child_cb_detach, .detach = bdrv_child_cb_detach,
@ -1043,11 +1057,13 @@ static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base,
} }
const BdrvChildRole child_backing = { const BdrvChildRole child_backing = {
.parent_is_bds = true,
.get_parent_desc = bdrv_child_get_parent_desc, .get_parent_desc = bdrv_child_get_parent_desc,
.attach = bdrv_backing_attach, .attach = bdrv_backing_attach,
.detach = bdrv_backing_detach, .detach = bdrv_backing_detach,
.inherit_options = bdrv_backing_options, .inherit_options = bdrv_backing_options,
.drained_begin = bdrv_child_cb_drained_begin, .drained_begin = bdrv_child_cb_drained_begin,
.drained_poll = bdrv_child_cb_drained_poll,
.drained_end = bdrv_child_cb_drained_end, .drained_end = bdrv_child_cb_drained_end,
.inactivate = bdrv_child_cb_inactivate, .inactivate = bdrv_child_cb_inactivate,
.update_filename = bdrv_backing_update_filename, .update_filename = bdrv_backing_update_filename,
@ -1152,7 +1168,7 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
int open_flags, Error **errp) int open_flags, Error **errp)
{ {
Error *local_err = NULL; Error *local_err = NULL;
int ret; int i, ret;
bdrv_assign_node_name(bs, node_name, &local_err); bdrv_assign_node_name(bs, node_name, &local_err);
if (local_err) { if (local_err) {
@ -1200,6 +1216,12 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
assert(bdrv_min_mem_align(bs) != 0); assert(bdrv_min_mem_align(bs) != 0);
assert(is_power_of_2(bs->bl.request_alignment)); assert(is_power_of_2(bs->bl.request_alignment));
for (i = 0; i < bs->quiesce_counter; i++) {
if (drv->bdrv_co_drain_begin) {
drv->bdrv_co_drain_begin(bs);
}
}
return 0; return 0;
open_failed: open_failed:
bs->drv = NULL; bs->drv = NULL;
@ -2021,7 +2043,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
child->role->detach(child); child->role->detach(child);
} }
if (old_bs->quiesce_counter && child->role->drained_end) { if (old_bs->quiesce_counter && child->role->drained_end) {
for (i = 0; i < old_bs->quiesce_counter; i++) { int num = old_bs->quiesce_counter;
if (child->role->parent_is_bds) {
num -= bdrv_drain_all_count;
}
assert(num >= 0);
for (i = 0; i < num; i++) {
child->role->drained_end(child); child->role->drained_end(child);
} }
} }
@ -2033,7 +2060,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
if (new_bs) { if (new_bs) {
QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent); QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
if (new_bs->quiesce_counter && child->role->drained_begin) { if (new_bs->quiesce_counter && child->role->drained_begin) {
for (i = 0; i < new_bs->quiesce_counter; i++) { int num = new_bs->quiesce_counter;
if (child->role->parent_is_bds) {
num -= bdrv_drain_all_count;
}
assert(num >= 0);
for (i = 0; i < num; i++) {
child->role->drained_begin(child); child->role->drained_begin(child);
} }
} }
@ -3395,16 +3427,39 @@ static bool should_update_child(BdrvChild *c, BlockDriverState *to)
return false; return false;
} }
if (c->role == &child_backing) { /* If the child @c belongs to the BDS @to, replacing the current
/* If @from is a backing file of @to, ignore the child to avoid * c->bs by @to would mean to create a loop.
* creating a loop. We only want to change the pointer of other *
* parents. */ * Such a case occurs when appending a BDS to a backing chain.
QLIST_FOREACH(to_c, &to->children, next) { * For instance, imagine the following chain:
if (to_c == c) { *
break; * guest device -> node A -> further backing chain...
} *
} * Now we create a new BDS B which we want to put on top of this
if (to_c) { * chain, so we first attach A as its backing node:
*
* node B
* |
* v
* guest device -> node A -> further backing chain...
*
* Finally we want to replace A by B. When doing that, we want to
* replace all pointers to A by pointers to B -- except for the
* pointer from B because (1) that would create a loop, and (2)
* that pointer should simply stay intact:
*
* guest device -> node B
* |
* v
* node A -> further backing chain...
*
* In general, when replacing a node A (c->bs) by a node B (@to),
* if A is a child of B, that means we cannot replace A by B there
* because that would create a loop. Silently detaching A from B
* is also not really an option. So overall just leaving A in
* place there is the most sensible choice. */
QLIST_FOREACH(to_c, &to->children, next) {
if (to_c == c) {
return false; return false;
} }
} }
@ -3430,6 +3485,7 @@ void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
/* Put all parents into @list and calculate their cumulative permissions */ /* Put all parents into @list and calculate their cumulative permissions */
QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) { QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
assert(c->bs == from);
if (!should_update_child(c, to)) { if (!should_update_child(c, to)) {
continue; continue;
} }
@ -4037,6 +4093,14 @@ BlockDriverState *bdrv_next_node(BlockDriverState *bs)
return QTAILQ_NEXT(bs, node_list); return QTAILQ_NEXT(bs, node_list);
} }
BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
{
if (!bs) {
return QTAILQ_FIRST(&all_bdrv_states);
}
return QTAILQ_NEXT(bs, bs_list);
}
const char *bdrv_get_node_name(const BlockDriverState *bs) const char *bdrv_get_node_name(const BlockDriverState *bs)
{ {
return bs->node_name; return bs->node_name;
@ -4948,7 +5012,7 @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
AioContext *ctx = bdrv_get_aio_context(bs); AioContext *ctx = bdrv_get_aio_context(bs);
aio_disable_external(ctx); aio_disable_external(ctx);
bdrv_parent_drained_begin(bs, NULL); bdrv_parent_drained_begin(bs, NULL, false);
bdrv_drain(bs); /* ensure there are no in-flight requests */ bdrv_drain(bs); /* ensure there are no in-flight requests */
while (aio_poll(ctx, false)) { while (aio_poll(ctx, false)) {
@ -4962,7 +5026,7 @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
*/ */
aio_context_acquire(new_context); aio_context_acquire(new_context);
bdrv_attach_aio_context(bs, new_context); bdrv_attach_aio_context(bs, new_context);
bdrv_parent_drained_end(bs, NULL); bdrv_parent_drained_end(bs, NULL, false);
aio_enable_external(ctx); aio_enable_external(ctx);
aio_context_release(new_context); aio_context_release(new_context);
} }

View File

@ -354,7 +354,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
HBitmapIter hbi; HBitmapIter hbi;
hbitmap_iter_init(&hbi, job->copy_bitmap, 0); hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
while ((cluster = hbitmap_iter_next(&hbi)) != -1) { while ((cluster = hbitmap_iter_next(&hbi, true)) != -1) {
do { do {
if (yield_and_check(job)) { if (yield_and_check(job)) {
return 0; return 0;

View File

@ -767,6 +767,11 @@ void blk_remove_bs(BlockBackend *blk)
blk_update_root_state(blk); blk_update_root_state(blk);
/* bdrv_root_unref_child() will cause blk->root to become stale and may
* switch to a completion coroutine later on. Let's drain all I/O here
* to avoid that and a potential QEMU crash.
*/
blk_drain(blk);
bdrv_root_unref_child(blk->root); bdrv_root_unref_child(blk->root);
blk->root = NULL; blk->root = NULL;
} }

View File

@ -519,7 +519,62 @@ void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter)
int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter) int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter)
{ {
return hbitmap_iter_next(&iter->hbi); return hbitmap_iter_next(&iter->hbi, true);
}
/**
* Return the next consecutively dirty area in the dirty bitmap
* belonging to the given iterator @iter.
*
* @max_offset: Maximum value that may be returned for
* *offset + *bytes
* @offset: Will contain the start offset of the next dirty area
* @bytes: Will contain the length of the next dirty area
*
* Returns: True if a dirty area could be found before max_offset
* (which means that *offset and *bytes then contain valid
* values), false otherwise.
*
* Note that @iter is never advanced if false is returned. If an area
* is found (which means that true is returned), it will be advanced
* past that area.
*/
bool bdrv_dirty_iter_next_area(BdrvDirtyBitmapIter *iter, uint64_t max_offset,
uint64_t *offset, int *bytes)
{
uint32_t granularity = bdrv_dirty_bitmap_granularity(iter->bitmap);
uint64_t gran_max_offset;
int64_t ret;
int size;
if (max_offset == iter->bitmap->size) {
/* If max_offset points to the image end, round it up by the
* bitmap granularity */
gran_max_offset = ROUND_UP(max_offset, granularity);
} else {
gran_max_offset = max_offset;
}
ret = hbitmap_iter_next(&iter->hbi, false);
if (ret < 0 || ret + granularity > gran_max_offset) {
return false;
}
*offset = ret;
size = 0;
assert(granularity <= INT_MAX);
do {
/* Advance iterator */
ret = hbitmap_iter_next(&iter->hbi, true);
size += granularity;
} while (ret + granularity <= gran_max_offset &&
hbitmap_iter_next(&iter->hbi, false) == ret + granularity &&
size <= INT_MAX - granularity);
*bytes = MIN(size, max_offset - *offset);
return true;
} }
/* Called within bdrv_dirty_bitmap_lock..unlock */ /* Called within bdrv_dirty_bitmap_lock..unlock */

View File

@ -38,15 +38,18 @@
/* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
#define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
static AioWait drain_all_aio_wait;
static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
int64_t offset, int bytes, BdrvRequestFlags flags); int64_t offset, int bytes, BdrvRequestFlags flags);
void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore) void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
bool ignore_bds_parents)
{ {
BdrvChild *c, *next; BdrvChild *c, *next;
QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
if (c == ignore) { if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
continue; continue;
} }
if (c->role->drained_begin) { if (c->role->drained_begin) {
@ -55,12 +58,13 @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
} }
} }
void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore) void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
bool ignore_bds_parents)
{ {
BdrvChild *c, *next; BdrvChild *c, *next;
QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
if (c == ignore) { if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
continue; continue;
} }
if (c->role->drained_end) { if (c->role->drained_end) {
@ -69,6 +73,24 @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
} }
} }
static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
bool ignore_bds_parents)
{
BdrvChild *c, *next;
bool busy = false;
QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
continue;
}
if (c->role->drained_poll) {
busy |= c->role->drained_poll(c);
}
}
return busy;
}
static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
{ {
dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
@ -148,7 +170,9 @@ typedef struct {
bool done; bool done;
bool begin; bool begin;
bool recursive; bool recursive;
bool poll;
BdrvChild *parent; BdrvChild *parent;
bool ignore_bds_parents;
} BdrvCoDrainData; } BdrvCoDrainData;
static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
@ -164,67 +188,83 @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
/* Set data->done before reading bs->wakeup. */ /* Set data->done before reading bs->wakeup. */
atomic_mb_set(&data->done, true); atomic_mb_set(&data->done, true);
bdrv_wakeup(bs); bdrv_dec_in_flight(bs);
if (data->begin) {
g_free(data);
}
} }
/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive) static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
{ {
BdrvChild *child, *tmp; BdrvCoDrainData *data;
BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
(!begin && !bs->drv->bdrv_co_drain_end)) { (!begin && !bs->drv->bdrv_co_drain_end)) {
return; return;
} }
data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data); data = g_new(BdrvCoDrainData, 1);
bdrv_coroutine_enter(bs, data.co); *data = (BdrvCoDrainData) {
BDRV_POLL_WHILE(bs, !data.done); .bs = bs,
.done = false,
.begin = begin
};
if (recursive) { /* Make sure the driver callback completes during the polling phase for
QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { * drain_begin. */
bdrv_drain_invoke(child->bs, begin, true); bdrv_inc_in_flight(bs);
} data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
aio_co_schedule(bdrv_get_aio_context(bs), data->co);
if (!begin) {
BDRV_POLL_WHILE(bs, !data->done);
g_free(data);
} }
} }
static bool bdrv_drain_recurse(BlockDriverState *bs) /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
BdrvChild *ignore_parent, bool ignore_bds_parents)
{ {
BdrvChild *child, *tmp; BdrvChild *child, *next;
bool waited;
/* Wait for drained requests to finish */ if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0); return true;
}
QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { if (atomic_read(&bs->in_flight)) {
BlockDriverState *bs = child->bs; return true;
bool in_main_loop = }
qemu_get_current_aio_context() == qemu_get_aio_context();
assert(bs->refcnt > 0); if (recursive) {
if (in_main_loop) { assert(!ignore_bds_parents);
/* In case the recursive bdrv_drain_recurse processes a QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
* block_job_defer_to_main_loop BH and modifies the graph, if (bdrv_drain_poll(child->bs, recursive, child, false)) {
* let's hold a reference to bs until we are done. return true;
* }
* IOThread doesn't have such a BH, and it is not safe to call
* bdrv_unref without BQL, so skip doing it there.
*/
bdrv_ref(bs);
}
waited |= bdrv_drain_recurse(bs);
if (in_main_loop) {
bdrv_unref(bs);
} }
} }
return waited; return false;
}
static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
BdrvChild *ignore_parent)
{
/* Execute pending BHs first and check everything else only after the BHs
* have executed. */
while (aio_poll(bs->aio_context, false));
return bdrv_drain_poll(bs, recursive, ignore_parent, false);
} }
static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
BdrvChild *parent); BdrvChild *parent, bool ignore_bds_parents,
bool poll);
static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
BdrvChild *parent); BdrvChild *parent, bool ignore_bds_parents);
static void bdrv_co_drain_bh_cb(void *opaque) static void bdrv_co_drain_bh_cb(void *opaque)
{ {
@ -232,11 +272,18 @@ static void bdrv_co_drain_bh_cb(void *opaque)
Coroutine *co = data->co; Coroutine *co = data->co;
BlockDriverState *bs = data->bs; BlockDriverState *bs = data->bs;
bdrv_dec_in_flight(bs); if (bs) {
if (data->begin) { bdrv_dec_in_flight(bs);
bdrv_do_drained_begin(bs, data->recursive, data->parent); if (data->begin) {
bdrv_do_drained_begin(bs, data->recursive, data->parent,
data->ignore_bds_parents, data->poll);
} else {
bdrv_do_drained_end(bs, data->recursive, data->parent,
data->ignore_bds_parents);
}
} else { } else {
bdrv_do_drained_end(bs, data->recursive, data->parent); assert(data->begin);
bdrv_drain_all_begin();
} }
data->done = true; data->done = true;
@ -245,7 +292,9 @@ static void bdrv_co_drain_bh_cb(void *opaque)
static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
bool begin, bool recursive, bool begin, bool recursive,
BdrvChild *parent) BdrvChild *parent,
bool ignore_bds_parents,
bool poll)
{ {
BdrvCoDrainData data; BdrvCoDrainData data;
@ -260,8 +309,12 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
.begin = begin, .begin = begin,
.recursive = recursive, .recursive = recursive,
.parent = parent, .parent = parent,
.ignore_bds_parents = ignore_bds_parents,
.poll = poll,
}; };
bdrv_inc_in_flight(bs); if (bs) {
bdrv_inc_in_flight(bs);
}
aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
bdrv_co_drain_bh_cb, &data); bdrv_co_drain_bh_cb, &data);
@ -271,79 +324,106 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
assert(data.done); assert(data.done);
} }
void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
BdrvChild *parent) BdrvChild *parent, bool ignore_bds_parents)
{ {
BdrvChild *child, *next; assert(!qemu_in_coroutine());
if (qemu_in_coroutine()) {
bdrv_co_yield_to_drain(bs, true, recursive, parent);
return;
}
/* Stop things in parent-to-child order */ /* Stop things in parent-to-child order */
if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
aio_disable_external(bdrv_get_aio_context(bs)); aio_disable_external(bdrv_get_aio_context(bs));
} }
bdrv_parent_drained_begin(bs, parent); bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
bdrv_drain_invoke(bs, true, false); bdrv_drain_invoke(bs, true);
bdrv_drain_recurse(bs); }
static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
BdrvChild *parent, bool ignore_bds_parents,
bool poll)
{
BdrvChild *child, *next;
if (qemu_in_coroutine()) {
bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
poll);
return;
}
bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
if (recursive) { if (recursive) {
assert(!ignore_bds_parents);
bs->recursive_quiesce_counter++; bs->recursive_quiesce_counter++;
QLIST_FOREACH_SAFE(child, &bs->children, next, next) { QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
bdrv_do_drained_begin(child->bs, true, child); bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
false);
} }
} }
/*
* Wait for drained requests to finish.
*
* Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
* call is needed so things in this AioContext can make progress even
* though we don't return to the main AioContext loop - this automatically
* includes other nodes in the same AioContext and therefore all child
* nodes.
*/
if (poll) {
assert(!ignore_bds_parents);
BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
}
} }
void bdrv_drained_begin(BlockDriverState *bs) void bdrv_drained_begin(BlockDriverState *bs)
{ {
bdrv_do_drained_begin(bs, false, NULL); bdrv_do_drained_begin(bs, false, NULL, false, true);
} }
void bdrv_subtree_drained_begin(BlockDriverState *bs) void bdrv_subtree_drained_begin(BlockDriverState *bs)
{ {
bdrv_do_drained_begin(bs, true, NULL); bdrv_do_drained_begin(bs, true, NULL, false, true);
} }
void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
BdrvChild *parent) BdrvChild *parent, bool ignore_bds_parents)
{ {
BdrvChild *child, *next; BdrvChild *child, *next;
int old_quiesce_counter; int old_quiesce_counter;
if (qemu_in_coroutine()) { if (qemu_in_coroutine()) {
bdrv_co_yield_to_drain(bs, false, recursive, parent); bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
false);
return; return;
} }
assert(bs->quiesce_counter > 0); assert(bs->quiesce_counter > 0);
old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
/* Re-enable things in child-to-parent order */ /* Re-enable things in child-to-parent order */
bdrv_drain_invoke(bs, false, false); bdrv_drain_invoke(bs, false);
bdrv_parent_drained_end(bs, parent); bdrv_parent_drained_end(bs, parent, ignore_bds_parents);
if (old_quiesce_counter == 1) { if (old_quiesce_counter == 1) {
aio_enable_external(bdrv_get_aio_context(bs)); aio_enable_external(bdrv_get_aio_context(bs));
} }
if (recursive) { if (recursive) {
assert(!ignore_bds_parents);
bs->recursive_quiesce_counter--; bs->recursive_quiesce_counter--;
QLIST_FOREACH_SAFE(child, &bs->children, next, next) { QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
bdrv_do_drained_end(child->bs, true, child); bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents);
} }
} }
} }
void bdrv_drained_end(BlockDriverState *bs) void bdrv_drained_end(BlockDriverState *bs)
{ {
bdrv_do_drained_end(bs, false, NULL); bdrv_do_drained_end(bs, false, NULL, false);
} }
void bdrv_subtree_drained_end(BlockDriverState *bs) void bdrv_subtree_drained_end(BlockDriverState *bs)
{ {
bdrv_do_drained_end(bs, true, NULL); bdrv_do_drained_end(bs, true, NULL, false);
} }
void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
@ -351,7 +431,7 @@ void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
int i; int i;
for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
bdrv_do_drained_begin(child->bs, true, child); bdrv_do_drained_begin(child->bs, true, child, false, true);
} }
} }
@ -360,7 +440,7 @@ void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
int i; int i;
for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
bdrv_do_drained_end(child->bs, true, child); bdrv_do_drained_end(child->bs, true, child, false);
} }
} }
@ -370,10 +450,6 @@ void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
* *
* Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
* AioContext. * AioContext.
*
* Only this BlockDriverState's AioContext is run, so in-flight requests must
* not depend on events in other AioContexts. In that case, use
* bdrv_drain_all() instead.
*/ */
void coroutine_fn bdrv_co_drain(BlockDriverState *bs) void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
{ {
@ -388,6 +464,39 @@ void bdrv_drain(BlockDriverState *bs)
bdrv_drained_end(bs); bdrv_drained_end(bs);
} }
static void bdrv_drain_assert_idle(BlockDriverState *bs)
{
BdrvChild *child, *next;
assert(atomic_read(&bs->in_flight) == 0);
QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
bdrv_drain_assert_idle(child->bs);
}
}
unsigned int bdrv_drain_all_count = 0;
static bool bdrv_drain_all_poll(void)
{
BlockDriverState *bs = NULL;
bool result = false;
/* Execute pending BHs first (may modify the graph) and check everything
* else only after the BHs have executed. */
while (aio_poll(qemu_get_aio_context(), false));
/* bdrv_drain_poll() can't make changes to the graph and we are holding the
* main AioContext lock, so iterating bdrv_next_all_states() is safe. */
while ((bs = bdrv_next_all_states(bs))) {
AioContext *aio_context = bdrv_get_aio_context(bs);
aio_context_acquire(aio_context);
result |= bdrv_drain_poll(bs, false, NULL, true);
aio_context_release(aio_context);
}
return result;
}
/* /*
* Wait for pending requests to complete across all BlockDriverStates * Wait for pending requests to complete across all BlockDriverStates
* *
@ -402,73 +511,51 @@ void bdrv_drain(BlockDriverState *bs)
*/ */
void bdrv_drain_all_begin(void) void bdrv_drain_all_begin(void)
{ {
/* Always run first iteration so any pending completion BHs run */ BlockDriverState *bs = NULL;
bool waited = true;
BlockDriverState *bs;
BdrvNextIterator it;
GSList *aio_ctxs = NULL, *ctx;
/* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread if (qemu_in_coroutine()) {
* or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true);
* nodes in several different AioContexts, so make sure we're in the main return;
* context. */ }
/* AIO_WAIT_WHILE() with a NULL context can only be called from the main
* loop AioContext, so make sure we're in the main context. */
assert(qemu_get_current_aio_context() == qemu_get_aio_context()); assert(qemu_get_current_aio_context() == qemu_get_aio_context());
assert(bdrv_drain_all_count < INT_MAX);
bdrv_drain_all_count++;
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { /* Quiesce all nodes, without polling in-flight requests yet. The graph
* cannot change during this loop. */
while ((bs = bdrv_next_all_states(bs))) {
AioContext *aio_context = bdrv_get_aio_context(bs); AioContext *aio_context = bdrv_get_aio_context(bs);
/* Stop things in parent-to-child order */
aio_context_acquire(aio_context); aio_context_acquire(aio_context);
aio_disable_external(aio_context); bdrv_do_drained_begin(bs, false, NULL, true, false);
bdrv_parent_drained_begin(bs, NULL);
bdrv_drain_invoke(bs, true, true);
aio_context_release(aio_context); aio_context_release(aio_context);
if (!g_slist_find(aio_ctxs, aio_context)) {
aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
}
} }
/* Note that completion of an asynchronous I/O operation can trigger any /* Now poll the in-flight requests */
* number of other I/O operations on other devices---for example a AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll());
* coroutine can submit an I/O request to another device in response to
* request completion. Therefore we must keep looping until there was no
* more activity rather than simply draining each device independently.
*/
while (waited) {
waited = false;
for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { while ((bs = bdrv_next_all_states(bs))) {
AioContext *aio_context = ctx->data; bdrv_drain_assert_idle(bs);
aio_context_acquire(aio_context);
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
if (aio_context == bdrv_get_aio_context(bs)) {
waited |= bdrv_drain_recurse(bs);
}
}
aio_context_release(aio_context);
}
} }
g_slist_free(aio_ctxs);
} }
void bdrv_drain_all_end(void) void bdrv_drain_all_end(void)
{ {
BlockDriverState *bs; BlockDriverState *bs = NULL;
BdrvNextIterator it;
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { while ((bs = bdrv_next_all_states(bs))) {
AioContext *aio_context = bdrv_get_aio_context(bs); AioContext *aio_context = bdrv_get_aio_context(bs);
/* Re-enable things in child-to-parent order */
aio_context_acquire(aio_context); aio_context_acquire(aio_context);
bdrv_drain_invoke(bs, false, true); bdrv_do_drained_end(bs, false, NULL, true);
bdrv_parent_drained_end(bs, NULL);
aio_enable_external(aio_context);
aio_context_release(aio_context); aio_context_release(aio_context);
} }
assert(bdrv_drain_all_count > 0);
bdrv_drain_all_count--;
} }
void bdrv_drain_all(void) void bdrv_drain_all(void)
@ -591,6 +678,7 @@ void bdrv_inc_in_flight(BlockDriverState *bs)
void bdrv_wakeup(BlockDriverState *bs) void bdrv_wakeup(BlockDriverState *bs)
{ {
aio_wait_kick(bdrv_get_aio_wait(bs)); aio_wait_kick(bdrv_get_aio_wait(bs));
aio_wait_kick(&drain_all_aio_wait);
} }
void bdrv_dec_in_flight(BlockDriverState *bs) void bdrv_dec_in_flight(BlockDriverState *bs)

View File

@ -13,6 +13,8 @@
#include "qemu/osdep.h" #include "qemu/osdep.h"
#include "qemu/cutils.h" #include "qemu/cutils.h"
#include "qemu/coroutine.h"
#include "qemu/range.h"
#include "trace.h" #include "trace.h"
#include "block/blockjob_int.h" #include "block/blockjob_int.h"
#include "block/block_int.h" #include "block/block_int.h"
@ -33,11 +35,12 @@ typedef struct MirrorBuffer {
QSIMPLEQ_ENTRY(MirrorBuffer) next; QSIMPLEQ_ENTRY(MirrorBuffer) next;
} MirrorBuffer; } MirrorBuffer;
typedef struct MirrorOp MirrorOp;
typedef struct MirrorBlockJob { typedef struct MirrorBlockJob {
BlockJob common; BlockJob common;
BlockBackend *target; BlockBackend *target;
BlockDriverState *mirror_top_bs; BlockDriverState *mirror_top_bs;
BlockDriverState *source;
BlockDriverState *base; BlockDriverState *base;
/* The name of the graph node to replace */ /* The name of the graph node to replace */
@ -48,8 +51,12 @@ typedef struct MirrorBlockJob {
Error *replace_blocker; Error *replace_blocker;
bool is_none_mode; bool is_none_mode;
BlockMirrorBackingMode backing_mode; BlockMirrorBackingMode backing_mode;
MirrorCopyMode copy_mode;
BlockdevOnError on_source_error, on_target_error; BlockdevOnError on_source_error, on_target_error;
bool synced; bool synced;
/* Set when the target is synced (dirty bitmap is clean, nothing
* in flight) and the job is running in active mode */
bool actively_synced;
bool should_complete; bool should_complete;
int64_t granularity; int64_t granularity;
size_t buf_size; size_t buf_size;
@ -65,25 +72,47 @@ typedef struct MirrorBlockJob {
unsigned long *in_flight_bitmap; unsigned long *in_flight_bitmap;
int in_flight; int in_flight;
int64_t bytes_in_flight; int64_t bytes_in_flight;
QTAILQ_HEAD(MirrorOpList, MirrorOp) ops_in_flight;
int ret; int ret;
bool unmap; bool unmap;
bool waiting_for_io;
int target_cluster_size; int target_cluster_size;
int max_iov; int max_iov;
bool initial_zeroing_ongoing; bool initial_zeroing_ongoing;
int in_active_write_counter;
} MirrorBlockJob; } MirrorBlockJob;
typedef struct MirrorOp { typedef struct MirrorBDSOpaque {
MirrorBlockJob *job;
} MirrorBDSOpaque;
struct MirrorOp {
MirrorBlockJob *s; MirrorBlockJob *s;
QEMUIOVector qiov; QEMUIOVector qiov;
int64_t offset; int64_t offset;
uint64_t bytes; uint64_t bytes;
} MirrorOp;
/* The pointee is set by mirror_co_read(), mirror_co_zero(), and
* mirror_co_discard() before yielding for the first time */
int64_t *bytes_handled;
bool is_pseudo_op;
bool is_active_write;
CoQueue waiting_requests;
QTAILQ_ENTRY(MirrorOp) next;
};
typedef enum MirrorMethod {
MIRROR_METHOD_COPY,
MIRROR_METHOD_ZERO,
MIRROR_METHOD_DISCARD,
} MirrorMethod;
static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read, static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
int error) int error)
{ {
s->synced = false; s->synced = false;
s->actively_synced = false;
if (read) { if (read) {
return block_job_error_action(&s->common, s->on_source_error, return block_job_error_action(&s->common, s->on_source_error,
true, error); true, error);
@ -93,7 +122,42 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
} }
} }
static void mirror_iteration_done(MirrorOp *op, int ret) static void coroutine_fn mirror_wait_on_conflicts(MirrorOp *self,
MirrorBlockJob *s,
uint64_t offset,
uint64_t bytes)
{
uint64_t self_start_chunk = offset / s->granularity;
uint64_t self_end_chunk = DIV_ROUND_UP(offset + bytes, s->granularity);
uint64_t self_nb_chunks = self_end_chunk - self_start_chunk;
while (find_next_bit(s->in_flight_bitmap, self_end_chunk,
self_start_chunk) < self_end_chunk &&
s->ret >= 0)
{
MirrorOp *op;
QTAILQ_FOREACH(op, &s->ops_in_flight, next) {
uint64_t op_start_chunk = op->offset / s->granularity;
uint64_t op_nb_chunks = DIV_ROUND_UP(op->offset + op->bytes,
s->granularity) -
op_start_chunk;
if (op == self) {
continue;
}
if (ranges_overlap(self_start_chunk, self_nb_chunks,
op_start_chunk, op_nb_chunks))
{
qemu_co_queue_wait(&op->waiting_requests, NULL);
break;
}
}
}
}
static void coroutine_fn mirror_iteration_done(MirrorOp *op, int ret)
{ {
MirrorBlockJob *s = op->s; MirrorBlockJob *s = op->s;
struct iovec *iov; struct iovec *iov;
@ -113,7 +177,9 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
chunk_num = op->offset / s->granularity; chunk_num = op->offset / s->granularity;
nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity); nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity);
bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks); bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
QTAILQ_REMOVE(&s->ops_in_flight, op, next);
if (ret >= 0) { if (ret >= 0) {
if (s->cow_bitmap) { if (s->cow_bitmap) {
bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
@ -123,16 +189,13 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
} }
} }
qemu_iovec_destroy(&op->qiov); qemu_iovec_destroy(&op->qiov);
g_free(op);
if (s->waiting_for_io) { qemu_co_queue_restart_all(&op->waiting_requests);
qemu_coroutine_enter(s->common.job.co); g_free(op);
}
} }
static void mirror_write_complete(void *opaque, int ret) static void coroutine_fn mirror_write_complete(MirrorOp *op, int ret)
{ {
MirrorOp *op = opaque;
MirrorBlockJob *s = op->s; MirrorBlockJob *s = op->s;
aio_context_acquire(blk_get_aio_context(s->common.blk)); aio_context_acquire(blk_get_aio_context(s->common.blk));
@ -149,9 +212,8 @@ static void mirror_write_complete(void *opaque, int ret)
aio_context_release(blk_get_aio_context(s->common.blk)); aio_context_release(blk_get_aio_context(s->common.blk));
} }
static void mirror_read_complete(void *opaque, int ret) static void coroutine_fn mirror_read_complete(MirrorOp *op, int ret)
{ {
MirrorOp *op = opaque;
MirrorBlockJob *s = op->s; MirrorBlockJob *s = op->s;
aio_context_acquire(blk_get_aio_context(s->common.blk)); aio_context_acquire(blk_get_aio_context(s->common.blk));
@ -166,8 +228,9 @@ static void mirror_read_complete(void *opaque, int ret)
mirror_iteration_done(op, ret); mirror_iteration_done(op, ret);
} else { } else {
blk_aio_pwritev(s->target, op->offset, &op->qiov, ret = blk_co_pwritev(s->target, op->offset,
0, mirror_write_complete, op); op->qiov.size, &op->qiov, 0);
mirror_write_complete(op, ret);
} }
aio_context_release(blk_get_aio_context(s->common.blk)); aio_context_release(blk_get_aio_context(s->common.blk));
} }
@ -216,68 +279,80 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
return ret; return ret;
} }
static inline void mirror_wait_for_io(MirrorBlockJob *s) static inline void mirror_wait_for_any_operation(MirrorBlockJob *s, bool active)
{ {
assert(!s->waiting_for_io); MirrorOp *op;
s->waiting_for_io = true;
qemu_coroutine_yield(); QTAILQ_FOREACH(op, &s->ops_in_flight, next) {
s->waiting_for_io = false; /* Do not wait on pseudo ops, because it may in turn wait on
* some other operation to start, which may in fact be the
* caller of this function. Since there is only one pseudo op
* at any given time, we will always find some real operation
* to wait on. */
if (!op->is_pseudo_op && op->is_active_write == active) {
qemu_co_queue_wait(&op->waiting_requests, NULL);
return;
}
}
abort();
} }
/* Submit async read while handling COW. static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
* Returns: The number of bytes copied after and including offset,
* excluding any bytes copied prior to offset due to alignment.
* This will be @bytes if no alignment is necessary, or
* (new_end - offset) if tail is rounded up or down due to
* alignment or buffer limit.
*/
static uint64_t mirror_do_read(MirrorBlockJob *s, int64_t offset,
uint64_t bytes)
{ {
BlockBackend *source = s->common.blk; /* Only non-active operations use up in-flight slots */
mirror_wait_for_any_operation(s, false);
}
/* Perform a mirror copy operation.
*
* *op->bytes_handled is set to the number of bytes copied after and
* including offset, excluding any bytes copied prior to offset due
* to alignment. This will be op->bytes if no alignment is necessary,
* or (new_end - op->offset) if the tail is rounded up or down due to
* alignment or buffer limit.
*/
static void coroutine_fn mirror_co_read(void *opaque)
{
MirrorOp *op = opaque;
MirrorBlockJob *s = op->s;
int nb_chunks; int nb_chunks;
uint64_t ret; uint64_t ret;
MirrorOp *op;
uint64_t max_bytes; uint64_t max_bytes;
max_bytes = s->granularity * s->max_iov; max_bytes = s->granularity * s->max_iov;
/* We can only handle as much as buf_size at a time. */ /* We can only handle as much as buf_size at a time. */
bytes = MIN(s->buf_size, MIN(max_bytes, bytes)); op->bytes = MIN(s->buf_size, MIN(max_bytes, op->bytes));
assert(bytes); assert(op->bytes);
assert(bytes < BDRV_REQUEST_MAX_BYTES); assert(op->bytes < BDRV_REQUEST_MAX_BYTES);
ret = bytes; *op->bytes_handled = op->bytes;
if (s->cow_bitmap) { if (s->cow_bitmap) {
ret += mirror_cow_align(s, &offset, &bytes); *op->bytes_handled += mirror_cow_align(s, &op->offset, &op->bytes);
} }
assert(bytes <= s->buf_size); /* Cannot exceed BDRV_REQUEST_MAX_BYTES + INT_MAX */
assert(*op->bytes_handled <= UINT_MAX);
assert(op->bytes <= s->buf_size);
/* The offset is granularity-aligned because: /* The offset is granularity-aligned because:
* 1) Caller passes in aligned values; * 1) Caller passes in aligned values;
* 2) mirror_cow_align is used only when target cluster is larger. */ * 2) mirror_cow_align is used only when target cluster is larger. */
assert(QEMU_IS_ALIGNED(offset, s->granularity)); assert(QEMU_IS_ALIGNED(op->offset, s->granularity));
/* The range is sector-aligned, since bdrv_getlength() rounds up. */ /* The range is sector-aligned, since bdrv_getlength() rounds up. */
assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); assert(QEMU_IS_ALIGNED(op->bytes, BDRV_SECTOR_SIZE));
nb_chunks = DIV_ROUND_UP(bytes, s->granularity); nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity);
while (s->buf_free_count < nb_chunks) { while (s->buf_free_count < nb_chunks) {
trace_mirror_yield_in_flight(s, offset, s->in_flight); trace_mirror_yield_in_flight(s, op->offset, s->in_flight);
mirror_wait_for_io(s); mirror_wait_for_free_in_flight_slot(s);
} }
/* Allocate a MirrorOp that is used as an AIO callback. */
op = g_new(MirrorOp, 1);
op->s = s;
op->offset = offset;
op->bytes = bytes;
/* Now make a QEMUIOVector taking enough granularity-sized chunks /* Now make a QEMUIOVector taking enough granularity-sized chunks
* from s->buf_free. * from s->buf_free.
*/ */
qemu_iovec_init(&op->qiov, nb_chunks); qemu_iovec_init(&op->qiov, nb_chunks);
while (nb_chunks-- > 0) { while (nb_chunks-- > 0) {
MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
size_t remaining = bytes - op->qiov.size; size_t remaining = op->bytes - op->qiov.size;
QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
s->buf_free_count--; s->buf_free_count--;
@ -286,44 +361,92 @@ static uint64_t mirror_do_read(MirrorBlockJob *s, int64_t offset,
/* Copy the dirty cluster. */ /* Copy the dirty cluster. */
s->in_flight++; s->in_flight++;
s->bytes_in_flight += bytes; s->bytes_in_flight += op->bytes;
trace_mirror_one_iteration(s, offset, bytes); trace_mirror_one_iteration(s, op->offset, op->bytes);
blk_aio_preadv(source, offset, &op->qiov, 0, mirror_read_complete, op); ret = bdrv_co_preadv(s->mirror_top_bs->backing, op->offset, op->bytes,
return ret; &op->qiov, 0);
mirror_read_complete(op, ret);
} }
static void mirror_do_zero_or_discard(MirrorBlockJob *s, static void coroutine_fn mirror_co_zero(void *opaque)
int64_t offset, {
uint64_t bytes, MirrorOp *op = opaque;
bool is_discard) int ret;
op->s->in_flight++;
op->s->bytes_in_flight += op->bytes;
*op->bytes_handled = op->bytes;
ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes,
op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0);
mirror_write_complete(op, ret);
}
static void coroutine_fn mirror_co_discard(void *opaque)
{
MirrorOp *op = opaque;
int ret;
op->s->in_flight++;
op->s->bytes_in_flight += op->bytes;
*op->bytes_handled = op->bytes;
ret = blk_co_pdiscard(op->s->target, op->offset, op->bytes);
mirror_write_complete(op, ret);
}
static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
unsigned bytes, MirrorMethod mirror_method)
{ {
MirrorOp *op; MirrorOp *op;
Coroutine *co;
int64_t bytes_handled = -1;
/* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed op = g_new(MirrorOp, 1);
* so the freeing in mirror_iteration_done is nop. */ *op = (MirrorOp){
op = g_new0(MirrorOp, 1); .s = s,
op->s = s; .offset = offset,
op->offset = offset; .bytes = bytes,
op->bytes = bytes; .bytes_handled = &bytes_handled,
};
qemu_co_queue_init(&op->waiting_requests);
s->in_flight++; switch (mirror_method) {
s->bytes_in_flight += bytes; case MIRROR_METHOD_COPY:
if (is_discard) { co = qemu_coroutine_create(mirror_co_read, op);
blk_aio_pdiscard(s->target, offset, break;
op->bytes, mirror_write_complete, op); case MIRROR_METHOD_ZERO:
} else { co = qemu_coroutine_create(mirror_co_zero, op);
blk_aio_pwrite_zeroes(s->target, offset, break;
op->bytes, s->unmap ? BDRV_REQ_MAY_UNMAP : 0, case MIRROR_METHOD_DISCARD:
mirror_write_complete, op); co = qemu_coroutine_create(mirror_co_discard, op);
break;
default:
abort();
} }
QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next);
qemu_coroutine_enter(co);
/* At this point, ownership of op has been moved to the coroutine
* and the object may already be freed */
/* Assert that this value has been set */
assert(bytes_handled >= 0);
/* Same assertion as in mirror_co_read() (and for mirror_co_read()
* and mirror_co_discard(), bytes_handled == op->bytes, which
* is the @bytes parameter given to this function) */
assert(bytes_handled <= UINT_MAX);
return bytes_handled;
} }
static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
{ {
BlockDriverState *source = s->source; BlockDriverState *source = s->mirror_top_bs->backing->bs;
int64_t offset, first_chunk; MirrorOp *pseudo_op;
uint64_t delay_ns = 0; int64_t offset;
uint64_t delay_ns = 0, ret = 0;
/* At least the first dirty chunk is mirrored in one iteration. */ /* At least the first dirty chunk is mirrored in one iteration. */
int nb_chunks = 1; int nb_chunks = 1;
bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target)); bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
@ -339,11 +462,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
} }
bdrv_dirty_bitmap_unlock(s->dirty_bitmap); bdrv_dirty_bitmap_unlock(s->dirty_bitmap);
first_chunk = offset / s->granularity; mirror_wait_on_conflicts(NULL, s, offset, 1);
while (test_bit(first_chunk, s->in_flight_bitmap)) {
trace_mirror_yield_in_flight(s, offset, s->in_flight);
mirror_wait_for_io(s);
}
job_pause_point(&s->common.job); job_pause_point(&s->common.job);
@ -380,16 +499,27 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
nb_chunks * s->granularity); nb_chunks * s->granularity);
bdrv_dirty_bitmap_unlock(s->dirty_bitmap); bdrv_dirty_bitmap_unlock(s->dirty_bitmap);
/* Before claiming an area in the in-flight bitmap, we have to
* create a MirrorOp for it so that conflicting requests can wait
* for it. mirror_perform() will create the real MirrorOps later,
* for now we just create a pseudo operation that will wake up all
* conflicting requests once all real operations have been
* launched. */
pseudo_op = g_new(MirrorOp, 1);
*pseudo_op = (MirrorOp){
.offset = offset,
.bytes = nb_chunks * s->granularity,
.is_pseudo_op = true,
};
qemu_co_queue_init(&pseudo_op->waiting_requests);
QTAILQ_INSERT_TAIL(&s->ops_in_flight, pseudo_op, next);
bitmap_set(s->in_flight_bitmap, offset / s->granularity, nb_chunks); bitmap_set(s->in_flight_bitmap, offset / s->granularity, nb_chunks);
while (nb_chunks > 0 && offset < s->bdev_length) { while (nb_chunks > 0 && offset < s->bdev_length) {
int ret; int ret;
int64_t io_bytes; int64_t io_bytes;
int64_t io_bytes_acct; int64_t io_bytes_acct;
enum MirrorMethod { MirrorMethod mirror_method = MIRROR_METHOD_COPY;
MIRROR_METHOD_COPY,
MIRROR_METHOD_ZERO,
MIRROR_METHOD_DISCARD
} mirror_method = MIRROR_METHOD_COPY;
assert(!(offset % s->granularity)); assert(!(offset % s->granularity));
ret = bdrv_block_status_above(source, NULL, offset, ret = bdrv_block_status_above(source, NULL, offset,
@ -419,37 +549,34 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
while (s->in_flight >= MAX_IN_FLIGHT) { while (s->in_flight >= MAX_IN_FLIGHT) {
trace_mirror_yield_in_flight(s, offset, s->in_flight); trace_mirror_yield_in_flight(s, offset, s->in_flight);
mirror_wait_for_io(s); mirror_wait_for_free_in_flight_slot(s);
} }
if (s->ret < 0) { if (s->ret < 0) {
return 0; ret = 0;
goto fail;
} }
io_bytes = mirror_clip_bytes(s, offset, io_bytes); io_bytes = mirror_clip_bytes(s, offset, io_bytes);
switch (mirror_method) { io_bytes = mirror_perform(s, offset, io_bytes, mirror_method);
case MIRROR_METHOD_COPY: if (mirror_method != MIRROR_METHOD_COPY && write_zeroes_ok) {
io_bytes = io_bytes_acct = mirror_do_read(s, offset, io_bytes); io_bytes_acct = 0;
break; } else {
case MIRROR_METHOD_ZERO: io_bytes_acct = io_bytes;
case MIRROR_METHOD_DISCARD:
mirror_do_zero_or_discard(s, offset, io_bytes,
mirror_method == MIRROR_METHOD_DISCARD);
if (write_zeroes_ok) {
io_bytes_acct = 0;
} else {
io_bytes_acct = io_bytes;
}
break;
default:
abort();
} }
assert(io_bytes); assert(io_bytes);
offset += io_bytes; offset += io_bytes;
nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity); nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
delay_ns = block_job_ratelimit_get_delay(&s->common, io_bytes_acct); delay_ns = block_job_ratelimit_get_delay(&s->common, io_bytes_acct);
} }
return delay_ns;
ret = delay_ns;
fail:
QTAILQ_REMOVE(&s->ops_in_flight, pseudo_op, next);
qemu_co_queue_restart_all(&pseudo_op->waiting_requests);
g_free(pseudo_op);
return ret;
} }
static void mirror_free_init(MirrorBlockJob *s) static void mirror_free_init(MirrorBlockJob *s)
@ -476,7 +603,7 @@ static void mirror_free_init(MirrorBlockJob *s)
static void mirror_wait_for_all_io(MirrorBlockJob *s) static void mirror_wait_for_all_io(MirrorBlockJob *s)
{ {
while (s->in_flight > 0) { while (s->in_flight > 0) {
mirror_wait_for_io(s); mirror_wait_for_free_in_flight_slot(s);
} }
} }
@ -489,8 +616,9 @@ static void mirror_exit(Job *job, void *opaque)
MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job); MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job);
BlockJob *bjob = &s->common; BlockJob *bjob = &s->common;
MirrorExitData *data = opaque; MirrorExitData *data = opaque;
MirrorBDSOpaque *bs_opaque = s->mirror_top_bs->opaque;
AioContext *replace_aio_context = NULL; AioContext *replace_aio_context = NULL;
BlockDriverState *src = s->source; BlockDriverState *src = s->mirror_top_bs->backing->bs;
BlockDriverState *target_bs = blk_bs(s->target); BlockDriverState *target_bs = blk_bs(s->target);
BlockDriverState *mirror_top_bs = s->mirror_top_bs; BlockDriverState *mirror_top_bs = s->mirror_top_bs;
Error *local_err = NULL; Error *local_err = NULL;
@ -581,6 +709,7 @@ static void mirror_exit(Job *job, void *opaque)
blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort); blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort);
blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort); blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort);
bs_opaque->job = NULL;
job_completed(job, data->ret, NULL); job_completed(job, data->ret, NULL);
g_free(data); g_free(data);
@ -605,7 +734,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
{ {
int64_t offset; int64_t offset;
BlockDriverState *base = s->base; BlockDriverState *base = s->base;
BlockDriverState *bs = s->source; BlockDriverState *bs = s->mirror_top_bs->backing->bs;
BlockDriverState *target_bs = blk_bs(s->target); BlockDriverState *target_bs = blk_bs(s->target);
int ret; int ret;
int64_t count; int64_t count;
@ -631,11 +760,11 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
if (s->in_flight >= MAX_IN_FLIGHT) { if (s->in_flight >= MAX_IN_FLIGHT) {
trace_mirror_yield(s, UINT64_MAX, s->buf_free_count, trace_mirror_yield(s, UINT64_MAX, s->buf_free_count,
s->in_flight); s->in_flight);
mirror_wait_for_io(s); mirror_wait_for_free_in_flight_slot(s);
continue; continue;
} }
mirror_do_zero_or_discard(s, offset, bytes, false); mirror_perform(s, offset, bytes, MIRROR_METHOD_ZERO);
offset += bytes; offset += bytes;
} }
@ -687,7 +816,7 @@ static void coroutine_fn mirror_run(void *opaque)
{ {
MirrorBlockJob *s = opaque; MirrorBlockJob *s = opaque;
MirrorExitData *data; MirrorExitData *data;
BlockDriverState *bs = s->source; BlockDriverState *bs = s->mirror_top_bs->backing->bs;
BlockDriverState *target_bs = blk_bs(s->target); BlockDriverState *target_bs = blk_bs(s->target);
bool need_drain = true; bool need_drain = true;
int64_t length; int64_t length;
@ -730,6 +859,7 @@ static void coroutine_fn mirror_run(void *opaque)
/* Transition to the READY state and wait for complete. */ /* Transition to the READY state and wait for complete. */
job_transition_to_ready(&s->common.job); job_transition_to_ready(&s->common.job);
s->synced = true; s->synced = true;
s->actively_synced = true;
while (!job_is_cancelled(&s->common.job) && !s->should_complete) { while (!job_is_cancelled(&s->common.job) && !s->should_complete) {
job_yield(&s->common.job); job_yield(&s->common.job);
} }
@ -781,6 +911,12 @@ static void coroutine_fn mirror_run(void *opaque)
int64_t cnt, delta; int64_t cnt, delta;
bool should_complete; bool should_complete;
/* Do not start passive operations while there are active
* writes in progress */
while (s->in_active_write_counter) {
mirror_wait_for_any_operation(s, true);
}
if (s->ret < 0) { if (s->ret < 0) {
ret = s->ret; ret = s->ret;
goto immediate_exit; goto immediate_exit;
@ -804,7 +940,7 @@ static void coroutine_fn mirror_run(void *opaque)
if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 || if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
(cnt == 0 && s->in_flight > 0)) { (cnt == 0 && s->in_flight > 0)) {
trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight); trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight);
mirror_wait_for_io(s); mirror_wait_for_free_in_flight_slot(s);
continue; continue;
} else if (cnt != 0) { } else if (cnt != 0) {
delay_ns = mirror_iteration(s); delay_ns = mirror_iteration(s);
@ -826,6 +962,9 @@ static void coroutine_fn mirror_run(void *opaque)
*/ */
job_transition_to_ready(&s->common.job); job_transition_to_ready(&s->common.job);
s->synced = true; s->synced = true;
if (s->copy_mode != MIRROR_COPY_MODE_BACKGROUND) {
s->actively_synced = true;
}
} }
should_complete = s->should_complete || should_complete = s->should_complete ||
@ -964,6 +1103,12 @@ static void mirror_pause(Job *job)
mirror_wait_for_all_io(s); mirror_wait_for_all_io(s);
} }
static bool mirror_drained_poll(BlockJob *job)
{
MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
return !!s->in_flight;
}
static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context) static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
{ {
MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
@ -997,6 +1142,7 @@ static const BlockJobDriver mirror_job_driver = {
.pause = mirror_pause, .pause = mirror_pause,
.complete = mirror_complete, .complete = mirror_complete,
}, },
.drained_poll = mirror_drained_poll,
.attached_aio_context = mirror_attached_aio_context, .attached_aio_context = mirror_attached_aio_context,
.drain = mirror_drain, .drain = mirror_drain,
}; };
@ -1012,20 +1158,237 @@ static const BlockJobDriver commit_active_job_driver = {
.pause = mirror_pause, .pause = mirror_pause,
.complete = mirror_complete, .complete = mirror_complete,
}, },
.drained_poll = mirror_drained_poll,
.attached_aio_context = mirror_attached_aio_context, .attached_aio_context = mirror_attached_aio_context,
.drain = mirror_drain, .drain = mirror_drain,
}; };
static void do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
uint64_t offset, uint64_t bytes,
QEMUIOVector *qiov, int flags)
{
BdrvDirtyBitmapIter *iter;
QEMUIOVector target_qiov;
uint64_t dirty_offset;
int dirty_bytes;
if (qiov) {
qemu_iovec_init(&target_qiov, qiov->niov);
}
iter = bdrv_dirty_iter_new(job->dirty_bitmap);
bdrv_set_dirty_iter(iter, offset);
while (true) {
bool valid_area;
int ret;
bdrv_dirty_bitmap_lock(job->dirty_bitmap);
valid_area = bdrv_dirty_iter_next_area(iter, offset + bytes,
&dirty_offset, &dirty_bytes);
if (!valid_area) {
bdrv_dirty_bitmap_unlock(job->dirty_bitmap);
break;
}
bdrv_reset_dirty_bitmap_locked(job->dirty_bitmap,
dirty_offset, dirty_bytes);
bdrv_dirty_bitmap_unlock(job->dirty_bitmap);
job_progress_increase_remaining(&job->common.job, dirty_bytes);
assert(dirty_offset - offset <= SIZE_MAX);
if (qiov) {
qemu_iovec_reset(&target_qiov);
qemu_iovec_concat(&target_qiov, qiov,
dirty_offset - offset, dirty_bytes);
}
switch (method) {
case MIRROR_METHOD_COPY:
ret = blk_co_pwritev(job->target, dirty_offset, dirty_bytes,
qiov ? &target_qiov : NULL, flags);
break;
case MIRROR_METHOD_ZERO:
assert(!qiov);
ret = blk_co_pwrite_zeroes(job->target, dirty_offset, dirty_bytes,
flags);
break;
case MIRROR_METHOD_DISCARD:
assert(!qiov);
ret = blk_co_pdiscard(job->target, dirty_offset, dirty_bytes);
break;
default:
abort();
}
if (ret >= 0) {
job_progress_update(&job->common.job, dirty_bytes);
} else {
BlockErrorAction action;
bdrv_set_dirty_bitmap(job->dirty_bitmap, dirty_offset, dirty_bytes);
job->actively_synced = false;
action = mirror_error_action(job, false, -ret);
if (action == BLOCK_ERROR_ACTION_REPORT) {
if (!job->ret) {
job->ret = ret;
}
break;
}
}
}
bdrv_dirty_iter_free(iter);
if (qiov) {
qemu_iovec_destroy(&target_qiov);
}
}
static MirrorOp *coroutine_fn active_write_prepare(MirrorBlockJob *s,
uint64_t offset,
uint64_t bytes)
{
MirrorOp *op;
uint64_t start_chunk = offset / s->granularity;
uint64_t end_chunk = DIV_ROUND_UP(offset + bytes, s->granularity);
op = g_new(MirrorOp, 1);
*op = (MirrorOp){
.s = s,
.offset = offset,
.bytes = bytes,
.is_active_write = true,
};
qemu_co_queue_init(&op->waiting_requests);
QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next);
s->in_active_write_counter++;
mirror_wait_on_conflicts(op, s, offset, bytes);
bitmap_set(s->in_flight_bitmap, start_chunk, end_chunk - start_chunk);
return op;
}
static void coroutine_fn active_write_settle(MirrorOp *op)
{
uint64_t start_chunk = op->offset / op->s->granularity;
uint64_t end_chunk = DIV_ROUND_UP(op->offset + op->bytes,
op->s->granularity);
if (!--op->s->in_active_write_counter && op->s->actively_synced) {
BdrvChild *source = op->s->mirror_top_bs->backing;
if (QLIST_FIRST(&source->bs->parents) == source &&
QLIST_NEXT(source, next_parent) == NULL)
{
/* Assert that we are back in sync once all active write
* operations are settled.
* Note that we can only assert this if the mirror node
* is the source node's only parent. */
assert(!bdrv_get_dirty_count(op->s->dirty_bitmap));
}
}
bitmap_clear(op->s->in_flight_bitmap, start_chunk, end_chunk - start_chunk);
QTAILQ_REMOVE(&op->s->ops_in_flight, op, next);
qemu_co_queue_restart_all(&op->waiting_requests);
g_free(op);
}
static int coroutine_fn bdrv_mirror_top_preadv(BlockDriverState *bs, static int coroutine_fn bdrv_mirror_top_preadv(BlockDriverState *bs,
uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
{ {
return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags); return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
} }
static int coroutine_fn bdrv_mirror_top_do_write(BlockDriverState *bs,
MirrorMethod method, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
int flags)
{
MirrorOp *op = NULL;
MirrorBDSOpaque *s = bs->opaque;
int ret = 0;
bool copy_to_target;
copy_to_target = s->job->ret >= 0 &&
s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
if (copy_to_target) {
op = active_write_prepare(s->job, offset, bytes);
}
switch (method) {
case MIRROR_METHOD_COPY:
ret = bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
break;
case MIRROR_METHOD_ZERO:
ret = bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
break;
case MIRROR_METHOD_DISCARD:
ret = bdrv_co_pdiscard(bs->backing->bs, offset, bytes);
break;
default:
abort();
}
if (ret < 0) {
goto out;
}
if (copy_to_target) {
do_sync_target_write(s->job, method, offset, bytes, qiov, flags);
}
out:
if (copy_to_target) {
active_write_settle(op);
}
return ret;
}
static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs, static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
{ {
return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags); MirrorBDSOpaque *s = bs->opaque;
QEMUIOVector bounce_qiov;
void *bounce_buf;
int ret = 0;
bool copy_to_target;
copy_to_target = s->job->ret >= 0 &&
s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
if (copy_to_target) {
/* The guest might concurrently modify the data to write; but
* the data on source and destination must match, so we have
* to use a bounce buffer if we are going to write to the
* target now. */
bounce_buf = qemu_blockalign(bs, bytes);
iov_to_buf_full(qiov->iov, qiov->niov, 0, bounce_buf, bytes);
qemu_iovec_init(&bounce_qiov, 1);
qemu_iovec_add(&bounce_qiov, bounce_buf, bytes);
qiov = &bounce_qiov;
}
ret = bdrv_mirror_top_do_write(bs, MIRROR_METHOD_COPY, offset, bytes, qiov,
flags);
if (copy_to_target) {
qemu_iovec_destroy(&bounce_qiov);
qemu_vfree(bounce_buf);
}
return ret;
} }
static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs) static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
@ -1040,13 +1403,15 @@ static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs, static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
int64_t offset, int bytes, BdrvRequestFlags flags) int64_t offset, int bytes, BdrvRequestFlags flags)
{ {
return bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags); return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_ZERO, offset, bytes, NULL,
flags);
} }
static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs, static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs,
int64_t offset, int bytes) int64_t offset, int bytes)
{ {
return bdrv_co_pdiscard(bs->backing->bs, offset, bytes); return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_DISCARD, offset, bytes,
NULL, 0);
} }
static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs, QDict *opts) static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs, QDict *opts)
@ -1108,10 +1473,11 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
const BlockJobDriver *driver, const BlockJobDriver *driver,
bool is_none_mode, BlockDriverState *base, bool is_none_mode, BlockDriverState *base,
bool auto_complete, const char *filter_node_name, bool auto_complete, const char *filter_node_name,
bool is_mirror, bool is_mirror, MirrorCopyMode copy_mode,
Error **errp) Error **errp)
{ {
MirrorBlockJob *s; MirrorBlockJob *s;
MirrorBDSOpaque *bs_opaque;
BlockDriverState *mirror_top_bs; BlockDriverState *mirror_top_bs;
bool target_graph_mod; bool target_graph_mod;
bool target_is_backing; bool target_is_backing;
@ -1147,6 +1513,8 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
mirror_top_bs->total_sectors = bs->total_sectors; mirror_top_bs->total_sectors = bs->total_sectors;
mirror_top_bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED; mirror_top_bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
mirror_top_bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED; mirror_top_bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
bs_opaque = g_new0(MirrorBDSOpaque, 1);
mirror_top_bs->opaque = bs_opaque;
bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs)); bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs));
/* bdrv_append takes ownership of the mirror_top_bs reference, need to keep /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
@ -1171,10 +1539,11 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
if (!s) { if (!s) {
goto fail; goto fail;
} }
bs_opaque->job = s;
/* The block job now has a reference to this node */ /* The block job now has a reference to this node */
bdrv_unref(mirror_top_bs); bdrv_unref(mirror_top_bs);
s->source = bs;
s->mirror_top_bs = mirror_top_bs; s->mirror_top_bs = mirror_top_bs;
/* No resize for the target either; while the mirror is still running, a /* No resize for the target either; while the mirror is still running, a
@ -1212,6 +1581,7 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
s->on_target_error = on_target_error; s->on_target_error = on_target_error;
s->is_none_mode = is_none_mode; s->is_none_mode = is_none_mode;
s->backing_mode = backing_mode; s->backing_mode = backing_mode;
s->copy_mode = copy_mode;
s->base = base; s->base = base;
s->granularity = granularity; s->granularity = granularity;
s->buf_size = ROUND_UP(buf_size, granularity); s->buf_size = ROUND_UP(buf_size, granularity);
@ -1247,6 +1617,8 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
} }
} }
QTAILQ_INIT(&s->ops_in_flight);
trace_mirror_start(bs, s, opaque); trace_mirror_start(bs, s, opaque);
job_start(&s->common.job); job_start(&s->common.job);
return; return;
@ -1259,6 +1631,7 @@ fail:
g_free(s->replaces); g_free(s->replaces);
blk_unref(s->target); blk_unref(s->target);
bs_opaque->job = NULL;
job_early_fail(&s->common.job); job_early_fail(&s->common.job);
} }
@ -1275,7 +1648,8 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
MirrorSyncMode mode, BlockMirrorBackingMode backing_mode, MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
BlockdevOnError on_source_error, BlockdevOnError on_source_error,
BlockdevOnError on_target_error, BlockdevOnError on_target_error,
bool unmap, const char *filter_node_name, Error **errp) bool unmap, const char *filter_node_name,
MirrorCopyMode copy_mode, Error **errp)
{ {
bool is_none_mode; bool is_none_mode;
BlockDriverState *base; BlockDriverState *base;
@ -1290,7 +1664,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
speed, granularity, buf_size, backing_mode, speed, granularity, buf_size, backing_mode,
on_source_error, on_target_error, unmap, NULL, NULL, on_source_error, on_target_error, unmap, NULL, NULL,
&mirror_job_driver, is_none_mode, base, false, &mirror_job_driver, is_none_mode, base, false,
filter_node_name, true, errp); filter_node_name, true, copy_mode, errp);
} }
void commit_active_start(const char *job_id, BlockDriverState *bs, void commit_active_start(const char *job_id, BlockDriverState *bs,
@ -1313,7 +1687,8 @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
MIRROR_LEAVE_BACKING_CHAIN, MIRROR_LEAVE_BACKING_CHAIN,
on_error, on_error, true, cb, opaque, on_error, on_error, true, cb, opaque,
&commit_active_job_driver, false, base, auto_complete, &commit_active_job_driver, false, base, auto_complete,
filter_node_name, false, &local_err); filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND,
&local_err);
if (local_err) { if (local_err) {
error_propagate(errp, local_err); error_propagate(errp, local_err);
goto error_restore_flags; goto error_restore_flags;

View File

@ -3134,6 +3134,7 @@ static void vvfat_qcow_options(int *child_flags, QDict *child_options,
} }
static const BdrvChildRole child_vvfat_qcow = { static const BdrvChildRole child_vvfat_qcow = {
.parent_is_bds = true,
.inherit_options = vvfat_qcow_options, .inherit_options = vvfat_qcow_options,
}; };

View File

@ -3586,6 +3586,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
bool has_unmap, bool unmap, bool has_unmap, bool unmap,
bool has_filter_node_name, bool has_filter_node_name,
const char *filter_node_name, const char *filter_node_name,
bool has_copy_mode, MirrorCopyMode copy_mode,
Error **errp) Error **errp)
{ {
@ -3610,6 +3611,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
if (!has_filter_node_name) { if (!has_filter_node_name) {
filter_node_name = NULL; filter_node_name = NULL;
} }
if (!has_copy_mode) {
copy_mode = MIRROR_COPY_MODE_BACKGROUND;
}
if (granularity != 0 && (granularity < 512 || granularity > 1048576 * 64)) { if (granularity != 0 && (granularity < 512 || granularity > 1048576 * 64)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "granularity", error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "granularity",
@ -3640,7 +3644,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
has_replaces ? replaces : NULL, has_replaces ? replaces : NULL,
speed, granularity, buf_size, sync, backing_mode, speed, granularity, buf_size, sync, backing_mode,
on_source_error, on_target_error, unmap, filter_node_name, on_source_error, on_target_error, unmap, filter_node_name,
errp); copy_mode, errp);
} }
void qmp_drive_mirror(DriveMirror *arg, Error **errp) void qmp_drive_mirror(DriveMirror *arg, Error **errp)
@ -3786,6 +3790,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
arg->has_on_target_error, arg->on_target_error, arg->has_on_target_error, arg->on_target_error,
arg->has_unmap, arg->unmap, arg->has_unmap, arg->unmap,
false, NULL, false, NULL,
arg->has_copy_mode, arg->copy_mode,
&local_err); &local_err);
bdrv_unref(target_bs); bdrv_unref(target_bs);
error_propagate(errp, local_err); error_propagate(errp, local_err);
@ -3806,6 +3811,7 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
BlockdevOnError on_target_error, BlockdevOnError on_target_error,
bool has_filter_node_name, bool has_filter_node_name,
const char *filter_node_name, const char *filter_node_name,
bool has_copy_mode, MirrorCopyMode copy_mode,
Error **errp) Error **errp)
{ {
BlockDriverState *bs; BlockDriverState *bs;
@ -3838,6 +3844,7 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
has_on_target_error, on_target_error, has_on_target_error, on_target_error,
true, true, true, true,
has_filter_node_name, filter_node_name, has_filter_node_name, filter_node_name,
has_copy_mode, copy_mode,
&local_err); &local_err);
error_propagate(errp, local_err); error_propagate(errp, local_err);

View File

@ -155,6 +155,28 @@ static void child_job_drained_begin(BdrvChild *c)
job_pause(&job->job); job_pause(&job->job);
} }
static bool child_job_drained_poll(BdrvChild *c)
{
BlockJob *bjob = c->opaque;
Job *job = &bjob->job;
const BlockJobDriver *drv = block_job_driver(bjob);
/* An inactive or completed job doesn't have any pending requests. Jobs
* with !job->busy are either already paused or have a pause point after
* being reentered, so no job driver code will run before they pause. */
if (!job->busy || job_is_completed(job) || job->deferred_to_main_loop) {
return false;
}
/* Otherwise, assume that it isn't fully stopped yet, but allow the job to
* override this assumption. */
if (drv->drained_poll) {
return drv->drained_poll(bjob);
} else {
return true;
}
}
static void child_job_drained_end(BdrvChild *c) static void child_job_drained_end(BdrvChild *c)
{ {
BlockJob *job = c->opaque; BlockJob *job = c->opaque;
@ -164,6 +186,7 @@ static void child_job_drained_end(BdrvChild *c)
static const BdrvChildRole child_job = { static const BdrvChildRole child_job = {
.get_parent_desc = child_job_get_parent_desc, .get_parent_desc = child_job_get_parent_desc,
.drained_begin = child_job_drained_begin, .drained_begin = child_job_drained_begin,
.drained_poll = child_job_drained_poll,
.drained_end = child_job_drained_end, .drained_end = child_job_drained_end,
.stay_at_node = true, .stay_at_node = true,
}; };

View File

@ -57,7 +57,8 @@ typedef struct {
/** /**
* AIO_WAIT_WHILE: * AIO_WAIT_WHILE:
* @wait: the aio wait object * @wait: the aio wait object
* @ctx: the aio context * @ctx: the aio context, or NULL if multiple aio contexts (for which the
* caller does not hold a lock) are involved in the polling condition.
* @cond: wait while this conditional expression is true * @cond: wait while this conditional expression is true
* *
* Wait while a condition is true. Use this to implement synchronous * Wait while a condition is true. Use this to implement synchronous
@ -73,29 +74,27 @@ typedef struct {
*/ */
#define AIO_WAIT_WHILE(wait, ctx, cond) ({ \ #define AIO_WAIT_WHILE(wait, ctx, cond) ({ \
bool waited_ = false; \ bool waited_ = false; \
bool busy_ = true; \
AioWait *wait_ = (wait); \ AioWait *wait_ = (wait); \
AioContext *ctx_ = (ctx); \ AioContext *ctx_ = (ctx); \
if (in_aio_context_home_thread(ctx_)) { \ if (ctx_ && in_aio_context_home_thread(ctx_)) { \
while ((cond) || busy_) { \ while ((cond)) { \
busy_ = aio_poll(ctx_, (cond)); \ aio_poll(ctx_, true); \
waited_ |= !!(cond) | busy_; \ waited_ = true; \
} \ } \
} else { \ } else { \
assert(qemu_get_current_aio_context() == \ assert(qemu_get_current_aio_context() == \
qemu_get_aio_context()); \ qemu_get_aio_context()); \
/* Increment wait_->num_waiters before evaluating cond. */ \ /* Increment wait_->num_waiters before evaluating cond. */ \
atomic_inc(&wait_->num_waiters); \ atomic_inc(&wait_->num_waiters); \
while (busy_) { \ while ((cond)) { \
if ((cond)) { \ if (ctx_) { \
waited_ = busy_ = true; \
aio_context_release(ctx_); \ aio_context_release(ctx_); \
aio_poll(qemu_get_aio_context(), true); \
aio_context_acquire(ctx_); \
} else { \
busy_ = aio_poll(ctx_, false); \
waited_ |= busy_; \
} \ } \
aio_poll(qemu_get_aio_context(), true); \
if (ctx_) { \
aio_context_acquire(ctx_); \
} \
waited_ = true; \
} \ } \
atomic_dec(&wait_->num_waiters); \ atomic_dec(&wait_->num_waiters); \
} \ } \

View File

@ -421,6 +421,7 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
Error **errp); Error **errp);
bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base); bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
BlockDriverState *bdrv_next_node(BlockDriverState *bs); BlockDriverState *bdrv_next_node(BlockDriverState *bs);
BlockDriverState *bdrv_next_all_states(BlockDriverState *bs);
typedef struct BdrvNextIterator { typedef struct BdrvNextIterator {
enum { enum {
@ -557,7 +558,8 @@ void bdrv_io_unplug(BlockDriverState *bs);
* Begin a quiesced section of all users of @bs. This is part of * Begin a quiesced section of all users of @bs. This is part of
* bdrv_drained_begin. * bdrv_drained_begin.
*/ */
void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore); void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
bool ignore_bds_parents);
/** /**
* bdrv_parent_drained_end: * bdrv_parent_drained_end:
@ -565,7 +567,23 @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
* End a quiesced section of all users of @bs. This is part of * End a quiesced section of all users of @bs. This is part of
* bdrv_drained_end. * bdrv_drained_end.
*/ */
void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore); void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
bool ignore_bds_parents);
/**
* bdrv_drain_poll:
*
* Poll for pending requests in @bs, its parents (except for @ignore_parent),
* and if @recursive is true its children as well (used for subtree drain).
*
* If @ignore_bds_parents is true, parents that are BlockDriverStates must
* ignore the drain request because they will be drained separately (used for
* drain_all).
*
* This is part of bdrv_drained_begin.
*/
bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
BdrvChild *ignore_parent, bool ignore_bds_parents);
/** /**
* bdrv_drained_begin: * bdrv_drained_begin:
@ -579,6 +597,15 @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
*/ */
void bdrv_drained_begin(BlockDriverState *bs); void bdrv_drained_begin(BlockDriverState *bs);
/**
* bdrv_do_drained_begin_quiesce:
*
* Quiesces a BDS like bdrv_drained_begin(), but does not wait for already
* running requests to complete.
*/
void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
BdrvChild *parent, bool ignore_bds_parents);
/** /**
* Like bdrv_drained_begin, but recursively begins a quiesced section for * Like bdrv_drained_begin, but recursively begins a quiesced section for
* exclusive access to all child nodes as well. * exclusive access to all child nodes as well.

View File

@ -577,6 +577,12 @@ struct BdrvChildRole {
* points to. */ * points to. */
bool stay_at_node; bool stay_at_node;
/* If true, the parent is a BlockDriverState and bdrv_next_all_states()
* will return it. This information is used for drain_all, where every node
* will be drained separately, so the drain only needs to be propagated to
* non-BDS parents. */
bool parent_is_bds;
void (*inherit_options)(int *child_flags, QDict *child_options, void (*inherit_options)(int *child_flags, QDict *child_options,
int parent_flags, QDict *parent_options); int parent_flags, QDict *parent_options);
@ -605,6 +611,13 @@ struct BdrvChildRole {
void (*drained_begin)(BdrvChild *child); void (*drained_begin)(BdrvChild *child);
void (*drained_end)(BdrvChild *child); void (*drained_end)(BdrvChild *child);
/*
* Returns whether the parent has pending requests for the child. This
* callback is polled after .drained_begin() has been called until all
* activity on the child has stopped.
*/
bool (*drained_poll)(BdrvChild *child);
/* Notifies the parent that the child has been activated/inactivated (e.g. /* Notifies the parent that the child has been activated/inactivated (e.g.
* when migration is completing) and it can start/stop requesting * when migration is completing) and it can start/stop requesting
* permissions and doing I/O on it. */ * permissions and doing I/O on it. */
@ -841,6 +854,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags); BdrvRequestFlags flags);
extern unsigned int bdrv_drain_all_count;
void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent); void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent); void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
@ -1017,6 +1031,7 @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
* @filter_node_name: The node name that should be assigned to the filter * @filter_node_name: The node name that should be assigned to the filter
* driver that the mirror job inserts into the graph above @bs. NULL means that * driver that the mirror job inserts into the graph above @bs. NULL means that
* a node name should be autogenerated. * a node name should be autogenerated.
* @copy_mode: When to trigger writes to the target.
* @errp: Error object. * @errp: Error object.
* *
* Start a mirroring operation on @bs. Clusters that are allocated * Start a mirroring operation on @bs. Clusters that are allocated
@ -1030,7 +1045,8 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
MirrorSyncMode mode, BlockMirrorBackingMode backing_mode, MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
BlockdevOnError on_source_error, BlockdevOnError on_source_error,
BlockdevOnError on_target_error, BlockdevOnError on_target_error,
bool unmap, const char *filter_node_name, Error **errp); bool unmap, const char *filter_node_name,
MirrorCopyMode copy_mode, Error **errp);
/* /*
* backup_job_create: * backup_job_create:

View File

@ -38,6 +38,14 @@ struct BlockJobDriver {
/** Generic JobDriver callbacks and settings */ /** Generic JobDriver callbacks and settings */
JobDriver job_driver; JobDriver job_driver;
/*
* Returns whether the job has pending requests for the child or will
* submit new requests before the next pause point. This callback is polled
* in the context of draining a job node after requesting that the job be
* paused, until all activity on the child has stopped.
*/
bool (*drained_poll)(BlockJob *job);
/* /*
* If the callback is not NULL, it will be invoked before the job is * If the callback is not NULL, it will be invoked before the job is
* resumed in a new AioContext. This is the place to move any resources * resumed in a new AioContext. This is the place to move any resources

View File

@ -82,6 +82,8 @@ void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap, void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
int64_t offset, int64_t bytes); int64_t offset, int64_t bytes);
int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter); int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter);
bool bdrv_dirty_iter_next_area(BdrvDirtyBitmapIter *iter, uint64_t max_offset,
uint64_t *offset, int *bytes);
void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t offset); void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t offset);
int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap); int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap); int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap);

View File

@ -324,11 +324,14 @@ void hbitmap_free_meta(HBitmap *hb);
/** /**
* hbitmap_iter_next: * hbitmap_iter_next:
* @hbi: HBitmapIter to operate on. * @hbi: HBitmapIter to operate on.
* @advance: If true, advance the iterator. Otherwise, the next call
* of this function will return the same result (if that
* position is still dirty).
* *
* Return the next bit that is set in @hbi's associated HBitmap, * Return the next bit that is set in @hbi's associated HBitmap,
* or -1 if all remaining bits are zero. * or -1 if all remaining bits are zero.
*/ */
int64_t hbitmap_iter_next(HBitmapIter *hbi); int64_t hbitmap_iter_next(HBitmapIter *hbi, bool advance);
/** /**
* hbitmap_iter_next_word: * hbitmap_iter_next_word:

View File

@ -335,6 +335,21 @@ void job_progress_update(Job *job, uint64_t done);
*/ */
void job_progress_set_remaining(Job *job, uint64_t remaining); void job_progress_set_remaining(Job *job, uint64_t remaining);
/**
* @job: The job whose expected progress end value is updated
* @delta: Value which is to be added to the current expected end
* value
*
* Increases the expected end value of the progress counter of a job.
* This is useful for parenthesis operations: If a job has to
* conditionally perform a high-priority operation as part of its
* progress, it calls this function with the expected operation's
* length before, and job_progress_update() afterwards.
* (So the operation acts as a parenthesis in regards to the main job
* operation running in background.)
*/
void job_progress_increase_remaining(Job *job, uint64_t delta);
/** To be called when a cancelled job is finalised. */ /** To be called when a cancelled job is finalised. */
void job_event_cancelled(Job *job); void job_event_cancelled(Job *job);

5
job.c
View File

@ -385,6 +385,11 @@ void job_progress_set_remaining(Job *job, uint64_t remaining)
job->progress_total = job->progress_current + remaining; job->progress_total = job->progress_current + remaining;
} }
void job_progress_increase_remaining(Job *job, uint64_t delta)
{
job->progress_total += delta;
}
void job_event_cancelled(Job *job) void job_event_cancelled(Job *job)
{ {
notifier_list_notify(&job->on_finalize_cancelled, job); notifier_list_notify(&job->on_finalize_cancelled, job);

View File

@ -1050,6 +1050,24 @@
{ 'enum': 'MirrorSyncMode', { 'enum': 'MirrorSyncMode',
'data': ['top', 'full', 'none', 'incremental'] } 'data': ['top', 'full', 'none', 'incremental'] }
##
# @MirrorCopyMode:
#
# An enumeration whose values tell the mirror block job when to
# trigger writes to the target.
#
# @background: copy data in background only.
#
# @write-blocking: when data is written to the source, write it
# (synchronously) to the target as well. In
# addition, data is copied in background just like in
# @background mode.
#
# Since: 3.0
##
{ 'enum': 'MirrorCopyMode',
'data': ['background', 'write-blocking'] }
## ##
# @BlockJobInfo: # @BlockJobInfo:
# #
@ -1692,6 +1710,9 @@
# written. Both will result in identical contents. # written. Both will result in identical contents.
# Default is true. (Since 2.4) # Default is true. (Since 2.4)
# #
# @copy-mode: when to copy data to the destination; defaults to 'background'
# (Since: 3.0)
#
# Since: 1.3 # Since: 1.3
## ##
{ 'struct': 'DriveMirror', { 'struct': 'DriveMirror',
@ -1701,7 +1722,7 @@
'*speed': 'int', '*granularity': 'uint32', '*speed': 'int', '*granularity': 'uint32',
'*buf-size': 'int', '*on-source-error': 'BlockdevOnError', '*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
'*on-target-error': 'BlockdevOnError', '*on-target-error': 'BlockdevOnError',
'*unmap': 'bool' } } '*unmap': 'bool', '*copy-mode': 'MirrorCopyMode' } }
## ##
# @BlockDirtyBitmap: # @BlockDirtyBitmap:
@ -1964,6 +1985,9 @@
# above @device. If this option is not given, a node name is # above @device. If this option is not given, a node name is
# autogenerated. (Since: 2.9) # autogenerated. (Since: 2.9)
# #
# @copy-mode: when to copy data to the destination; defaults to 'background'
# (Since: 3.0)
#
# Returns: nothing on success. # Returns: nothing on success.
# #
# Since: 2.6 # Since: 2.6
@ -1984,7 +2008,8 @@
'*speed': 'int', '*granularity': 'uint32', '*speed': 'int', '*granularity': 'uint32',
'*buf-size': 'int', '*on-source-error': 'BlockdevOnError', '*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
'*on-target-error': 'BlockdevOnError', '*on-target-error': 'BlockdevOnError',
'*filter-node-name': 'str' } } '*filter-node-name': 'str',
'*copy-mode': 'MirrorCopyMode' } }
## ##
# @block_set_io_throttle: # @block_set_io_throttle:

120
tests/qemu-iotests/151 Executable file
View File

@ -0,0 +1,120 @@
#!/usr/bin/env python
#
# Tests for active mirroring
#
# Copyright (C) 2018 Red Hat, Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import iotests
from iotests import qemu_img
source_img = os.path.join(iotests.test_dir, 'source.' + iotests.imgfmt)
target_img = os.path.join(iotests.test_dir, 'target.' + iotests.imgfmt)
class TestActiveMirror(iotests.QMPTestCase):
image_len = 128 * 1024 * 1024 # MB
potential_writes_in_flight = True
def setUp(self):
qemu_img('create', '-f', iotests.imgfmt, source_img, '128M')
qemu_img('create', '-f', iotests.imgfmt, target_img, '128M')
blk_source = {'id': 'source',
'if': 'none',
'node-name': 'source-node',
'driver': iotests.imgfmt,
'file': {'driver': 'file',
'filename': source_img}}
blk_target = {'node-name': 'target-node',
'driver': iotests.imgfmt,
'file': {'driver': 'file',
'filename': target_img}}
self.vm = iotests.VM()
self.vm.add_drive_raw(self.vm.qmp_to_opts(blk_source))
self.vm.add_blockdev(self.vm.qmp_to_opts(blk_target))
self.vm.add_device('virtio-blk,drive=source')
self.vm.launch()
def tearDown(self):
self.vm.shutdown()
if not self.potential_writes_in_flight:
self.assertTrue(iotests.compare_images(source_img, target_img),
'mirror target does not match source')
os.remove(source_img)
os.remove(target_img)
def doActiveIO(self, sync_source_and_target):
# Fill the source image
self.vm.hmp_qemu_io('source',
'write -P 1 0 %i' % self.image_len);
# Start some background requests
for offset in range(1 * self.image_len / 8, 3 * self.image_len / 8, 1024 * 1024):
self.vm.hmp_qemu_io('source', 'aio_write -P 2 %i 1M' % offset)
for offset in range(2 * self.image_len / 8, 3 * self.image_len / 8, 1024 * 1024):
self.vm.hmp_qemu_io('source', 'aio_write -z %i 1M' % offset)
# Start the block job
result = self.vm.qmp('blockdev-mirror',
job_id='mirror',
filter_node_name='mirror-node',
device='source-node',
target='target-node',
sync='full',
copy_mode='write-blocking')
self.assert_qmp(result, 'return', {})
# Start some more requests
for offset in range(3 * self.image_len / 8, 5 * self.image_len / 8, 1024 * 1024):
self.vm.hmp_qemu_io('source', 'aio_write -P 3 %i 1M' % offset)
for offset in range(4 * self.image_len / 8, 5 * self.image_len / 8, 1024 * 1024):
self.vm.hmp_qemu_io('source', 'aio_write -z %i 1M' % offset)
# Wait for the READY event
self.wait_ready(drive='mirror')
# Now start some final requests; all of these (which land on
# the source) should be settled using the active mechanism.
# The mirror code itself asserts that the source BDS's dirty
# bitmap will stay clean between READY and COMPLETED.
for offset in range(5 * self.image_len / 8, 7 * self.image_len / 8, 1024 * 1024):
self.vm.hmp_qemu_io('source', 'aio_write -P 3 %i 1M' % offset)
for offset in range(6 * self.image_len / 8, 7 * self.image_len / 8, 1024 * 1024):
self.vm.hmp_qemu_io('source', 'aio_write -z %i 1M' % offset)
if sync_source_and_target:
# If source and target should be in sync after the mirror,
# we have to flush before completion
self.vm.hmp_qemu_io('source', 'aio_flush')
self.potential_writes_in_flight = False
self.complete_and_wait(drive='mirror', wait_ready=False)
def testActiveIO(self):
self.doActiveIO(False)
def testActiveIOFlushed(self):
self.doActiveIO(True)
if __name__ == '__main__':
iotests.main(supported_fmts=['qcow2', 'raw'])

View File

@ -0,0 +1,5 @@
..
----------------------------------------------------------------------
Ran 2 tests
OK

View File

@ -157,6 +157,7 @@
148 rw auto quick 148 rw auto quick
149 rw auto sudo 149 rw auto sudo
150 rw auto quick 150 rw auto quick
151 rw auto
152 rw auto quick 152 rw auto quick
153 rw auto quick 153 rw auto quick
154 rw auto backing quick 154 rw auto backing quick

View File

@ -27,15 +27,23 @@
#include "block/blockjob_int.h" #include "block/blockjob_int.h"
#include "sysemu/block-backend.h" #include "sysemu/block-backend.h"
#include "qapi/error.h" #include "qapi/error.h"
#include "iothread.h"
static QemuEvent done_event;
typedef struct BDRVTestState { typedef struct BDRVTestState {
int drain_count; int drain_count;
AioContext *bh_indirection_ctx;
bool sleep_in_drain_begin;
} BDRVTestState; } BDRVTestState;
static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs) static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
{ {
BDRVTestState *s = bs->opaque; BDRVTestState *s = bs->opaque;
s->drain_count++; s->drain_count++;
if (s->sleep_in_drain_begin) {
qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
}
} }
static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs) static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
@ -50,19 +58,48 @@ static void bdrv_test_close(BlockDriverState *bs)
g_assert_cmpint(s->drain_count, >, 0); g_assert_cmpint(s->drain_count, >, 0);
} }
static void co_reenter_bh(void *opaque)
{
aio_co_wake(opaque);
}
static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs, static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
uint64_t offset, uint64_t bytes, uint64_t offset, uint64_t bytes,
QEMUIOVector *qiov, int flags) QEMUIOVector *qiov, int flags)
{ {
BDRVTestState *s = bs->opaque;
/* We want this request to stay until the polling loop in drain waits for /* We want this request to stay until the polling loop in drain waits for
* it to complete. We need to sleep a while as bdrv_drain_invoke() comes * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
* first and polls its result, too, but it shouldn't accidentally complete * first and polls its result, too, but it shouldn't accidentally complete
* this request yet. */ * this request yet. */
qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000); qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
if (s->bh_indirection_ctx) {
aio_bh_schedule_oneshot(s->bh_indirection_ctx, co_reenter_bh,
qemu_coroutine_self());
qemu_coroutine_yield();
}
return 0; return 0;
} }
static void bdrv_test_child_perm(BlockDriverState *bs, BdrvChild *c,
const BdrvChildRole *role,
BlockReopenQueue *reopen_queue,
uint64_t perm, uint64_t shared,
uint64_t *nperm, uint64_t *nshared)
{
/* bdrv_format_default_perms() accepts only these two, so disguise
* detach_by_driver_cb_role as one of them. */
if (role != &child_file && role != &child_backing) {
role = &child_file;
}
bdrv_format_default_perms(bs, c, role, reopen_queue, perm, shared,
nperm, nshared);
}
static BlockDriver bdrv_test = { static BlockDriver bdrv_test = {
.format_name = "test", .format_name = "test",
.instance_size = sizeof(BDRVTestState), .instance_size = sizeof(BDRVTestState),
@ -73,7 +110,7 @@ static BlockDriver bdrv_test = {
.bdrv_co_drain_begin = bdrv_test_co_drain_begin, .bdrv_co_drain_begin = bdrv_test_co_drain_begin,
.bdrv_co_drain_end = bdrv_test_co_drain_end, .bdrv_co_drain_end = bdrv_test_co_drain_end,
.bdrv_child_perm = bdrv_format_default_perms, .bdrv_child_perm = bdrv_test_child_perm,
}; };
static void aio_ret_cb(void *opaque, int ret) static void aio_ret_cb(void *opaque, int ret)
@ -216,6 +253,11 @@ static void test_drv_cb_drain_subtree(void)
test_drv_cb_common(BDRV_SUBTREE_DRAIN, true); test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
} }
static void test_drv_cb_co_drain_all(void)
{
call_in_coroutine(test_drv_cb_drain_all);
}
static void test_drv_cb_co_drain(void) static void test_drv_cb_co_drain(void)
{ {
call_in_coroutine(test_drv_cb_drain); call_in_coroutine(test_drv_cb_drain);
@ -259,8 +301,7 @@ static void test_quiesce_common(enum drain_type drain_type, bool recursive)
static void test_quiesce_drain_all(void) static void test_quiesce_drain_all(void)
{ {
// XXX drain_all doesn't quiesce test_quiesce_common(BDRV_DRAIN_ALL, true);
//test_quiesce_common(BDRV_DRAIN_ALL, true);
} }
static void test_quiesce_drain(void) static void test_quiesce_drain(void)
@ -273,6 +314,11 @@ static void test_quiesce_drain_subtree(void)
test_quiesce_common(BDRV_SUBTREE_DRAIN, true); test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
} }
static void test_quiesce_co_drain_all(void)
{
call_in_coroutine(test_quiesce_drain_all);
}
static void test_quiesce_co_drain(void) static void test_quiesce_co_drain(void)
{ {
call_in_coroutine(test_quiesce_drain); call_in_coroutine(test_quiesce_drain);
@ -302,12 +348,7 @@ static void test_nested(void)
for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) { for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) { for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
/* XXX bdrv_drain_all() doesn't increase the quiesce_counter */ int backing_quiesce = (outer != BDRV_DRAIN) +
int bs_quiesce = (outer != BDRV_DRAIN_ALL) +
(inner != BDRV_DRAIN_ALL);
int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
(inner == BDRV_SUBTREE_DRAIN);
int backing_cb_cnt = (outer != BDRV_DRAIN) +
(inner != BDRV_DRAIN); (inner != BDRV_DRAIN);
g_assert_cmpint(bs->quiesce_counter, ==, 0); g_assert_cmpint(bs->quiesce_counter, ==, 0);
@ -318,10 +359,10 @@ static void test_nested(void)
do_drain_begin(outer, bs); do_drain_begin(outer, bs);
do_drain_begin(inner, bs); do_drain_begin(inner, bs);
g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce); g_assert_cmpint(bs->quiesce_counter, ==, 2);
g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce); g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
g_assert_cmpint(s->drain_count, ==, 2); g_assert_cmpint(s->drain_count, ==, 2);
g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt); g_assert_cmpint(backing_s->drain_count, ==, backing_quiesce);
do_drain_end(inner, bs); do_drain_end(inner, bs);
do_drain_end(outer, bs); do_drain_end(outer, bs);
@ -411,7 +452,7 @@ static void test_multiparent(void)
blk_unref(blk_b); blk_unref(blk_b);
} }
static void test_graph_change(void) static void test_graph_change_drain_subtree(void)
{ {
BlockBackend *blk_a, *blk_b; BlockBackend *blk_a, *blk_b;
BlockDriverState *bs_a, *bs_b, *backing; BlockDriverState *bs_a, *bs_b, *backing;
@ -490,6 +531,221 @@ static void test_graph_change(void)
blk_unref(blk_b); blk_unref(blk_b);
} }
static void test_graph_change_drain_all(void)
{
BlockBackend *blk_a, *blk_b;
BlockDriverState *bs_a, *bs_b;
BDRVTestState *a_s, *b_s;
/* Create node A with a BlockBackend */
blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
&error_abort);
a_s = bs_a->opaque;
blk_insert_bs(blk_a, bs_a, &error_abort);
g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
g_assert_cmpint(a_s->drain_count, ==, 0);
/* Call bdrv_drain_all_begin() */
bdrv_drain_all_begin();
g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
g_assert_cmpint(a_s->drain_count, ==, 1);
/* Create node B with a BlockBackend */
blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
&error_abort);
b_s = bs_b->opaque;
blk_insert_bs(blk_b, bs_b, &error_abort);
g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
g_assert_cmpint(a_s->drain_count, ==, 1);
g_assert_cmpint(b_s->drain_count, ==, 1);
/* Unref and finally delete node A */
blk_unref(blk_a);
g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
g_assert_cmpint(a_s->drain_count, ==, 1);
g_assert_cmpint(b_s->drain_count, ==, 1);
bdrv_unref(bs_a);
g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
g_assert_cmpint(b_s->drain_count, ==, 1);
/* End the drained section */
bdrv_drain_all_end();
g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
g_assert_cmpint(b_s->drain_count, ==, 0);
bdrv_unref(bs_b);
blk_unref(blk_b);
}
struct test_iothread_data {
BlockDriverState *bs;
enum drain_type drain_type;
int *aio_ret;
};
static void test_iothread_drain_entry(void *opaque)
{
struct test_iothread_data *data = opaque;
aio_context_acquire(bdrv_get_aio_context(data->bs));
do_drain_begin(data->drain_type, data->bs);
g_assert_cmpint(*data->aio_ret, ==, 0);
do_drain_end(data->drain_type, data->bs);
aio_context_release(bdrv_get_aio_context(data->bs));
qemu_event_set(&done_event);
}
static void test_iothread_aio_cb(void *opaque, int ret)
{
int *aio_ret = opaque;
*aio_ret = ret;
qemu_event_set(&done_event);
}
/*
* Starts an AIO request on a BDS that runs in the AioContext of iothread 1.
* The request involves a BH on iothread 2 before it can complete.
*
* @drain_thread = 0 means that do_drain_begin/end are called from the main
* thread, @drain_thread = 1 means that they are called from iothread 1. Drain
* for this BDS cannot be called from iothread 2 because only the main thread
* may do cross-AioContext polling.
*/
static void test_iothread_common(enum drain_type drain_type, int drain_thread)
{
BlockBackend *blk;
BlockDriverState *bs;
BDRVTestState *s;
BlockAIOCB *acb;
int aio_ret;
struct test_iothread_data data;
IOThread *a = iothread_new();
IOThread *b = iothread_new();
AioContext *ctx_a = iothread_get_aio_context(a);
AioContext *ctx_b = iothread_get_aio_context(b);
QEMUIOVector qiov;
struct iovec iov = {
.iov_base = NULL,
.iov_len = 0,
};
qemu_iovec_init_external(&qiov, &iov, 1);
/* bdrv_drain_all() may only be called from the main loop thread */
if (drain_type == BDRV_DRAIN_ALL && drain_thread != 0) {
goto out;
}
blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
&error_abort);
s = bs->opaque;
blk_insert_bs(blk, bs, &error_abort);
blk_set_aio_context(blk, ctx_a);
aio_context_acquire(ctx_a);
s->bh_indirection_ctx = ctx_b;
aio_ret = -EINPROGRESS;
if (drain_thread == 0) {
acb = blk_aio_preadv(blk, 0, &qiov, 0, test_iothread_aio_cb, &aio_ret);
} else {
acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
}
g_assert(acb != NULL);
g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
aio_context_release(ctx_a);
data = (struct test_iothread_data) {
.bs = bs,
.drain_type = drain_type,
.aio_ret = &aio_ret,
};
switch (drain_thread) {
case 0:
if (drain_type != BDRV_DRAIN_ALL) {
aio_context_acquire(ctx_a);
}
/* The request is running on the IOThread a. Draining its block device
* will make sure that it has completed as far as the BDS is concerned,
* but the drain in this thread can continue immediately after
* bdrv_dec_in_flight() and aio_ret might be assigned only slightly
* later. */
qemu_event_reset(&done_event);
do_drain_begin(drain_type, bs);
g_assert_cmpint(bs->in_flight, ==, 0);
if (drain_type != BDRV_DRAIN_ALL) {
aio_context_release(ctx_a);
}
qemu_event_wait(&done_event);
if (drain_type != BDRV_DRAIN_ALL) {
aio_context_acquire(ctx_a);
}
g_assert_cmpint(aio_ret, ==, 0);
do_drain_end(drain_type, bs);
if (drain_type != BDRV_DRAIN_ALL) {
aio_context_release(ctx_a);
}
break;
case 1:
qemu_event_reset(&done_event);
aio_bh_schedule_oneshot(ctx_a, test_iothread_drain_entry, &data);
qemu_event_wait(&done_event);
break;
default:
g_assert_not_reached();
}
aio_context_acquire(ctx_a);
blk_set_aio_context(blk, qemu_get_aio_context());
aio_context_release(ctx_a);
bdrv_unref(bs);
blk_unref(blk);
out:
iothread_join(a);
iothread_join(b);
}
static void test_iothread_drain_all(void)
{
test_iothread_common(BDRV_DRAIN_ALL, 0);
test_iothread_common(BDRV_DRAIN_ALL, 1);
}
static void test_iothread_drain(void)
{
test_iothread_common(BDRV_DRAIN, 0);
test_iothread_common(BDRV_DRAIN, 1);
}
static void test_iothread_drain_subtree(void)
{
test_iothread_common(BDRV_SUBTREE_DRAIN, 0);
test_iothread_common(BDRV_SUBTREE_DRAIN, 1);
}
typedef struct TestBlockJob { typedef struct TestBlockJob {
BlockJob common; BlockJob common;
@ -507,7 +763,11 @@ static void coroutine_fn test_job_start(void *opaque)
job_transition_to_ready(&s->common.job); job_transition_to_ready(&s->common.job);
while (!s->should_complete) { while (!s->should_complete) {
job_sleep_ns(&s->common.job, 100000); /* Avoid block_job_sleep_ns() because it marks the job as !busy. We
* want to emulate some actual activity (probably some I/O) here so
* that drain has to wait for this acitivity to stop. */
qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
job_pause_point(&s->common.job);
} }
job_defer_to_main_loop(&s->common.job, test_job_completed, NULL); job_defer_to_main_loop(&s->common.job, test_job_completed, NULL);
@ -554,7 +814,7 @@ static void test_blockjob_common(enum drain_type drain_type)
g_assert_cmpint(job->job.pause_count, ==, 0); g_assert_cmpint(job->job.pause_count, ==, 0);
g_assert_false(job->job.paused); g_assert_false(job->job.paused);
g_assert_false(job->job.busy); /* We're in job_sleep_ns() */ g_assert_true(job->job.busy); /* We're in job_sleep_ns() */
do_drain_begin(drain_type, src); do_drain_begin(drain_type, src);
@ -564,15 +824,14 @@ static void test_blockjob_common(enum drain_type drain_type)
} else { } else {
g_assert_cmpint(job->job.pause_count, ==, 1); g_assert_cmpint(job->job.pause_count, ==, 1);
} }
/* XXX We don't wait until the job is actually paused. Is this okay? */ g_assert_true(job->job.paused);
/* g_assert_true(job->job.paused); */
g_assert_false(job->job.busy); /* The job is paused */ g_assert_false(job->job.busy); /* The job is paused */
do_drain_end(drain_type, src); do_drain_end(drain_type, src);
g_assert_cmpint(job->job.pause_count, ==, 0); g_assert_cmpint(job->job.pause_count, ==, 0);
g_assert_false(job->job.paused); g_assert_false(job->job.paused);
g_assert_false(job->job.busy); /* We're in job_sleep_ns() */ g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
do_drain_begin(drain_type, target); do_drain_begin(drain_type, target);
@ -582,15 +841,14 @@ static void test_blockjob_common(enum drain_type drain_type)
} else { } else {
g_assert_cmpint(job->job.pause_count, ==, 1); g_assert_cmpint(job->job.pause_count, ==, 1);
} }
/* XXX We don't wait until the job is actually paused. Is this okay? */ g_assert_true(job->job.paused);
/* g_assert_true(job->job.paused); */
g_assert_false(job->job.busy); /* The job is paused */ g_assert_false(job->job.busy); /* The job is paused */
do_drain_end(drain_type, target); do_drain_end(drain_type, target);
g_assert_cmpint(job->job.pause_count, ==, 0); g_assert_cmpint(job->job.pause_count, ==, 0);
g_assert_false(job->job.paused); g_assert_false(job->job.paused);
g_assert_false(job->job.busy); /* We're in job_sleep_ns() */ g_assert_true(job->job.busy); /* We're in job_sleep_ns() */
ret = job_complete_sync(&job->job, &error_abort); ret = job_complete_sync(&job->job, &error_abort);
g_assert_cmpint(ret, ==, 0); g_assert_cmpint(ret, ==, 0);
@ -616,19 +874,399 @@ static void test_blockjob_drain_subtree(void)
test_blockjob_common(BDRV_SUBTREE_DRAIN); test_blockjob_common(BDRV_SUBTREE_DRAIN);
} }
typedef struct BDRVTestTopState {
BdrvChild *wait_child;
} BDRVTestTopState;
static void bdrv_test_top_close(BlockDriverState *bs)
{
BdrvChild *c, *next_c;
QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) {
bdrv_unref_child(bs, c);
}
}
static int coroutine_fn bdrv_test_top_co_preadv(BlockDriverState *bs,
uint64_t offset, uint64_t bytes,
QEMUIOVector *qiov, int flags)
{
BDRVTestTopState *tts = bs->opaque;
return bdrv_co_preadv(tts->wait_child, offset, bytes, qiov, flags);
}
static BlockDriver bdrv_test_top_driver = {
.format_name = "test_top_driver",
.instance_size = sizeof(BDRVTestTopState),
.bdrv_close = bdrv_test_top_close,
.bdrv_co_preadv = bdrv_test_top_co_preadv,
.bdrv_child_perm = bdrv_format_default_perms,
};
typedef struct TestCoDeleteByDrainData {
BlockBackend *blk;
bool detach_instead_of_delete;
bool done;
} TestCoDeleteByDrainData;
static void coroutine_fn test_co_delete_by_drain(void *opaque)
{
TestCoDeleteByDrainData *dbdd = opaque;
BlockBackend *blk = dbdd->blk;
BlockDriverState *bs = blk_bs(blk);
BDRVTestTopState *tts = bs->opaque;
void *buffer = g_malloc(65536);
QEMUIOVector qiov;
struct iovec iov = {
.iov_base = buffer,
.iov_len = 65536,
};
qemu_iovec_init_external(&qiov, &iov, 1);
/* Pretend some internal write operation from parent to child.
* Important: We have to read from the child, not from the parent!
* Draining works by first propagating it all up the tree to the
* root and then waiting for drainage from root to the leaves
* (protocol nodes). If we have a request waiting on the root,
* everything will be drained before we go back down the tree, but
* we do not want that. We want to be in the middle of draining
* when this following requests returns. */
bdrv_co_preadv(tts->wait_child, 0, 65536, &qiov, 0);
g_assert_cmpint(bs->refcnt, ==, 1);
if (!dbdd->detach_instead_of_delete) {
blk_unref(blk);
} else {
BdrvChild *c, *next_c;
QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) {
bdrv_unref_child(bs, c);
}
}
dbdd->done = true;
}
/**
* Test what happens when some BDS has some children, you drain one of
* them and this results in the BDS being deleted.
*
* If @detach_instead_of_delete is set, the BDS is not going to be
* deleted but will only detach all of its children.
*/
static void do_test_delete_by_drain(bool detach_instead_of_delete,
enum drain_type drain_type)
{
BlockBackend *blk;
BlockDriverState *bs, *child_bs, *null_bs;
BDRVTestTopState *tts;
TestCoDeleteByDrainData dbdd;
Coroutine *co;
bs = bdrv_new_open_driver(&bdrv_test_top_driver, "top", BDRV_O_RDWR,
&error_abort);
bs->total_sectors = 65536 >> BDRV_SECTOR_BITS;
tts = bs->opaque;
null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
&error_abort);
bdrv_attach_child(bs, null_bs, "null-child", &child_file, &error_abort);
/* This child will be the one to pass to requests through to, and
* it will stall until a drain occurs */
child_bs = bdrv_new_open_driver(&bdrv_test, "child", BDRV_O_RDWR,
&error_abort);
child_bs->total_sectors = 65536 >> BDRV_SECTOR_BITS;
/* Takes our reference to child_bs */
tts->wait_child = bdrv_attach_child(bs, child_bs, "wait-child", &child_file,
&error_abort);
/* This child is just there to be deleted
* (for detach_instead_of_delete == true) */
null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
&error_abort);
bdrv_attach_child(bs, null_bs, "null-child", &child_file, &error_abort);
blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
blk_insert_bs(blk, bs, &error_abort);
/* Referenced by blk now */
bdrv_unref(bs);
g_assert_cmpint(bs->refcnt, ==, 1);
g_assert_cmpint(child_bs->refcnt, ==, 1);
g_assert_cmpint(null_bs->refcnt, ==, 1);
dbdd = (TestCoDeleteByDrainData){
.blk = blk,
.detach_instead_of_delete = detach_instead_of_delete,
.done = false,
};
co = qemu_coroutine_create(test_co_delete_by_drain, &dbdd);
qemu_coroutine_enter(co);
/* Drain the child while the read operation is still pending.
* This should result in the operation finishing and
* test_co_delete_by_drain() resuming. Thus, @bs will be deleted
* and the coroutine will exit while this drain operation is still
* in progress. */
switch (drain_type) {
case BDRV_DRAIN:
bdrv_ref(child_bs);
bdrv_drain(child_bs);
bdrv_unref(child_bs);
break;
case BDRV_SUBTREE_DRAIN:
/* Would have to ref/unref bs here for !detach_instead_of_delete, but
* then the whole test becomes pointless because the graph changes
* don't occur during the drain any more. */
assert(detach_instead_of_delete);
bdrv_subtree_drained_begin(bs);
bdrv_subtree_drained_end(bs);
break;
case BDRV_DRAIN_ALL:
bdrv_drain_all_begin();
bdrv_drain_all_end();
break;
default:
g_assert_not_reached();
}
while (!dbdd.done) {
aio_poll(qemu_get_aio_context(), true);
}
if (detach_instead_of_delete) {
/* Here, the reference has not passed over to the coroutine,
* so we have to delete the BB ourselves */
blk_unref(blk);
}
}
static void test_delete_by_drain(void)
{
do_test_delete_by_drain(false, BDRV_DRAIN);
}
static void test_detach_by_drain_all(void)
{
do_test_delete_by_drain(true, BDRV_DRAIN_ALL);
}
static void test_detach_by_drain(void)
{
do_test_delete_by_drain(true, BDRV_DRAIN);
}
static void test_detach_by_drain_subtree(void)
{
do_test_delete_by_drain(true, BDRV_SUBTREE_DRAIN);
}
struct detach_by_parent_data {
BlockDriverState *parent_b;
BdrvChild *child_b;
BlockDriverState *c;
BdrvChild *child_c;
bool by_parent_cb;
};
static struct detach_by_parent_data detach_by_parent_data;
static void detach_indirect_bh(void *opaque)
{
struct detach_by_parent_data *data = opaque;
bdrv_unref_child(data->parent_b, data->child_b);
bdrv_ref(data->c);
data->child_c = bdrv_attach_child(data->parent_b, data->c, "PB-C",
&child_file, &error_abort);
}
static void detach_by_parent_aio_cb(void *opaque, int ret)
{
struct detach_by_parent_data *data = &detach_by_parent_data;
g_assert_cmpint(ret, ==, 0);
if (data->by_parent_cb) {
detach_indirect_bh(data);
}
}
static void detach_by_driver_cb_drained_begin(BdrvChild *child)
{
aio_bh_schedule_oneshot(qemu_get_current_aio_context(),
detach_indirect_bh, &detach_by_parent_data);
child_file.drained_begin(child);
}
static BdrvChildRole detach_by_driver_cb_role;
/*
* Initial graph:
*
* PA PB
* \ / \
* A B C
*
* by_parent_cb == true: Test that parent callbacks don't poll
*
* PA has a pending write request whose callback changes the child nodes of
* PB: It removes B and adds C instead. The subtree of PB is drained, which
* will indirectly drain the write request, too.
*
* by_parent_cb == false: Test that bdrv_drain_invoke() doesn't poll
*
* PA's BdrvChildRole has a .drained_begin callback that schedules a BH
* that does the same graph change. If bdrv_drain_invoke() calls it, the
* state is messed up, but if it is only polled in the single
* BDRV_POLL_WHILE() at the end of the drain, this should work fine.
*/
static void test_detach_indirect(bool by_parent_cb)
{
BlockBackend *blk;
BlockDriverState *parent_a, *parent_b, *a, *b, *c;
BdrvChild *child_a, *child_b;
BlockAIOCB *acb;
QEMUIOVector qiov;
struct iovec iov = {
.iov_base = NULL,
.iov_len = 0,
};
qemu_iovec_init_external(&qiov, &iov, 1);
if (!by_parent_cb) {
detach_by_driver_cb_role = child_file;
detach_by_driver_cb_role.drained_begin =
detach_by_driver_cb_drained_begin;
}
/* Create all involved nodes */
parent_a = bdrv_new_open_driver(&bdrv_test, "parent-a", BDRV_O_RDWR,
&error_abort);
parent_b = bdrv_new_open_driver(&bdrv_test, "parent-b", 0,
&error_abort);
a = bdrv_new_open_driver(&bdrv_test, "a", BDRV_O_RDWR, &error_abort);
b = bdrv_new_open_driver(&bdrv_test, "b", BDRV_O_RDWR, &error_abort);
c = bdrv_new_open_driver(&bdrv_test, "c", BDRV_O_RDWR, &error_abort);
/* blk is a BB for parent-a */
blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
blk_insert_bs(blk, parent_a, &error_abort);
bdrv_unref(parent_a);
/* If we want to get bdrv_drain_invoke() to call aio_poll(), the driver
* callback must not return immediately. */
if (!by_parent_cb) {
BDRVTestState *s = parent_a->opaque;
s->sleep_in_drain_begin = true;
}
/* Set child relationships */
bdrv_ref(b);
bdrv_ref(a);
child_b = bdrv_attach_child(parent_b, b, "PB-B", &child_file, &error_abort);
child_a = bdrv_attach_child(parent_b, a, "PB-A", &child_backing, &error_abort);
bdrv_ref(a);
bdrv_attach_child(parent_a, a, "PA-A",
by_parent_cb ? &child_file : &detach_by_driver_cb_role,
&error_abort);
g_assert_cmpint(parent_a->refcnt, ==, 1);
g_assert_cmpint(parent_b->refcnt, ==, 1);
g_assert_cmpint(a->refcnt, ==, 3);
g_assert_cmpint(b->refcnt, ==, 2);
g_assert_cmpint(c->refcnt, ==, 1);
g_assert(QLIST_FIRST(&parent_b->children) == child_a);
g_assert(QLIST_NEXT(child_a, next) == child_b);
g_assert(QLIST_NEXT(child_b, next) == NULL);
/* Start the evil write request */
detach_by_parent_data = (struct detach_by_parent_data) {
.parent_b = parent_b,
.child_b = child_b,
.c = c,
.by_parent_cb = by_parent_cb,
};
acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, NULL);
g_assert(acb != NULL);
/* Drain and check the expected result */
bdrv_subtree_drained_begin(parent_b);
g_assert(detach_by_parent_data.child_c != NULL);
g_assert_cmpint(parent_a->refcnt, ==, 1);
g_assert_cmpint(parent_b->refcnt, ==, 1);
g_assert_cmpint(a->refcnt, ==, 3);
g_assert_cmpint(b->refcnt, ==, 1);
g_assert_cmpint(c->refcnt, ==, 2);
g_assert(QLIST_FIRST(&parent_b->children) == detach_by_parent_data.child_c);
g_assert(QLIST_NEXT(detach_by_parent_data.child_c, next) == child_a);
g_assert(QLIST_NEXT(child_a, next) == NULL);
g_assert_cmpint(parent_a->quiesce_counter, ==, 1);
g_assert_cmpint(parent_b->quiesce_counter, ==, 1);
g_assert_cmpint(a->quiesce_counter, ==, 1);
g_assert_cmpint(b->quiesce_counter, ==, 0);
g_assert_cmpint(c->quiesce_counter, ==, 1);
bdrv_subtree_drained_end(parent_b);
bdrv_unref(parent_b);
blk_unref(blk);
/* XXX Once bdrv_close() unref's children instead of just detaching them,
* this won't be necessary any more. */
bdrv_unref(a);
bdrv_unref(a);
bdrv_unref(c);
g_assert_cmpint(a->refcnt, ==, 1);
g_assert_cmpint(b->refcnt, ==, 1);
g_assert_cmpint(c->refcnt, ==, 1);
bdrv_unref(a);
bdrv_unref(b);
bdrv_unref(c);
}
static void test_detach_by_parent_cb(void)
{
test_detach_indirect(true);
}
static void test_detach_by_driver_cb(void)
{
test_detach_indirect(false);
}
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
int ret;
bdrv_init(); bdrv_init();
qemu_init_main_loop(&error_abort); qemu_init_main_loop(&error_abort);
g_test_init(&argc, &argv, NULL); g_test_init(&argc, &argv, NULL);
qemu_event_init(&done_event, false);
g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all); g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain); g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
g_test_add_func("/bdrv-drain/driver-cb/drain_subtree", g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
test_drv_cb_drain_subtree); test_drv_cb_drain_subtree);
// XXX bdrv_drain_all() doesn't work in coroutine context g_test_add_func("/bdrv-drain/driver-cb/co/drain_all",
test_drv_cb_co_drain_all);
g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain); g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree", g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
test_drv_cb_co_drain_subtree); test_drv_cb_co_drain_subtree);
@ -639,19 +1277,38 @@ int main(int argc, char **argv)
g_test_add_func("/bdrv-drain/quiesce/drain_subtree", g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
test_quiesce_drain_subtree); test_quiesce_drain_subtree);
// XXX bdrv_drain_all() doesn't work in coroutine context g_test_add_func("/bdrv-drain/quiesce/co/drain_all",
test_quiesce_co_drain_all);
g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain); g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree", g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
test_quiesce_co_drain_subtree); test_quiesce_co_drain_subtree);
g_test_add_func("/bdrv-drain/nested", test_nested); g_test_add_func("/bdrv-drain/nested", test_nested);
g_test_add_func("/bdrv-drain/multiparent", test_multiparent); g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
g_test_add_func("/bdrv-drain/graph-change/drain_subtree",
test_graph_change_drain_subtree);
g_test_add_func("/bdrv-drain/graph-change/drain_all",
test_graph_change_drain_all);
g_test_add_func("/bdrv-drain/iothread/drain_all", test_iothread_drain_all);
g_test_add_func("/bdrv-drain/iothread/drain", test_iothread_drain);
g_test_add_func("/bdrv-drain/iothread/drain_subtree",
test_iothread_drain_subtree);
g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all); g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain); g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
g_test_add_func("/bdrv-drain/blockjob/drain_subtree", g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
test_blockjob_drain_subtree); test_blockjob_drain_subtree);
return g_test_run(); g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain);
g_test_add_func("/bdrv-drain/detach/drain_all", test_detach_by_drain_all);
g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain);
g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree);
g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb);
g_test_add_func("/bdrv-drain/detach/driver_cb", test_detach_by_driver_cb);
ret = g_test_run();
qemu_event_destroy(&done_event);
return ret;
} }

View File

@ -30,6 +30,18 @@ typedef struct TestHBitmapData {
} TestHBitmapData; } TestHBitmapData;
static int64_t check_hbitmap_iter_next(HBitmapIter *hbi)
{
int next0, next1;
next0 = hbitmap_iter_next(hbi, false);
next1 = hbitmap_iter_next(hbi, true);
g_assert_cmpint(next0, ==, next1);
return next0;
}
/* Check that the HBitmap and the shadow bitmap contain the same data, /* Check that the HBitmap and the shadow bitmap contain the same data,
* ignoring the same "first" bits. * ignoring the same "first" bits.
*/ */
@ -46,7 +58,7 @@ static void hbitmap_test_check(TestHBitmapData *data,
i = first; i = first;
for (;;) { for (;;) {
next = hbitmap_iter_next(&hbi); next = check_hbitmap_iter_next(&hbi);
if (next < 0) { if (next < 0) {
next = data->size; next = data->size;
} }
@ -435,25 +447,25 @@ static void test_hbitmap_iter_granularity(TestHBitmapData *data,
/* Note that hbitmap_test_check has to be invoked manually in this test. */ /* Note that hbitmap_test_check has to be invoked manually in this test. */
hbitmap_test_init(data, 131072 << 7, 7); hbitmap_test_init(data, 131072 << 7, 7);
hbitmap_iter_init(&hbi, data->hb, 0); hbitmap_iter_init(&hbi, data->hb, 0);
g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
hbitmap_test_set(data, ((L2 + L1 + 1) << 7) + 8, 8); hbitmap_test_set(data, ((L2 + L1 + 1) << 7) + 8, 8);
hbitmap_iter_init(&hbi, data->hb, 0); hbitmap_iter_init(&hbi, data->hb, 0);
g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7); g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7); hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
hbitmap_test_set(data, (131072 << 7) - 8, 8); hbitmap_test_set(data, (131072 << 7) - 8, 8);
hbitmap_iter_init(&hbi, data->hb, 0); hbitmap_iter_init(&hbi, data->hb, 0);
g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7); g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7); g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, 131071 << 7);
g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7); hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7); g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, 131071 << 7);
g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
} }
static void hbitmap_test_set_boundary_bits(TestHBitmapData *data, ssize_t diff) static void hbitmap_test_set_boundary_bits(TestHBitmapData *data, ssize_t diff)
@ -893,7 +905,7 @@ static void test_hbitmap_serialize_zeroes(TestHBitmapData *data,
for (i = 0; i < num_positions; i++) { for (i = 0; i < num_positions; i++) {
hbitmap_deserialize_zeroes(data->hb, positions[i], min_l1, true); hbitmap_deserialize_zeroes(data->hb, positions[i], min_l1, true);
hbitmap_iter_init(&iter, data->hb, 0); hbitmap_iter_init(&iter, data->hb, 0);
next = hbitmap_iter_next(&iter); next = check_hbitmap_iter_next(&iter);
if (i == num_positions - 1) { if (i == num_positions - 1) {
g_assert_cmpint(next, ==, -1); g_assert_cmpint(next, ==, -1);
} else { } else {
@ -919,10 +931,10 @@ static void test_hbitmap_iter_and_reset(TestHBitmapData *data,
hbitmap_iter_init(&hbi, data->hb, BITS_PER_LONG - 1); hbitmap_iter_init(&hbi, data->hb, BITS_PER_LONG - 1);
hbitmap_iter_next(&hbi); check_hbitmap_iter_next(&hbi);
hbitmap_reset_all(data->hb); hbitmap_reset_all(data->hb);
hbitmap_iter_next(&hbi); check_hbitmap_iter_next(&hbi);
} }
static void test_hbitmap_next_zero_check(TestHBitmapData *data, int64_t start) static void test_hbitmap_next_zero_check(TestHBitmapData *data, int64_t start)

View File

@ -141,7 +141,7 @@ unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi)
return cur; return cur;
} }
int64_t hbitmap_iter_next(HBitmapIter *hbi) int64_t hbitmap_iter_next(HBitmapIter *hbi, bool advance)
{ {
unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1] & unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1] &
hbi->hb->levels[HBITMAP_LEVELS - 1][hbi->pos]; hbi->hb->levels[HBITMAP_LEVELS - 1][hbi->pos];
@ -154,8 +154,12 @@ int64_t hbitmap_iter_next(HBitmapIter *hbi)
} }
} }
/* The next call will resume work from the next bit. */ if (advance) {
hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1); /* The next call will resume work from the next bit. */
hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1);
} else {
hbi->cur[HBITMAP_LEVELS - 1] = cur;
}
item = ((uint64_t)hbi->pos << BITS_PER_LEVEL) + ctzl(cur); item = ((uint64_t)hbi->pos << BITS_PER_LEVEL) + ctzl(cur);
return item << hbi->granularity; return item << hbi->granularity;