mirror of
https://git.proxmox.com/git/pve-qemu
synced 2025-08-24 09:05:01 +00:00
186 lines
8.1 KiB
Diff
186 lines
8.1 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Fiona Ebner <f.ebner@proxmox.com>
|
|
Date: Mon, 31 Mar 2025 16:55:06 +0200
|
|
Subject: [PATCH] savevm-async: use dedicated iothread for state file
|
|
|
|
Having the state file be in the iohandler context means that a
|
|
blk_drain_all() call in the main thread or vCPU thread that happens
|
|
while the snapshot is running will result in a deadlock.
|
|
|
|
For example, the main thread might be stuck in:
|
|
|
|
> 0 0x00007300ac9552d6 in __ppoll (fds=0x64bd5a411a50, nfds=2, timeout=<optimized out>, timeout@entry=0x0, sigmask=sigmask@entry=0x0) at ../sysdeps/unix/sysv/linux/ppoll.c:42
|
|
> 1 0x000064bd51af3cad in ppoll (__ss=0x0, __timeout=0x0, __nfds=<optimized out>, __fds=<optimized out>) at /usr/include/x86_64-linux-gnu/bits/poll2.h:64
|
|
> 2 0x000064bd51ad8799 in fdmon_poll_wait (ctx=0x64bd58d968a0, ready_list=0x7ffcfcc15558, timeout=-1) at ../util/fdmon-poll.c:79
|
|
> 3 0x000064bd51ad7c3d in aio_poll (ctx=0x64bd58d968a0, blocking=blocking@entry=true) at ../util/aio-posix.c:671
|
|
> 4 0x000064bd519a0b5d in bdrv_drain_all_begin () at ../block/io.c:531
|
|
> 5 bdrv_drain_all_begin () at ../block/io.c:510
|
|
> 6 0x000064bd519943c4 in blk_drain_all () at ../block/block-backend.c:2085
|
|
> 7 0x000064bd5160fc5a in virtio_scsi_dataplane_stop (vdev=0x64bd5a215190) at ../hw/scsi/virtio-scsi-dataplane.c:213
|
|
> 8 0x000064bd51664e90 in virtio_bus_stop_ioeventfd (bus=0x64bd5a215110) at ../hw/virtio/virtio-bus.c:259
|
|
> 9 0x000064bd5166511b in virtio_bus_stop_ioeventfd (bus=<optimized out>) at ../hw/virtio/virtio-bus.c:251
|
|
> 10 virtio_bus_reset (bus=<optimized out>) at ../hw/virtio/virtio-bus.c:107
|
|
> 11 0x000064bd51667431 in virtio_pci_reset (qdev=<optimized out>) at ../hw/virtio/virtio-pci.c:2296
|
|
...
|
|
> 34 0x000064bd517aa951 in pc_machine_reset (machine=<optimized out>, type=<optimized out>) at ../hw/i386/pc.c:1722
|
|
> 35 0x000064bd516aa4c4 in qemu_system_reset (reason=reason@entry=SHUTDOWN_CAUSE_GUEST_RESET) at ../system/runstate.c:525
|
|
> 36 0x000064bd516aaeb9 in main_loop_should_exit (status=<synthetic pointer>) at ../system/runstate.c:801
|
|
> 37 qemu_main_loop () at ../system/runstate.c:834
|
|
|
|
which is in block/io.c:
|
|
|
|
> /* Now poll the in-flight requests */
|
|
> AIO_WAIT_WHILE_UNLOCKED(NULL, bdrv_drain_all_poll());
|
|
|
|
The working theory is: The deadlock happens because the IO is issued
|
|
from the process_savevm_co() coroutine, which doesn't get scheduled
|
|
again to complete in-flight requests when the main thread is stuck
|
|
there polling. The main thread itself is the one that would need to
|
|
schedule it. In case of a vCPU triggering the VirtIO SCSI dataplane
|
|
stop, which happens during (Linux) boot, the vCPU thread will hold the
|
|
big QEMU lock (BQL) blocking the main thread from making progress
|
|
scheduling the process_savevm_co() coroutine.
|
|
|
|
This change should also help in general to reduce load on the main
|
|
thread and for it to get stuck on IO, i.e. same benefits as using a
|
|
dedicated IO thread for regular drives. This is particularly
|
|
interesting when the VM state storage is a network storage like NFS.
|
|
|
|
With some luck, it could also help with bug #6262 [0]. The failure
|
|
there happens while issuing/right after the savevm-start QMP command,
|
|
so the most likely coroutine is the process_savevm_co() that was
|
|
previously scheduled to the iohandler context. Likely someone polls
|
|
the iohandler context and wants to enter the already scheduled
|
|
coroutine leading to the abort():
|
|
> qemu_aio_coroutine_enter: Co-routine was already scheduled in 'aio_co_schedule'
|
|
With a dedicated iothread, there hopefully is no such race.
|
|
|
|
The comment above querying the pending bytes wrongly talked about the
|
|
"iothread lock", but should've been "iohandler lock". This was even
|
|
renamed to BQL (big QEMU lock) a few releases ago. Even if that was
|
|
not a typo to begin with, there are no AioContext locks anymore.
|
|
|
|
[0]: https://bugzilla.proxmox.com/show_bug.cgi?id=6262
|
|
|
|
Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
|
|
[WB: update to the changed error handling in the previous commit]
|
|
Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
|
|
---
|
|
migration/savevm-async.c | 42 ++++++++++++++++++++++++++++------------
|
|
1 file changed, 30 insertions(+), 12 deletions(-)
|
|
|
|
diff --git a/migration/savevm-async.c b/migration/savevm-async.c
|
|
index d8d2c80475..11ea4c601d 100644
|
|
--- a/migration/savevm-async.c
|
|
+++ b/migration/savevm-async.c
|
|
@@ -25,6 +25,7 @@
|
|
#include "qemu/main-loop.h"
|
|
#include "qemu/rcu.h"
|
|
#include "qemu/yank.h"
|
|
+#include "sysemu/iothread.h"
|
|
|
|
/* #define DEBUG_SAVEVM_STATE */
|
|
|
|
@@ -57,6 +58,7 @@ static struct SnapshotState {
|
|
QEMUBH *finalize_bh;
|
|
Coroutine *co;
|
|
QemuCoSleep target_close_wait;
|
|
+ IOThread *iothread;
|
|
} snap_state;
|
|
|
|
static bool savevm_aborted(void)
|
|
@@ -256,16 +258,13 @@ static void coroutine_fn process_savevm_co(void *opaque)
|
|
uint64_t threshold = 400 * 1000;
|
|
|
|
/*
|
|
- * pending_{estimate,exact} are expected to be called without iothread
|
|
- * lock. Similar to what is done in migration.c, call the exact variant
|
|
- * only once pend_precopy in the estimate is below the threshold.
|
|
+ * Similar to what is done in migration.c, call the exact variant only
|
|
+ * once pend_precopy in the estimate is below the threshold.
|
|
*/
|
|
- bql_unlock();
|
|
qemu_savevm_state_pending_estimate(&pend_precopy, &pend_postcopy);
|
|
if (pend_precopy <= threshold) {
|
|
qemu_savevm_state_pending_exact(&pend_precopy, &pend_postcopy);
|
|
}
|
|
- bql_lock();
|
|
pending_size = pend_precopy + pend_postcopy;
|
|
|
|
/*
|
|
@@ -332,11 +331,17 @@ static void coroutine_fn process_savevm_co(void *opaque)
|
|
qemu_bh_schedule(snap_state.finalize_bh);
|
|
}
|
|
|
|
+static void savevm_cleanup_iothread(void) {
|
|
+ if (snap_state.iothread) {
|
|
+ iothread_destroy(snap_state.iothread);
|
|
+ snap_state.iothread = NULL;
|
|
+ }
|
|
+}
|
|
+
|
|
void qmp_savevm_start(const char *statefile, Error **errp)
|
|
{
|
|
Error *local_err = NULL;
|
|
MigrationState *ms = migrate_get_current();
|
|
- AioContext *iohandler_ctx = iohandler_get_aio_context();
|
|
BlockDriverState *target_bs = NULL;
|
|
int ret = 0;
|
|
|
|
@@ -374,6 +379,19 @@ void qmp_savevm_start(const char *statefile, Error **errp)
|
|
goto fail;
|
|
}
|
|
|
|
+ if (snap_state.iothread) {
|
|
+ /* This is not expected, so warn about it, but no point in re-creating a new iothread. */
|
|
+ warn_report("iothread for snapshot already exists - re-using");
|
|
+ } else {
|
|
+ snap_state.iothread =
|
|
+ iothread_create("__proxmox_savevm_async_iothread__", &local_err);
|
|
+ if (!snap_state.iothread) {
|
|
+ error_setg(errp, "creating iothread failed: %s",
|
|
+ local_err ? error_get_pretty(local_err) : "unknown error");
|
|
+ goto fail;
|
|
+ }
|
|
+ }
|
|
+
|
|
/* Open the image */
|
|
QDict *options = NULL;
|
|
options = qdict_new();
|
|
@@ -422,22 +440,20 @@ void qmp_savevm_start(const char *statefile, Error **errp)
|
|
goto fail;
|
|
}
|
|
|
|
- /* Async processing from here on out happens in iohandler context, so let
|
|
- * the target bdrv have its home there.
|
|
- */
|
|
- ret = blk_set_aio_context(snap_state.target, iohandler_ctx, &local_err);
|
|
+ ret = blk_set_aio_context(snap_state.target, snap_state.iothread->ctx, &local_err);
|
|
if (ret != 0) {
|
|
- warn_report("failed to set iohandler context for VM state target: %s %s",
|
|
+ warn_report("failed to set iothread context for VM state target: %s %s",
|
|
local_err ? error_get_pretty(local_err) : "unknown error",
|
|
strerror(-ret));
|
|
}
|
|
|
|
snap_state.co = qemu_coroutine_create(&process_savevm_co, NULL);
|
|
- aio_co_schedule(iohandler_ctx, snap_state.co);
|
|
+ aio_co_schedule(snap_state.iothread->ctx, snap_state.co);
|
|
|
|
return;
|
|
|
|
fail:
|
|
+ savevm_cleanup_iothread();
|
|
save_snapshot_error("setup failed");
|
|
}
|
|
|
|
@@ -463,6 +479,8 @@ static void coroutine_fn wait_for_close_co(void *opaque)
|
|
DPRINTF("savevm-end: no target file open\n");
|
|
}
|
|
|
|
+ savevm_cleanup_iothread();
|
|
+
|
|
// File closed and no other error, so ensure next snapshot can be started.
|
|
if (snap_state.state != SAVE_STATE_ERROR) {
|
|
snap_state.state = SAVE_STATE_DONE;
|