From 9e8ef1583103310556abb2a7b04b91caba22ed7a Mon Sep 17 00:00:00 2001 From: Thomas Lamprecht Date: Tue, 12 Nov 2024 16:47:40 +0100 Subject: [PATCH] PVE backup: improve error handling for fleecing See Fiona's inner commit for details. Signed-off-by: Thomas Lamprecht --- ...up-fixup-error-handling-for-fleecing.patch | 103 ++++++++++++++++++ debian/patches/series | 1 + 2 files changed, 104 insertions(+) create mode 100644 debian/patches/pve/0048-PVE-backup-fixup-error-handling-for-fleecing.patch diff --git a/debian/patches/pve/0048-PVE-backup-fixup-error-handling-for-fleecing.patch b/debian/patches/pve/0048-PVE-backup-fixup-error-handling-for-fleecing.patch new file mode 100644 index 0000000..dc5e3f1 --- /dev/null +++ b/debian/patches/pve/0048-PVE-backup-fixup-error-handling-for-fleecing.patch @@ -0,0 +1,103 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Thu, 7 Nov 2024 17:51:14 +0100 +Subject: [PATCH] PVE backup: fixup error handling for fleecing + +The drained section needs to be terminated before breaking out of the +loop in the error scenarios. Otherwise, guest IO on the drive would +become stuck. + +If the job is created successfully, then the job completion callback +will clean up the snapshot access block nodes. In case failure +happened before the job is created, there was no cleanup for the +snapshot access block nodes yet. Add it. + +Signed-off-by: Fiona Ebner +--- + pve-backup.c | 38 +++++++++++++++++++++++++------------- + 1 file changed, 25 insertions(+), 13 deletions(-) + +diff --git a/pve-backup.c b/pve-backup.c +index 4e730aa3da..c4178758b3 100644 +--- a/pve-backup.c ++++ b/pve-backup.c +@@ -357,22 +357,23 @@ static void coroutine_fn pvebackup_co_complete_stream(void *opaque) + qemu_co_mutex_unlock(&backup_state.backup_mutex); + } + +-static void pvebackup_complete_cb(void *opaque, int ret) ++static void cleanup_snapshot_access(PVEBackupDevInfo *di) + { +- PVEBackupDevInfo *di = opaque; +- di->completed_ret = ret; +- +- /* +- * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work +- * won't be done as a coroutine anyways: +- * - For snapshot_access, allows doing bdrv_unref() directly. Doing it via bdrv_co_unref() would +- * just spawn a BH calling bdrv_unref(). +- * - For cbw, draining would need to spawn a BH. +- */ + if (di->fleecing.snapshot_access) { + bdrv_unref(di->fleecing.snapshot_access); + di->fleecing.snapshot_access = NULL; + } ++ if (di->fleecing.cbw) { ++ bdrv_cbw_drop(di->fleecing.cbw); ++ di->fleecing.cbw = NULL; ++ } ++} ++ ++static void pvebackup_complete_cb(void *opaque, int ret) ++{ ++ PVEBackupDevInfo *di = opaque; ++ di->completed_ret = ret; ++ + if (di->fleecing.cbw) { + /* + * With fleecing, failure for cbw does not fail the guest write, but only sets the snapshot +@@ -383,10 +384,17 @@ static void pvebackup_complete_cb(void *opaque, int ret) + if (di->completed_ret == -EACCES && snapshot_error) { + di->completed_ret = snapshot_error; + } +- bdrv_cbw_drop(di->fleecing.cbw); +- di->fleecing.cbw = NULL; + } + ++ /* ++ * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work ++ * won't be done as a coroutine anyways: ++ * - For snapshot_access, allows doing bdrv_unref() directly. Doing it via bdrv_co_unref() would ++ * just spawn a BH calling bdrv_unref(). ++ * - For cbw, draining would need to spawn a BH. ++ */ ++ cleanup_snapshot_access(di); ++ + /* + * Needs to happen outside of coroutine, because it takes the graph write lock. + */ +@@ -587,6 +595,7 @@ static void create_backup_jobs_bh(void *opaque) { + if (!di->fleecing.cbw) { + error_setg(errp, "appending cbw node for fleecing failed: %s", + local_err ? error_get_pretty(local_err) : "unknown error"); ++ bdrv_drained_end(di->bs); + break; + } + +@@ -599,6 +608,8 @@ static void create_backup_jobs_bh(void *opaque) { + if (!di->fleecing.snapshot_access) { + error_setg(errp, "setting up snapshot access for fleecing failed: %s", + local_err ? error_get_pretty(local_err) : "unknown error"); ++ cleanup_snapshot_access(di); ++ bdrv_drained_end(di->bs); + break; + } + source_bs = di->fleecing.snapshot_access; +@@ -637,6 +648,7 @@ static void create_backup_jobs_bh(void *opaque) { + } + + if (!job || local_err) { ++ cleanup_snapshot_access(di); + error_setg(errp, "backup_job_create failed: %s", + local_err ? error_get_pretty(local_err) : "null"); + break; diff --git a/debian/patches/series b/debian/patches/series index 4c53d28..8407fe6 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -88,3 +88,4 @@ pve/0044-copy-before-write-allow-specifying-minimum-cluster-s.patch pve/0045-backup-add-minimum-cluster-size-to-performance-optio.patch pve/0046-PVE-backup-add-fleecing-option.patch pve/0047-PVE-backup-improve-error-when-copy-before-write-fail.patch +pve/0048-PVE-backup-fixup-error-handling-for-fleecing.patch