PVE backup: improve error handling for fleecing

See Fiona's inner commit for details. Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
2025-10-04 11:59:19 +00:00 · 2024-11-12 16:47:40 +01:00 · 2024-11-12 16:47:40 +01:00 · 9e8ef15831
commit 9e8ef15831
parent 531db7df01
2 changed files with 104 additions and 0 deletions
--- a/debian/patches/pve/0048-PVE-backup-fixup-error-handling-for-fleecing.patch
+++ b/debian/patches/pve/0048-PVE-backup-fixup-error-handling-for-fleecing.patch
@ -0,0 +1,103 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Thu, 7 Nov 2024 17:51:14 +0100
+Subject: [PATCH] PVE backup: fixup error handling for fleecing
+
+The drained section needs to be terminated before breaking out of the
+loop in the error scenarios. Otherwise, guest IO on the drive would
+become stuck.
+
+If the job is created successfully, then the job completion callback
+will clean up the snapshot access block nodes. In case failure
+happened before the job is created, there was no cleanup for the
+snapshot access block nodes yet. Add it.
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+---
+ pve-backup.c | 38 +++++++++++++++++++++++++-------------
+ 1 file changed, 25 insertions(+), 13 deletions(-)
+
+diff --git a/pve-backup.c b/pve-backup.c
+index 4e730aa3da..c4178758b3 100644
+--- a/pve-backup.c
+++ b/pve-backup.c
+@@ -357,22 +357,23 @@ static void coroutine_fn pvebackup_co_complete_stream(void *opaque)
+     qemu_co_mutex_unlock(&backup_state.backup_mutex);
+ }
+ 
+-static void pvebackup_complete_cb(void *opaque, int ret)
+static void cleanup_snapshot_access(PVEBackupDevInfo *di)
+ {
+-    PVEBackupDevInfo *di = opaque;
+-    di->completed_ret = ret;
+-
+-    /*
+-     * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work
+-     * won't be done as a coroutine anyways:
+-     * - For snapshot_access, allows doing bdrv_unref() directly. Doing it via bdrv_co_unref() would
+-     *   just spawn a BH calling bdrv_unref().
+-     * - For cbw, draining would need to spawn a BH.
+-     */
+     if (di->fleecing.snapshot_access) {
+         bdrv_unref(di->fleecing.snapshot_access);
+         di->fleecing.snapshot_access = NULL;
+     }
+    if (di->fleecing.cbw) {
+        bdrv_cbw_drop(di->fleecing.cbw);
+        di->fleecing.cbw = NULL;
+    }
+}
+
+static void pvebackup_complete_cb(void *opaque, int ret)
+{
+    PVEBackupDevInfo *di = opaque;
+    di->completed_ret = ret;
+
+     if (di->fleecing.cbw) {
+         /*
+          * With fleecing, failure for cbw does not fail the guest write, but only sets the snapshot
+@@ -383,10 +384,17 @@ static void pvebackup_complete_cb(void *opaque, int ret)
+         if (di->completed_ret == -EACCES && snapshot_error) {
+             di->completed_ret = snapshot_error;
+         }
+-        bdrv_cbw_drop(di->fleecing.cbw);
+-        di->fleecing.cbw = NULL;
+     }
+ 
+    /*
+     * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work
+     * won't be done as a coroutine anyways:
+     * - For snapshot_access, allows doing bdrv_unref() directly. Doing it via bdrv_co_unref() would
+     *   just spawn a BH calling bdrv_unref().
+     * - For cbw, draining would need to spawn a BH.
+     */
+    cleanup_snapshot_access(di);
+
+     /*
+      * Needs to happen outside of coroutine, because it takes the graph write lock.
+      */
+@@ -587,6 +595,7 @@ static void create_backup_jobs_bh(void *opaque) {
+             if (!di->fleecing.cbw) {
+                 error_setg(errp, "appending cbw node for fleecing failed: %s",
+                            local_err ? error_get_pretty(local_err) : "unknown error");
+                bdrv_drained_end(di->bs);
+                 break;
+             }
+ 
+@@ -599,6 +608,8 @@ static void create_backup_jobs_bh(void *opaque) {
+             if (!di->fleecing.snapshot_access) {
+                 error_setg(errp, "setting up snapshot access for fleecing failed: %s",
+                            local_err ? error_get_pretty(local_err) : "unknown error");
+                cleanup_snapshot_access(di);
+                bdrv_drained_end(di->bs);
+                 break;
+             }
+             source_bs = di->fleecing.snapshot_access;
+@@ -637,6 +648,7 @@ static void create_backup_jobs_bh(void *opaque) {
+         }
+ 
+         if (!job || local_err) {
+            cleanup_snapshot_access(di);
+             error_setg(errp, "backup_job_create failed: %s",
+                        local_err ? error_get_pretty(local_err) : "null");
+             break;
--- a/debian/patches/series
+++ b/debian/patches/series
@ -88,3 +88,4 @@ pve/0044-copy-before-write-allow-specifying-minimum-cluster-s.patch
 pve/0045-backup-add-minimum-cluster-size-to-performance-optio.patch
 pve/0046-PVE-backup-add-fleecing-option.patch
 pve/0047-PVE-backup-improve-error-when-copy-before-write-fail.patch
+pve/0048-PVE-backup-fixup-error-handling-for-fleecing.patch