From 3e24733bdf9bef232be8e5205bf5e73338e65bdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= Date: Thu, 23 May 2019 21:22:22 +0200 Subject: [PATCH] vm_resume: correctly honor $nocheck MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit for both vm_mon_cmd calls. under certain circumstances, the following sequence of events can otherwise fail when live-migrating under load: S...source node T...target node 0: migration is complete, handover from S to T starts 1: S: logically move VM config file from S to T via rename() 2: S: rename returns, config file is (visibly) moved on S 3: S: trigger resume on T via mtunnel 4a: T: call vm_resume while config file move is not yet visible on T 4b: T: call vm_resume while config file move is already visible on T 4a instead of 4b means vm_mon_cmd will die in check_running unless vm_mon_cmd_nocheck is used. under heavy pmxcfs load and a slow cluster/corosync network, there can be a few seconds of delay between 1 and 2, with a subsequent race ending in 4a instead of 4b. this issue was reported to occur on bulk migrations. Signed-off-by: Fabian Grünbichler --- PVE/QemuServer.pm | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm index c342addc..1a22fb47 100644 --- a/PVE/QemuServer.pm +++ b/PVE/QemuServer.pm @@ -5766,8 +5766,8 @@ sub vm_resume { my ($vmid, $skiplock, $nocheck) = @_; PVE::QemuConfig->lock_config($vmid, sub { - - my $res = vm_mon_cmd($vmid, 'query-status'); + my $vm_mon_cmd = $nocheck ? \&vm_mon_cmd_nocheck : \&vm_mon_cmd; + my $res = $vm_mon_cmd->($vmid, 'query-status'); my $resume_cmd = 'cont'; if ($res->{status} && $res->{status} eq 'suspended') { @@ -5780,12 +5780,9 @@ sub vm_resume { PVE::QemuConfig->check_lock($conf) if !($skiplock || PVE::QemuConfig->has_lock($conf, 'backup')); - - vm_mon_cmd($vmid, $resume_cmd); - - } else { - vm_mon_cmd_nocheck($vmid, $resume_cmd); } + + $vm_mon_cmd->($vmid, $resume_cmd); }); }