From 0783c3c2714094290d5868b640b3f694a6524491 Mon Sep 17 00:00:00 2001 From: Fabian Ebner Date: Fri, 29 Jan 2021 16:11:43 +0100 Subject: [PATCH] migration: move finishing block jobs to phase2 for better/uniform error handling avoids the possibility to die during phase3_cleanup and instead of needing to duplicate the cleanup ourselves, benefit from phase2_cleanup doing so. The duplicate cleanup was also very incomplete: it didn't stop the remote kvm process (leading to 'VM already running' when trying to migrate again afterwards), but it removed its disks, and it didn't unlock the config, didn't close the tunnel and didn't cancel the block-dirty bitmaps. Since migrate_cancel should do nothing after the (non-storage) migrate process has completed, even that cleanup step is fine here. Since phase3 is empty at the moment, the order of operations is still the same. Also add a test, that would complain about finish_tunnel not being called before this patch. That test also checks that local disks are not already removed before finishing the block jobs. Signed-off-by: Fabian Ebner --- PVE/QemuMigrate.pm | 23 ++++++++---------- test/MigrationTest/QemuMigrateMock.pm | 6 +++++ test/run_qemu_migrate_tests.pl | 35 +++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 13 deletions(-) diff --git a/PVE/QemuMigrate.pm b/PVE/QemuMigrate.pm index b5036015..435c1f7c 100644 --- a/PVE/QemuMigrate.pm +++ b/PVE/QemuMigrate.pm @@ -1134,6 +1134,16 @@ sub phase2 { die "unable to parse migration status '$stat->{status}' - aborting\n"; } } + + if ($self->{storage_migration}) { + # finish block-job with block-job-cancel, to disconnect source VM from NBD + # to avoid it trying to re-establish it. We are in blockjob ready state, + # thus, this command changes to it to blockjob complete (see qapi docs) + eval { PVE::QemuServer::qemu_drive_mirror_monitor($vmid, undef, $self->{storage_migration_jobs}, 'cancel'); }; + if (my $err = $@) { + die "Failed to complete storage migration: $err\n"; + } + } } sub phase2_cleanup { @@ -1209,19 +1219,6 @@ sub phase3_cleanup { my $tunnel = $self->{tunnel}; - if ($self->{storage_migration}) { - # finish block-job with block-job-cancel, to disconnect source VM from NBD - # to avoid it trying to re-establish it. We are in blockjob ready state, - # thus, this command changes to it to blockjob complete (see qapi docs) - eval { PVE::QemuServer::qemu_drive_mirror_monitor($vmid, undef, $self->{storage_migration_jobs}, 'cancel'); }; - - if (my $err = $@) { - eval { PVE::QemuServer::qemu_blockjobs_cancel($vmid, $self->{storage_migration_jobs}) }; - eval { PVE::QemuMigrate::cleanup_remotedisks($self) }; - die "Failed to complete storage migration: $err\n"; - } - } - if ($self->{volume_map}) { my $target_drives = $self->{target_drive}; diff --git a/test/MigrationTest/QemuMigrateMock.pm b/test/MigrationTest/QemuMigrateMock.pm index 2d424e01..8e0b7d09 100644 --- a/test/MigrationTest/QemuMigrateMock.pm +++ b/test/MigrationTest/QemuMigrateMock.pm @@ -139,6 +139,12 @@ $MigrationTest::Shared::qemu_server_module->mock( file_set_contents("${RUN_DIR_PATH}/nbd_info", to_json($nbd_info)); }, qemu_drive_mirror_monitor => sub { + my ($vmid, $vmiddst, $jobs, $completion, $qga) = @_; + + if ($fail_config->{qemu_drive_mirror_monitor} && + $fail_config->{qemu_drive_mirror_monitor} eq $completion) { + die "qemu_drive_mirror_monitor '$completion' error\n"; + } return; }, set_migration_caps => sub { diff --git a/test/run_qemu_migrate_tests.pl b/test/run_qemu_migrate_tests.pl index 4f7f021e..5edea7b5 100755 --- a/test/run_qemu_migrate_tests.pl +++ b/test/run_qemu_migrate_tests.pl @@ -1444,6 +1444,41 @@ my $tests = [ }, }, }, + { + name => '149_running_unused_block_job_cancel_fail', + target => 'pve1', + vmid => 149, + vm_status => { + running => 1, + runningmachine => 'pc-q35-5.0+pve0', + }, + opts => { + online => 1, + 'with-local-disks' => 1, + }, + config_patch => { + scsi1 => undef, + unused0 => 'local-dir:149/vm-149-disk-0.qcow2', + }, + expected_calls => {}, + expect_die => "qemu_drive_mirror_monitor 'cancel' error", + # note that 'cancel' is also used to finish and that's what this test is about + fail_config => { + 'qemu_drive_mirror_monitor' => 'cancel', + }, + expected => { + source_volids => local_volids_for_vm(149), + target_volids => {}, + vm_config => get_patched_config(149, { + scsi1 => undef, + unused0 => 'local-dir:149/vm-149-disk-0.qcow2', + }), + vm_status => { + running => 1, + runningmachine => 'pc-q35-5.0+pve0', + }, + }, + }, { name => '149_offline', target => 'pve1',