migration: move finishing block jobs to phase2 for better/uniform error handling

avoids the possibility to die during phase3_cleanup and instead of needing to
duplicate the cleanup ourselves, benefit from phase2_cleanup doing so.

The duplicate cleanup was also very incomplete: it didn't stop the remote kvm
process (leading to 'VM already running' when trying to migrate again
afterwards), but it removed its disks, and it didn't unlock the config, didn't
close the tunnel and didn't cancel the block-dirty bitmaps.

Since migrate_cancel should do nothing after the (non-storage) migrate process
has completed, even that cleanup step is fine here.

Since phase3 is empty at the moment, the order of operations is still the same.

Also add a test, that would complain about finish_tunnel not being called before
this patch. That test also checks that local disks are not already removed
before finishing the block jobs.

Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
This commit is contained in:
Fabian Ebner 2021-01-29 16:11:43 +01:00 committed by Thomas Lamprecht
parent a6be63ac9b
commit 0783c3c271
3 changed files with 51 additions and 13 deletions

View File

@ -1134,6 +1134,16 @@ sub phase2 {
die "unable to parse migration status '$stat->{status}' - aborting\n";
}
}
if ($self->{storage_migration}) {
# finish block-job with block-job-cancel, to disconnect source VM from NBD
# to avoid it trying to re-establish it. We are in blockjob ready state,
# thus, this command changes to it to blockjob complete (see qapi docs)
eval { PVE::QemuServer::qemu_drive_mirror_monitor($vmid, undef, $self->{storage_migration_jobs}, 'cancel'); };
if (my $err = $@) {
die "Failed to complete storage migration: $err\n";
}
}
}
sub phase2_cleanup {
@ -1209,19 +1219,6 @@ sub phase3_cleanup {
my $tunnel = $self->{tunnel};
if ($self->{storage_migration}) {
# finish block-job with block-job-cancel, to disconnect source VM from NBD
# to avoid it trying to re-establish it. We are in blockjob ready state,
# thus, this command changes to it to blockjob complete (see qapi docs)
eval { PVE::QemuServer::qemu_drive_mirror_monitor($vmid, undef, $self->{storage_migration_jobs}, 'cancel'); };
if (my $err = $@) {
eval { PVE::QemuServer::qemu_blockjobs_cancel($vmid, $self->{storage_migration_jobs}) };
eval { PVE::QemuMigrate::cleanup_remotedisks($self) };
die "Failed to complete storage migration: $err\n";
}
}
if ($self->{volume_map}) {
my $target_drives = $self->{target_drive};

View File

@ -139,6 +139,12 @@ $MigrationTest::Shared::qemu_server_module->mock(
file_set_contents("${RUN_DIR_PATH}/nbd_info", to_json($nbd_info));
},
qemu_drive_mirror_monitor => sub {
my ($vmid, $vmiddst, $jobs, $completion, $qga) = @_;
if ($fail_config->{qemu_drive_mirror_monitor} &&
$fail_config->{qemu_drive_mirror_monitor} eq $completion) {
die "qemu_drive_mirror_monitor '$completion' error\n";
}
return;
},
set_migration_caps => sub {

View File

@ -1444,6 +1444,41 @@ my $tests = [
},
},
},
{
name => '149_running_unused_block_job_cancel_fail',
target => 'pve1',
vmid => 149,
vm_status => {
running => 1,
runningmachine => 'pc-q35-5.0+pve0',
},
opts => {
online => 1,
'with-local-disks' => 1,
},
config_patch => {
scsi1 => undef,
unused0 => 'local-dir:149/vm-149-disk-0.qcow2',
},
expected_calls => {},
expect_die => "qemu_drive_mirror_monitor 'cancel' error",
# note that 'cancel' is also used to finish and that's what this test is about
fail_config => {
'qemu_drive_mirror_monitor' => 'cancel',
},
expected => {
source_volids => local_volids_for_vm(149),
target_volids => {},
vm_config => get_patched_config(149, {
scsi1 => undef,
unused0 => 'local-dir:149/vm-149-disk-0.qcow2',
}),
vm_status => {
running => 1,
runningmachine => 'pc-q35-5.0+pve0',
},
},
},
{
name => '149_offline',
target => 'pve1',