mirror of
https://git.proxmox.com/git/qemu-server
synced 2025-07-23 13:39:55 +00:00
migration: avoid crash with heavy IO on local VM disk
There is a possibility that the drive-mirror job is not yet done when the migration wants to inactivate the source's blockdrives: > bdrv_co_write_req_prepare: Assertion `!(bs->open_flags & BDRV_O_INACTIVE)' failed. This can be prevented by using the 'write-blocking' copy mode (also called active mode) for the mirror. However, with active mode, the guest write speed is limited by the synchronous writes to the mirror target. For this reason, a way to start out in the faster 'background' mode and later switch to active mode was introduced in QEMU 8.2. The switch is done once the mirror job for all drives is ready to be completed to reduce the time spent where guest IO is limited. The loop waiting for actively-synced to become true is not an endless loop: Once the remaining dirty parts have been mirrored by the background iteration, the actively-synced flag will be set. Because the 'block-job-change' QMP command already succeeded, new writes will be done synchronously to the target and thus not lead to new dirty parts. If the job fails or vanishes (shouldn't actually happen, because auto-dismiss is false), the loop will be exited and the error propagated. Reported rarely, but steadily over the years: https://forum.proxmox.com/threads/78954/post-353651 https://forum.proxmox.com/threads/78954/post-380015 https://forum.proxmox.com/threads/100020/post-431660 https://forum.proxmox.com/threads/111831/post-482425 https://forum.proxmox.com/threads/111831/post-499807 https://forum.proxmox.com/threads/137849/ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
This commit is contained in:
parent
7b4fac1275
commit
60e1b142fb
@ -1139,6 +1139,14 @@ sub phase2 {
|
||||
$self->log('info', "$drive: start migration to $nbd_uri");
|
||||
PVE::QemuServer::qemu_drive_mirror($vmid, $drive, $nbd_uri, $vmid, undef, $self->{storage_migration_jobs}, 'skip', undef, $bwlimit, $bitmap);
|
||||
}
|
||||
|
||||
if (PVE::QemuServer::Machine::runs_at_least_qemu_version($vmid, 8, 2)) {
|
||||
$self->log('info', "switching mirror jobs to actively synced mode");
|
||||
PVE::QemuServer::qemu_drive_mirror_switch_to_active_mode(
|
||||
$vmid,
|
||||
$self->{storage_migration_jobs},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
$self->log('info', "starting online/live migration on $migrate_uri");
|
||||
|
@ -8186,6 +8186,57 @@ sub qemu_blockjobs_cancel {
|
||||
}
|
||||
}
|
||||
|
||||
# Callers should version guard this (only available with a binary >= QEMU 8.2)
|
||||
sub qemu_drive_mirror_switch_to_active_mode {
|
||||
my ($vmid, $jobs) = @_;
|
||||
|
||||
my $switching = {};
|
||||
|
||||
for my $job (sort keys $jobs->%*) {
|
||||
print "$job: switching to actively synced mode\n";
|
||||
|
||||
eval {
|
||||
mon_cmd(
|
||||
$vmid,
|
||||
"block-job-change",
|
||||
id => $job,
|
||||
type => 'mirror',
|
||||
'copy-mode' => 'write-blocking',
|
||||
);
|
||||
$switching->{$job} = 1;
|
||||
};
|
||||
die "could not switch mirror job $job to active mode - $@\n" if $@;
|
||||
}
|
||||
|
||||
while (1) {
|
||||
my $stats = mon_cmd($vmid, "query-block-jobs");
|
||||
|
||||
my $running_jobs = {};
|
||||
$running_jobs->{$_->{device}} = $_ for $stats->@*;
|
||||
|
||||
for my $job (sort keys $switching->%*) {
|
||||
die "$job: vanished while switching to active mode\n" if !$running_jobs->{$job};
|
||||
|
||||
my $info = $running_jobs->{$job};
|
||||
if ($info->{status} eq 'concluded') {
|
||||
qemu_handle_concluded_blockjob($vmid, $job, $info);
|
||||
# The 'concluded' state should occur here if and only if the job failed, so the
|
||||
# 'die' below should be unreachable, but play it safe.
|
||||
die "$job: expected job to have failed, but no error was set\n";
|
||||
}
|
||||
|
||||
if ($info->{'actively-synced'}) {
|
||||
print "$job: successfully switched to actively synced mode\n";
|
||||
delete $switching->{$job};
|
||||
}
|
||||
}
|
||||
|
||||
last if scalar(keys $switching->%*) == 0;
|
||||
|
||||
sleep 1;
|
||||
}
|
||||
}
|
||||
|
||||
# Check for bug #4525: drive-mirror will open the target drive with the same aio setting as the
|
||||
# source, but some storages have problems with io_uring, sometimes even leading to crashes.
|
||||
my sub clone_disk_check_io_uring {
|
||||
|
@ -152,6 +152,9 @@ $MigrationTest::Shared::qemu_server_module->mock(
|
||||
}
|
||||
return;
|
||||
},
|
||||
qemu_drive_mirror_switch_to_active_mode => sub {
|
||||
return;
|
||||
},
|
||||
set_migration_caps => sub {
|
||||
return;
|
||||
},
|
||||
@ -185,6 +188,9 @@ $qemu_server_machine_module->mock(
|
||||
if !defined($vm_status->{runningmachine});
|
||||
return $vm_status->{runningmachine};
|
||||
},
|
||||
runs_at_least_qemu_version => sub {
|
||||
return 1;
|
||||
},
|
||||
);
|
||||
|
||||
my $ssh_info_module = Test::MockModule->new("PVE::SSHInfo");
|
||||
|
Loading…
Reference in New Issue
Block a user