mirror of
https://git.proxmox.com/git/qemu-server
synced 2025-04-29 20:18:39 +00:00
fix #5440: vzdump: better cleanup fleecing images after hard errors
By recording the allocated fleecing images in the VM config, they are not immediately orphaned, should a hard error occur during backup that prevents cleanup. They are attempted to be cleaned up during the next backup run. In the cleanup helper, check if fleecing images are still attached in QEMU and detach them. This allows recovering from more failure scenarios. However, to avoid a deadlock, a left-over backup job needs to be canceled first. While canceling a left-over backup already happens when cleanup is done for a subsquent backup, it is required for other cases that like cleanup before migration (to be added in a following commit). Suggested-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> Signed-off-by: Fiona Ebner <f.ebner@proxmox.com> Link: https://lore.proxmox.com/20250127112923.31703-15-f.ebner@proxmox.com
This commit is contained in:
parent
a82c4555e3
commit
a39866732f
@ -13,6 +13,7 @@ use PVE::QemuServer::Monitor qw(mon_cmd);
|
|||||||
use PVE::QemuServer;
|
use PVE::QemuServer;
|
||||||
use PVE::QemuServer::Machine;
|
use PVE::QemuServer::Machine;
|
||||||
use PVE::QemuServer::Memory qw(get_current_memory);
|
use PVE::QemuServer::Memory qw(get_current_memory);
|
||||||
|
use PVE::RESTEnvironment qw(log_warn);
|
||||||
use PVE::Storage;
|
use PVE::Storage;
|
||||||
use PVE::Tools;
|
use PVE::Tools;
|
||||||
use PVE::Format qw(render_bytes render_duration);
|
use PVE::Format qw(render_bytes render_duration);
|
||||||
@ -578,4 +579,85 @@ sub has_cloudinit {
|
|||||||
return $found;
|
return $found;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Caller is expected to deal with volumes from an already existing 'fleecing' special section in the
|
||||||
|
# configuration first.
|
||||||
|
sub record_fleecing_images {
|
||||||
|
my ($vmid, $volids) = @_;
|
||||||
|
|
||||||
|
return if scalar($volids->@*) == 0;
|
||||||
|
|
||||||
|
PVE::QemuConfig->lock_config($vmid, sub {
|
||||||
|
my $conf = PVE::QemuConfig->load_config($vmid);
|
||||||
|
$conf->{'special-sections'}->{fleecing}->{'fleecing-images'} = join(',', $volids->@*);
|
||||||
|
PVE::QemuConfig->write_config($vmid, $conf);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
# Will also cancel a running backup job inside QEMU. Not doing so can lead to a deadlock when
|
||||||
|
# attempting to detach the fleecing image.
|
||||||
|
sub cleanup_fleecing_images {
|
||||||
|
my ($vmid, $storecfg, $log_func) = @_;
|
||||||
|
|
||||||
|
if (!$log_func) {
|
||||||
|
$log_func = sub {
|
||||||
|
my ($level, $line) = @_;
|
||||||
|
chomp($line);
|
||||||
|
if ($level eq 'info') {
|
||||||
|
print "$line\n";
|
||||||
|
} else {
|
||||||
|
log_warn($line);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
my $volids = [];
|
||||||
|
my $failed = [];
|
||||||
|
|
||||||
|
# cancel left-over backup job and detach any left-over images from a running VM
|
||||||
|
if (PVE::QemuServer::Helpers::vm_running_locally($vmid)) {
|
||||||
|
eval {
|
||||||
|
if (my $status = mon_cmd($vmid, 'query-backup')) {
|
||||||
|
if ($status->{status} && $status->{status} eq 'active') {
|
||||||
|
$log_func->('warn', "left-over backup job still running inside QEMU - canceling now");
|
||||||
|
mon_cmd($vmid, 'backup-cancel');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
$log_func->('warn', "checking/canceling old backup job failed - $@") if $@;
|
||||||
|
|
||||||
|
my $block_info = mon_cmd($vmid, "query-block");
|
||||||
|
for my $info ($block_info->@*) {
|
||||||
|
my $device_id = $info->{device};
|
||||||
|
next if $device_id !~ m/-fleecing$/;
|
||||||
|
|
||||||
|
$log_func->('info', "detaching (old) fleecing image for '$device_id'");
|
||||||
|
$device_id =~ s/^drive-//; # re-added by qemu_drivedel()
|
||||||
|
eval { PVE::QemuServer::qemu_drivedel($vmid, $device_id) };
|
||||||
|
$log_func->('warn', "error detaching (old) fleecing image '$device_id' - $@") if $@;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PVE::QemuConfig->lock_config($vmid, sub {
|
||||||
|
my $conf = PVE::QemuConfig->load_config($vmid);
|
||||||
|
my $special = $conf->{'special-sections'};
|
||||||
|
if (my $fleecing = $special->{fleecing}) {
|
||||||
|
$volids = [PVE::Tools::split_list($fleecing->{'fleecing-images'})];
|
||||||
|
delete $fleecing->{'fleecing-images'};
|
||||||
|
delete $special->{fleecing} if !scalar(keys $fleecing->%*);
|
||||||
|
PVE::QemuConfig->write_config($vmid, $conf);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
for my $volid ($volids->@*) {
|
||||||
|
$log_func->('info', "removing (old) fleecing image '$volid'");
|
||||||
|
eval { PVE::Storage::vdisk_free($storecfg, $volid); };
|
||||||
|
if (my $err = $@) {
|
||||||
|
$log_func->('warn', "error removing fleecing image '$volid' - $err");
|
||||||
|
push $failed->@*, $volid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
record_fleecing_images($vmid, $failed);
|
||||||
|
}
|
||||||
|
|
||||||
1;
|
1;
|
||||||
|
@ -533,15 +533,25 @@ sub get_and_check_pbs_encryption_config {
|
|||||||
die "internal error - unhandled case for getting & checking PBS encryption ($keyfile, $master_keyfile)!";
|
die "internal error - unhandled case for getting & checking PBS encryption ($keyfile, $master_keyfile)!";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Helper is intended to be called from allocate_fleecing_images() only. Otherwise, fleecing volids
|
||||||
|
# have already been recorded in the configuration and PVE::QemuConfig::cleanup_fleecing_images()
|
||||||
|
# should be used instead.
|
||||||
my sub cleanup_fleecing_images {
|
my sub cleanup_fleecing_images {
|
||||||
my ($self, $disks) = @_;
|
my ($self, $vmid, $disks) = @_;
|
||||||
|
|
||||||
|
my $failed = [];
|
||||||
|
|
||||||
for my $di ($disks->@*) {
|
for my $di ($disks->@*) {
|
||||||
if (my $volid = $di->{'fleece-volid'}) {
|
if (my $volid = $di->{'fleece-volid'}) {
|
||||||
eval { PVE::Storage::vdisk_free($self->{storecfg}, $volid); };
|
eval { PVE::Storage::vdisk_free($self->{storecfg}, $volid); };
|
||||||
$self->log('warn', "error removing fleecing image '$volid' - $@") if $@;
|
if (my $err = $@) {
|
||||||
|
$self->log('warn', "error removing fleecing image '$volid' - $err");
|
||||||
|
push $failed->@*, $volid;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PVE::QemuConfig::record_fleecing_images($vmid, $failed);
|
||||||
}
|
}
|
||||||
|
|
||||||
my sub allocate_fleecing_images {
|
my sub allocate_fleecing_images {
|
||||||
@ -549,8 +559,7 @@ my sub allocate_fleecing_images {
|
|||||||
|
|
||||||
die "internal error - no fleecing storage specified\n" if !$fleecing_storeid;
|
die "internal error - no fleecing storage specified\n" if !$fleecing_storeid;
|
||||||
|
|
||||||
# TODO what about potential left-over images from a failed attempt? Just
|
my $fleece_volids = [];
|
||||||
# auto-remove? While unlikely, could conflict with manually created image from user...
|
|
||||||
|
|
||||||
eval {
|
eval {
|
||||||
my $n = 0; # counter for fleecing image names
|
my $n = 0; # counter for fleecing image names
|
||||||
@ -567,6 +576,8 @@ my sub allocate_fleecing_images {
|
|||||||
$di->{'fleece-volid'} = PVE::Storage::vdisk_alloc(
|
$di->{'fleece-volid'} = PVE::Storage::vdisk_alloc(
|
||||||
$self->{storecfg}, $fleecing_storeid, $vmid, $format, $name, $size);
|
$self->{storecfg}, $fleecing_storeid, $vmid, $format, $name, $size);
|
||||||
|
|
||||||
|
push $fleece_volids->@*, $di->{'fleece-volid'};
|
||||||
|
|
||||||
$n++;
|
$n++;
|
||||||
} else {
|
} else {
|
||||||
die "implement me (type '$di->{type}')";
|
die "implement me (type '$di->{type}')";
|
||||||
@ -574,9 +585,11 @@ my sub allocate_fleecing_images {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
if (my $err = $@) {
|
if (my $err = $@) {
|
||||||
cleanup_fleecing_images($self, $disks);
|
cleanup_fleecing_images($self, $vmid, $disks);
|
||||||
die $err;
|
die $err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PVE::QemuConfig::record_fleecing_images($vmid, $fleece_volids);
|
||||||
}
|
}
|
||||||
|
|
||||||
my sub detach_fleecing_images {
|
my sub detach_fleecing_images {
|
||||||
@ -636,6 +649,13 @@ my sub check_and_prepare_fleecing {
|
|||||||
$use_fleecing = 0;
|
$use_fleecing = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# clean up potential left-overs from a previous attempt
|
||||||
|
eval {
|
||||||
|
PVE::QemuConfig::cleanup_fleecing_images(
|
||||||
|
$vmid, $self->{storecfg}, sub { $self->log($_[0], $_[1]); });
|
||||||
|
};
|
||||||
|
$self->log('warn', "attempt to clean up left-over fleecing images failed - $@") if $@;
|
||||||
|
|
||||||
if ($use_fleecing) {
|
if ($use_fleecing) {
|
||||||
my ($default_format, $valid_formats) = PVE::Storage::storage_default_format(
|
my ($default_format, $valid_formats) = PVE::Storage::storage_default_format(
|
||||||
$self->{storecfg}, $fleecing_opts->{storage});
|
$self->{storecfg}, $fleecing_opts->{storage});
|
||||||
@ -1132,7 +1152,11 @@ sub cleanup {
|
|||||||
# If VM was started only for backup, it is already stopped now.
|
# If VM was started only for backup, it is already stopped now.
|
||||||
if (PVE::QemuServer::Helpers::vm_running_locally($vmid)) {
|
if (PVE::QemuServer::Helpers::vm_running_locally($vmid)) {
|
||||||
$detach_tpmstate_drive->($task, $vmid);
|
$detach_tpmstate_drive->($task, $vmid);
|
||||||
detach_fleecing_images($task->{disks}, $vmid) if $task->{'use-fleecing'};
|
if ($task->{'use-fleecing'}) {
|
||||||
|
detach_fleecing_images($task->{disks}, $vmid);
|
||||||
|
PVE::QemuConfig::cleanup_fleecing_images(
|
||||||
|
$vmid, $self->{storecfg}, sub { $self->log($_[0], $_[1]); });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanup_fleecing_images($self, $task->{disks}) if $task->{'use-fleecing'};
|
cleanup_fleecing_images($self, $task->{disks}) if $task->{'use-fleecing'};
|
||||||
|
Loading…
Reference in New Issue
Block a user