pve-manager/PVE/API2/Replication.pm
Fabian Ebner b70458b6dd fix #4111: replication: don't send mail when fail count is zero
which can happen when failing to obtain the guest's migration lock.
This led to a lot of mails being sent during migration (timeout for
obtaining lock is only 2 seconds and we run it in a loop).

One could argue that obtaining the lock should increase the fail
count, but without the lock, the job state should not be touched and
even the first three mails upon migration could be considered spam.

Fixes: fa4bb659 ("replication: sent always mail for first three tries and move helper")
Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
2022-06-17 17:20:06 +02:00

417 lines
10 KiB
Perl

package PVE::API2::Replication;
use warnings;
use strict;
use PVE::JSONSchema qw(get_standard_option);
use PVE::RPCEnvironment;
use PVE::Format qw(render_timestamp);
use PVE::ProcFSTools;
use PVE::ReplicationConfig;
use PVE::ReplicationState;
use PVE::Replication;
use PVE::QemuConfig;
use PVE::QemuServer;
use PVE::LXC::Config;
use PVE::LXC;
use PVE::RESTHandler;
use base qw(PVE::RESTHandler);
our $pvesr_lock_path = "/var/lock/pvesr.lck";
our $lookup_guest_class = sub {
my ($vmtype) = @_;
if ($vmtype eq 'qemu') {
return 'PVE::QemuConfig';
} elsif ($vmtype eq 'lxc') {
return 'PVE::LXC::Config';
} else {
die "unknown guest type '$vmtype' - internal error";
}
};
# passing $now is useful for regression testing
sub run_single_job {
my ($jobid, $now, $logfunc) = @_;
my $local_node = PVE::INotify::nodename();
my $code = sub {
$now //= time();
my $cfg = PVE::ReplicationConfig->new();
my $jobcfg = $cfg->{ids}->{$jobid};
die "no such job '$jobid'\n" if !$jobcfg;
die "internal error - not implemented" if $jobcfg->{type} ne 'local';
die "job '$jobid' is disabled\n" if $jobcfg->{disable};
my $vms = PVE::Cluster::get_vmlist();
my $vmid = $jobcfg->{guest};
die "no such guest '$vmid'\n" if !$vms->{ids}->{$vmid};
die "guest '$vmid' is not on local node\n"
if $vms->{ids}->{$vmid}->{node} ne $local_node;
die "unable to sync to local node\n" if $jobcfg->{target} eq $local_node;
my $vmtype = $vms->{ids}->{$vmid}->{type};
my $guest_class = $lookup_guest_class->($vmtype);
PVE::Replication::run_replication($guest_class, $jobcfg, $now, $now, $logfunc);
};
my $res = PVE::Tools::lock_file($pvesr_lock_path, 60, $code);
die $@ if $@;
}
# TODO: below two should probably part of the general job framework/plugin system
my sub _should_mail_at_failcount {
my ($fail_count) = @_;
# avoid spam during migration (bug #4111): when failing to obtain the guest's migration lock,
# fail_count will be 0
return 0 if $fail_count == 0;
return 1 if $fail_count <= 3; # always send the first few for better visibility of the issue
# failing job is re-tried every half hour, try to send one mail after 1, 2, 4, 8, etc. days
my $i = 1;
while ($i * 48 < $fail_count) {
$i = $i * 2;
}
return $i * 48 == $fail_count;
};
my sub _handle_job_err {
my ($job, $err, $mail) = @_;
warn "$job->{id}: got unexpected replication job error - $err";
return if !$mail;
my $state = PVE::ReplicationState::read_state();
my $jobstate = PVE::ReplicationState::extract_job_state($state, $job);
my $fail_count = $jobstate->{fail_count};
return if !_should_mail_at_failcount($fail_count);
my $msg = "Replication job $job->{id} with target '$job->{target}' and schedule";
$msg .= " '$job->{schedule}' failed!\n";
$msg .= " Last successful sync: ";
if (my $last_sync = $jobstate->{last_sync}) {
$msg .= render_timestamp($last_sync) ."\n";
} else {
$msg .= "None/Unknown\n";
}
# not yet updated, so $job->next_sync here is actually the current one.
# NOTE: Copied from PVE::ReplicationState::job_status()
my $next_sync = $job->{next_sync} + 60 * ($fail_count <= 3 ? 5 * $fail_count : 30);
$msg .= " Next sync try: " . render_timestamp($next_sync) ."\n";
$msg .= " Failure count: $fail_count\n";
if ($fail_count == 3) {
$msg .= "\nNote: The system will now reduce the frequency of error reports,";
$msg .= " as the job appears to be stuck.\n";
}
$msg .= "\nError:\n$err";
eval {
PVE::Tools::sendmail('root', "Replication Job: $job->{id} failed", $msg)
};
warn ": $@" if $@;
}
# passing $now and $verbose is useful for regression testing
sub run_jobs {
my ($now, $logfunc, $verbose, $mail) = @_;
my $iteration = $now // time();
my $code = sub {
my $start_time = $now // time();
PVE::ReplicationState::purge_old_states();
while (my $jobcfg = PVE::ReplicationState::get_next_job($iteration, $start_time)) {
my $guest_class = $lookup_guest_class->($jobcfg->{vmtype});
eval {
PVE::Replication::run_replication($guest_class, $jobcfg, $iteration, $start_time, $logfunc, $verbose);
};
if (my $err = $@) {
_handle_job_err($jobcfg, $err, $mail);
}
$start_time = $now // time();
}
};
my $res = PVE::Tools::lock_file($pvesr_lock_path, 60, $code);
die $@ if $@;
}
my $extract_job_status = sub {
my ($jobcfg, $jobid) = @_;
# Note: we modify $jobcfg
my $state = delete $jobcfg->{state};
my $data = $jobcfg;
$data->{id} = $jobid;
foreach my $k (qw(last_sync last_try fail_count error duration)) {
$data->{$k} = $state->{$k} if defined($state->{$k});
}
if ($state->{pid} && $state->{ptime}) {
if (PVE::ProcFSTools::check_process_running($state->{pid}, $state->{ptime})) {
$data->{pid} = $state->{pid};
}
}
return $data;
};
__PACKAGE__->register_method ({
name => 'status',
path => '',
method => 'GET',
description => "List status of all replication jobs on this node.",
permissions => {
description => "Requires the VM.Audit permission on /vms/<vmid>.",
user => 'all',
},
protected => 1,
proxyto => 'node',
parameters => {
additionalProperties => 0,
properties => {
node => get_standard_option('pve-node'),
guest => get_standard_option('pve-vmid', {
optional => 1,
description => "Only list replication jobs for this guest.",
}),
},
},
returns => {
type => 'array',
items => {
type => "object",
properties => {
id => { type => 'string' },
},
},
links => [ { rel => 'child', href => "{id}" } ],
},
code => sub {
my ($param) = @_;
my $rpcenv = PVE::RPCEnvironment::get();
my $authuser = $rpcenv->get_user();
my $jobs = PVE::ReplicationState::job_status(1);
my $res = [];
foreach my $id (sort keys %$jobs) {
my $data = $extract_job_status->($jobs->{$id}, $id);
my $guest = $data->{guest};
next if defined($param->{guest}) && $guest != $param->{guest};
next if !$rpcenv->check($authuser, "/vms/$guest", [ 'VM.Audit' ]);
push @$res, $data;
}
return $res;
}});
__PACKAGE__->register_method ({
name => 'index',
path => '{id}',
method => 'GET',
permissions => { user => 'all' },
description => "Directory index.",
parameters => {
additionalProperties => 0,
properties => {
id => get_standard_option('pve-replication-id'),
node => get_standard_option('pve-node'),
},
},
returns => {
type => 'array',
items => {
type => "object",
properties => {},
},
links => [ { rel => 'child', href => "{name}" } ],
},
code => sub {
my ($param) = @_;
return [
{ name => 'schedule_now' },
{ name => 'log' },
{ name => 'status' },
];
}});
__PACKAGE__->register_method ({
name => 'job_status',
path => '{id}/status',
method => 'GET',
description => "Get replication job status.",
permissions => {
description => "Requires the VM.Audit permission on /vms/<vmid>.",
user => 'all',
},
protected => 1,
proxyto => 'node',
parameters => {
additionalProperties => 0,
properties => {
id => get_standard_option('pve-replication-id'),
node => get_standard_option('pve-node'),
},
},
returns => {
type => "object",
properties => {},
},
code => sub {
my ($param) = @_;
my $rpcenv = PVE::RPCEnvironment::get();
my $authuser = $rpcenv->get_user();
my $jobs = PVE::ReplicationState::job_status();
my $jobid = $param->{id};
my $jobcfg = $jobs->{$jobid};
die "no such replication job '$jobid'\n" if !defined($jobcfg);
my $data = $extract_job_status->($jobcfg, $jobid);
my $guest = $data->{guest};
raise_perm_exc() if !$rpcenv->check($authuser, "/vms/$guest", [ 'VM.Audit' ]);
return $data;
}});
__PACKAGE__->register_method({
name => 'read_job_log',
path => '{id}/log',
method => 'GET',
permissions => {
description => "Requires the VM.Audit permission on /vms/<vmid>, or 'Sys.Audit' on '/nodes/<node>'",
user => 'all',
},
protected => 1,
description => "Read replication job log.",
proxyto => 'node',
parameters => {
additionalProperties => 0,
properties => {
id => get_standard_option('pve-replication-id'),
node => get_standard_option('pve-node'),
start => {
type => 'integer',
minimum => 0,
optional => 1,
},
limit => {
type => 'integer',
minimum => 0,
optional => 1,
},
},
},
returns => {
type => 'array',
items => {
type => "object",
properties => {
n => {
description=> "Line number",
type=> 'integer',
},
t => {
description=> "Line text",
type => 'string',
}
}
}
},
code => sub {
my ($param) = @_;
my $rpcenv = PVE::RPCEnvironment::get();
my $authuser = $rpcenv->get_user();
my $jobid = $param->{id};
my $filename = PVE::ReplicationState::job_logfile_name($jobid);
my $cfg = PVE::ReplicationConfig->new();
my $data = $cfg->{ids}->{$jobid};
die "no such replication job '$jobid'\n" if !defined($data);
my $node = $param->{node};
my $vmid = $data->{guest};
raise_perm_exc() if (!($rpcenv->check($authuser, "/vms/$vmid", [ 'VM.Audit' ]) ||
$rpcenv->check($authuser, "/nodes/$node", [ 'Sys.Audit' ])));
my ($count, $lines) = PVE::Tools::dump_logfile($filename, $param->{start}, $param->{limit});
$rpcenv->set_result_attrib('total', $count);
return $lines;
}});
__PACKAGE__->register_method ({
name => 'schedule_now',
path => '{id}/schedule_now',
method => 'POST',
description => "Schedule replication job to start as soon as possible.",
proxyto => 'node',
protected => 1,
permissions => {
check => ['perm', '/storage', ['Datastore.Allocate']],
},
parameters => {
additionalProperties => 0,
properties => {
id => get_standard_option('pve-replication-id'),
node => get_standard_option('pve-node'),
},
},
returns => {
type => 'string',
},
code => sub {
my ($param) = @_;
my $jobid = $param->{id};
my $cfg = PVE::ReplicationConfig->new();
my $jobcfg = $cfg->{ids}->{$jobid};
die "no such replication job '$jobid'\n" if !defined($jobcfg);
PVE::ReplicationState::schedule_job_now($jobcfg);
}});
1;