notifications: overhaul fence notification

- try to make template variable names more clear (in preparation
  for #6143)
- add common tempate variables (fqdn, hostname, cluster-name)
- Instead of dumping the status-data variable as a JSON blob we
  add template variables for the most useful information and
  render it in a structured manner

Signed-off-by: Lukas Wagner <l.wagner@proxmox.com>
This commit is contained in:
Lukas Wagner 2025-03-28 11:19:15 +01:00 committed by Thomas Lamprecht
parent e3622b0f11
commit 66eddda945
5 changed files with 111 additions and 29 deletions

View File

@ -3,6 +3,8 @@ package PVE::HA::NodeStatus;
use strict;
use warnings;
use PVE::Notify;
use JSON;
my $fence_delay = 60;
@ -195,15 +197,38 @@ my $send_fence_state_email = sub {
my $haenv = $self->{haenv};
my $status = $haenv->read_manager_status();
my $template_data = {
"status-data" => {
manager_status => $status,
node_status => $self->{status}
},
"node" => $node,
"subject-prefix" => $subject_prefix,
"subject" => $subject,
};
my $template_data = PVE::Notify::common_template_data();
# Those two are needed for the expected output for test cases,
# see src/PVE/HA/Sim/Env.pm
$template_data->{"fence-status"} = $subject;
$template_data->{"fence-prefix"} = $subject_prefix;
$template_data->{"is-success"} = 1 ? $subject_prefix eq "SUCCEED" : 0;
$template_data->{"failed-node"} = $node;
$template_data->{"master-node"} = $status->{master_node};
# There is a handlebars helper 'timestamp', we should not
# name a variable the same way.
$template_data->{"fence-timestamp"} = $status->{timestamp};
$template_data->{"nodes"} = [];
for my $key (sort keys $status->{node_status}->%*) {
push $template_data->{"nodes"}->@*, {
node => $key,
status => $status->{node_status}->{$key}
};
}
$template_data->{"resources"} = [];
for my $key (sort keys $status->{service_status}->%*) {
my $resource_status = $status->{service_status}->{$key};
push $template_data->{"resources"}->@*, {
resource => $key,
state => $resource_status->{state},
node => $resource_status->{node},
running => $resource_status->{running},
};
}
my $metadata_fields = {
type => 'fencing',

View File

@ -299,12 +299,12 @@ sub log {
sub send_notification {
my ($self, $template_name, $properties) = @_;
# The template for the subject is "{{subject-prefix}}: {{subject}}"
# The template for the subject is "{{fence-status}}: {{fence-message}}"
# We have to perform poor-man's template rendering to pass the test cases.
my $subject = "{{subject-prefix}}: {{subject}}";
$subject = $subject =~ s/\{\{subject-prefix}}/$properties->{"subject-prefix"}/r;
$subject = $subject =~ s/\{\{subject}}/$properties->{"subject"}/r;
my $subject = "{{fence-prefix}}: {{fence-status}}";
$subject = $subject =~ s/\{\{fence-prefix}}/$properties->{"fence-prefix"}/r;
$subject = $subject =~ s/\{\{fence-status}}/$properties->{"fence-status"}/r;
# only log subject, do not spam the logs
$self->log('email', $subject);

View File

@ -1,14 +1,43 @@
<html>
<body>
The node '{{node}}' failed and needs manual intervention.<br/><br/>
The node '{{failed-node}}' in cluster '{{cluster-name}}' failed and
needs manual intervention.<br/><br/>
The PVE HA manager tries to fence it and recover the configured HA resources to
a healthy node if possible.<br/><br/>
{{#if is-success~}}
The PVE HA manager successfully fenced '{{failed-node}}'.<br/><br/>
{{else}}
The PVE HA manager will now fence '{{failed-node}}'.<br/><br/>
{{/if}}
Current fence status: {{subject-prefix}}<br/>
{{subject}}<br/>
<b>Status:</b> {{fence-status}}<br/>
<b>Timestamp:</b> {{timestamp fence-timestamp}}<br/>
<h2 style="font-size: 1em">Overall Cluster status:</h2>
{{object status-data}}
<h2 style="font-size: 1em">Cluster Node Status:</h2>
<ul>
{{#each nodes}}
<li>
{{this.node}}: {{this.status}} {{#if (eq this.node ../master-node)}}[master]{{/if}}
</li>
{{/each}}
</ul>
<h2 style="font-size: 1em">HA Resources:</h2>
The following HA resources were running on the failed node and will be
recovered to a healthy node if possible:
<ul>
{{#each resources}}
{{#if (eq this.node ../failed-node)}}
<li>{{this.resource}} [{{this.node}}]: {{this.state}}</li>
{{/if}}
{{/each}}
</ul>
The other HA resources in this cluster are:
<ul>
{{#each resources}}
{{#if (ne this.node ../failed-node)}}
<li>{{this.resource}} [{{this.node}}]: {{this.state}}</li>
{{/if}}
{{/each}}
</ul>
</body>
</html>

View File

@ -1,11 +1,35 @@
The node '{{node}}' failed and needs manual intervention.
The node '{{failed-node}}' in cluster '{{cluster-name}}' failed
and needs manual intervention.
The PVE HA manager tries to fence it and recover the configured HA resources to
a healthy node if possible.
{{#if is-success~}}
The PVE HA manager successfully fenced '{{failed-node}}'.
{{else~}}
The PVE HA manager will now fence '{{failed-node}}'.
{{/if}}
Status: {{fence-status}}
Timestamp: {{timestamp fence-timestamp}}
Current fence status: {{subject-prefix}}
{{subject}}
Cluster Node Status:
--------------------
{{#each nodes~}}
- {{this.node}}: {{this.status}} {{#if (eq this.node ../master-node)}}[master]{{/if}}
{{/each}}
HA Resources:
-------------
The following HA resources were running on the failed node and will be
recovered to a healthy node if possible:
{{#each resources~}}
{{#if (eq this.node ../failed-node)~}}
- {{this.resource}} [{{this.node}}]: {{this.state}}
{{/if~}}
{{/each}}
The other HA resources in this cluster are:
{{#each resources~}}
{{#if (ne this.node ../failed-node)~}}
- {{this.resource}} [{{this.node}}]: {{this.state}}
{{/if~}}
{{/each~}}
Overall Cluster status:
-----------------------
{{object status-data}}

View File

@ -1 +1,5 @@
{{subject-prefix}}: {{subject}}
{{#if is-success~}}
Successfully fenced node '{{failed-node}}'
{{else}}
Trying to fence node '{{failed-node}}'
{{/if}}