pve-manager/PVE/API2/Cluster/MetricServer.pm
Lukas Wagner 073b53ae71 metrics: add /cluster/metrics/export endpoint
This new endpoint returns node, storage and guest metrics in JSON
format. The endpoint supports history/max-age parameters, allowing
the caller to query the recent metric history as recorded by the
PVE::PullMetric module.

The returned data format is quite simple, being an array of
metric records, including a value, a metric name, an id to identify
the object (e.g. qemu/100, node/foo), a timestamp and a type
('gauge', 'derive', ...). The latter property makes the format
self-describing and aids the metric collector in choosing a
representation for storing the metric data.

    [
        ...
        {
            "metric": "cpu_avg1",
            "value": 0.12,
            "timestamp": 170053205,
            "id": "node/foo",
            "type": "gauge"
        },
        ...
    ]

Some experiments were made in regards to making the format
more 'efficient', e.g. by grouping based on timestamps/ids, resulting
in a much more nested/complicated data format. While that
certainly reduces the size of the raw JSON response by quite a bit,
after GZIP compression the differences are negligible (the
simple, flat data format as described above compresses by a factor
of 25 for large clusters!). Also, the slightly increased CPU load
of compressing the larger amount of data when e.g. polling once a
minute is so small that it's indistinguishable from noise in relation
to a usual hypervisor workload. Thus the simpler, format was
chosen. One benefit of this format is that it is more or less already
the exact same format as the one Prometheus uses, but in JSON format -
so adding a Prometheus metric scraping endpoint should not be much
work at all.

The API endpoint collects metrics for the whole cluster by calling
the same endpoint for all cluster nodes. To avoid endless request
recursion, the 'local-only' request parameter is provided. If this
parameter is set, the endpoint implementation will only return metrics
for the local node, avoiding a loop.

Signed-off-by: Lukas Wagner <l.wagner@proxmox.com>
[WB: remove unused $start_time leftover from benchmarks]
Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
2024-08-14 14:18:48 +02:00

475 lines
11 KiB
Perl

package PVE::API2::Cluster::MetricServer;
use warnings;
use strict;
use PVE::Tools qw(extract_param extract_sensitive_params);
use PVE::Exception qw(raise_perm_exc raise_param_exc);
use PVE::JSONSchema qw(get_standard_option);
use PVE::INotify;
use PVE::RPCEnvironment;
use PVE::ExtMetric;
use PVE::PullMetric;
use PVE::SafeSyslog;
use PVE::RESTHandler;
use base qw(PVE::RESTHandler);
__PACKAGE__->register_method ({
name => 'index',
path => '',
method => 'GET',
description => "Metrics index.",
permissions => { user => 'all' },
parameters => {
additionalProperties => 0,
properties => {},
},
returns => {
type => 'array',
items => {
type => "object",
properties => {},
},
links => [ { rel => 'child', href => "{name}" } ],
},
code => sub {
my ($param) = @_;
my $result = [
{ name => 'server' },
];
return $result;
}
});
__PACKAGE__->register_method ({
name => 'server_index',
path => 'server',
method => 'GET',
description => "List configured metric servers.",
permissions => {
check => ['perm', '/', ['Sys.Audit']],
},
parameters => {
additionalProperties => 0,
properties => {},
},
returns => {
type => 'array',
items => {
type => "object",
properties => {
id => {
description => "The ID of the entry.",
type => 'string'
},
disable => {
description => "Flag to disable the plugin.",
type => 'boolean',
},
type => {
description => "Plugin type.",
type => 'string',
},
server => {
description => "Server dns name or IP address",
type => 'string',
},
port => {
description => "Server network port",
type => 'integer',
},
},
},
links => [ { rel => 'child', href => "{id}" } ],
},
code => sub {
my ($param) = @_;
my $res = [];
my $status_cfg = PVE::Cluster::cfs_read_file('status.cfg');
for my $id (sort keys %{$status_cfg->{ids}}) {
my $plugin_config = $status_cfg->{ids}->{$id};
push @$res, {
id => $id,
disable => $plugin_config->{disable} // 0,
type => $plugin_config->{type},
server => $plugin_config->{server},
port => $plugin_config->{port},
};
}
return $res;
}});
__PACKAGE__->register_method ({
name => 'read',
path => 'server/{id}',
method => 'GET',
description => "Read metric server configuration.",
permissions => {
check => ['perm', '/', ['Sys.Audit']],
},
parameters => {
additionalProperties => 0,
properties => {
id => {
type => 'string',
format => 'pve-configid',
},
},
},
returns => { type => 'object' },
code => sub {
my ($param) = @_;
my $status_cfg = PVE::Cluster::cfs_read_file('status.cfg');
my $id = $param->{id};
if (!defined($status_cfg->{ids}->{$id})) {
die "status server entry '$id' does not exist\n";
}
return $status_cfg->{ids}->{$id};
}});
__PACKAGE__->register_method ({
name => 'create',
path => 'server/{id}',
protected => 1,
method => 'POST',
description => "Create a new external metric server config",
permissions => {
check => ['perm', '/', ['Sys.Modify']],
},
parameters => PVE::Status::Plugin->createSchema(),
returns => { type => 'null' },
code => sub {
my ($param) = @_;
my $type = extract_param($param, 'type');
my $plugin = PVE::Status::Plugin->lookup($type);
my $id = extract_param($param, 'id');
my $sensitive_params = extract_sensitive_params($param, ['token'], []);
PVE::Cluster::cfs_lock_file('status.cfg', undef, sub {
my $cfg = PVE::Cluster::cfs_read_file('status.cfg');
die "Metric server '$id' already exists\n"
if $cfg->{ids}->{$id};
my $opts = $plugin->check_config($id, $param, 1, 1);
$cfg->{ids}->{$id} = $opts;
$plugin->on_add_hook($id, $opts, $sensitive_params);
eval {
$plugin->test_connection($opts, $id);
};
if (my $err = $@) {
eval { $plugin->on_delete_hook($id, $opts) };
warn "$@\n" if $@;
die $err;
}
PVE::Cluster::cfs_write_file('status.cfg', $cfg);
});
die $@ if $@;
return;
}});
__PACKAGE__->register_method ({
name => 'update',
protected => 1,
path => 'server/{id}',
method => 'PUT',
description => "Update metric server configuration.",
permissions => {
check => ['perm', '/', ['Sys.Modify']],
},
parameters => PVE::Status::Plugin->updateSchema(),
returns => { type => 'null' },
code => sub {
my ($param) = @_;
my $id = extract_param($param, 'id');
my $digest = extract_param($param, 'digest');
my $delete = extract_param($param, 'delete');
if ($delete) {
$delete = [PVE::Tools::split_list($delete)];
}
my $sensitive_params = extract_sensitive_params($param, ['token'], $delete);
PVE::Cluster::cfs_lock_file('status.cfg', undef, sub {
my $cfg = PVE::Cluster::cfs_read_file('status.cfg');
PVE::SectionConfig::assert_if_modified($cfg, $digest);
my $data = $cfg->{ids}->{$id};
die "no such server '$id'\n" if !$data;
my $plugin = PVE::Status::Plugin->lookup($data->{type});
my $opts = $plugin->check_config($id, $param, 0, 1);
for my $k (keys %$opts) {
$data->{$k} = $opts->{$k};
}
if ($delete) {
my $options = $plugin->private()->{options}->{$data->{type}};
for my $k (@$delete) {
my $d = $options->{$k} || die "no such option '$k'\n";
die "unable to delete required option '$k'\n" if !$d->{optional};
die "unable to delete fixed option '$k'\n" if $d->{fixed};
die "cannot set and delete property '$k' at the same time!\n"
if defined($opts->{$k});
delete $data->{$k};
}
}
$plugin->on_update_hook($id, $data, $sensitive_params);
$plugin->test_connection($data, $id);
PVE::Cluster::cfs_write_file('status.cfg', $cfg);
});
die $@ if $@;
return;
}});
__PACKAGE__->register_method ({
name => 'delete',
protected => 1,
path => 'server/{id}',
method => 'DELETE',
description => "Remove Metric server.",
permissions => {
check => ['perm', '/', ['Sys.Modify']],
},
parameters => {
additionalProperties => 0,
properties => {
id => {
type => 'string',
format => 'pve-configid',
},
}
},
returns => { type => 'null' },
code => sub {
my ($param) = @_;
PVE::Cluster::cfs_lock_file('status.cfg', undef, sub {
my $cfg = PVE::Cluster::cfs_read_file('status.cfg');
my $id = $param->{id};
my $plugin_cfg = $cfg->{ids}->{$id};
my $plugin = PVE::Status::Plugin->lookup($plugin_cfg->{type});
$plugin->on_delete_hook($id, $plugin_cfg);
delete $cfg->{ids}->{$id};
PVE::Cluster::cfs_write_file('status.cfg', $cfg);
});
die $@ if $@;
return;
}});
__PACKAGE__->register_method ({
name => 'export',
path => 'export',
method => 'GET',
protected => 1,
description => "Retrieve metrics of the cluster.",
permissions => {
check => ['perm', '/', ['Sys.Audit']],
},
parameters => {
additionalProperties => 0,
properties => {
'local-only' => {
type => 'boolean',
description =>
'Only return metrics for the current node instead of the whole cluster',
optional => 1,
default => 0,
},
'start-time' => {
type => 'integer',
description => 'Only include metrics with a timestamp > start-time.',
optional => 1,
default => 0,
},
'history' => {
type => 'boolean',
description => 'Also return historic values.'
. ' Returns full available metric history unless `start-time` is also set',
optional => 1,
default => 0,
},
},
},
returns => {
type => 'object',
additionalProperties => 0,
properties => {
data => {
type => 'array',
description => 'Array of system metrics. Metrics are sorted by their timestamp.',
items => {
type => 'object',
additionalProperties => 0,
properties => {
timestamp => {
type => 'integer',
description => 'Time at which this metric was observed',
},
id => {
type => 'string',
description => "Unique identifier for this metric object,"
. " for instance 'node/<nodename>' or"
. " 'qemu/<vmid>'."
},
metric => {
type => 'string',
description => "Name of the metric.",
},
value => {
type => 'number',
description => 'Metric value.',
},
type => {
type => 'string',
description => 'Type of the metric.',
enum => [qw(gauge counter derive)],
}
}
},
},
}
},
code => sub {
my ($param) = @_;
my $local_only = $param->{'local-only'} // 0;
my $start = $param->{'start-time'};
my $history = $param->{'history'} // 0;
my $now = time();
my $generations;
if ($history) {
# Assuming update loop time of pvestatd of 10 seconds.
if (defined($start)) {
my $delta = $now - $start;
$generations = int($delta / 10);
} else {
$generations = PVE::PullMetric::max_generations();
}
} else {
$generations = 0;
};
my @metrics = @{PVE::PullMetric::get_local_metrics($generations)};
if (defined($start)) {
@metrics = grep {
$_->{timestamp} > ($start)
} @metrics;
}
my $nodename = PVE::INotify::nodename();
# Fan out to cluster members
# Do NOT remove this check
if (!$local_only) {
my $members = PVE::Cluster::get_members();
my $rpcenv = PVE::RPCEnvironment::get();
my $authuser = $rpcenv->get_user();
my ($user, undef) = PVE::AccessControl::split_tokenid($authuser, 1);
my $ticket;
if ($user) {
# Theoretically, we might now bypass token privilege separation, since
# we use the regular user instead of the token, but
# since we already passed the permission check for this handler,
# this should be fine.
$ticket = PVE::AccessControl::assemble_ticket($user);
} else {
$ticket = PVE::AccessControl::assemble_ticket($authuser);
}
for my $name (keys %$members) {
if ($name eq $nodename) {
# Skip own node, for that one we already have the metrics
next;
}
if (!$members->{$name}->{online}) {
next;
}
my $status = eval {
my $fingerprint = PVE::Cluster::get_node_fingerprint($name);
my $ip = scalar(PVE::Cluster::remote_node_ip($name));
my $conn_args = {
protocol => 'https',
host => $ip,
port => 8006,
ticket => $ticket,
timeout => 5,
};
$conn_args->{cached_fingerprints} = { $fingerprint => 1 };
my $api_client = PVE::APIClient::LWP->new(%$conn_args);
my $params = {
# Do NOT remove 'local-only' - potential for request recursion!
'local-only' => 1,
history => $history,
};
$params->{'start-time'} = $start if defined($start);
$api_client->get('/cluster/metrics/export', $params);
};
if ($@) {
syslog('warning', "could not fetch metrics from $name: $@");
} else {
push @metrics, $status->{data}->@*;
}
}
}
my @sorted = sort {$a->{timestamp} <=> $b->{timestamp}} @metrics;
return {
data => \@sorted,
};
},
});
1;