api: add /status/metrics API

This one is modelled exactly as the one in PVE (there it
is available under /cluster/metrics/export).

The returned data format is quite simple, being an array of
metric records, including a value, a metric name, an id to identify
the object (e.g. datastore/foo, host), a timestamp and a type
('gauge', 'derive', ...). The latter property makes the format
self-describing and aids the metric collector in choosing a
representation for storing the metric data.

[
    ...
    {
	"metric": "cpu_avg1",
	"value": 0.12,
	"timestamp": 170053205,
	"id": "host",
	"type": "gauge"
    },
    ...
]

In terms of permissions, the new endpoint requires Sys.Audit
on /system/status for metrics of the 'host' object,
and Datastore.Audit on /datastore/{store} for 'datastore/{store}'
metric objects.

Via the 'history' and 'start-time' parameters one can query
the last 30mins of metric history. If these parameters
are not provided, only the most recent metric generation
is returned.

Signed-off-by: Lukas Wagner <l.wagner@proxmox.com>
This commit is contained in:
Lukas Wagner 2024-10-15 10:46:36 +02:00 committed by Wolfgang Bumiller
parent da12adb1f9
commit c804763bdf
4 changed files with 131 additions and 6 deletions

View File

@ -0,0 +1,73 @@
use anyhow::Error;
use pbs_api_types::{Authid, MetricDataPoint, Metrics, PRIV_DATASTORE_AUDIT, PRIV_SYS_AUDIT};
use pbs_config::CachedUserInfo;
use proxmox_router::{Permission, Router, RpcEnvironment};
use proxmox_schema::api;
use crate::server::metric_collection::pull_metrics;
pub const ROUTER: Router = Router::new().get(&API_METHOD_GET_METRICS);
#[api(
input: {
properties: {
"start-time": {
optional: true,
default: 0,
description: "Only return values with a timestamp > start-time. Only has an effect if 'history' is also set",
},
"history": {
optional: true,
default: false,
description: "Include historic values (last 30 minutes)",
}
},
},
access: {
description: "Users need Sys.Audit on /system/status for host metrics and Datastore.Audit on /datastore/{store} for datastore metrics",
permission: &Permission::Anybody,
},
)]
/// Return backup server metrics.
pub fn get_metrics(
start_time: i64,
history: bool,
rpcenv: &mut dyn RpcEnvironment,
) -> Result<Metrics, Error> {
let metrics = if history {
pull_metrics::get_all_metrics(start_time)?
} else {
pull_metrics::get_most_recent_metrics()?
};
let auth_id: Authid = rpcenv.get_auth_id().unwrap().parse()?;
let user_info = CachedUserInfo::new()?;
let filter_by_privs = |point: &MetricDataPoint| {
let elements: Vec<&str> = point.id.as_str().split('/').collect();
match elements.as_slice() {
["host"] => {
let user_privs =
CachedUserInfo::lookup_privs(&user_info, &auth_id, &["system", "status"]);
(user_privs & PRIV_SYS_AUDIT) != 0
}
["datastore", datastore_id] => {
let user_privs = CachedUserInfo::lookup_privs(
&user_info,
&auth_id,
&["datastore", datastore_id],
);
(user_privs & PRIV_DATASTORE_AUDIT) != 0
}
_ => {
log::error!("invalid metric object id: {}", point.id);
false
}
}
};
Ok(Metrics {
data: metrics.into_iter().filter(filter_by_privs).collect(),
})
}

View File

@ -7,6 +7,7 @@ use proxmox_router::list_subdirs_api_method;
use proxmox_router::{ApiMethod, Permission, Router, RpcEnvironment, SubdirMap}; use proxmox_router::{ApiMethod, Permission, Router, RpcEnvironment, SubdirMap};
use proxmox_rrd_api_types::{RrdMode, RrdTimeframe}; use proxmox_rrd_api_types::{RrdMode, RrdTimeframe};
use proxmox_schema::api; use proxmox_schema::api;
use proxmox_sortable_macro::sortable;
use pbs_api_types::{ use pbs_api_types::{
Authid, DataStoreStatusListItem, Operation, PRIV_DATASTORE_AUDIT, PRIV_DATASTORE_BACKUP, Authid, DataStoreStatusListItem, Operation, PRIV_DATASTORE_AUDIT, PRIV_DATASTORE_BACKUP,
@ -20,6 +21,8 @@ use crate::tools::statistics::linear_regression;
use crate::backup::can_access_any_namespace; use crate::backup::can_access_any_namespace;
pub mod metrics;
#[api( #[api(
returns: { returns: {
description: "Lists the Status of the Datastores.", description: "Lists the Status of the Datastores.",
@ -137,10 +140,14 @@ pub async fn datastore_status(
Ok(list) Ok(list)
} }
const SUBDIRS: SubdirMap = &[( #[sortable]
"datastore-usage", const SUBDIRS: SubdirMap = &sorted!([
&Router::new().get(&API_METHOD_DATASTORE_STATUS), (
)]; "datastore-usage",
&Router::new().get(&API_METHOD_DATASTORE_STATUS),
),
("metrics", &metrics::ROUTER),
]);
pub const ROUTER: Router = Router::new() pub const ROUTER: Router = Router::new()
.get(&list_subdirs_api_method!(SUBDIRS)) .get(&list_subdirs_api_method!(SUBDIRS))

View File

@ -17,8 +17,8 @@ use proxmox_sys::{
use crate::tools::disks::{zfs_dataset_stats, BlockDevStat, DiskManage}; use crate::tools::disks::{zfs_dataset_stats, BlockDevStat, DiskManage};
mod metric_server; mod metric_server;
mod pull_metrics; pub(crate) mod pull_metrics;
pub mod rrd; pub(crate) mod rrd;
const METRIC_COLLECTION_INTERVAL: Duration = Duration::from_secs(10); const METRIC_COLLECTION_INTERVAL: Duration = Duration::from_secs(10);

View File

@ -39,6 +39,51 @@ pub(super) fn init() -> Result<(), Error> {
Ok(()) Ok(())
} }
/// Return most recent metrics
///
/// If the metric collection loop has no produced any metrics yet, an empty
/// `Vec` is returned. Returns an error if the cache could not be accessed.
pub fn get_most_recent_metrics() -> Result<Vec<MetricDataPoint>, Error> {
let cached_datapoints: Option<MetricDataPoints> = get_cache()?.get()?;
let mut points = cached_datapoints.map(|r| r.datapoints).unwrap_or_default();
points.sort_unstable_by_key(|p| p.timestamp);
Ok(points)
}
/// Return all cached metrics with a `timestamp > start_time`
///
/// If the metric collection loop has no produced any metrics yet, an empty
/// `Vec` is returned. Returns an error if the cache could not be accessed.
pub fn get_all_metrics(start_time: i64) -> Result<Vec<MetricDataPoint>, Error> {
let now = proxmox_time::epoch_i64();
let delta = now - start_time;
if delta < 0 {
// start-time in the future, no metrics for you
return Ok(Vec::new());
}
let generations = delta / (METRIC_COLLECTION_INTERVAL.as_secs() as i64);
let generations = generations.clamp(0, STORED_METRIC_GENERATIONS as i64);
let cached_datapoints: Vec<MetricDataPoints> = get_cache()?.get_last(generations as u32)?;
let mut points = Vec::new();
for gen in cached_datapoints {
if gen.timestamp > start_time {
points.extend(gen.datapoints);
}
}
points.sort_unstable_by_key(|p| p.timestamp);
Ok(points)
}
/// Convert `DiskStat` `HostStat` into a universal metric data point and cache /// Convert `DiskStat` `HostStat` into a universal metric data point and cache
/// them for a later retrieval. /// them for a later retrieval.
pub(super) fn update_metrics( pub(super) fn update_metrics(