mirror of
https://git.proxmox.com/git/mirror_corosync
synced 2025-08-13 12:52:31 +00:00
stats: Add stats for scheduler misses
This patch add a stats.schedmiss.* set of entries that are a record of the last 10 times corosync was not scheduled in time. These entries are keypt in reverse order (so stats.schedmiss.0.* is always the latest one kept) and the values, including the timestamp, are in milliseconds. It's also possible to use a cmap tracker to follow these events, which might be useful. Signed-off-by: Christine Caulfield <ccaulfie@redhat.com> Reviewed-by: Jan Friesse <jfriesse@redhat.com>
This commit is contained in:
parent
8ce65bf951
commit
48b6894ef4
@ -835,6 +835,8 @@ static void timer_function_scheduler_timeout (void *data)
|
||||
log_printf (LOGSYS_LEVEL_WARNING, "Corosync main process was not scheduled for %0.4f ms "
|
||||
"(threshold is %0.4f ms). Consider token timeout increase.",
|
||||
(float)tv_diff / QB_TIME_NS_IN_MSEC, (float)timeout_data->max_tv_diff / QB_TIME_NS_IN_MSEC);
|
||||
|
||||
stats_add_schedmiss_event(tv_current / 1000, (float)tv_diff / QB_TIME_NS_IN_MSEC);
|
||||
}
|
||||
|
||||
/*
|
||||
|
113
exec/stats.c
113
exec/stats.c
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2017 Red Hat, Inc.
|
||||
* Copyright (c) 2017-2020 Red Hat, Inc.
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
@ -60,9 +60,20 @@ LOGSYS_DECLARE_SUBSYS ("STATS");
|
||||
|
||||
static qb_map_t *stats_map;
|
||||
|
||||
/* Structure of an element in the schedmiss array */
|
||||
struct schedmiss_entry {
|
||||
uint64_t timestamp;
|
||||
float delay;
|
||||
};
|
||||
#define MAX_SCHEDMISS_EVENTS 10
|
||||
static struct schedmiss_entry schedmiss_event[MAX_SCHEDMISS_EVENTS];
|
||||
static unsigned int highest_schedmiss_event;
|
||||
|
||||
#define SCHEDMISS_PREFIX "stats.schedmiss"
|
||||
|
||||
/* Convert iterator number to text and a stats pointer */
|
||||
struct cs_stats_conv {
|
||||
enum {STAT_PG, STAT_SRP, STAT_KNET, STAT_KNET_HANDLE, STAT_IPCSC, STAT_IPCSG} type;
|
||||
enum {STAT_PG, STAT_SRP, STAT_KNET, STAT_KNET_HANDLE, STAT_IPCSC, STAT_IPCSG, STAT_SCHEDMISS} type;
|
||||
const char *name;
|
||||
const size_t offset;
|
||||
const icmap_value_types_t value_type;
|
||||
@ -190,6 +201,10 @@ struct cs_stats_conv cs_ipcs_global_stats[] = {
|
||||
{ STAT_IPCSG, "global.active", offsetof(struct ipcs_global_stats, active), ICMAP_VALUETYPE_UINT64},
|
||||
{ STAT_IPCSG, "global.closed", offsetof(struct ipcs_global_stats, closed), ICMAP_VALUETYPE_UINT64},
|
||||
};
|
||||
struct cs_stats_conv cs_schedmiss_stats[] = {
|
||||
{ STAT_SCHEDMISS, "timestamp", offsetof(struct schedmiss_entry, timestamp), ICMAP_VALUETYPE_UINT64},
|
||||
{ STAT_SCHEDMISS, "delay", offsetof(struct schedmiss_entry, delay), ICMAP_VALUETYPE_FLOAT},
|
||||
};
|
||||
|
||||
#define NUM_PG_STATS (sizeof(cs_pg_stats) / sizeof(struct cs_stats_conv))
|
||||
#define NUM_SRP_STATS (sizeof(cs_srp_stats) / sizeof(struct cs_stats_conv))
|
||||
@ -286,7 +301,7 @@ cs_error_t stats_map_init(const struct corosync_api_v1 *corosync_api)
|
||||
stats_add_entry(param, &cs_ipcs_global_stats[i]);
|
||||
}
|
||||
|
||||
/* KNET and IPCS stats are added when appropriate */
|
||||
/* KNET, IPCS & SCHEDMISS stats are added when appropriate */
|
||||
return CS_OK;
|
||||
}
|
||||
|
||||
@ -307,6 +322,8 @@ cs_error_t stats_map_get(const char *key_name,
|
||||
int link_no;
|
||||
int service_id;
|
||||
uint32_t pid;
|
||||
unsigned int sm_event;
|
||||
char *sm_type;
|
||||
void *conn_ptr;
|
||||
|
||||
item = qb_map_get(stats_map, key_name);
|
||||
@ -363,17 +380,85 @@ cs_error_t stats_map_get(const char *key_name,
|
||||
cs_ipcs_get_global_stats(&ipcs_global_stats);
|
||||
stats_map_set_value(statinfo, &ipcs_global_stats, value, value_len, type);
|
||||
break;
|
||||
case STAT_SCHEDMISS:
|
||||
if (sscanf(key_name, SCHEDMISS_PREFIX ".%d", &sm_event) != 1) {
|
||||
return CS_ERR_NOT_EXIST;
|
||||
}
|
||||
|
||||
sm_type = strrchr(key_name, '.');
|
||||
if (sm_type == NULL) {
|
||||
return CS_ERR_NOT_EXIST;
|
||||
}
|
||||
sm_type++;
|
||||
|
||||
if (strcmp(sm_type, "timestamp") == 0) {
|
||||
memcpy(value, &schedmiss_event[sm_event].timestamp, sizeof(uint64_t));
|
||||
*value_len = sizeof(uint64_t);
|
||||
*type = ICMAP_VALUETYPE_UINT64;
|
||||
}
|
||||
if (strcmp(sm_type, "delay") == 0) {
|
||||
memcpy(value, &schedmiss_event[sm_event].delay, sizeof(float));
|
||||
*value_len = sizeof(float);
|
||||
*type = ICMAP_VALUETYPE_FLOAT;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return CS_ERR_LIBRARY;
|
||||
}
|
||||
return CS_OK;
|
||||
}
|
||||
|
||||
#define STATS_CLEAR "stats.clear."
|
||||
#define STATS_CLEAR_KNET "stats.clear.knet"
|
||||
#define STATS_CLEAR_IPC "stats.clear.ipc"
|
||||
#define STATS_CLEAR_TOTEM "stats.clear.totem"
|
||||
#define STATS_CLEAR_ALL "stats.clear.all"
|
||||
static void schedmiss_clear_stats(void)
|
||||
{
|
||||
int i;
|
||||
char param[ICMAP_KEYNAME_MAXLEN];
|
||||
|
||||
for (i=0; i<MAX_SCHEDMISS_EVENTS; i++) {
|
||||
if (i < highest_schedmiss_event) {
|
||||
sprintf(param, SCHEDMISS_PREFIX ".%i.timestamp", i);
|
||||
stats_rm_entry(param);
|
||||
sprintf(param, SCHEDMISS_PREFIX ".%i.delay", i);
|
||||
stats_rm_entry(param);
|
||||
}
|
||||
schedmiss_event[i].timestamp = (uint64_t)0LL;
|
||||
schedmiss_event[i].delay = 0.0f;
|
||||
}
|
||||
highest_schedmiss_event = 0;
|
||||
}
|
||||
|
||||
/* Called from main.c */
|
||||
void stats_add_schedmiss_event(uint64_t timestamp, float delay)
|
||||
{
|
||||
char param[ICMAP_KEYNAME_MAXLEN];
|
||||
int i;
|
||||
|
||||
/* Move 'em all up */
|
||||
for (i=MAX_SCHEDMISS_EVENTS-2; i>=0; i--) {
|
||||
schedmiss_event[i+1].timestamp = schedmiss_event[i].timestamp;
|
||||
schedmiss_event[i+1].delay = schedmiss_event[i].delay;
|
||||
}
|
||||
|
||||
/* New entries are always at the front */
|
||||
schedmiss_event[0].timestamp = timestamp;
|
||||
schedmiss_event[0].delay = delay;
|
||||
|
||||
/* If we've not run off the end then add an entry in the trie for the new 'end' one */
|
||||
if (highest_schedmiss_event < MAX_SCHEDMISS_EVENTS) {
|
||||
sprintf(param, SCHEDMISS_PREFIX ".%i.timestamp", highest_schedmiss_event);
|
||||
stats_add_entry(param, &cs_schedmiss_stats[0]);
|
||||
sprintf(param, SCHEDMISS_PREFIX ".%i.delay", highest_schedmiss_event);
|
||||
stats_add_entry(param, &cs_schedmiss_stats[1]);
|
||||
highest_schedmiss_event++;
|
||||
}
|
||||
/* Notifications get sent by the stats_updater */
|
||||
}
|
||||
|
||||
#define STATS_CLEAR "stats.clear."
|
||||
#define STATS_CLEAR_KNET "stats.clear.knet"
|
||||
#define STATS_CLEAR_IPC "stats.clear.ipc"
|
||||
#define STATS_CLEAR_TOTEM "stats.clear.totem"
|
||||
#define STATS_CLEAR_ALL "stats.clear.all"
|
||||
#define STATS_CLEAR_SCHEDMISS "stats.clear.schedmiss"
|
||||
|
||||
cs_error_t stats_map_set(const char *key_name,
|
||||
const void *value,
|
||||
@ -394,9 +479,14 @@ cs_error_t stats_map_set(const char *key_name,
|
||||
totempg_stats_clear(TOTEMPG_STATS_CLEAR_TOTEM);
|
||||
cleared = 1;
|
||||
}
|
||||
if (strncmp(key_name, STATS_CLEAR_SCHEDMISS, strlen(STATS_CLEAR_SCHEDMISS)) == 0) {
|
||||
schedmiss_clear_stats();
|
||||
cleared = 1;
|
||||
}
|
||||
if (strncmp(key_name, STATS_CLEAR_ALL, strlen(STATS_CLEAR_ALL)) == 0) {
|
||||
totempg_stats_clear(TOTEMPG_STATS_CLEAR_TRANSPORT | TOTEMPG_STATS_CLEAR_TOTEM);
|
||||
cs_ipcs_clear_stats();
|
||||
schedmiss_clear_stats();
|
||||
cleared = 1;
|
||||
}
|
||||
if (!cleared) {
|
||||
@ -500,6 +590,11 @@ static void stats_map_notify_fn(uint32_t event, char *key, void *old_value, void
|
||||
return ;
|
||||
}
|
||||
|
||||
/* Ignore schedmiss trackers as the values are read from the circular buffer */
|
||||
if (strncmp(key, SCHEDMISS_PREFIX, strlen(SCHEDMISS_PREFIX)) == 0 ) {
|
||||
return ;
|
||||
}
|
||||
|
||||
new_val.data = new_value;
|
||||
if (stats_map_get(key,
|
||||
&new_value,
|
||||
@ -556,7 +651,7 @@ cs_error_t stats_map_track_add(const char *key_name,
|
||||
}
|
||||
/* Get initial value */
|
||||
if (stats_map_get(tracker->key_name,
|
||||
&tracker->old_value, &value_len, &type) == CS_OK) {
|
||||
&tracker->old_value, &value_len, &type) != CS_OK) {
|
||||
tracker->old_value = 0ULL;
|
||||
}
|
||||
} else {
|
||||
|
@ -69,3 +69,5 @@ void stats_trigger_trackers(void);
|
||||
void stats_ipcs_add_connection(int service_id, uint32_t pid, void *ptr);
|
||||
void stats_ipcs_del_connection(int service_id, uint32_t pid, void *ptr);
|
||||
cs_error_t cs_ipcs_get_conn_stats(int service_id, uint32_t pid, void *conn_ptr, struct ipcs_conn_stats *ipcs_stats);
|
||||
|
||||
void stats_add_schedmiss_event(uint64_t, float delay);
|
||||
|
@ -1,5 +1,5 @@
|
||||
.\"/*
|
||||
.\" * Copyright (c) 2012-2018 Red Hat, Inc.
|
||||
.\" * Copyright (c) 2012-2020 Red Hat, Inc.
|
||||
.\" *
|
||||
.\" * All rights reserved.
|
||||
.\" *
|
||||
@ -357,6 +357,27 @@ contains the total number of interrupted sends.
|
||||
.B service_id
|
||||
contains the ID of service which the IPC is connected to.
|
||||
|
||||
|
||||
.TP
|
||||
stats.schedmiss.<n>.*
|
||||
If corosync is not scheduled after the required period of time it will
|
||||
log this event and also write an entry to the stats cmap under this key.
|
||||
There can be up to 10 entries (0..9) in here, when an 11th event happens
|
||||
the earliest will be removed.
|
||||
|
||||
These events will always be in reverse order, so stats.schedmiss.0.* will
|
||||
always be the latest event kept and 9 the oldest. If you want to listen
|
||||
for notifications then you are recommended to listen for changes
|
||||
to stats.schedmiss.0.timestamp or stats.schedmiss.0.delay.
|
||||
|
||||
.B timestamp
|
||||
The time of the event in ms since the Epoch (ie time_t * 1000 but with
|
||||
valid milliseconds).
|
||||
|
||||
.B delay
|
||||
The time that corosync was paused (in ms, float value).
|
||||
|
||||
|
||||
.TP
|
||||
stats.clear.*
|
||||
These are write-only keys used to clear the stats for various subsystems
|
||||
@ -370,6 +391,9 @@ Clears the knet stats
|
||||
.B ipc
|
||||
Clears the ipc stats
|
||||
|
||||
.B schedmiss
|
||||
Clears the schedmiss stats
|
||||
|
||||
.B all
|
||||
Clears all of the above stats
|
||||
|
||||
|
@ -115,7 +115,7 @@ static int print_help(void)
|
||||
printf(" about the networking and IPC traffic in some detail.\n");
|
||||
printf("\n");
|
||||
printf("Clear stats:\n");
|
||||
printf(" corosync-cmapctl -C [knet|ipc|totem|all]\n");
|
||||
printf(" corosync-cmapctl -C [knet|ipc|totem|schedmiss|all]\n");
|
||||
printf(" The 'stats' map is implied\n");
|
||||
printf("\n");
|
||||
printf("Load settings from a file:\n");
|
||||
@ -849,6 +849,7 @@ int main(int argc, char *argv[])
|
||||
if (strcmp(optarg, "knet") == 0 ||
|
||||
strcmp(optarg, "totem") == 0 ||
|
||||
strcmp(optarg, "ipc") == 0 ||
|
||||
strcmp(optarg, "schedmiss") == 0 ||
|
||||
strcmp(optarg, "all") == 0) {
|
||||
action = ACTION_CLEARSTATS;
|
||||
clear_opt = optarg;
|
||||
@ -857,7 +858,7 @@ int main(int argc, char *argv[])
|
||||
map = CMAP_MAP_STATS;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "argument to -C should be 'knet', 'totem', 'ipc' or 'all'\n");
|
||||
fprintf(stderr, "argument to -C should be 'knet', 'totem', 'ipc', 'schedmiss' or 'all'\n");
|
||||
return (EXIT_FAILURE);
|
||||
}
|
||||
break;
|
||||
|
Loading…
Reference in New Issue
Block a user