stats: Add stats for scheduler misses

This patch add a stats.schedmiss.* set of entries that
are a record of the last 10 times corosync was not scheduled
in time.

These entries are keypt in reverse order (so stats.schedmiss.0.* is
always the latest one kept) and the values, including the timestamp,
are in milliseconds.

It's also possible to use a cmap tracker to follow these events, which
might be useful.

Signed-off-by: Christine Caulfield <ccaulfie@redhat.com>
Reviewed-by: Jan Friesse <jfriesse@redhat.com>
This commit is contained in:
Christine Caulfield 2020-01-17 14:22:16 +00:00 committed by Jan Friesse
parent 8ce65bf951
commit 48b6894ef4
5 changed files with 136 additions and 12 deletions

View File

@ -835,6 +835,8 @@ static void timer_function_scheduler_timeout (void *data)
log_printf (LOGSYS_LEVEL_WARNING, "Corosync main process was not scheduled for %0.4f ms "
"(threshold is %0.4f ms). Consider token timeout increase.",
(float)tv_diff / QB_TIME_NS_IN_MSEC, (float)timeout_data->max_tv_diff / QB_TIME_NS_IN_MSEC);
stats_add_schedmiss_event(tv_current / 1000, (float)tv_diff / QB_TIME_NS_IN_MSEC);
}
/*

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2017 Red Hat, Inc.
* Copyright (c) 2017-2020 Red Hat, Inc.
*
* All rights reserved.
*
@ -60,9 +60,20 @@ LOGSYS_DECLARE_SUBSYS ("STATS");
static qb_map_t *stats_map;
/* Structure of an element in the schedmiss array */
struct schedmiss_entry {
uint64_t timestamp;
float delay;
};
#define MAX_SCHEDMISS_EVENTS 10
static struct schedmiss_entry schedmiss_event[MAX_SCHEDMISS_EVENTS];
static unsigned int highest_schedmiss_event;
#define SCHEDMISS_PREFIX "stats.schedmiss"
/* Convert iterator number to text and a stats pointer */
struct cs_stats_conv {
enum {STAT_PG, STAT_SRP, STAT_KNET, STAT_KNET_HANDLE, STAT_IPCSC, STAT_IPCSG} type;
enum {STAT_PG, STAT_SRP, STAT_KNET, STAT_KNET_HANDLE, STAT_IPCSC, STAT_IPCSG, STAT_SCHEDMISS} type;
const char *name;
const size_t offset;
const icmap_value_types_t value_type;
@ -190,6 +201,10 @@ struct cs_stats_conv cs_ipcs_global_stats[] = {
{ STAT_IPCSG, "global.active", offsetof(struct ipcs_global_stats, active), ICMAP_VALUETYPE_UINT64},
{ STAT_IPCSG, "global.closed", offsetof(struct ipcs_global_stats, closed), ICMAP_VALUETYPE_UINT64},
};
struct cs_stats_conv cs_schedmiss_stats[] = {
{ STAT_SCHEDMISS, "timestamp", offsetof(struct schedmiss_entry, timestamp), ICMAP_VALUETYPE_UINT64},
{ STAT_SCHEDMISS, "delay", offsetof(struct schedmiss_entry, delay), ICMAP_VALUETYPE_FLOAT},
};
#define NUM_PG_STATS (sizeof(cs_pg_stats) / sizeof(struct cs_stats_conv))
#define NUM_SRP_STATS (sizeof(cs_srp_stats) / sizeof(struct cs_stats_conv))
@ -286,7 +301,7 @@ cs_error_t stats_map_init(const struct corosync_api_v1 *corosync_api)
stats_add_entry(param, &cs_ipcs_global_stats[i]);
}
/* KNET and IPCS stats are added when appropriate */
/* KNET, IPCS & SCHEDMISS stats are added when appropriate */
return CS_OK;
}
@ -307,6 +322,8 @@ cs_error_t stats_map_get(const char *key_name,
int link_no;
int service_id;
uint32_t pid;
unsigned int sm_event;
char *sm_type;
void *conn_ptr;
item = qb_map_get(stats_map, key_name);
@ -363,17 +380,85 @@ cs_error_t stats_map_get(const char *key_name,
cs_ipcs_get_global_stats(&ipcs_global_stats);
stats_map_set_value(statinfo, &ipcs_global_stats, value, value_len, type);
break;
case STAT_SCHEDMISS:
if (sscanf(key_name, SCHEDMISS_PREFIX ".%d", &sm_event) != 1) {
return CS_ERR_NOT_EXIST;
}
sm_type = strrchr(key_name, '.');
if (sm_type == NULL) {
return CS_ERR_NOT_EXIST;
}
sm_type++;
if (strcmp(sm_type, "timestamp") == 0) {
memcpy(value, &schedmiss_event[sm_event].timestamp, sizeof(uint64_t));
*value_len = sizeof(uint64_t);
*type = ICMAP_VALUETYPE_UINT64;
}
if (strcmp(sm_type, "delay") == 0) {
memcpy(value, &schedmiss_event[sm_event].delay, sizeof(float));
*value_len = sizeof(float);
*type = ICMAP_VALUETYPE_FLOAT;
}
break;
default:
return CS_ERR_LIBRARY;
}
return CS_OK;
}
#define STATS_CLEAR "stats.clear."
#define STATS_CLEAR_KNET "stats.clear.knet"
#define STATS_CLEAR_IPC "stats.clear.ipc"
#define STATS_CLEAR_TOTEM "stats.clear.totem"
#define STATS_CLEAR_ALL "stats.clear.all"
static void schedmiss_clear_stats(void)
{
int i;
char param[ICMAP_KEYNAME_MAXLEN];
for (i=0; i<MAX_SCHEDMISS_EVENTS; i++) {
if (i < highest_schedmiss_event) {
sprintf(param, SCHEDMISS_PREFIX ".%i.timestamp", i);
stats_rm_entry(param);
sprintf(param, SCHEDMISS_PREFIX ".%i.delay", i);
stats_rm_entry(param);
}
schedmiss_event[i].timestamp = (uint64_t)0LL;
schedmiss_event[i].delay = 0.0f;
}
highest_schedmiss_event = 0;
}
/* Called from main.c */
void stats_add_schedmiss_event(uint64_t timestamp, float delay)
{
char param[ICMAP_KEYNAME_MAXLEN];
int i;
/* Move 'em all up */
for (i=MAX_SCHEDMISS_EVENTS-2; i>=0; i--) {
schedmiss_event[i+1].timestamp = schedmiss_event[i].timestamp;
schedmiss_event[i+1].delay = schedmiss_event[i].delay;
}
/* New entries are always at the front */
schedmiss_event[0].timestamp = timestamp;
schedmiss_event[0].delay = delay;
/* If we've not run off the end then add an entry in the trie for the new 'end' one */
if (highest_schedmiss_event < MAX_SCHEDMISS_EVENTS) {
sprintf(param, SCHEDMISS_PREFIX ".%i.timestamp", highest_schedmiss_event);
stats_add_entry(param, &cs_schedmiss_stats[0]);
sprintf(param, SCHEDMISS_PREFIX ".%i.delay", highest_schedmiss_event);
stats_add_entry(param, &cs_schedmiss_stats[1]);
highest_schedmiss_event++;
}
/* Notifications get sent by the stats_updater */
}
#define STATS_CLEAR "stats.clear."
#define STATS_CLEAR_KNET "stats.clear.knet"
#define STATS_CLEAR_IPC "stats.clear.ipc"
#define STATS_CLEAR_TOTEM "stats.clear.totem"
#define STATS_CLEAR_ALL "stats.clear.all"
#define STATS_CLEAR_SCHEDMISS "stats.clear.schedmiss"
cs_error_t stats_map_set(const char *key_name,
const void *value,
@ -394,9 +479,14 @@ cs_error_t stats_map_set(const char *key_name,
totempg_stats_clear(TOTEMPG_STATS_CLEAR_TOTEM);
cleared = 1;
}
if (strncmp(key_name, STATS_CLEAR_SCHEDMISS, strlen(STATS_CLEAR_SCHEDMISS)) == 0) {
schedmiss_clear_stats();
cleared = 1;
}
if (strncmp(key_name, STATS_CLEAR_ALL, strlen(STATS_CLEAR_ALL)) == 0) {
totempg_stats_clear(TOTEMPG_STATS_CLEAR_TRANSPORT | TOTEMPG_STATS_CLEAR_TOTEM);
cs_ipcs_clear_stats();
schedmiss_clear_stats();
cleared = 1;
}
if (!cleared) {
@ -500,6 +590,11 @@ static void stats_map_notify_fn(uint32_t event, char *key, void *old_value, void
return ;
}
/* Ignore schedmiss trackers as the values are read from the circular buffer */
if (strncmp(key, SCHEDMISS_PREFIX, strlen(SCHEDMISS_PREFIX)) == 0 ) {
return ;
}
new_val.data = new_value;
if (stats_map_get(key,
&new_value,
@ -556,7 +651,7 @@ cs_error_t stats_map_track_add(const char *key_name,
}
/* Get initial value */
if (stats_map_get(tracker->key_name,
&tracker->old_value, &value_len, &type) == CS_OK) {
&tracker->old_value, &value_len, &type) != CS_OK) {
tracker->old_value = 0ULL;
}
} else {

View File

@ -69,3 +69,5 @@ void stats_trigger_trackers(void);
void stats_ipcs_add_connection(int service_id, uint32_t pid, void *ptr);
void stats_ipcs_del_connection(int service_id, uint32_t pid, void *ptr);
cs_error_t cs_ipcs_get_conn_stats(int service_id, uint32_t pid, void *conn_ptr, struct ipcs_conn_stats *ipcs_stats);
void stats_add_schedmiss_event(uint64_t, float delay);

View File

@ -1,5 +1,5 @@
.\"/*
.\" * Copyright (c) 2012-2018 Red Hat, Inc.
.\" * Copyright (c) 2012-2020 Red Hat, Inc.
.\" *
.\" * All rights reserved.
.\" *
@ -357,6 +357,27 @@ contains the total number of interrupted sends.
.B service_id
contains the ID of service which the IPC is connected to.
.TP
stats.schedmiss.<n>.*
If corosync is not scheduled after the required period of time it will
log this event and also write an entry to the stats cmap under this key.
There can be up to 10 entries (0..9) in here, when an 11th event happens
the earliest will be removed.
These events will always be in reverse order, so stats.schedmiss.0.* will
always be the latest event kept and 9 the oldest. If you want to listen
for notifications then you are recommended to listen for changes
to stats.schedmiss.0.timestamp or stats.schedmiss.0.delay.
.B timestamp
The time of the event in ms since the Epoch (ie time_t * 1000 but with
valid milliseconds).
.B delay
The time that corosync was paused (in ms, float value).
.TP
stats.clear.*
These are write-only keys used to clear the stats for various subsystems
@ -370,6 +391,9 @@ Clears the knet stats
.B ipc
Clears the ipc stats
.B schedmiss
Clears the schedmiss stats
.B all
Clears all of the above stats

View File

@ -115,7 +115,7 @@ static int print_help(void)
printf(" about the networking and IPC traffic in some detail.\n");
printf("\n");
printf("Clear stats:\n");
printf(" corosync-cmapctl -C [knet|ipc|totem|all]\n");
printf(" corosync-cmapctl -C [knet|ipc|totem|schedmiss|all]\n");
printf(" The 'stats' map is implied\n");
printf("\n");
printf("Load settings from a file:\n");
@ -849,6 +849,7 @@ int main(int argc, char *argv[])
if (strcmp(optarg, "knet") == 0 ||
strcmp(optarg, "totem") == 0 ||
strcmp(optarg, "ipc") == 0 ||
strcmp(optarg, "schedmiss") == 0 ||
strcmp(optarg, "all") == 0) {
action = ACTION_CLEARSTATS;
clear_opt = optarg;
@ -857,7 +858,7 @@ int main(int argc, char *argv[])
map = CMAP_MAP_STATS;
}
else {
fprintf(stderr, "argument to -C should be 'knet', 'totem', 'ipc' or 'all'\n");
fprintf(stderr, "argument to -C should be 'knet', 'totem', 'ipc', 'schedmiss' or 'all'\n");
return (EXIT_FAILURE);
}
break;