mirror of
https://git.proxmox.com/git/mirror_frr
synced 2025-08-06 10:54:47 +00:00
lib: Differentiate between real and cpu bound processes
When generating SLOW_THREAD warnings let's differentiate between a cpu bound process and a wall bound process. Effectively a slow thread can now be a process in FRR doing lots of work( cpu bound ) or wall bound ( the cpu is heavy load and a FRR process may be pre-empted and never scheduled ). Signed-off-by: Donald Sharp <sharpd@nvidia.com>
This commit is contained in:
parent
694df37daf
commit
039d547f6f
@ -45,9 +45,15 @@ static struct log_ref ferr_lib_warn[] = {
|
||||
.suggestion = "Gather log data and open an Issue. restart FRR",
|
||||
},
|
||||
{
|
||||
.code = EC_LIB_SLOW_THREAD,
|
||||
.title = "The Event subsystem has detected a slow process",
|
||||
.description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug, or some combination therof.",
|
||||
.code = EC_LIB_SLOW_THREAD_CPU,
|
||||
.title = "The Event subsystem has detected a slow cpu time process",
|
||||
.description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug, or some combination therof. In this case total CPU time was over 5 seconds. Which indicates that FRR is very busy doing some work and should be addressed",
|
||||
.suggestion = "Gather log data and open an Issue",
|
||||
},
|
||||
{
|
||||
.code = EC_LIB_SLOW_THREAD_WALL,
|
||||
.title = "The Event subsystem has detected a slow wall time process",
|
||||
.description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug or some combination therof. In this case total WALL time was over 5 seconds. Which indicates that FRR might be having trouble being scheduled or some system call is delaying",
|
||||
.suggestion = "Gather log data and open an Issue",
|
||||
},
|
||||
{
|
||||
|
@ -44,7 +44,8 @@ enum lib_log_refs {
|
||||
EC_LIB_SNMP,
|
||||
EC_LIB_STREAM,
|
||||
EC_LIB_LINUX_NS,
|
||||
EC_LIB_SLOW_THREAD,
|
||||
EC_LIB_SLOW_THREAD_CPU,
|
||||
EC_LIB_SLOW_THREAD_WALL,
|
||||
EC_LIB_NO_THREAD,
|
||||
EC_LIB_RMAP_RECURSION_LIMIT,
|
||||
EC_LIB_BACKUP_CONFIG,
|
||||
|
22
lib/thread.c
22
lib/thread.c
@ -1850,15 +1850,29 @@ void thread_call(struct thread *thread)
|
||||
memory_order_seq_cst);
|
||||
|
||||
#ifdef CONSUMED_TIME_CHECK
|
||||
if (realtime > CONSUMED_TIME_CHECK) {
|
||||
if (cputime > CONSUMED_TIME_CHECK) {
|
||||
/*
|
||||
* We have a CPU Hog on our hands.
|
||||
* We have a CPU Hog on our hands. The time FRR
|
||||
* has spent doing actual work ( not sleeping )
|
||||
* is greater than 5 seconds.
|
||||
* Whinge about it now, so we're aware this is yet another task
|
||||
* to fix.
|
||||
*/
|
||||
flog_warn(
|
||||
EC_LIB_SLOW_THREAD,
|
||||
"SLOW THREAD: task %s (%lx) ran for %lums (cpu time %lums)",
|
||||
EC_LIB_SLOW_THREAD_CPU,
|
||||
"CPU HOG: task %s (%lx) ran for %lums (cpu time %lums)",
|
||||
thread->xref->funcname, (unsigned long)thread->func,
|
||||
realtime / 1000, cputime / 1000);
|
||||
} else if (realtime > CONSUMED_TIME_CHECK) {
|
||||
/*
|
||||
* The runtime for a task is greater than 5 seconds, but
|
||||
* the cpu time is under 5 seconds. Let's whine
|
||||
* about this because this could imply some sort of
|
||||
* scheduling issue.
|
||||
*/
|
||||
flog_warn(
|
||||
EC_LIB_SLOW_THREAD_WALL,
|
||||
"STARVATION: task %s (%lx) ran for %lums (cpu time %lums)",
|
||||
thread->xref->funcname, (unsigned long)thread->func,
|
||||
realtime / 1000, cputime / 1000);
|
||||
}
|
||||
|
12
lib/vty.c
12
lib/vty.c
@ -512,13 +512,19 @@ static int vty_command(struct vty *vty, char *buf)
|
||||
|
||||
#ifdef CONSUMED_TIME_CHECK
|
||||
GETRUSAGE(&after);
|
||||
if ((realtime = thread_consumed_time(&after, &before, &cputime))
|
||||
> CONSUMED_TIME_CHECK)
|
||||
realtime = thread_consumed_time(&after, &before, &cputime);
|
||||
if (cputime > CONSUMED_TIME_CHECK) {
|
||||
/* Warn about CPU hog that must be fixed. */
|
||||
flog_warn(
|
||||
EC_LIB_SLOW_THREAD,
|
||||
EC_LIB_SLOW_THREAD_CPU,
|
||||
"SLOW COMMAND: command took %lums (cpu time %lums): %s",
|
||||
realtime / 1000, cputime / 1000, buf);
|
||||
} else if (realtime > CONSUMED_TIME_CHECK) {
|
||||
flog_warn(
|
||||
EC_LIB_SLOW_THREAD_WALL,
|
||||
"SLOW COMMAND: command took %lums (cpu time %lums): %s",
|
||||
realtime / 1000, cputime / 1000, buf);
|
||||
}
|
||||
}
|
||||
#endif /* CONSUMED_TIME_CHECK */
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user