mirror of
https://git.proxmox.com/git/mirror_frr
synced 2025-08-12 11:18:59 +00:00
Merge pull request #8003 from donaldsharp/timings
lib: Differentiate between real and cpu bound processes
This commit is contained in:
commit
5bb91468c5
@ -45,9 +45,15 @@ static struct log_ref ferr_lib_warn[] = {
|
|||||||
.suggestion = "Gather log data and open an Issue. restart FRR",
|
.suggestion = "Gather log data and open an Issue. restart FRR",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
.code = EC_LIB_SLOW_THREAD,
|
.code = EC_LIB_SLOW_THREAD_CPU,
|
||||||
.title = "The Event subsystem has detected a slow process",
|
.title = "The Event subsystem has detected a slow cpu time process",
|
||||||
.description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug, or some combination therof.",
|
.description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug, or some combination therof. In this case total CPU time was over 5 seconds. Which indicates that FRR is very busy doing some work and should be addressed",
|
||||||
|
.suggestion = "Gather log data and open an Issue",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.code = EC_LIB_SLOW_THREAD_WALL,
|
||||||
|
.title = "The Event subsystem has detected a slow wall time process",
|
||||||
|
.description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug or some combination therof. In this case total WALL time was over 5 seconds. Which indicates that FRR might be having trouble being scheduled or some system call is delaying",
|
||||||
.suggestion = "Gather log data and open an Issue",
|
.suggestion = "Gather log data and open an Issue",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -44,7 +44,8 @@ enum lib_log_refs {
|
|||||||
EC_LIB_SNMP,
|
EC_LIB_SNMP,
|
||||||
EC_LIB_STREAM,
|
EC_LIB_STREAM,
|
||||||
EC_LIB_LINUX_NS,
|
EC_LIB_LINUX_NS,
|
||||||
EC_LIB_SLOW_THREAD,
|
EC_LIB_SLOW_THREAD_CPU,
|
||||||
|
EC_LIB_SLOW_THREAD_WALL,
|
||||||
EC_LIB_NO_THREAD,
|
EC_LIB_NO_THREAD,
|
||||||
EC_LIB_RMAP_RECURSION_LIMIT,
|
EC_LIB_RMAP_RECURSION_LIMIT,
|
||||||
EC_LIB_BACKUP_CONFIG,
|
EC_LIB_BACKUP_CONFIG,
|
||||||
|
45
lib/thread.c
45
lib/thread.c
@ -124,11 +124,12 @@ static void cpu_record_hash_free(void *a)
|
|||||||
static void vty_out_cpu_thread_history(struct vty *vty,
|
static void vty_out_cpu_thread_history(struct vty *vty,
|
||||||
struct cpu_thread_history *a)
|
struct cpu_thread_history *a)
|
||||||
{
|
{
|
||||||
vty_out(vty, "%5zu %10zu.%03zu %9zu %8zu %9zu %8zu %9zu",
|
vty_out(vty, "%5zu %10zu.%03zu %9zu %8zu %9zu %8zu %9zu %9zu %9zu",
|
||||||
a->total_active, a->cpu.total / 1000, a->cpu.total % 1000,
|
a->total_active, a->cpu.total / 1000, a->cpu.total % 1000,
|
||||||
a->total_calls, (a->cpu.total / a->total_calls), a->cpu.max,
|
a->total_calls, (a->cpu.total / a->total_calls), a->cpu.max,
|
||||||
(a->real.total / a->total_calls), a->real.max);
|
(a->real.total / a->total_calls), a->real.max,
|
||||||
vty_out(vty, " %c%c%c%c%c %s\n",
|
a->total_cpu_warn, a->total_wall_warn);
|
||||||
|
vty_out(vty, " %c%c%c%c%c %s\n",
|
||||||
a->types & (1 << THREAD_READ) ? 'R' : ' ',
|
a->types & (1 << THREAD_READ) ? 'R' : ' ',
|
||||||
a->types & (1 << THREAD_WRITE) ? 'W' : ' ',
|
a->types & (1 << THREAD_WRITE) ? 'W' : ' ',
|
||||||
a->types & (1 << THREAD_TIMER) ? 'T' : ' ',
|
a->types & (1 << THREAD_TIMER) ? 'T' : ' ',
|
||||||
@ -149,6 +150,10 @@ static void cpu_record_hash_print(struct hash_bucket *bucket, void *args[])
|
|||||||
atomic_load_explicit(&a->total_active, memory_order_seq_cst);
|
atomic_load_explicit(&a->total_active, memory_order_seq_cst);
|
||||||
copy.total_calls =
|
copy.total_calls =
|
||||||
atomic_load_explicit(&a->total_calls, memory_order_seq_cst);
|
atomic_load_explicit(&a->total_calls, memory_order_seq_cst);
|
||||||
|
copy.total_cpu_warn =
|
||||||
|
atomic_load_explicit(&a->total_cpu_warn, memory_order_seq_cst);
|
||||||
|
copy.total_wall_warn =
|
||||||
|
atomic_load_explicit(&a->total_wall_warn, memory_order_seq_cst);
|
||||||
copy.cpu.total =
|
copy.cpu.total =
|
||||||
atomic_load_explicit(&a->cpu.total, memory_order_seq_cst);
|
atomic_load_explicit(&a->cpu.total, memory_order_seq_cst);
|
||||||
copy.cpu.max = atomic_load_explicit(&a->cpu.max, memory_order_seq_cst);
|
copy.cpu.max = atomic_load_explicit(&a->cpu.max, memory_order_seq_cst);
|
||||||
@ -165,6 +170,8 @@ static void cpu_record_hash_print(struct hash_bucket *bucket, void *args[])
|
|||||||
vty_out_cpu_thread_history(vty, ©);
|
vty_out_cpu_thread_history(vty, ©);
|
||||||
totals->total_active += copy.total_active;
|
totals->total_active += copy.total_active;
|
||||||
totals->total_calls += copy.total_calls;
|
totals->total_calls += copy.total_calls;
|
||||||
|
totals->total_cpu_warn += copy.total_cpu_warn;
|
||||||
|
totals->total_wall_warn += copy.total_wall_warn;
|
||||||
totals->real.total += copy.real.total;
|
totals->real.total += copy.real.total;
|
||||||
if (totals->real.max < copy.real.max)
|
if (totals->real.max < copy.real.max)
|
||||||
totals->real.max = copy.real.max;
|
totals->real.max = copy.real.max;
|
||||||
@ -202,7 +209,7 @@ static void cpu_record_print(struct vty *vty, uint8_t filter)
|
|||||||
vty_out(vty,
|
vty_out(vty,
|
||||||
"Active Runtime(ms) Invoked Avg uSec Max uSecs");
|
"Active Runtime(ms) Invoked Avg uSec Max uSecs");
|
||||||
vty_out(vty, " Avg uSec Max uSecs");
|
vty_out(vty, " Avg uSec Max uSecs");
|
||||||
vty_out(vty, " Type Thread\n");
|
vty_out(vty, " CPU_Warn Wall_Warn Type Thread\n");
|
||||||
|
|
||||||
if (m->cpu_record->count)
|
if (m->cpu_record->count)
|
||||||
hash_iterate(
|
hash_iterate(
|
||||||
@ -223,7 +230,7 @@ static void cpu_record_print(struct vty *vty, uint8_t filter)
|
|||||||
vty_out(vty, "%30s %18s %18s\n", "",
|
vty_out(vty, "%30s %18s %18s\n", "",
|
||||||
"CPU (user+system):", "Real (wall-clock):");
|
"CPU (user+system):", "Real (wall-clock):");
|
||||||
vty_out(vty, "Active Runtime(ms) Invoked Avg uSec Max uSecs");
|
vty_out(vty, "Active Runtime(ms) Invoked Avg uSec Max uSecs");
|
||||||
vty_out(vty, " Avg uSec Max uSecs");
|
vty_out(vty, " Avg uSec Max uSecs CPU_Warn Wall_Warn");
|
||||||
vty_out(vty, " Type Thread\n");
|
vty_out(vty, " Type Thread\n");
|
||||||
|
|
||||||
if (tmp.total_calls > 0)
|
if (tmp.total_calls > 0)
|
||||||
@ -1850,15 +1857,33 @@ void thread_call(struct thread *thread)
|
|||||||
memory_order_seq_cst);
|
memory_order_seq_cst);
|
||||||
|
|
||||||
#ifdef CONSUMED_TIME_CHECK
|
#ifdef CONSUMED_TIME_CHECK
|
||||||
if (realtime > CONSUMED_TIME_CHECK) {
|
if (cputime > CONSUMED_TIME_CHECK) {
|
||||||
/*
|
/*
|
||||||
* We have a CPU Hog on our hands.
|
* We have a CPU Hog on our hands. The time FRR
|
||||||
|
* has spent doing actual work ( not sleeping )
|
||||||
|
* is greater than 5 seconds.
|
||||||
* Whinge about it now, so we're aware this is yet another task
|
* Whinge about it now, so we're aware this is yet another task
|
||||||
* to fix.
|
* to fix.
|
||||||
*/
|
*/
|
||||||
|
atomic_fetch_add_explicit(&thread->hist->total_cpu_warn,
|
||||||
|
1, memory_order_seq_cst);
|
||||||
flog_warn(
|
flog_warn(
|
||||||
EC_LIB_SLOW_THREAD,
|
EC_LIB_SLOW_THREAD_CPU,
|
||||||
"SLOW THREAD: task %s (%lx) ran for %lums (cpu time %lums)",
|
"CPU HOG: task %s (%lx) ran for %lums (cpu time %lums)",
|
||||||
|
thread->xref->funcname, (unsigned long)thread->func,
|
||||||
|
realtime / 1000, cputime / 1000);
|
||||||
|
} else if (realtime > CONSUMED_TIME_CHECK) {
|
||||||
|
/*
|
||||||
|
* The runtime for a task is greater than 5 seconds, but
|
||||||
|
* the cpu time is under 5 seconds. Let's whine
|
||||||
|
* about this because this could imply some sort of
|
||||||
|
* scheduling issue.
|
||||||
|
*/
|
||||||
|
atomic_fetch_add_explicit(&thread->hist->total_wall_warn,
|
||||||
|
1, memory_order_seq_cst);
|
||||||
|
flog_warn(
|
||||||
|
EC_LIB_SLOW_THREAD_WALL,
|
||||||
|
"STARVATION: task %s (%lx) ran for %lums (cpu time %lums)",
|
||||||
thread->xref->funcname, (unsigned long)thread->func,
|
thread->xref->funcname, (unsigned long)thread->func,
|
||||||
realtime / 1000, cputime / 1000);
|
realtime / 1000, cputime / 1000);
|
||||||
}
|
}
|
||||||
|
@ -119,6 +119,8 @@ struct thread {
|
|||||||
|
|
||||||
struct cpu_thread_history {
|
struct cpu_thread_history {
|
||||||
int (*func)(struct thread *);
|
int (*func)(struct thread *);
|
||||||
|
atomic_size_t total_cpu_warn;
|
||||||
|
atomic_size_t total_wall_warn;
|
||||||
atomic_size_t total_calls;
|
atomic_size_t total_calls;
|
||||||
atomic_size_t total_active;
|
atomic_size_t total_active;
|
||||||
struct time_stats {
|
struct time_stats {
|
||||||
|
14
lib/vty.c
14
lib/vty.c
@ -515,13 +515,19 @@ static int vty_command(struct vty *vty, char *buf)
|
|||||||
|
|
||||||
#ifdef CONSUMED_TIME_CHECK
|
#ifdef CONSUMED_TIME_CHECK
|
||||||
GETRUSAGE(&after);
|
GETRUSAGE(&after);
|
||||||
if ((realtime = thread_consumed_time(&after, &before, &cputime))
|
realtime = thread_consumed_time(&after, &before, &cputime);
|
||||||
> CONSUMED_TIME_CHECK)
|
if (cputime > CONSUMED_TIME_CHECK) {
|
||||||
/* Warn about CPU hog that must be fixed. */
|
/* Warn about CPU hog that must be fixed. */
|
||||||
flog_warn(
|
flog_warn(
|
||||||
EC_LIB_SLOW_THREAD,
|
EC_LIB_SLOW_THREAD_CPU,
|
||||||
"SLOW COMMAND: command took %lums (cpu time %lums): %s",
|
"CPU HOG: command took %lums (cpu time %lums): %s",
|
||||||
realtime / 1000, cputime / 1000, buf);
|
realtime / 1000, cputime / 1000, buf);
|
||||||
|
} else if (realtime > CONSUMED_TIME_CHECK) {
|
||||||
|
flog_warn(
|
||||||
|
EC_LIB_SLOW_THREAD_WALL,
|
||||||
|
"STARVATION: command took %lums (cpu time %lums): %s",
|
||||||
|
realtime / 1000, cputime / 1000, buf);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif /* CONSUMED_TIME_CHECK */
|
#endif /* CONSUMED_TIME_CHECK */
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user