mirror of
https://git.proxmox.com/git/mirror_frr
synced 2025-07-03 04:11:47 +00:00
Merge pull request #8011 from donaldsharp/starvation
lib: Figure out if we are being starved for cpu
This commit is contained in:
commit
cdaa204eff
@ -56,6 +56,12 @@ static struct log_ref ferr_lib_warn[] = {
|
||||
.description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug or some combination thereof. In this case total WALL time was over 5 seconds. Which indicates that FRR might be having trouble being scheduled or some system call is delaying",
|
||||
.suggestion = "Gather log data and open an Issue",
|
||||
},
|
||||
{
|
||||
.code = EC_LIB_STARVE_THREAD,
|
||||
.title = "The Event subsystem has detected a thread starvation issue",
|
||||
.description = "The event subsystem has detected a thread starvation issue. This typically indicates that the system FRR is running on is heavily loaded and this load might be impacting FRR's ability to handle events in a timely fashion",
|
||||
.suggestion = "Gather log data and open an Issue",
|
||||
},
|
||||
{
|
||||
.code = EC_LIB_NO_THREAD,
|
||||
.title = "The Event subsystem has detected an internal FD problem",
|
||||
|
@ -46,6 +46,7 @@ enum lib_log_refs {
|
||||
EC_LIB_LINUX_NS,
|
||||
EC_LIB_SLOW_THREAD_CPU,
|
||||
EC_LIB_SLOW_THREAD_WALL,
|
||||
EC_LIB_STARVE_THREAD,
|
||||
EC_LIB_NO_THREAD,
|
||||
EC_LIB_RMAP_RECURSION_LIMIT,
|
||||
EC_LIB_BACKUP_CONFIG,
|
||||
|
20
lib/thread.c
20
lib/thread.c
@ -787,6 +787,7 @@ static struct thread *thread_get(struct thread_master *m, uint8_t type,
|
||||
thread->arg = arg;
|
||||
thread->yield = THREAD_YIELD_TIME_SLOT; /* default */
|
||||
thread->ref = NULL;
|
||||
thread->ignore_timer_late = false;
|
||||
|
||||
/*
|
||||
* So if the passed in funcname is not what we have
|
||||
@ -1651,12 +1652,31 @@ static void thread_process_io(struct thread_master *m, unsigned int num)
|
||||
static unsigned int thread_process_timers(struct thread_master *m,
|
||||
struct timeval *timenow)
|
||||
{
|
||||
struct timeval prev = *timenow;
|
||||
bool displayed = false;
|
||||
struct thread *thread;
|
||||
unsigned int ready = 0;
|
||||
|
||||
while ((thread = thread_timer_list_first(&m->timer))) {
|
||||
if (timercmp(timenow, &thread->u.sands, <))
|
||||
break;
|
||||
prev = thread->u.sands;
|
||||
prev.tv_sec += 4;
|
||||
/*
|
||||
* If the timer would have popped 4 seconds in the
|
||||
* past then we are in a situation where we are
|
||||
* really getting behind on handling of events.
|
||||
* Let's log it and do the right thing with it.
|
||||
*/
|
||||
if (!displayed && !thread->ignore_timer_late &&
|
||||
timercmp(timenow, &prev, >)) {
|
||||
flog_warn(
|
||||
EC_LIB_STARVE_THREAD,
|
||||
"Thread Starvation: %pTHD was scheduled to pop greater than 4s ago",
|
||||
thread);
|
||||
displayed = true;
|
||||
}
|
||||
|
||||
thread_timer_list_pop(&m->timer);
|
||||
thread->type = THREAD_READY;
|
||||
thread_list_add_tail(&m->ready, thread);
|
||||
|
@ -126,6 +126,7 @@ struct thread {
|
||||
unsigned long yield; /* yield time in microseconds */
|
||||
const struct xref_threadsched *xref; /* origin location */
|
||||
pthread_mutex_t mtx; /* mutex for thread.c functions */
|
||||
bool ignore_timer_late;
|
||||
};
|
||||
|
||||
#ifdef _FRR_ATTRIBUTE_PRINTFRR
|
||||
@ -285,6 +286,11 @@ extern bool thread_is_scheduled(struct thread *thread);
|
||||
/* Debug signal mask */
|
||||
void debug_signals(const sigset_t *sigs);
|
||||
|
||||
static inline void thread_ignore_late_timer(struct thread *thread)
|
||||
{
|
||||
thread->ignore_timer_late = true;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -135,10 +135,11 @@ static int work_queue_schedule(struct work_queue *wq, unsigned int delay)
|
||||
/* Schedule timer if there's a delay, otherwise just schedule
|
||||
* as an 'event'
|
||||
*/
|
||||
if (delay > 0)
|
||||
if (delay > 0) {
|
||||
thread_add_timer_msec(wq->master, work_queue_run, wq,
|
||||
delay, &wq->thread);
|
||||
else
|
||||
thread_ignore_late_timer(wq->thread);
|
||||
} else
|
||||
thread_add_event(wq->master, work_queue_run, wq, 0,
|
||||
&wq->thread);
|
||||
|
||||
|
@ -91,9 +91,12 @@ static int if_zebra_speed_update(struct thread *thread)
|
||||
changed = true;
|
||||
}
|
||||
|
||||
if (changed || new_speed == UINT32_MAX)
|
||||
if (changed || new_speed == UINT32_MAX) {
|
||||
thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 5,
|
||||
&zif->speed_update);
|
||||
thread_ignore_late_timer(zif->speed_update);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -187,6 +190,8 @@ static int if_zebra_new_hook(struct interface *ifp)
|
||||
*/
|
||||
thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 15,
|
||||
&zebra_if->speed_update);
|
||||
thread_ignore_late_timer(zebra_if->speed_update);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1074,6 +1079,7 @@ void if_up(struct interface *ifp)
|
||||
|
||||
thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 0,
|
||||
&zif->speed_update);
|
||||
thread_ignore_late_timer(zif->speed_update);
|
||||
}
|
||||
|
||||
/* Interface goes down. We have to manage different behavior of based
|
||||
|
Loading…
Reference in New Issue
Block a user