diff --git a/lib/lib_errors.c b/lib/lib_errors.c index a139b9a14c..acc9a05c33 100644 --- a/lib/lib_errors.c +++ b/lib/lib_errors.c @@ -56,6 +56,12 @@ static struct log_ref ferr_lib_warn[] = { .description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug or some combination thereof. In this case total WALL time was over 5 seconds. Which indicates that FRR might be having trouble being scheduled or some system call is delaying", .suggestion = "Gather log data and open an Issue", }, + { + .code = EC_LIB_STARVE_THREAD, + .title = "The Event subsystem has detected a thread starvation issue", + .description = "The event subsystem has detected a thread starvation issue. This typically indicates that the system FRR is running on is heavily loaded and this load might be impacting FRR's ability to handle events in a timely fashion", + .suggestion = "Gather log data and open an Issue", + }, { .code = EC_LIB_NO_THREAD, .title = "The Event subsystem has detected an internal FD problem", diff --git a/lib/lib_errors.h b/lib/lib_errors.h index 9f0f58d20b..64ac6c1ceb 100644 --- a/lib/lib_errors.h +++ b/lib/lib_errors.h @@ -46,6 +46,7 @@ enum lib_log_refs { EC_LIB_LINUX_NS, EC_LIB_SLOW_THREAD_CPU, EC_LIB_SLOW_THREAD_WALL, + EC_LIB_STARVE_THREAD, EC_LIB_NO_THREAD, EC_LIB_RMAP_RECURSION_LIMIT, EC_LIB_BACKUP_CONFIG, diff --git a/lib/thread.c b/lib/thread.c index 77e34f48f3..73e0e4887c 100644 --- a/lib/thread.c +++ b/lib/thread.c @@ -1651,12 +1651,31 @@ static void thread_process_io(struct thread_master *m, unsigned int num) static unsigned int thread_process_timers(struct thread_master *m, struct timeval *timenow) { + struct timeval prev = *timenow; + bool displayed = false; struct thread *thread; unsigned int ready = 0; while ((thread = thread_timer_list_first(&m->timer))) { if (timercmp(timenow, &thread->u.sands, <)) break; + prev = thread->u.sands; + prev.tv_sec += 4; + /* + * If the timer would have popped 4 seconds in the + * past then we are in a situation where we are + * really getting behind on handling of events. + * Let's log it and do the right thing with it. + */ + if (timercmp(timenow, &prev, >)) { + if (!displayed) + flog_warn( + EC_LIB_STARVE_THREAD, + "Thread Starvation: %pTHD was scheduled to pop greater than 4s ago", + thread); + displayed = true; + } + thread_timer_list_pop(&m->timer); thread->type = THREAD_READY; thread_list_add_tail(&m->ready, thread);