mirror of
https://git.proxmox.com/git/mirror_frr
synced 2025-08-14 16:04:49 +00:00
Merge pull request #18094 from opensourcerouting/event-tardy-bfd
lib (+bfd): improve late timer warnings
This commit is contained in:
commit
9623612fc0
@ -107,6 +107,7 @@ void sbfd_init_xmttimer_update(struct bfd_session *bs, uint64_t jitter)
|
||||
tv_normalize(&tv);
|
||||
|
||||
event_add_timer_tv(master, sbfd_init_xmt_cb, bs, &tv, &bs->xmttimer_ev);
|
||||
event_set_tardy_threshold(bs->xmttimer_ev, bs->xmt_TO / 2);
|
||||
}
|
||||
|
||||
void sbfd_echo_xmttimer_update(struct bfd_session *bs, uint64_t jitter)
|
||||
@ -123,6 +124,7 @@ void sbfd_echo_xmttimer_update(struct bfd_session *bs, uint64_t jitter)
|
||||
tv_normalize(&tv);
|
||||
|
||||
event_add_timer_tv(master, sbfd_echo_xmt_cb, bs, &tv, &bs->echo_xmttimer_ev);
|
||||
event_set_tardy_threshold(bs->echo_xmttimer_ev, bs->echo_xmt_TO / 2);
|
||||
}
|
||||
|
||||
void bfd_xmttimer_update(struct bfd_session *bs, uint64_t jitter)
|
||||
@ -140,6 +142,7 @@ void bfd_xmttimer_update(struct bfd_session *bs, uint64_t jitter)
|
||||
tv_normalize(&tv);
|
||||
|
||||
event_add_timer_tv(master, bfd_xmt_cb, bs, &tv, &bs->xmttimer_ev);
|
||||
event_set_tardy_threshold(bs->xmttimer_ev, bs->xmt_TO / 2);
|
||||
}
|
||||
|
||||
void bfd_echo_xmttimer_update(struct bfd_session *bs, uint64_t jitter)
|
||||
@ -158,6 +161,7 @@ void bfd_echo_xmttimer_update(struct bfd_session *bs, uint64_t jitter)
|
||||
|
||||
event_add_timer_tv(master, bfd_echo_xmt_cb, bs, &tv,
|
||||
&bs->echo_xmttimer_ev);
|
||||
event_set_tardy_threshold(bs->echo_xmttimer_ev, bs->echo_xmt_TO / 2);
|
||||
}
|
||||
|
||||
void bfd_recvtimer_delete(struct bfd_session *bs)
|
||||
|
85
lib/event.c
85
lib/event.c
@ -571,6 +571,11 @@ struct event_loop *event_master_create(const char *name)
|
||||
rv->spin = true;
|
||||
rv->handle_signals = true;
|
||||
|
||||
/* tardy event warnings */
|
||||
monotime(&rv->last_tardy_warning);
|
||||
rv->last_tardy_warning.tv_sec -= (TARDY_WARNING_INTERVAL + TIMER_SECOND_MICRO - 1) /
|
||||
TIMER_SECOND_MICRO;
|
||||
|
||||
/* Set pthread owner, should be updated by actual owner */
|
||||
rv->owner = pthread_self();
|
||||
rv->cancel_req = list_new();
|
||||
@ -770,13 +775,13 @@ static struct event *thread_get(struct event_loop *m, uint8_t type,
|
||||
thread->master = m;
|
||||
thread->arg = arg;
|
||||
thread->yield = EVENT_YIELD_TIME_SLOT; /* default */
|
||||
thread->tardy_threshold = 0;
|
||||
/* thread->ref is zeroed either by XCALLOC above or by memset before
|
||||
* being put on the "unuse" list by thread_add_unuse().
|
||||
* Setting it here again makes coverity complain about a missing
|
||||
* lock :(
|
||||
*/
|
||||
/* thread->ref = NULL; */
|
||||
thread->ignore_timer_late = false;
|
||||
|
||||
/*
|
||||
* So if the passed in funcname is not what we have
|
||||
@ -1023,6 +1028,8 @@ static void _event_add_timer_timeval(const struct xref_eventsched *xref,
|
||||
return;
|
||||
|
||||
thread = thread_get(m, EVENT_TIMER, func, arg, xref);
|
||||
/* default lateness warning: 4s */
|
||||
thread->tardy_threshold = TARDY_DEFAULT_THRESHOLD;
|
||||
|
||||
frr_with_mutex (&thread->mtx) {
|
||||
thread->u.sands = t;
|
||||
@ -1685,34 +1692,12 @@ static void thread_process_io(struct event_loop *m, unsigned int num)
|
||||
static unsigned int thread_process_timers(struct event_loop *m,
|
||||
struct timeval *timenow)
|
||||
{
|
||||
struct timeval prev = *timenow;
|
||||
bool displayed = false;
|
||||
struct event *thread;
|
||||
unsigned int ready = 0;
|
||||
|
||||
while ((thread = event_timer_list_first(&m->timer))) {
|
||||
if (timercmp(timenow, &thread->u.sands, <))
|
||||
break;
|
||||
prev = thread->u.sands;
|
||||
prev.tv_sec += 4;
|
||||
/*
|
||||
* If the timer would have popped 4 seconds in the
|
||||
* past then we are in a situation where we are
|
||||
* really getting behind on handling of events.
|
||||
* Let's log it and do the right thing with it.
|
||||
*/
|
||||
if (timercmp(timenow, &prev, >)) {
|
||||
atomic_fetch_add_explicit(
|
||||
&thread->hist->total_starv_warn, 1,
|
||||
memory_order_seq_cst);
|
||||
if (!displayed && !thread->ignore_timer_late) {
|
||||
flog_warn(
|
||||
EC_LIB_STARVE_THREAD,
|
||||
"Thread Starvation: %pTHD was scheduled to pop greater than 4s ago",
|
||||
thread);
|
||||
displayed = true;
|
||||
}
|
||||
}
|
||||
|
||||
event_timer_list_pop(&m->timer);
|
||||
thread->type = EVENT_READY;
|
||||
@ -1946,6 +1931,29 @@ void event_getrusage(RUSAGE_T *r)
|
||||
#endif
|
||||
}
|
||||
|
||||
static void event_tardy_warn(struct event *thread, unsigned long since_us)
|
||||
{
|
||||
char buf[64];
|
||||
struct fbuf fb = { .buf = buf, .pos = buf, .len = sizeof(buf) };
|
||||
double loadavg[3];
|
||||
int rv;
|
||||
|
||||
rv = getloadavg(loadavg, array_size(loadavg));
|
||||
if (rv < 0)
|
||||
bprintfrr(&fb, "not available");
|
||||
else {
|
||||
for (int i = 0; i < rv; i++) {
|
||||
bprintfrr(&fb, "%.2f", loadavg[i]);
|
||||
if (i < rv - 1)
|
||||
bputs(&fb, ", ");
|
||||
}
|
||||
}
|
||||
|
||||
flog_warn(EC_LIB_STARVE_THREAD,
|
||||
"CPU starvation: %pTHD getting executed %lums late, warning threshold %lums. System load: %pFB",
|
||||
thread, (since_us + 999) / 1000, (thread->tardy_threshold + 999) / 1000, &fb);
|
||||
}
|
||||
|
||||
/*
|
||||
* Call a thread.
|
||||
*
|
||||
@ -1962,6 +1970,33 @@ void event_call(struct event *thread)
|
||||
RUSAGE_T before, after;
|
||||
bool suppress_warnings = EVENT_ARG(thread);
|
||||
|
||||
if (thread->tardy_threshold) {
|
||||
int64_t timer_late_us = monotime_since(&thread->u.sands, NULL);
|
||||
|
||||
/* Timers have a tardiness warning defaulting to 4s.
|
||||
* It can be customized with event_set_tardy_threshold()
|
||||
* (bfdd does that since the protocol has really short timers)
|
||||
*
|
||||
* If we are more than that threshold late, print a warning
|
||||
* since we're running behind in calling timers (probably due
|
||||
* to high system load.)
|
||||
*/
|
||||
if (timer_late_us > (int64_t)thread->tardy_threshold) {
|
||||
int64_t since_last_warning;
|
||||
struct timeval *tw;
|
||||
|
||||
atomic_fetch_add_explicit(&thread->hist->total_starv_warn, 1,
|
||||
memory_order_seq_cst);
|
||||
|
||||
tw = &thread->master->last_tardy_warning;
|
||||
since_last_warning = monotime_since(tw, NULL);
|
||||
if (since_last_warning > TARDY_WARNING_INTERVAL) {
|
||||
event_tardy_warn(thread, timer_late_us);
|
||||
monotime(tw);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* if the thread being called is the CLI, it may change cputime_enabled
|
||||
* ("service cputime-stats" command), which can result in nonsensical
|
||||
* and very confusing warnings
|
||||
@ -2145,9 +2180,9 @@ static ssize_t printfrr_thread_dbg(struct fbuf *buf, struct printfrr_eargs *ea,
|
||||
char info[16] = "";
|
||||
|
||||
if (!thread)
|
||||
return bputs(buf, "{(thread *)NULL}");
|
||||
return bputs(buf, "{(event *)NULL}");
|
||||
|
||||
rv += bprintfrr(buf, "{(thread *)%p arg=%p", thread, thread->arg);
|
||||
rv += bprintfrr(buf, "{(event *)%p arg=%p", thread, thread->arg);
|
||||
|
||||
if (thread->type < array_size(types) && types[thread->type])
|
||||
rv += bprintfrr(buf, " %-6s", types[thread->type]);
|
||||
|
@ -95,6 +95,7 @@ struct event_loop {
|
||||
|
||||
bool ready_run_loop;
|
||||
RUSAGE_T last_getrusage;
|
||||
struct timeval last_tardy_warning;
|
||||
};
|
||||
|
||||
/* Event types. */
|
||||
@ -126,11 +127,17 @@ struct event {
|
||||
struct timeval real;
|
||||
struct cpu_event_history *hist; /* cache pointer to cpu_history */
|
||||
unsigned long yield; /* yield time in microseconds */
|
||||
/* lateness warning threshold, usec. 0 if it's not a timer. */
|
||||
unsigned long tardy_threshold;
|
||||
const struct xref_eventsched *xref; /* origin location */
|
||||
pthread_mutex_t mtx; /* mutex for thread.c functions */
|
||||
bool ignore_timer_late;
|
||||
};
|
||||
|
||||
/* rate limit late timer warnings */
|
||||
#define TARDY_WARNING_INTERVAL 10 * TIMER_SECOND_MICRO
|
||||
/* default threshold for late timer warning */
|
||||
#define TARDY_DEFAULT_THRESHOLD 4 * TIMER_SECOND_MICRO
|
||||
|
||||
#ifdef _FRR_ATTRIBUTE_PRINTFRR
|
||||
#pragma FRR printfrr_ext "%pTH"(struct event *)
|
||||
#endif
|
||||
@ -305,9 +312,17 @@ static inline bool event_is_scheduled(struct event *thread)
|
||||
/* Debug signal mask */
|
||||
void debug_signals(const sigset_t *sigs);
|
||||
|
||||
/* getting called more than given microseconds late will print a warning.
|
||||
* Default if not called: 4s. Don't call this on non-timers.
|
||||
*/
|
||||
static inline void event_set_tardy_threshold(struct event *event, unsigned long thres)
|
||||
{
|
||||
event->tardy_threshold = thres;
|
||||
}
|
||||
|
||||
static inline void event_ignore_late_timer(struct event *event)
|
||||
{
|
||||
event->ignore_timer_late = true;
|
||||
event->tardy_threshold = 0;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
Loading…
Reference in New Issue
Block a user