mirror of
				https://git.proxmox.com/git/mirror_frr
				synced 2025-10-26 15:56:26 +00:00 
			
		
		
		
	Merge pull request #8011 from donaldsharp/starvation
lib: Figure out if we are being starved for cpu
This commit is contained in:
		
						commit
						cdaa204eff
					
				| @ -56,6 +56,12 @@ static struct log_ref ferr_lib_warn[] = { | |||||||
| 		.description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner.  This can be either a misconfiguration, bug or some combination thereof.  In this case total WALL time was over 5 seconds.  Which indicates that FRR might be having trouble being scheduled or some system call is delaying", | 		.description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner.  This can be either a misconfiguration, bug or some combination thereof.  In this case total WALL time was over 5 seconds.  Which indicates that FRR might be having trouble being scheduled or some system call is delaying", | ||||||
| 		.suggestion = "Gather log data and open an Issue", | 		.suggestion = "Gather log data and open an Issue", | ||||||
| 	}, | 	}, | ||||||
|  | 	{ | ||||||
|  | 		.code = EC_LIB_STARVE_THREAD, | ||||||
|  | 		.title = "The Event subsystem has detected a thread starvation issue", | ||||||
|  | 		.description = "The event subsystem has detected a thread starvation issue.  This typically indicates that the system FRR is running on is heavily loaded and this load might be impacting FRR's ability to handle events in a timely fashion", | ||||||
|  | 		.suggestion = "Gather log data and open an Issue", | ||||||
|  | 	}, | ||||||
| 	{ | 	{ | ||||||
| 		.code = EC_LIB_NO_THREAD, | 		.code = EC_LIB_NO_THREAD, | ||||||
| 		.title = "The Event subsystem has detected an internal FD problem", | 		.title = "The Event subsystem has detected an internal FD problem", | ||||||
|  | |||||||
| @ -46,6 +46,7 @@ enum lib_log_refs { | |||||||
| 	EC_LIB_LINUX_NS, | 	EC_LIB_LINUX_NS, | ||||||
| 	EC_LIB_SLOW_THREAD_CPU, | 	EC_LIB_SLOW_THREAD_CPU, | ||||||
| 	EC_LIB_SLOW_THREAD_WALL, | 	EC_LIB_SLOW_THREAD_WALL, | ||||||
|  | 	EC_LIB_STARVE_THREAD, | ||||||
| 	EC_LIB_NO_THREAD, | 	EC_LIB_NO_THREAD, | ||||||
| 	EC_LIB_RMAP_RECURSION_LIMIT, | 	EC_LIB_RMAP_RECURSION_LIMIT, | ||||||
| 	EC_LIB_BACKUP_CONFIG, | 	EC_LIB_BACKUP_CONFIG, | ||||||
|  | |||||||
							
								
								
									
										20
									
								
								lib/thread.c
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								lib/thread.c
									
									
									
									
									
								
							| @ -787,6 +787,7 @@ static struct thread *thread_get(struct thread_master *m, uint8_t type, | |||||||
| 	thread->arg = arg; | 	thread->arg = arg; | ||||||
| 	thread->yield = THREAD_YIELD_TIME_SLOT; /* default */ | 	thread->yield = THREAD_YIELD_TIME_SLOT; /* default */ | ||||||
| 	thread->ref = NULL; | 	thread->ref = NULL; | ||||||
|  | 	thread->ignore_timer_late = false; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * So if the passed in funcname is not what we have | 	 * So if the passed in funcname is not what we have | ||||||
| @ -1651,12 +1652,31 @@ static void thread_process_io(struct thread_master *m, unsigned int num) | |||||||
| static unsigned int thread_process_timers(struct thread_master *m, | static unsigned int thread_process_timers(struct thread_master *m, | ||||||
| 					  struct timeval *timenow) | 					  struct timeval *timenow) | ||||||
| { | { | ||||||
|  | 	struct timeval prev = *timenow; | ||||||
|  | 	bool displayed = false; | ||||||
| 	struct thread *thread; | 	struct thread *thread; | ||||||
| 	unsigned int ready = 0; | 	unsigned int ready = 0; | ||||||
| 
 | 
 | ||||||
| 	while ((thread = thread_timer_list_first(&m->timer))) { | 	while ((thread = thread_timer_list_first(&m->timer))) { | ||||||
| 		if (timercmp(timenow, &thread->u.sands, <)) | 		if (timercmp(timenow, &thread->u.sands, <)) | ||||||
| 			break; | 			break; | ||||||
|  | 		prev = thread->u.sands; | ||||||
|  | 		prev.tv_sec += 4; | ||||||
|  | 		/*
 | ||||||
|  | 		 * If the timer would have popped 4 seconds in the | ||||||
|  | 		 * past then we are in a situation where we are | ||||||
|  | 		 * really getting behind on handling of events. | ||||||
|  | 		 * Let's log it and do the right thing with it. | ||||||
|  | 		 */ | ||||||
|  | 		if (!displayed && !thread->ignore_timer_late && | ||||||
|  | 		    timercmp(timenow, &prev, >)) { | ||||||
|  | 			flog_warn( | ||||||
|  | 				EC_LIB_STARVE_THREAD, | ||||||
|  | 				"Thread Starvation: %pTHD was scheduled to pop greater than 4s ago", | ||||||
|  | 				thread); | ||||||
|  | 			displayed = true; | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
| 		thread_timer_list_pop(&m->timer); | 		thread_timer_list_pop(&m->timer); | ||||||
| 		thread->type = THREAD_READY; | 		thread->type = THREAD_READY; | ||||||
| 		thread_list_add_tail(&m->ready, thread); | 		thread_list_add_tail(&m->ready, thread); | ||||||
|  | |||||||
| @ -126,6 +126,7 @@ struct thread { | |||||||
| 	unsigned long yield;		 /* yield time in microseconds */ | 	unsigned long yield;		 /* yield time in microseconds */ | ||||||
| 	const struct xref_threadsched *xref;   /* origin location */ | 	const struct xref_threadsched *xref;   /* origin location */ | ||||||
| 	pthread_mutex_t mtx;   /* mutex for thread.c functions */ | 	pthread_mutex_t mtx;   /* mutex for thread.c functions */ | ||||||
|  | 	bool ignore_timer_late; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| #ifdef _FRR_ATTRIBUTE_PRINTFRR | #ifdef _FRR_ATTRIBUTE_PRINTFRR | ||||||
| @ -285,6 +286,11 @@ extern bool thread_is_scheduled(struct thread *thread); | |||||||
| /* Debug signal mask */ | /* Debug signal mask */ | ||||||
| void debug_signals(const sigset_t *sigs); | void debug_signals(const sigset_t *sigs); | ||||||
| 
 | 
 | ||||||
|  | static inline void thread_ignore_late_timer(struct thread *thread) | ||||||
|  | { | ||||||
|  | 	thread->ignore_timer_late = true; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  | |||||||
| @ -135,10 +135,11 @@ static int work_queue_schedule(struct work_queue *wq, unsigned int delay) | |||||||
| 		/* Schedule timer if there's a delay, otherwise just schedule
 | 		/* Schedule timer if there's a delay, otherwise just schedule
 | ||||||
| 		 * as an 'event' | 		 * as an 'event' | ||||||
| 		 */ | 		 */ | ||||||
| 		if (delay > 0) | 		if (delay > 0) { | ||||||
| 			thread_add_timer_msec(wq->master, work_queue_run, wq, | 			thread_add_timer_msec(wq->master, work_queue_run, wq, | ||||||
| 					      delay, &wq->thread); | 					      delay, &wq->thread); | ||||||
| 		else | 			thread_ignore_late_timer(wq->thread); | ||||||
|  | 		} else | ||||||
| 			thread_add_event(wq->master, work_queue_run, wq, 0, | 			thread_add_event(wq->master, work_queue_run, wq, 0, | ||||||
| 					 &wq->thread); | 					 &wq->thread); | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -91,9 +91,12 @@ static int if_zebra_speed_update(struct thread *thread) | |||||||
| 		changed = true; | 		changed = true; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if (changed || new_speed == UINT32_MAX) | 	if (changed || new_speed == UINT32_MAX) { | ||||||
| 		thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 5, | 		thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 5, | ||||||
| 				 &zif->speed_update); | 				 &zif->speed_update); | ||||||
|  | 		thread_ignore_late_timer(zif->speed_update); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	return 1; | 	return 1; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @ -187,6 +190,8 @@ static int if_zebra_new_hook(struct interface *ifp) | |||||||
| 	 */ | 	 */ | ||||||
| 	thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 15, | 	thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 15, | ||||||
| 			 &zebra_if->speed_update); | 			 &zebra_if->speed_update); | ||||||
|  | 	thread_ignore_late_timer(zebra_if->speed_update); | ||||||
|  | 
 | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @ -1074,6 +1079,7 @@ void if_up(struct interface *ifp) | |||||||
| 
 | 
 | ||||||
| 	thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 0, | 	thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 0, | ||||||
| 			 &zif->speed_update); | 			 &zif->speed_update); | ||||||
|  | 	thread_ignore_late_timer(zif->speed_update); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* Interface goes down.  We have to manage different behavior of based
 | /* Interface goes down.  We have to manage different behavior of based
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Jafar Al-Gharaibeh
						Jafar Al-Gharaibeh