mirror of
				https://git.proxmox.com/git/mirror_frr
				synced 2025-10-25 00:58:30 +00:00 
			
		
		
		
	Merge pull request #8011 from donaldsharp/starvation
lib: Figure out if we are being starved for cpu
This commit is contained in:
		
						commit
						cdaa204eff
					
				| @ -56,6 +56,12 @@ static struct log_ref ferr_lib_warn[] = { | ||||
| 		.description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner.  This can be either a misconfiguration, bug or some combination thereof.  In this case total WALL time was over 5 seconds.  Which indicates that FRR might be having trouble being scheduled or some system call is delaying", | ||||
| 		.suggestion = "Gather log data and open an Issue", | ||||
| 	}, | ||||
| 	{ | ||||
| 		.code = EC_LIB_STARVE_THREAD, | ||||
| 		.title = "The Event subsystem has detected a thread starvation issue", | ||||
| 		.description = "The event subsystem has detected a thread starvation issue.  This typically indicates that the system FRR is running on is heavily loaded and this load might be impacting FRR's ability to handle events in a timely fashion", | ||||
| 		.suggestion = "Gather log data and open an Issue", | ||||
| 	}, | ||||
| 	{ | ||||
| 		.code = EC_LIB_NO_THREAD, | ||||
| 		.title = "The Event subsystem has detected an internal FD problem", | ||||
|  | ||||
| @ -46,6 +46,7 @@ enum lib_log_refs { | ||||
| 	EC_LIB_LINUX_NS, | ||||
| 	EC_LIB_SLOW_THREAD_CPU, | ||||
| 	EC_LIB_SLOW_THREAD_WALL, | ||||
| 	EC_LIB_STARVE_THREAD, | ||||
| 	EC_LIB_NO_THREAD, | ||||
| 	EC_LIB_RMAP_RECURSION_LIMIT, | ||||
| 	EC_LIB_BACKUP_CONFIG, | ||||
|  | ||||
							
								
								
									
										20
									
								
								lib/thread.c
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								lib/thread.c
									
									
									
									
									
								
							| @ -787,6 +787,7 @@ static struct thread *thread_get(struct thread_master *m, uint8_t type, | ||||
| 	thread->arg = arg; | ||||
| 	thread->yield = THREAD_YIELD_TIME_SLOT; /* default */ | ||||
| 	thread->ref = NULL; | ||||
| 	thread->ignore_timer_late = false; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * So if the passed in funcname is not what we have | ||||
| @ -1651,12 +1652,31 @@ static void thread_process_io(struct thread_master *m, unsigned int num) | ||||
| static unsigned int thread_process_timers(struct thread_master *m, | ||||
| 					  struct timeval *timenow) | ||||
| { | ||||
| 	struct timeval prev = *timenow; | ||||
| 	bool displayed = false; | ||||
| 	struct thread *thread; | ||||
| 	unsigned int ready = 0; | ||||
| 
 | ||||
| 	while ((thread = thread_timer_list_first(&m->timer))) { | ||||
| 		if (timercmp(timenow, &thread->u.sands, <)) | ||||
| 			break; | ||||
| 		prev = thread->u.sands; | ||||
| 		prev.tv_sec += 4; | ||||
| 		/*
 | ||||
| 		 * If the timer would have popped 4 seconds in the | ||||
| 		 * past then we are in a situation where we are | ||||
| 		 * really getting behind on handling of events. | ||||
| 		 * Let's log it and do the right thing with it. | ||||
| 		 */ | ||||
| 		if (!displayed && !thread->ignore_timer_late && | ||||
| 		    timercmp(timenow, &prev, >)) { | ||||
| 			flog_warn( | ||||
| 				EC_LIB_STARVE_THREAD, | ||||
| 				"Thread Starvation: %pTHD was scheduled to pop greater than 4s ago", | ||||
| 				thread); | ||||
| 			displayed = true; | ||||
| 		} | ||||
| 
 | ||||
| 		thread_timer_list_pop(&m->timer); | ||||
| 		thread->type = THREAD_READY; | ||||
| 		thread_list_add_tail(&m->ready, thread); | ||||
|  | ||||
| @ -126,6 +126,7 @@ struct thread { | ||||
| 	unsigned long yield;		 /* yield time in microseconds */ | ||||
| 	const struct xref_threadsched *xref;   /* origin location */ | ||||
| 	pthread_mutex_t mtx;   /* mutex for thread.c functions */ | ||||
| 	bool ignore_timer_late; | ||||
| }; | ||||
| 
 | ||||
| #ifdef _FRR_ATTRIBUTE_PRINTFRR | ||||
| @ -285,6 +286,11 @@ extern bool thread_is_scheduled(struct thread *thread); | ||||
| /* Debug signal mask */ | ||||
| void debug_signals(const sigset_t *sigs); | ||||
| 
 | ||||
| static inline void thread_ignore_late_timer(struct thread *thread) | ||||
| { | ||||
| 	thread->ignore_timer_late = true; | ||||
| } | ||||
| 
 | ||||
| #ifdef __cplusplus | ||||
| } | ||||
| #endif | ||||
|  | ||||
| @ -135,10 +135,11 @@ static int work_queue_schedule(struct work_queue *wq, unsigned int delay) | ||||
| 		/* Schedule timer if there's a delay, otherwise just schedule
 | ||||
| 		 * as an 'event' | ||||
| 		 */ | ||||
| 		if (delay > 0) | ||||
| 		if (delay > 0) { | ||||
| 			thread_add_timer_msec(wq->master, work_queue_run, wq, | ||||
| 					      delay, &wq->thread); | ||||
| 		else | ||||
| 			thread_ignore_late_timer(wq->thread); | ||||
| 		} else | ||||
| 			thread_add_event(wq->master, work_queue_run, wq, 0, | ||||
| 					 &wq->thread); | ||||
| 
 | ||||
|  | ||||
| @ -91,9 +91,12 @@ static int if_zebra_speed_update(struct thread *thread) | ||||
| 		changed = true; | ||||
| 	} | ||||
| 
 | ||||
| 	if (changed || new_speed == UINT32_MAX) | ||||
| 	if (changed || new_speed == UINT32_MAX) { | ||||
| 		thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 5, | ||||
| 				 &zif->speed_update); | ||||
| 		thread_ignore_late_timer(zif->speed_update); | ||||
| 	} | ||||
| 
 | ||||
| 	return 1; | ||||
| } | ||||
| 
 | ||||
| @ -187,6 +190,8 @@ static int if_zebra_new_hook(struct interface *ifp) | ||||
| 	 */ | ||||
| 	thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 15, | ||||
| 			 &zebra_if->speed_update); | ||||
| 	thread_ignore_late_timer(zebra_if->speed_update); | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| @ -1074,6 +1079,7 @@ void if_up(struct interface *ifp) | ||||
| 
 | ||||
| 	thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 0, | ||||
| 			 &zif->speed_update); | ||||
| 	thread_ignore_late_timer(zif->speed_update); | ||||
| } | ||||
| 
 | ||||
| /* Interface goes down.  We have to manage different behavior of based
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Jafar Al-Gharaibeh
						Jafar Al-Gharaibeh