From a2032324643e70aa10e57c6de9a4cf9357d51592 Mon Sep 17 00:00:00 2001 From: Rafael Zalamena Date: Fri, 17 Jul 2020 09:19:29 -0300 Subject: [PATCH 1/4] zebra,fpm: fix dead lock on close during startup Serialize the `fpm_reconnect` function by only allowing one part of our code to call it, then make sure all zebra threads executions are done before attempting to close and reset the output stream. Signed-off-by: Rafael Zalamena --- zebra/dplane_fpm_nl.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c index ef208bdc83..47d359149f 100644 --- a/zebra/dplane_fpm_nl.c +++ b/zebra/dplane_fpm_nl.c @@ -149,8 +149,14 @@ enum fpm_nl_events { FNE_RESET_COUNTERS, /* Toggle next hop group feature. */ FNE_TOGGLE_NHG, + /* Reconnect request by our own code to avoid races. */ + FNE_INTERNAL_RECONNECT, }; +#define FPM_RECONNECT(fnc) \ + thread_add_event((fnc)->fthread->master, fpm_process_event, (fnc), \ + FNE_INTERNAL_RECONNECT, &(fnc)->t_event) + /* * Prototypes. */ @@ -428,7 +434,18 @@ static int fpm_connect(struct thread *t); static void fpm_reconnect(struct fpm_nl_ctx *fnc) { - /* Grab the lock to empty the stream and stop the zebra thread. */ + /* Cancel all zebra threads first. */ + thread_cancel_async(zrouter.master, &fnc->t_nhgreset, NULL); + thread_cancel_async(zrouter.master, &fnc->t_nhgwalk, NULL); + thread_cancel_async(zrouter.master, &fnc->t_ribreset, NULL); + thread_cancel_async(zrouter.master, &fnc->t_ribwalk, NULL); + thread_cancel_async(zrouter.master, &fnc->t_rmacreset, NULL); + thread_cancel_async(zrouter.master, &fnc->t_rmacwalk, NULL); + + /* + * Grab the lock to empty the streams (data plane might try to + * enqueue updates while we are closing). + */ frr_mutex_lock_autounlock(&fnc->obuf_mutex); /* Avoid calling close on `-1`. */ @@ -442,13 +459,6 @@ static void fpm_reconnect(struct fpm_nl_ctx *fnc) THREAD_OFF(fnc->t_read); THREAD_OFF(fnc->t_write); - thread_cancel_async(zrouter.master, &fnc->t_nhgreset, NULL); - thread_cancel_async(zrouter.master, &fnc->t_nhgwalk, NULL); - thread_cancel_async(zrouter.master, &fnc->t_ribreset, NULL); - thread_cancel_async(zrouter.master, &fnc->t_ribwalk, NULL); - thread_cancel_async(zrouter.master, &fnc->t_rmacreset, NULL); - thread_cancel_async(zrouter.master, &fnc->t_rmacwalk, NULL); - /* FPM is disabled, don't attempt to connect. */ if (fnc->disabled) return; @@ -472,7 +482,7 @@ static int fpm_read(struct thread *t) if (IS_ZEBRA_DEBUG_FPM) zlog_debug("%s: connection closed", __func__); - fpm_reconnect(fnc); + FPM_RECONNECT(fnc); return 0; } if (rv == -1) { @@ -484,7 +494,7 @@ static int fpm_read(struct thread *t) memory_order_relaxed); zlog_warn("%s: connection failure: %s", __func__, strerror(errno)); - fpm_reconnect(fnc); + FPM_RECONNECT(fnc); return 0; } stream_reset(fnc->ibuf); @@ -525,7 +535,7 @@ static int fpm_write(struct thread *t) &fnc->counters.connection_errors, 1, memory_order_relaxed); - fpm_reconnect(fnc); + FPM_RECONNECT(fnc); return 0; } @@ -589,8 +599,9 @@ static int fpm_write(struct thread *t) memory_order_relaxed); zlog_warn("%s: connection failure: %s", __func__, strerror(errno)); - fpm_reconnect(fnc); - break; + + FPM_RECONNECT(fnc); + return 0; } /* Account all bytes sent. */ @@ -1174,6 +1185,10 @@ static int fpm_process_event(struct thread *t) fpm_reconnect(fnc); break; + case FNE_INTERNAL_RECONNECT: + fpm_reconnect(fnc); + break; + default: if (IS_ZEBRA_DEBUG_FPM) zlog_debug("%s: unhandled event %d", __func__, event); From e1afb97fddcda0b19e684759fc239b1b0ae8517a Mon Sep 17 00:00:00 2001 From: Rafael Zalamena Date: Fri, 17 Jul 2020 14:37:55 -0300 Subject: [PATCH 2/4] zebra,fpm: fix input handling Two important fixes: * `stream_read_try` does a dirty trick and converts the `-1` return to `-2` when errno is `EAGAIN`, `EWOULDBLOCK` or `EINTR`. * Don't enable reads until the connection is complete. Signed-off-by: Rafael Zalamena --- zebra/dplane_fpm_nl.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c index 47d359149f..e5a0ebfd46 100644 --- a/zebra/dplane_fpm_nl.c +++ b/zebra/dplane_fpm_nl.c @@ -475,6 +475,13 @@ static int fpm_read(struct thread *t) /* Let's ignore the input at the moment. */ rv = stream_read_try(fnc->ibuf, fnc->socket, STREAM_WRITEABLE(fnc->ibuf)); + /* We've got an interruption. */ + if (rv == -2) { + /* Schedule next read. */ + thread_add_read(fnc->fthread->master, fpm_read, fnc, + fnc->socket, &fnc->t_read); + return 0; + } if (rv == 0) { atomic_fetch_add_explicit(&fnc->counters.connection_closes, 1, memory_order_relaxed); @@ -486,10 +493,6 @@ static int fpm_read(struct thread *t) return 0; } if (rv == -1) { - if (errno == EAGAIN || errno == EWOULDBLOCK - || errno == EINTR) - return 0; - atomic_fetch_add_explicit(&fnc->counters.connection_errors, 1, memory_order_relaxed); zlog_warn("%s: connection failure: %s", __func__, @@ -541,6 +544,10 @@ static int fpm_write(struct thread *t) fnc->connecting = false; + /* Permit receiving messages now. */ + thread_add_read(fnc->fthread->master, fpm_read, fnc, + fnc->socket, &fnc->t_read); + /* * Walk the route tables to send old information before starting * to send updated information. @@ -672,8 +679,9 @@ static int fpm_connect(struct thread *t) fnc->connecting = (errno == EINPROGRESS); fnc->socket = sock; - thread_add_read(fnc->fthread->master, fpm_read, fnc, sock, - &fnc->t_read); + if (!fnc->connecting) + thread_add_read(fnc->fthread->master, fpm_read, fnc, sock, + &fnc->t_read); thread_add_write(fnc->fthread->master, fpm_write, fnc, sock, &fnc->t_write); From 55eb9d4d7d28b83f131828110c73be5cc2e9f962 Mon Sep 17 00:00:00 2001 From: Rafael Zalamena Date: Fri, 17 Jul 2020 11:37:38 -0300 Subject: [PATCH 3/4] zebra,fpm: fix race on completion detection Zebra runs on a different thread than FPM, so we need to synchronize them by using events. While here, implement completion detection for all kinds of walk. Signed-off-by: Rafael Zalamena --- zebra/dplane_fpm_nl.c | 54 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c index e5a0ebfd46..60e8eb5ad6 100644 --- a/zebra/dplane_fpm_nl.c +++ b/zebra/dplane_fpm_nl.c @@ -72,6 +72,7 @@ struct fpm_nl_ctx { int socket; bool disabled; bool connecting; + bool nhg_complete; bool rib_complete; bool rmac_complete; bool use_nhg; @@ -151,12 +152,23 @@ enum fpm_nl_events { FNE_TOGGLE_NHG, /* Reconnect request by our own code to avoid races. */ FNE_INTERNAL_RECONNECT, + + /* Next hop groups walk finished. */ + FNE_NHG_FINISHED, + /* RIB walk finished. */ + FNE_RIB_FINISHED, + /* RMAC walk finished. */ + FNE_RMAC_FINISHED, }; #define FPM_RECONNECT(fnc) \ thread_add_event((fnc)->fthread->master, fpm_process_event, (fnc), \ FNE_INTERNAL_RECONNECT, &(fnc)->t_event) +#define WALK_FINISH(fnc, ev) \ + thread_add_event((fnc)->fthread->master, fpm_process_event, (fnc), \ + (ev), NULL) + /* * Prototypes. */ @@ -923,10 +935,11 @@ static int fpm_nhg_send(struct thread *t) dplane_ctx_fini(&fna.ctx); /* We are done sending next hops, lets install the routes now. */ - if (fna.complete) + if (fna.complete) { + WALK_FINISH(fnc, FNE_NHG_FINISHED); thread_add_timer(zrouter.master, fpm_rib_send, fnc, 0, &fnc->t_ribwalk); - else /* Otherwise reschedule next hop group again. */ + } else /* Otherwise reschedule next hop group again. */ thread_add_timer(zrouter.master, fpm_nhg_send, fnc, 0, &fnc->t_nhgwalk); @@ -982,7 +995,7 @@ static int fpm_rib_send(struct thread *t) dplane_ctx_fini(&ctx); /* All RIB routes sent! */ - fnc->rib_complete = true; + WALK_FINISH(fnc, FNE_RIB_FINISHED); return 0; } @@ -994,6 +1007,7 @@ struct fpm_rmac_arg { struct zebra_dplane_ctx *ctx; struct fpm_nl_ctx *fnc; zebra_l3vni_t *zl3vni; + bool complete; }; static void fpm_enqueue_rmac_table(struct hash_bucket *backet, void *arg) @@ -1007,7 +1021,7 @@ static void fpm_enqueue_rmac_table(struct hash_bucket *backet, void *arg) bool sticky; /* Entry already sent. */ - if (CHECK_FLAG(zrmac->flags, ZEBRA_MAC_FPM_SENT)) + if (CHECK_FLAG(zrmac->flags, ZEBRA_MAC_FPM_SENT) || !fra->complete) return; sticky = !!CHECK_FLAG(zrmac->flags, @@ -1023,6 +1037,7 @@ static void fpm_enqueue_rmac_table(struct hash_bucket *backet, void *arg) if (fpm_nl_enqueue(fra->fnc, fra->ctx) == -1) { thread_add_timer(zrouter.master, fpm_rmac_send, fra->fnc, 1, &fra->fnc->t_rmacwalk); + fra->complete = false; } } @@ -1041,9 +1056,14 @@ static int fpm_rmac_send(struct thread *t) fra.fnc = THREAD_ARG(t); fra.ctx = dplane_ctx_alloc(); + fra.complete = true; hash_iterate(zrouter.l3vni_table, fpm_enqueue_l3vni_table, &fra); dplane_ctx_fini(&fra.ctx); + /* RMAC walk completed. */ + if (fra.complete) + WALK_FINISH(fra.fnc, FNE_RMAC_FINISHED); + return 0; } @@ -1060,6 +1080,9 @@ static void fpm_nhg_reset_cb(struct hash_bucket *bucket, void *arg) static int fpm_nhg_reset(struct thread *t) { + struct fpm_nl_ctx *fnc = THREAD_ARG(t); + + fnc->nhg_complete = false; hash_iterate(zrouter.nhgs_id, fpm_nhg_reset_cb, NULL); return 0; } @@ -1111,6 +1134,9 @@ static void fpm_unset_l3vni_table(struct hash_bucket *backet, void *arg) static int fpm_rmac_reset(struct thread *t) { + struct fpm_nl_ctx *fnc = THREAD_ARG(t); + + fnc->rmac_complete = false; hash_iterate(zrouter.l3vni_table, fpm_unset_l3vni_table, NULL); return 0; @@ -1197,6 +1223,26 @@ static int fpm_process_event(struct thread *t) fpm_reconnect(fnc); break; + case FNE_NHG_FINISHED: + if (IS_ZEBRA_DEBUG_FPM) + zlog_debug("%s: next hop groups walk finished", + __func__); + + fnc->nhg_complete = true; + break; + case FNE_RIB_FINISHED: + if (IS_ZEBRA_DEBUG_FPM) + zlog_debug("%s: RIB walk finished", __func__); + + fnc->rib_complete = true; + break; + case FNE_RMAC_FINISHED: + if (IS_ZEBRA_DEBUG_FPM) + zlog_debug("%s: RMAC walk finished", __func__); + + fnc->rmac_complete = true; + break; + default: if (IS_ZEBRA_DEBUG_FPM) zlog_debug("%s: unhandled event %d", __func__, event); From e41e0f8135cb9a4e0878d55c2f75cbc4e0499c90 Mon Sep 17 00:00:00 2001 From: Rafael Zalamena Date: Fri, 17 Jul 2020 16:15:04 -0300 Subject: [PATCH 4/4] zebra,fpm: serialize zebra table walks We were not getting any benefits from attempting to walk all tables at the same time and it made debugging harder, so lets execute one table walk per time. Signed-off-by: Rafael Zalamena --- zebra/dplane_fpm_nl.c | 53 +++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c index 60e8eb5ad6..c81d451693 100644 --- a/zebra/dplane_fpm_nl.c +++ b/zebra/dplane_fpm_nl.c @@ -559,28 +559,6 @@ static int fpm_write(struct thread *t) /* Permit receiving messages now. */ thread_add_read(fnc->fthread->master, fpm_read, fnc, fnc->socket, &fnc->t_read); - - /* - * Walk the route tables to send old information before starting - * to send updated information. - * - * NOTE 1: - * RIB table walk is called after the next group table walk - * ends. - * - * NOTE 2: - * Don't attempt to go through next hop group table if we were - * explictly told to not use it. - */ - if (fnc->use_nhg) - thread_add_timer(zrouter.master, fpm_nhg_send, fnc, 0, - &fnc->t_nhgwalk); - else - thread_add_timer(zrouter.master, fpm_rib_send, fnc, 0, - &fnc->t_ribwalk); - - thread_add_timer(zrouter.master, fpm_rmac_send, fnc, 0, - &fnc->t_rmacwalk); } frr_mutex_lock_autounlock(&fnc->obuf_mutex); @@ -698,12 +676,12 @@ static int fpm_connect(struct thread *t) &fnc->t_write); /* Mark all routes as unsent. */ - thread_add_timer(zrouter.master, fpm_nhg_reset, fnc, 0, - &fnc->t_nhgreset); - thread_add_timer(zrouter.master, fpm_rib_reset, fnc, 0, - &fnc->t_ribreset); - thread_add_timer(zrouter.master, fpm_rmac_reset, fnc, 0, - &fnc->t_rmacreset); + if (fnc->use_nhg) + thread_add_timer(zrouter.master, fpm_nhg_reset, fnc, 0, + &fnc->t_nhgreset); + else + thread_add_timer(zrouter.master, fpm_rib_reset, fnc, 0, + &fnc->t_ribreset); return 0; } @@ -937,8 +915,8 @@ static int fpm_nhg_send(struct thread *t) /* We are done sending next hops, lets install the routes now. */ if (fna.complete) { WALK_FINISH(fnc, FNE_NHG_FINISHED); - thread_add_timer(zrouter.master, fpm_rib_send, fnc, 0, - &fnc->t_ribwalk); + thread_add_timer(zrouter.master, fpm_rib_reset, fnc, 0, + &fnc->t_ribreset); } else /* Otherwise reschedule next hop group again. */ thread_add_timer(zrouter.master, fpm_nhg_send, fnc, 0, &fnc->t_nhgwalk); @@ -997,6 +975,10 @@ static int fpm_rib_send(struct thread *t) /* All RIB routes sent! */ WALK_FINISH(fnc, FNE_RIB_FINISHED); + /* Schedule next event: RMAC reset. */ + thread_add_event(zrouter.master, fpm_rmac_reset, fnc, 0, + &fnc->t_rmacreset); + return 0; } @@ -1084,6 +1066,10 @@ static int fpm_nhg_reset(struct thread *t) fnc->nhg_complete = false; hash_iterate(zrouter.nhgs_id, fpm_nhg_reset_cb, NULL); + + /* Schedule next step: send next hop groups. */ + thread_add_event(zrouter.master, fpm_nhg_send, fnc, 0, &fnc->t_nhgwalk); + return 0; } @@ -1112,6 +1098,9 @@ static int fpm_rib_reset(struct thread *t) } } + /* Schedule next step: send RIB routes. */ + thread_add_event(zrouter.master, fpm_rib_send, fnc, 0, &fnc->t_ribwalk); + return 0; } @@ -1139,6 +1128,10 @@ static int fpm_rmac_reset(struct thread *t) fnc->rmac_complete = false; hash_iterate(zrouter.l3vni_table, fpm_unset_l3vni_table, NULL); + /* Schedule next event: send RMAC entries. */ + thread_add_event(zrouter.master, fpm_rmac_send, fnc, 0, + &fnc->t_rmacwalk); + return 0; }