From 2f9db11bd8d97512386339926ecc91bfc601cd53 Mon Sep 17 00:00:00 2001 From: Quentin Young Date: Fri, 22 May 2020 16:12:38 -0400 Subject: [PATCH] bgpd: avoid spamming bgp_accept() on bad vrf sock bgp_accept() gets called over and over again when a VRF device is deleted out from under a bgp listener socket that is bound to it. Prevent this by noting the error and cancelling ourselves, allowing the vrf status code to clean up the mess when it receives word about the change from Zebra. Signed-off-by: Quentin Young --- bgpd/bgp_network.c | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/bgpd/bgp_network.c b/bgpd/bgp_network.c index f35528477f..00cc1f67a1 100644 --- a/bgpd/bgp_network.c +++ b/bgpd/bgp_network.c @@ -360,10 +360,13 @@ static int bgp_accept(struct thread *thread) sockunion_init(&su); + bgp = bgp_lookup_by_name(listener->name); + /* Register accept thread. */ accept_sock = THREAD_FD(thread); if (accept_sock < 0) { - flog_err_sys(EC_LIB_SOCKET, "accept_sock is negative value %d", + flog_err_sys(EC_LIB_SOCKET, + "[Error] BGP accept socket fd is negative: %d", accept_sock); return -1; } @@ -374,10 +377,37 @@ static int bgp_accept(struct thread *thread) /* Accept client connection. */ bgp_sock = sockunion_accept(accept_sock, &su); + int save_errno = errno; if (bgp_sock < 0) { - flog_err_sys(EC_LIB_SOCKET, - "[Error] BGP socket accept failed (%s)", - safe_strerror(errno)); + if (save_errno == EINVAL) { + struct vrf *vrf = + bgp ? vrf_lookup_by_id(bgp->vrf_id) : NULL; + + /* + * It appears that sometimes, when VRFs are deleted on + * the system, it takes a little while for us to get + * notified about that. In the meantime we endlessly + * loop on accept(), because the socket, having been + * bound to a now-deleted VRF device, is in some weird + * state which causes accept() to fail. + * + * To avoid this, if we see accept() fail with EINVAL, + * we cancel ourselves and trust that when the VRF + * deletion notification comes in the event handler for + * that will take care of cleaning us up. + */ + flog_err_sys( + EC_LIB_SOCKET, + "[Error] accept() failed with error \"%s\" on BGP listener socket %d for BGP instance in VRF \"%s\"; refreshing socket", + safe_strerror(save_errno), accept_sock, + VRF_LOGNAME(vrf)); + THREAD_OFF(listener->thread); + } else { + flog_err_sys( + EC_LIB_SOCKET, + "[Error] BGP socket accept failed (%s); retrying", + safe_strerror(save_errno)); + } return -1; } set_nonblocking(bgp_sock); @@ -888,7 +918,7 @@ void bgp_close_vrf_socket(struct bgp *bgp) for (ALL_LIST_ELEMENTS(bm->listen_sockets, node, next, listener)) { if (listener->bgp == bgp) { - thread_cancel(listener->thread); + THREAD_OFF(listener->thread); close(listener->fd); listnode_delete(bm->listen_sockets, listener); XFREE(MTYPE_BGP_LISTENER, listener->name);