From 59711f1063d763649f53219ef31d7325c007cdeb Mon Sep 17 00:00:00 2001 From: Satheesh Kumar K Date: Tue, 23 Jul 2019 22:00:08 -0700 Subject: [PATCH 01/13] pimd: Zebra Route Updates are missing at pim Pim will do the nexthop registration with "ip pim rp" static configuration with this Zebra will advertise the Route Information. But while processing this info at PIM, if Nexthop Interfaces are not PIM enabled, currently PIM is dropping those paths. in case all paths are not PIM enabled, there is no valid RPF Interface at PIM. and PIM will be stuck at this state until Next update this to route, that can happen only if there is a Routing change at Zebra for this prefix. until that time PIM will not have any valid outgoing Interface. This issue was mainly seen during Node bootup scenarios. Fix Proposed ============= store the paths in PIM PNC Data structure though they are not enabled with PIM, because while selecting the Interface PIM checks for multicast enabled Interface. Tests Performed =============== 1. Verified fail Test case 2. Disabling the PIM on selected outgoing Interface, PIM is choosing another path when Neighbor is down on this Interface. 3. Re-configure the PIM on above un-configured Interface, PIM is staying with old NHop since it is valid. Signed-off-by: Satheesh Kumar K --- pimd/pim_nht.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pimd/pim_nht.c b/pimd/pim_nht.c index 65ea858cb6..39dc8ad2fa 100644 --- a/pimd/pim_nht.c +++ b/pimd/pim_nht.c @@ -842,6 +842,14 @@ int pim_parse_nexthop_update(ZAPI_CALLBACK_ARGS) } if (!ifp->info) { + /* + * Though Multicast is not enabled on this + * Interface store it in database otheriwse we + * may miss this update and this will not cause + * any issue, because while choosing the path we + * are ommitting the Interfaces which are not + * multicast enabled + */ if (PIM_DEBUG_PIM_NHT) { char buf[NEXTHOP_STRLEN]; @@ -853,8 +861,6 @@ int pim_parse_nexthop_update(ZAPI_CALLBACK_ARGS) nexthop2str(nexthop, buf, sizeof(buf))); } - nexthop_free(nexthop); - continue; } if (nhlist_tail) { From 5b4d431d2367fe5db8725efca5494f957b7a8b76 Mon Sep 17 00:00:00 2001 From: Satheesh Kumar K Date: Tue, 23 Jul 2019 22:03:06 -0700 Subject: [PATCH 02/13] pimd: PIM Core seen during NH processing PIM Core was seen during EVPN PIM Testing beacuse of NULL Interface pointer Signed-off-by: Satheesh Kumar K --- pimd/pim_neighbor.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pimd/pim_neighbor.c b/pimd/pim_neighbor.c index a63b09fc1f..722ecb2a72 100644 --- a/pimd/pim_neighbor.c +++ b/pimd/pim_neighbor.c @@ -424,10 +424,11 @@ struct pim_neighbor *pim_neighbor_find_by_secondary(struct interface *ifp, struct pim_neighbor *neigh; struct prefix *p; - pim_ifp = ifp->info; - if (!pim_ifp) + if (!ifp || !ifp->info) return NULL; + pim_ifp = ifp->info; + for (ALL_LIST_ELEMENTS_RO(pim_ifp->pim_neighbor_list, node, neigh)) { for (ALL_LIST_ELEMENTS_RO(neigh->prefix_list, pnode, p)) { if (prefix_same(p, src)) From 5e4f10b1e00ad63ccec5bb91b0f0b18141265231 Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Fri, 21 Jun 2019 07:45:50 +0200 Subject: [PATCH 03/13] lib: make SA_SIGINFO use unconditional This has been in POSIX for ages & there's no OS left that doesn't support this. Signed-off-by: David Lamparter --- lib/log.c | 4 ---- lib/sigevent.c | 42 +++--------------------------------------- 2 files changed, 3 insertions(+), 43 deletions(-) diff --git a/lib/log.c b/lib/log.c index 48ee0f6adb..399d1af0c4 100644 --- a/lib/log.c +++ b/lib/log.c @@ -561,9 +561,7 @@ static void crash_write(struct fbuf *fb, char *msgstart) void zlog_signal(int signo, const char *action, void *siginfo_v, void *program_counter) { -#ifdef SA_SIGINFO siginfo_t *siginfo = siginfo_v; -#endif time_t now; char buf[sizeof("DEFAULT: Received signal S at T (si_addr 0xP, PC 0xP); aborting...") + 100]; @@ -577,7 +575,6 @@ void zlog_signal(int signo, const char *action, void *siginfo_v, msgstart = fb.pos; bprintfrr(&fb, "Received signal %d at %lld", signo, (long long)now); -#ifdef SA_SIGINFO if (program_counter) bprintfrr(&fb, " (si_addr 0x%tx, PC 0x%tx)", (ptrdiff_t)siginfo->si_addr, @@ -585,7 +582,6 @@ void zlog_signal(int signo, const char *action, void *siginfo_v, else bprintfrr(&fb, " (si_addr 0x%tx)", (ptrdiff_t)siginfo->si_addr); -#endif /* SA_SIGINFO */ bprintfrr(&fb, "; %s\n", action); crash_write(&fb, msgstart); diff --git a/lib/sigevent.c b/lib/sigevent.c index d02b074223..fcd85d0d43 100644 --- a/lib/sigevent.c +++ b/lib/sigevent.c @@ -24,7 +24,6 @@ #include #include -#ifdef SA_SIGINFO #ifdef HAVE_UCONTEXT_H #ifdef GNU_LINUX /* get REG_EIP from ucontext.h */ @@ -34,7 +33,6 @@ #endif /* GNU_LINUX */ #include #endif /* HAVE_UCONTEXT_H */ -#endif /* SA_SIGINFO */ /* master signals descriptor struct */ @@ -158,8 +156,6 @@ static int signal_set(int signo) return 0; } -#ifdef SA_SIGINFO - /* XXX This function should be enhanced to support more platforms (it currently works only on Linux/x86). */ static void *program_counter(void *context) @@ -199,41 +195,19 @@ static void *program_counter(void *context) return NULL; } -#endif /* SA_SIGINFO */ - static void __attribute__((noreturn)) -exit_handler(int signo -#ifdef SA_SIGINFO - , - siginfo_t *siginfo, void *context -#endif - ) +exit_handler(int signo, siginfo_t *siginfo, void *context) { -#ifndef SA_SIGINFO - void *siginfo = NULL; - void *pc = NULL; -#else void *pc = program_counter(context); -#endif zlog_signal(signo, "exiting...", siginfo, pc); _exit(128 + signo); } static void __attribute__((noreturn)) -core_handler(int signo -#ifdef SA_SIGINFO - , - siginfo_t *siginfo, void *context -#endif - ) +core_handler(int signo, siginfo_t *siginfo, void *context) { -#ifndef SA_SIGINFO - void *siginfo = NULL; - void *pc = NULL; -#else void *pc = program_counter(context); -#endif /* make sure we don't hang in here. default for SIGALRM is terminate. * - if we're in backtrace for more than a second, abort. */ @@ -290,12 +264,7 @@ static void trap_default_signals(void) static const struct { const int *sigs; unsigned int nsigs; - void (*handler)(int signo -#ifdef SA_SIGINFO - , - siginfo_t *info, void *context -#endif - ); + void (*handler)(int signo, siginfo_t *info, void *context); } sigmap[] = { {core_signals, array_size(core_signals), core_handler}, {exit_signals, array_size(exit_signals), exit_handler}, @@ -316,15 +285,10 @@ static void trap_default_signals(void) act.sa_handler = SIG_IGN; act.sa_flags = 0; } else { -#ifdef SA_SIGINFO /* Request extra arguments to signal * handler. */ act.sa_sigaction = sigmap[i].handler; act.sa_flags = SA_SIGINFO; -#else - act.sa_handler = sigmap[i].handler; - act.sa_flags = 0; -#endif #ifdef SA_RESETHAND /* don't try to print backtraces * recursively */ From 6046b690b53a5e1c14ccf261304f5aa1f53546ae Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Wed, 19 Jun 2019 12:52:38 +0200 Subject: [PATCH 04/13] lib/seqlock: avoid syscalls in no-waiter cases When we have no contention on the seqlock, we shouldn't incur the cost of syscalls. Signed-off-by: David Lamparter --- lib/seqlock.c | 56 +++++++++++++++++++++++++++------------ lib/seqlock.h | 10 +++++-- tests/lib/test_atomlist.c | 10 +++---- tests/lib/test_seqlock.c | 34 ++++++++++++------------ 4 files changed, 69 insertions(+), 41 deletions(-) diff --git a/lib/seqlock.c b/lib/seqlock.c index 223d14952c..32dff5173d 100644 --- a/lib/seqlock.c +++ b/lib/seqlock.c @@ -103,16 +103,23 @@ void seqlock_wait(struct seqlock *sqlo, seqlock_val_t val) seqlock_assert_valid(val); wait_prep(sqlo); - while (1) { - cur = atomic_load_explicit(&sqlo->pos, memory_order_acquire); - if (!(cur & 1)) - break; - cal = cur - val - 1; + cur = atomic_load_explicit(&sqlo->pos, memory_order_relaxed); + + while (cur & SEQLOCK_HELD) { + cal = SEQLOCK_VAL(cur) - val - 1; assert(cal < 0x40000000 || cal > 0xc0000000); if (cal < 0x80000000) break; - wait_once(sqlo, cur); + if ((cur & SEQLOCK_WAITERS) + || atomic_compare_exchange_weak_explicit( + &sqlo->pos, &cur, cur | SEQLOCK_WAITERS, + memory_order_relaxed, memory_order_relaxed)) { + wait_once(sqlo, cur | SEQLOCK_WAITERS); + cur = atomic_load_explicit(&sqlo->pos, + memory_order_relaxed); + } + /* else: we failed to swap in cur because it just changed */ } wait_done(sqlo); } @@ -123,26 +130,32 @@ bool seqlock_check(struct seqlock *sqlo, seqlock_val_t val) seqlock_assert_valid(val); - cur = atomic_load_explicit(&sqlo->pos, memory_order_acquire); - if (!(cur & 1)) + cur = atomic_load_explicit(&sqlo->pos, memory_order_relaxed); + if (!(cur & SEQLOCK_HELD)) return 1; - cur -= val; + cur = SEQLOCK_VAL(cur) - val - 1; assert(cur < 0x40000000 || cur > 0xc0000000); return cur < 0x80000000; } void seqlock_acquire_val(struct seqlock *sqlo, seqlock_val_t val) { + seqlock_val_t prev; + seqlock_assert_valid(val); - atomic_store_explicit(&sqlo->pos, val, memory_order_release); - wait_poke(sqlo); + prev = atomic_exchange_explicit(&sqlo->pos, val, memory_order_relaxed); + if (prev & SEQLOCK_WAITERS) + wait_poke(sqlo); } void seqlock_release(struct seqlock *sqlo) { - atomic_store_explicit(&sqlo->pos, 0, memory_order_release); - wait_poke(sqlo); + seqlock_val_t prev; + + prev = atomic_exchange_explicit(&sqlo->pos, 0, memory_order_relaxed); + if (prev & SEQLOCK_WAITERS) + wait_poke(sqlo); } void seqlock_init(struct seqlock *sqlo) @@ -154,14 +167,23 @@ void seqlock_init(struct seqlock *sqlo) seqlock_val_t seqlock_cur(struct seqlock *sqlo) { - return atomic_load_explicit(&sqlo->pos, memory_order_acquire); + return SEQLOCK_VAL(atomic_load_explicit(&sqlo->pos, + memory_order_relaxed)); } seqlock_val_t seqlock_bump(struct seqlock *sqlo) { - seqlock_val_t val; + seqlock_val_t val, cur; - val = atomic_fetch_add_explicit(&sqlo->pos, 2, memory_order_release); - wait_poke(sqlo); + cur = atomic_load_explicit(&sqlo->pos, memory_order_relaxed); + seqlock_assert_valid(cur); + + do { + val = SEQLOCK_VAL(cur) + SEQLOCK_INCR; + } while (!atomic_compare_exchange_weak_explicit(&sqlo->pos, &cur, val, + memory_order_relaxed, memory_order_relaxed)); + + if (cur & SEQLOCK_WAITERS) + wait_poke(sqlo); return val; } diff --git a/lib/seqlock.h b/lib/seqlock.h index eef05a4307..0f5dc47cd7 100644 --- a/lib/seqlock.h +++ b/lib/seqlock.h @@ -54,12 +54,18 @@ */ /* use sequentially increasing "ticket numbers". lowest bit will always - * be 1 to have a 'cleared' indication (i.e., counts 1,3,5,7,etc. ) + * be 1 to have a 'cleared' indication (i.e., counts 1,5,9,13,etc. ) + * 2nd lowest bit is used to indicate we have waiters. */ typedef _Atomic uint32_t seqlock_ctr_t; typedef uint32_t seqlock_val_t; -#define seqlock_assert_valid(val) assert(val & 1) +#define seqlock_assert_valid(val) assert((val) & SEQLOCK_HELD) +#define SEQLOCK_HELD (1U << 0) +#define SEQLOCK_WAITERS (1U << 1) +#define SEQLOCK_VAL(n) ((n) & ~SEQLOCK_WAITERS) +#define SEQLOCK_STARTVAL 1U +#define SEQLOCK_INCR 4U struct seqlock { /* always used */ diff --git a/tests/lib/test_atomlist.c b/tests/lib/test_atomlist.c index 249fff8edb..238ee9539e 100644 --- a/tests/lib/test_atomlist.c +++ b/tests/lib/test_atomlist.c @@ -253,7 +253,7 @@ static void *thr1func(void *arg) struct testrun *tr; for (tr = runs; tr; tr = tr->next) { - sv = seqlock_bump(&p->sqlo); + sv = seqlock_bump(&p->sqlo) - SEQLOCK_INCR; seqlock_wait(&sqlo, sv); tr->func(offset); @@ -288,14 +288,14 @@ static void run_tr(struct testrun *tr) size_t c = 0, s = 0, n = 0; struct item *item, *prev, dummy; - printf("[%02u] %35s %s\n", seqlock_cur(&sqlo) >> 1, "", desc); + printf("[%02u] %35s %s\n", seqlock_cur(&sqlo) >> 2, "", desc); fflush(stdout); if (tr->prefill != NOCLEAR) clear_list(tr->prefill); monotime(&tv); - sv = seqlock_bump(&sqlo); + sv = seqlock_bump(&sqlo) - SEQLOCK_INCR; for (size_t i = 0; i < NTHREADS; i++) { seqlock_wait(&thr[i].sqlo, seqlock_cur(&sqlo)); s += thr[i].counter; @@ -325,7 +325,7 @@ static void run_tr(struct testrun *tr) assert(c == alist_count(&ahead)); } printf("\033[1A[%02u] %9"PRId64"us c=%5zu s=%5zu n=%5zu %s\n", - sv >> 1, delta, c, s, n, desc); + sv >> 2, delta, c, s, n, desc); } #ifdef BASIC_TESTS @@ -381,7 +381,7 @@ int main(int argc, char **argv) basic_tests(); seqlock_init(&sqlo); - seqlock_acquire_val(&sqlo, 1); + seqlock_acquire_val(&sqlo, SEQLOCK_STARTVAL); for (i = 0; i < NTHREADS; i++) { seqlock_init(&thr[i].sqlo); diff --git a/tests/lib/test_seqlock.c b/tests/lib/test_seqlock.c index 9cc6f80702..639c2bdc2b 100644 --- a/tests/lib/test_seqlock.c +++ b/tests/lib/test_seqlock.c @@ -66,20 +66,20 @@ static void *thr1func(void *arg) seqlock_wait(&sqlo, 1); writestr("thr1 @1\n"); - seqlock_wait(&sqlo, 3); - writestr("thr1 @3\n"); - seqlock_wait(&sqlo, 5); writestr("thr1 @5\n"); - seqlock_wait(&sqlo, 7); - writestr("thr1 @7\n"); - seqlock_wait(&sqlo, 9); writestr("thr1 @9\n"); - seqlock_wait(&sqlo, 11); - writestr("thr1 @11\n"); + seqlock_wait(&sqlo, 13); + writestr("thr1 @13\n"); + + seqlock_wait(&sqlo, 17); + writestr("thr1 @17\n"); + + seqlock_wait(&sqlo, 21); + writestr("thr1 @21\n"); return NULL; } @@ -95,11 +95,11 @@ int main(int argc, char **argv) assert(seqlock_cur(&sqlo) == 1); assert(seqlock_bump(&sqlo) == 1); - assert(seqlock_cur(&sqlo) == 3); - assert(seqlock_bump(&sqlo) == 3); + assert(seqlock_cur(&sqlo) == 5); assert(seqlock_bump(&sqlo) == 5); - assert(seqlock_bump(&sqlo) == 7); - assert(seqlock_cur(&sqlo) == 9); + assert(seqlock_bump(&sqlo) == 9); + assert(seqlock_bump(&sqlo) == 13); + assert(seqlock_cur(&sqlo) == 17); assert(seqlock_held(&sqlo)); seqlock_release(&sqlo); @@ -108,16 +108,16 @@ int main(int argc, char **argv) pthread_create(&thr1, NULL, thr1func, NULL); sleep(1); - writestr("main @3\n"); - seqlock_acquire_val(&sqlo, 3); + writestr("main @5\n"); + seqlock_acquire_val(&sqlo, 5); sleep(2); - writestr("main @5\n"); + writestr("main @9\n"); seqlock_bump(&sqlo); sleep(1); - writestr("main @9\n"); - seqlock_acquire_val(&sqlo, 9); + writestr("main @17\n"); + seqlock_acquire_val(&sqlo, 17); sleep(1); writestr("main @release\n"); From 2a5e62359f737b233a8b06731ce60f915a754d72 Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Wed, 19 Jun 2019 12:52:38 +0200 Subject: [PATCH 05/13] lib/seqlock: add timed-wait operation seqlock_timedwait() puts an (absolute, CLOCK_MONOTONIC) deadline on how long we wait. The RCU code uses this for its watchdog implementation. Signed-off-by: David Lamparter --- lib/seqlock.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++-- lib/seqlock.h | 2 + 2 files changed, 107 insertions(+), 3 deletions(-) diff --git a/lib/seqlock.c b/lib/seqlock.c index 32dff5173d..6ec9bbcf5f 100644 --- a/lib/seqlock.c +++ b/lib/seqlock.c @@ -25,6 +25,7 @@ #include "config.h" #endif +#include #include #include #include @@ -40,24 +41,31 @@ #include #include -static long sys_futex(void *addr1, int op, int val1, struct timespec *timeout, - void *addr2, int val3) +static long sys_futex(void *addr1, int op, int val1, + const struct timespec *timeout, void *addr2, int val3) { return syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3); } #define wait_once(sqlo, val) \ sys_futex((int *)&sqlo->pos, FUTEX_WAIT, (int)val, NULL, NULL, 0) +#define wait_time(sqlo, val, time, reltime) \ + sys_futex((int *)&sqlo->pos, FUTEX_WAIT_BITSET, (int)val, time, \ + NULL, ~0U) #define wait_poke(sqlo) \ sys_futex((int *)&sqlo->pos, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) #elif defined(HAVE_SYNC_OPENBSD_FUTEX) -/* OpenBSD variant of the above. untested, not upstream in OpenBSD. */ +/* OpenBSD variant of the above. */ #include #include +#define TIME_RELATIVE 1 + #define wait_once(sqlo, val) \ futex((int *)&sqlo->pos, FUTEX_WAIT, (int)val, NULL, NULL, 0) +#define wait_time(sqlo, val, time, reltime) \ + futex((int *)&sqlo->pos, FUTEX_WAIT, (int)val, reltime, NULL, 0) #define wait_poke(sqlo) \ futex((int *)&sqlo->pos, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) @@ -67,6 +75,17 @@ static long sys_futex(void *addr1, int op, int val1, struct timespec *timeout, #define wait_once(sqlo, val) \ _umtx_op((void *)&sqlo->pos, UMTX_OP_WAIT_UINT, val, NULL, NULL) +static int wait_time(struct seqlock *sqlo, uint32_t val, + const struct timespec *abstime, + const struct timespec *reltime) +{ + struct _umtx_time t; + t._flags = UMTX_ABSTIME; + t._clockid = CLOCK_MONOTONIC; + memcpy(&t._timeout, abstime, sizeof(t._timeout)); + return _umtx_op((void *)&sqlo->pos, UMTX_OP_WAIT_UINT, val, + (void *)(uintptr_t) sizeof(t), &t); +} #define wait_poke(sqlo) \ _umtx_op((void *)&sqlo->pos, UMTX_OP_WAKE, INT_MAX, NULL, NULL) @@ -74,12 +93,17 @@ static long sys_futex(void *addr1, int op, int val1, struct timespec *timeout, /* generic version. used on *BSD, Solaris and OSX. */ +#define TIME_ABS_REALTIME 1 + #define wait_init(sqlo) do { \ pthread_mutex_init(&sqlo->lock, NULL); \ pthread_cond_init(&sqlo->wake, NULL); \ } while (0) #define wait_prep(sqlo) pthread_mutex_lock(&sqlo->lock) #define wait_once(sqlo, val) pthread_cond_wait(&sqlo->wake, &sqlo->lock) +#define wait_time(sqlo, val, time, reltime) \ + pthread_cond_timedwait(&sqlo->wake, \ + &sqlo->lock, time); #define wait_done(sqlo) pthread_mutex_unlock(&sqlo->lock) #define wait_poke(sqlo) do { \ pthread_mutex_lock(&sqlo->lock); \ @@ -124,6 +148,84 @@ void seqlock_wait(struct seqlock *sqlo, seqlock_val_t val) wait_done(sqlo); } +bool seqlock_timedwait(struct seqlock *sqlo, seqlock_val_t val, + const struct timespec *abs_monotime_limit) +{ +#if TIME_ABS_REALTIME +#define time_arg1 &abs_rt +#define time_arg2 NULL +#define time_prep + struct timespec curmono, abs_rt; + + clock_gettime(CLOCK_MONOTONIC, &curmono); + clock_gettime(CLOCK_REALTIME, &abs_rt); + + abs_rt.tv_nsec += abs_monotime_limit->tv_nsec - curmono.tv_nsec; + if (abs_rt.tv_nsec < 0) { + abs_rt.tv_sec--; + abs_rt.tv_nsec += 1000000000; + } else if (abs_rt.tv_nsec >= 1000000000) { + abs_rt.tv_sec++; + abs_rt.tv_nsec -= 1000000000; + } + abs_rt.tv_sec += abs_monotime_limit->tv_sec - curmono.tv_sec; + +#elif TIME_RELATIVE + struct timespec reltime; + +#define time_arg1 abs_monotime_limit +#define time_arg2 &reltime +#define time_prep \ + clock_gettime(CLOCK_MONOTONIC, &reltime); \ + reltime.tv_sec = abs_monotime_limit.tv_sec - reltime.tv_sec; \ + reltime.tv_nsec = abs_monotime_limit.tv_nsec - reltime.tv_nsec; \ + if (reltime.tv_nsec < 0) { \ + reltime.tv_sec--; \ + reltime.tv_nsec += 1000000000; \ + } +#else +#define time_arg1 abs_monotime_limit +#define time_arg2 NULL +#define time_prep +#endif + + bool ret = true; + seqlock_val_t cur, cal; + + seqlock_assert_valid(val); + + wait_prep(sqlo); + cur = atomic_load_explicit(&sqlo->pos, memory_order_relaxed); + + while (cur & SEQLOCK_HELD) { + cal = SEQLOCK_VAL(cur) - val - 1; + assert(cal < 0x40000000 || cal > 0xc0000000); + if (cal < 0x80000000) + break; + + if ((cur & SEQLOCK_WAITERS) + || atomic_compare_exchange_weak_explicit( + &sqlo->pos, &cur, cur | SEQLOCK_WAITERS, + memory_order_relaxed, memory_order_relaxed)) { + int rv; + + time_prep + + rv = wait_time(sqlo, cur | SEQLOCK_WAITERS, time_arg1, + time_arg2); + if (rv) { + ret = false; + break; + } + cur = atomic_load_explicit(&sqlo->pos, + memory_order_relaxed); + } + } + wait_done(sqlo); + + return ret; +} + bool seqlock_check(struct seqlock *sqlo, seqlock_val_t val) { seqlock_val_t cur; diff --git a/lib/seqlock.h b/lib/seqlock.h index 0f5dc47cd7..3cce9ccf49 100644 --- a/lib/seqlock.h +++ b/lib/seqlock.h @@ -82,6 +82,8 @@ extern void seqlock_init(struct seqlock *sqlo); /* while (sqlo <= val) - wait until seqlock->pos > val, or seqlock unheld */ extern void seqlock_wait(struct seqlock *sqlo, seqlock_val_t val); +extern bool seqlock_timedwait(struct seqlock *sqlo, seqlock_val_t val, + const struct timespec *abs_monotime_limit); extern bool seqlock_check(struct seqlock *sqlo, seqlock_val_t val); static inline bool seqlock_held(struct seqlock *sqlo) From 30ef834ab3b50c09e103a82742b67ae4b0bac9f5 Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Tue, 25 Jun 2019 20:33:02 +0200 Subject: [PATCH 06/13] lib/seqlock: add a few more comments Signed-off-by: David Lamparter --- lib/seqlock.c | 30 +++++++++++++++++++++++++----- lib/seqlock.h | 28 ++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/lib/seqlock.c b/lib/seqlock.c index 6ec9bbcf5f..c05ec19db4 100644 --- a/lib/seqlock.c +++ b/lib/seqlock.c @@ -36,8 +36,14 @@ #include "seqlock.h" +/**************************************** + * OS specific synchronization wrappers * + ****************************************/ + +/* + * Linux: sys_futex() + */ #ifdef HAVE_SYNC_LINUX_FUTEX -/* Linux-specific - sys_futex() */ #include #include @@ -55,8 +61,10 @@ static long sys_futex(void *addr1, int op, int val1, #define wait_poke(sqlo) \ sys_futex((int *)&sqlo->pos, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) +/* + * OpenBSD: sys_futex(), almost the same as on Linux + */ #elif defined(HAVE_SYNC_OPENBSD_FUTEX) -/* OpenBSD variant of the above. */ #include #include @@ -69,8 +77,10 @@ static long sys_futex(void *addr1, int op, int val1, #define wait_poke(sqlo) \ futex((int *)&sqlo->pos, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) +/* + * FreeBSD: _umtx_op() + */ #elif defined(HAVE_SYNC_UMTX_OP) -/* FreeBSD-specific: umtx_op() */ #include #define wait_once(sqlo, val) \ @@ -89,9 +99,10 @@ static int wait_time(struct seqlock *sqlo, uint32_t val, #define wait_poke(sqlo) \ _umtx_op((void *)&sqlo->pos, UMTX_OP_WAKE, INT_MAX, NULL, NULL) -#else -/* generic version. used on *BSD, Solaris and OSX. +/* + * generic version. used on NetBSD, Solaris and OSX. really shitty. */ +#else #define TIME_ABS_REALTIME 1 @@ -151,6 +162,9 @@ void seqlock_wait(struct seqlock *sqlo, seqlock_val_t val) bool seqlock_timedwait(struct seqlock *sqlo, seqlock_val_t val, const struct timespec *abs_monotime_limit) { +/* + * ABS_REALTIME - used on NetBSD, Solaris and OSX + */ #if TIME_ABS_REALTIME #define time_arg1 &abs_rt #define time_arg2 NULL @@ -170,6 +184,9 @@ bool seqlock_timedwait(struct seqlock *sqlo, seqlock_val_t val, } abs_rt.tv_sec += abs_monotime_limit->tv_sec - curmono.tv_sec; +/* + * RELATIVE - used on OpenBSD (might get a patch to get absolute monotime) + */ #elif TIME_RELATIVE struct timespec reltime; @@ -183,6 +200,9 @@ bool seqlock_timedwait(struct seqlock *sqlo, seqlock_val_t val, reltime.tv_sec--; \ reltime.tv_nsec += 1000000000; \ } +/* + * FreeBSD & Linux: absolute time re. CLOCK_MONOTONIC + */ #else #define time_arg1 abs_monotime_limit #define time_arg2 NULL diff --git a/lib/seqlock.h b/lib/seqlock.h index 3cce9ccf49..b551e3ffc4 100644 --- a/lib/seqlock.h +++ b/lib/seqlock.h @@ -61,12 +61,22 @@ typedef _Atomic uint32_t seqlock_ctr_t; typedef uint32_t seqlock_val_t; #define seqlock_assert_valid(val) assert((val) & SEQLOCK_HELD) +/* NB: SEQLOCK_WAITERS is only allowed if SEQLOCK_HELD is also set; can't + * have waiters on an unheld seqlock + */ #define SEQLOCK_HELD (1U << 0) #define SEQLOCK_WAITERS (1U << 1) #define SEQLOCK_VAL(n) ((n) & ~SEQLOCK_WAITERS) #define SEQLOCK_STARTVAL 1U #define SEQLOCK_INCR 4U +/* TODO: originally, this was using "atomic_fetch_add", which is the reason + * bit 0 is used to indicate held state. With SEQLOCK_WAITERS added, there's + * no fetch_add anymore (cmpxchg loop instead), so we don't need to use bit 0 + * for this anymore & can just special-case the value 0 for it and skip it in + * counting. + */ + struct seqlock { /* always used */ seqlock_ctr_t pos; @@ -80,10 +90,16 @@ struct seqlock { extern void seqlock_init(struct seqlock *sqlo); -/* while (sqlo <= val) - wait until seqlock->pos > val, or seqlock unheld */ +/* basically: "while (sqlo <= val) wait();" + * returns when sqlo > val || !seqlock_held(sqlo) + */ extern void seqlock_wait(struct seqlock *sqlo, seqlock_val_t val); + +/* same, but time-limited (limit is an absolute CLOCK_MONOTONIC value) */ extern bool seqlock_timedwait(struct seqlock *sqlo, seqlock_val_t val, const struct timespec *abs_monotime_limit); + +/* one-shot test, returns true if seqlock_wait would return immediately */ extern bool seqlock_check(struct seqlock *sqlo, seqlock_val_t val); static inline bool seqlock_held(struct seqlock *sqlo) @@ -93,12 +109,20 @@ static inline bool seqlock_held(struct seqlock *sqlo) /* sqlo - get seqlock position -- for the "counter" seqlock */ extern seqlock_val_t seqlock_cur(struct seqlock *sqlo); -/* sqlo++ - note: like x++, returns previous value, before bumping */ + +/* ++sqlo (but atomic & wakes waiters) - returns value that we bumped to. + * + * guarantees: + * - each seqlock_bump call bumps the position by exactly one SEQLOCK_INCR. + * There are no skipped/missed or multiple increments. + * - each return value is only returned from one seqlock_bump() call + */ extern seqlock_val_t seqlock_bump(struct seqlock *sqlo); /* sqlo = val - can be used on held seqlock. */ extern void seqlock_acquire_val(struct seqlock *sqlo, seqlock_val_t val); + /* sqlo = ref - standard pattern: acquire relative to other seqlock */ static inline void seqlock_acquire(struct seqlock *sqlo, struct seqlock *ref) { From 3e41733f1bbe9ccd7d08441f5962b3dc6db2c644 Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Mon, 3 Apr 2017 00:51:20 +0200 Subject: [PATCH 07/13] lib: RCU Please refer to doc/developer/rcu.rst for documentation. Signed-off-by: David Lamparter --- doc/developer/library.rst | 1 + doc/developer/rcu.rst | 269 +++++++++++++++++++ doc/developer/subdir.am | 1 + lib/frr_pthread.c | 15 +- lib/frr_pthread.h | 3 + lib/frrcu.c | 527 ++++++++++++++++++++++++++++++++++++++ lib/frrcu.h | 172 +++++++++++++ lib/libfrr.c | 2 + lib/subdir.am | 2 + lib/thread.c | 6 + 10 files changed, 996 insertions(+), 2 deletions(-) create mode 100644 doc/developer/rcu.rst create mode 100644 lib/frrcu.c create mode 100644 lib/frrcu.h diff --git a/doc/developer/library.rst b/doc/developer/library.rst index 4ba0c0ebc6..7cd493ccc4 100644 --- a/doc/developer/library.rst +++ b/doc/developer/library.rst @@ -8,6 +8,7 @@ Library Facilities (libfrr) :maxdepth: 2 memtypes + rcu lists logging hooks diff --git a/doc/developer/rcu.rst b/doc/developer/rcu.rst new file mode 100644 index 0000000000..c2ddf93f53 --- /dev/null +++ b/doc/developer/rcu.rst @@ -0,0 +1,269 @@ +.. highlight:: c + +RCU +=== + +Introduction +------------ + +RCU (Read-Copy-Update) is, fundamentally, a paradigm of multithreaded +operation (and not a set of APIs.) The core ideas are: + +* longer, complicated updates to structures are made only on private, + "invisible" copies. Other threads, when they access the structure, see an + older (but consistent) copy. + +* once done, the updated copy is swapped in in a single operation so that + other threads see either the old or the new data but no inconsistent state + between. + +* the old instance is only released after making sure that it is impossible + any other thread might still be reading it. + +For more information, please search for general or Linux kernel RCU +documentation; there is no way this doc can be comprehensive in explaining the +interactions: + +* https://en.wikipedia.org/wiki/Read-copy-update +* https://www.kernel.org/doc/html/latest/kernel-hacking/locking.html#avoiding-locks-read-copy-update +* https://lwn.net/Articles/262464/ +* http://www.rdrop.com/users/paulmck/RCU/rclock_OLS.2001.05.01c.pdf +* http://lse.sourceforge.net/locking/rcupdate.html + +RCU, the TL;DR +^^^^^^^^^^^^^^ + +#. data structures are always consistent for reading. That's the "R" part. +#. reading never blocks / takes a lock. +#. rcu_read_lock is not a lock in the traditional sense. Think of it as a + "reservation"; it notes what the *oldest* possible thing the thread might + be seeing is, and which thus can't be deleted yet. +#. you create some object, finish it up, and then publish it. +#. publishing is an ``atomic_*`` call with ``memory_order_release``, which + tells the compiler to make sure prior memory writes have completed before + doing the atomic op. +#. ``ATOMLIST_*`` ``add`` operations do the ``memory_order_release`` for you. +#. you can't touch the object after it is published, except with atomic ops. +#. because you can't touch it, if you want to change it you make a new copy, + work on that, and then publish the new copy. That's the "CU" part. +#. deleting the object is also an atomic op. +#. other threads that started working before you published / deleted an object + might not see the new object / still see the deleted object. +#. because other threads may still see deleted objects, the ``free()`` needs + to be delayed. That's what :c:func:`rcu_free()` is for. + + +When (not) to use RCU +^^^^^^^^^^^^^^^^^^^^^ + +RCU is designed for read-heavy workloads where objects are updated relatively +rarely, but frequently accessed. Do *not* indiscriminately replace locking by +RCU patterns. + +The "copy" part of RCU implies that, while updating, several copies of a given +object exist in parallel. Even after the updated copy is swapped in, the old +object remains queued for freeing until all other threads are guaranteed to +not be accessing it anymore, due to passing a sequence point. In addition to +the increased memory usage, there may be some bursted (due to batching) malloc +contention when the RCU cleanup thread does its thing and frees memory. + +Other useful patterns +^^^^^^^^^^^^^^^^^^^^^ + +In addition to the full "copy object, apply changes, atomically update" +approach, there are 2 "reduced" usage cases that can be done: + +* atomically updating single pieces of a particular object, e.g. some flags + or configuration piece + +* straight up read-only / immutable objects + +Both of these cases can be considered RCU "subsets". For example, when +maintaining an atomic list of items, but these items only have a single +integer value that needs to be updated, that value can be atomically updated +without copying the entire object. However, the object still needs to be +free'd through :c:func:`rcu_free()` since reading/updating and deleting might +be happening concurrently. The same applies for immutable objects; deletion +might still race with reading so they need to be free'd through RCU. + +FRR API +------- + +Before diving into detail on the provided functions, it is important to note +that the FRR RCU API covers the **cleanup part of RCU, not the read-copy-update +paradigm itself**. These parts are handled by standard C11 atomic operations, +and by extension through the atomic data structures (ATOMLIST, ATOMSORT & co.) + +The ``rcu_*`` functions only make sense in conjunction with these RCU access +patterns. If you're calling the RCU API but not using these, something is +wrong. The other way around is not necessarily true; it is possible to use +atomic ops & datastructures with other types of locking, e.g. rwlocks. + +.. c:function:: void rcu_read_lock() +.. c:function:: void rcu_read_unlock() + + These functions acquire / release the RCU read-side lock. All access to + RCU-guarded data must be inside a block guarded by these. Any number of + threads may hold the RCU read-side lock at a given point in time, including + both no threads at all and all threads. + + The functions implement a depth counter, i.e. can be nested. The nested + calls are cheap, since they only increment/decrement the counter. + Therefore, any place that uses RCU data and doesn't have a guarantee that + the caller holds RCU (e.g. ``lib/`` code) should just have its own + rcu_read_lock/rcu_read_unlock pair. + + At the "root" level (e.g. un-nested), these calls can incur the cost of one + syscall (to ``futex()``). That puts them on about the same cost as a + mutex lock/unlock. + + The ``thread_master`` code currently always holds RCU everywhere, except + while doing the actual ``poll()`` syscall. This is both an optimization as + well as an "easement" into getting RCU going. The current implementation + contract is that any ``struct thread *`` callback is called with a RCU + holding depth of 1, and that this is owned by the thread so it may (should) + drop and reacquire it when doing some longer-running work. + + .. warning:: + + The RCU read-side lock must be held **continuously** for the entire time + any piece of RCU data is used. This includes any access to RCU data + after the initial ``atomic_load``. If the RCU read-side lock is + released, any RCU-protected pointers as well as the data they refer to + become invalid, as another thread may have called :c:func:`rcu_free` on + them. + +.. c:type:: struct rcu_head +.. c:type:: struct rcu_head_close +.. c:type:: struct rcu_action + + The ``rcu_head`` structures are small (16-byte) bits that contain the + queueing machinery for the RCU sweeper/cleanup mechanisms. + + Any piece of data that is cleaned up by RCU needs to have a matching + ``rcu_head`` embedded in it. If there is more than one cleanup operation + to be done (e.g. closing a file descriptor), more than one ``rcu_head`` may + be embedded. + + .. warning:: + + It is not possible to reuse a ``rcu_head``. It is owned by the RCU code + as soon as ``rcu_*`` is called on it. + + The ``_close`` variant carries an extra ``int fd`` field to store the fd to + be closed. + + To minimize the amount of memory used for ``rcu_head``, details about the + RCU operation to be performed are moved into the ``rcu_action`` structure. + It contains e.g. the MTYPE for :c:func:`rcu_free` calls. The pointer to be + freed is stored as an offset relative to the ``rcu_head``, which means it + must be embedded as a struct field so the offset is constant. + + The ``rcu_action`` structure is an implementation detail. Using + ``rcu_free`` or ``rcu_close`` will set it up correctly without further + code needed. + + The ``rcu_head`` may be put in an union with other data if the other data + is only used during "life" of the data, since the ``rcu_head`` is used only + for the "death" of data. But note that other threads may still be reading + a piece of data while a thread is working to free it. + +.. c:function:: void rcu_free(struct memtype *mtype, struct X *ptr, field) + + Free a block of memory after RCU has ensured no other thread can be + accessing it anymore. The pointer remains valid for any other thread that + has called :c:func:`rcu_read_lock` before the ``rcu_free`` call. + + .. warning:: + + In some other RCU implementations, the pointer remains valid to the + *calling* thread if it is holding the RCU read-side lock. This is not + the case in FRR, particularly when running single-threaded. Enforcing + this rule also allows static analysis to find use-after-free issues. + + ``mtype`` is the libfrr ``MTYPE_FOO`` allocation type to pass to + :c:func:`XFREE`. + + ``field`` must be the name of a ``struct rcu_head`` member field in ``ptr``. + The offset of this field (which must be constant) is used to reduce the + memory size of ``struct rcu_head``. + + .. note:: + + ``rcu_free`` (and ``rcu_close``) calls are more efficient if they are + put close to each other. When freeing several RCU'd resources, try to + move the calls next to each other (even if the data structures do not + directly point to each other.) + + Having the calls bundled reduces the cost of adding the ``rcu_head`` to + the RCU queue; the RCU queue is an atomic data structure whose usage + will require the CPU to acquire an exclusive hold on relevant cache + lines. + +.. c:function:: void rcu_close(struct rcu_head_close *head, int fd) + + Close a file descriptor after ensuring no other thread might be using it + anymore. Same as :c:func:`rcu_free`, except it calls ``close`` instead of + ``free``. + +Internals +^^^^^^^^^ + +.. c:type:: struct rcu_thread + + Per-thread state maintained by the RCU code, set up by the following + functions. A pointer to a thread's own ``rcu_thread`` is saved in + thread-local storage. + +.. c:function:: struct rcu_thread *rcu_thread_prepare(void) +.. c:function:: void rcu_thread_unprepare(struct rcu_thread *rcu_thread) +.. c:function:: void rcu_thread_start(struct rcu_thread *rcu_thread) + + Since the RCU code needs to have a list of all active threads, these + functions are used by the ``frr_pthread`` code to set up threads. Teardown + is automatic. It should not be necessary to call these functions. + + Any thread that accesses RCU-protected data needs to be registered with + these functions. Threads that do not access RCU-protected data may call + these functions but do not need to. + + Note that passing a pointer to RCU-protected data to some library which + accesses that pointer makes the library "access RCU-protected data". In + that case, either all of the library's threads must be registered for RCU, + or the code must instead pass a (non-RCU) copy of the data to the library. + +.. c:function:: void rcu_shutdown(void) + + Stop the RCU sweeper thread and make sure all cleanup has finished. + + This function is called on daemon exit by the libfrr code to ensure pending + RCU operations are completed. This is mostly to get a clean exit without + memory leaks from queued RCU operations. It should not be necessary to + call this function as libfrr handles this. + +FRR specifics and implementation details +---------------------------------------- + +The FRR RCU infrastructure has the following characteristics: + +* it is Epoch-based with a 32-bit wrapping counter. (This is somewhat + different from other Epoch-based approaches which may be designed to only + use 3 counter values, but works out to a simple implementation.) + +* instead of tracking CPUs as the Linux kernel does, threads are tracked. This + has exactly zero semantic impact, RCU just cares about "threads of + execution", which the kernel can optimize to CPUs but we can't. But it + really boils down to the same thing. + +* there are no ``rcu_dereference`` and ``rcu_assign_pointer`` - use + ``atomic_load`` and ``atomic_store`` instead. (These didn't exist when the + Linux RCU code was created.) + +* there is no ``synchronize_rcu``; this is a design choice but may be revisited + at a later point. ``synchronize_rcu`` blocks a thread until it is guaranteed + that no other threads might still be accessing data structures that they may + have access to at the beginning of the function call. This is a blocking + design and probably not appropriate for FRR. Instead, ``rcu_call`` can be + used to have the RCU sweeper thread make a callback after the same constraint + is fulfilled in an asynchronous way. Most needs should be covered by + ``rcu_free`` and ``rcu_close``. diff --git a/doc/developer/subdir.am b/doc/developer/subdir.am index 996f12d47f..1fc593e566 100644 --- a/doc/developer/subdir.am +++ b/doc/developer/subdir.am @@ -42,6 +42,7 @@ dev_RSTFILES = \ doc/developer/packaging-debian.rst \ doc/developer/packaging-redhat.rst \ doc/developer/packaging.rst \ + doc/developer/rcu.rst \ doc/developer/testing.rst \ doc/developer/topotests-snippets.rst \ doc/developer/topotests.rst \ diff --git a/lib/frr_pthread.c b/lib/frr_pthread.c index e588571c01..bdb6c2a397 100644 --- a/lib/frr_pthread.c +++ b/lib/frr_pthread.c @@ -133,18 +133,29 @@ int frr_pthread_set_name(struct frr_pthread *fpt) return ret; } +static void *frr_pthread_inner(void *arg) +{ + struct frr_pthread *fpt = arg; + + rcu_thread_start(fpt->rcu_thread); + return fpt->attr.start(fpt); +} + int frr_pthread_run(struct frr_pthread *fpt, const pthread_attr_t *attr) { int ret; - ret = pthread_create(&fpt->thread, attr, fpt->attr.start, fpt); + fpt->rcu_thread = rcu_thread_prepare(); + ret = pthread_create(&fpt->thread, attr, frr_pthread_inner, fpt); /* * Per pthread_create(3), the contents of fpt->thread are undefined if * pthread_create() did not succeed. Reset this value to zero. */ - if (ret < 0) + if (ret < 0) { + rcu_thread_unprepare(fpt->rcu_thread); memset(&fpt->thread, 0x00, sizeof(fpt->thread)); + } return ret; } diff --git a/lib/frr_pthread.h b/lib/frr_pthread.h index 3afe7ba966..6096a50370 100644 --- a/lib/frr_pthread.h +++ b/lib/frr_pthread.h @@ -23,6 +23,7 @@ #include #include "frratomic.h" #include "memory.h" +#include "frrcu.h" #include "thread.h" #ifdef __cplusplus @@ -50,6 +51,8 @@ struct frr_pthread { /* pthread id */ pthread_t thread; + struct rcu_thread *rcu_thread; + /* thread master for this pthread's thread.c event loop */ struct thread_master *master; diff --git a/lib/frrcu.c b/lib/frrcu.c new file mode 100644 index 0000000000..7e6475b648 --- /dev/null +++ b/lib/frrcu.c @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2017-19 David Lamparter, for NetDEF, Inc. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* implementation notes: this is an epoch-based RCU implementation. rcu_seq + * (global variable) counts the current epoch. Threads hold a specific epoch + * in rcu_read_lock(). This is the oldest epoch a thread might be accessing + * data from. + * + * The rcu_seq global is only pushed forward on rcu_read_lock() and + * rcu_read_unlock() calls. This makes things a tad more efficient since + * those are the only places it matters: + * - on rcu_read_lock, we don't want to hold an old epoch pointlessly + * - on rcu_read_unlock, we want to make sure we're not stuck on an old epoch + * when heading into a long idle period where no thread holds RCU + * + * rcu_thread structures themselves are RCU-free'd. + * + * rcu_head structures are the most iffy; normally for an ATOMLIST we would + * need to make sure we use rcu_free or pthread_rwlock to deallocate old items + * to prevent ABA or use-after-free problems. However, our ATOMLIST code + * guarantees that if the list remains non-empty in all cases, we only need + * the "last" pointer to do an "add_tail()", i.e. we can't run into ABA/UAF + * issues - but we do need to keep at least 1 item on the list. + * + * (Search the atomlist code for all uses of "last") + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#ifdef HAVE_PTHREAD_NP_H +#include +#endif +#include +#include +#include + +#include "frrcu.h" +#include "seqlock.h" +#include "atomlist.h" + +DEFINE_MTYPE_STATIC(LIB, RCU_THREAD, "RCU thread") +DEFINE_MTYPE_STATIC(LIB, RCU_NEXT, "RCU sequence barrier") + +DECLARE_ATOMLIST(rcu_heads, struct rcu_head, head) + +PREDECL_ATOMLIST(rcu_threads) +struct rcu_thread { + struct rcu_threads_item head; + + struct rcu_head rcu_head; + + struct seqlock rcu; + + /* only accessed by thread itself, not atomic */ + unsigned depth; +}; +DECLARE_ATOMLIST(rcu_threads, struct rcu_thread, head) + +static const struct rcu_action rcua_next = { .type = RCUA_NEXT }; +static const struct rcu_action rcua_end = { .type = RCUA_END }; +static const struct rcu_action rcua_close = { .type = RCUA_CLOSE }; + +struct rcu_next { + struct rcu_head head_free; + struct rcu_head head_next; +}; + +#define rcu_free_internal(mtype, ptr, field) \ + do { \ + typeof(ptr) _ptr = (ptr); \ + struct rcu_head *_rcu_head = &_ptr->field; \ + static const struct rcu_action _rcu_action = { \ + .type = RCUA_FREE, \ + .u.free = { \ + .mt = mtype, \ + .offset = offsetof(typeof(*_ptr), field), \ + }, \ + }; \ + _rcu_head->action = &_rcu_action; \ + rcu_heads_add_tail(&rcu_heads, _rcu_head); \ + } while (0) + +/* primary global RCU position */ +static struct seqlock rcu_seq; +/* this is set to rcu_seq whenever something is added on the RCU queue. + * rcu_read_lock() and rcu_read_unlock() will then bump rcu_seq up one step. + */ +static _Atomic seqlock_val_t rcu_dirty; + +static struct rcu_threads_head rcu_threads; +static struct rcu_heads_head rcu_heads; + +/* main thread & RCU sweeper have pre-setup rcu_thread structures. The + * reasons are different: + * + * - rcu_thread_main is there because the main thread isn't started like + * other threads, it's implicitly created when the program is started. So + * rcu_thread_main matches up implicitly. + * + * - rcu_thread_rcu isn't actually put on the rcu_threads list (makes no + * sense really), it only exists so we can call RCU-using functions from + * the RCU thread without special handling in rcu_read_lock/unlock. + */ +static struct rcu_thread rcu_thread_main; +static struct rcu_thread rcu_thread_rcu; + +static pthread_t rcu_pthread; +static pthread_key_t rcu_thread_key; +static bool rcu_active; + +static void rcu_start(void); +static void rcu_bump(void); + +/* + * preinitialization for main thread + */ +static void rcu_thread_end(void *rcu_thread); + +static void rcu_preinit(void) __attribute__((constructor)); +static void rcu_preinit(void) +{ + struct rcu_thread *rt; + + rt = &rcu_thread_main; + rt->depth = 1; + seqlock_init(&rt->rcu); + seqlock_acquire_val(&rt->rcu, SEQLOCK_STARTVAL); + + pthread_key_create(&rcu_thread_key, rcu_thread_end); + pthread_setspecific(rcu_thread_key, rt); + + rcu_threads_add_tail(&rcu_threads, rt); + + /* RCU sweeper's rcu_thread is a dummy, NOT added to rcu_threads */ + rt = &rcu_thread_rcu; + rt->depth = 1; + + seqlock_init(&rcu_seq); + seqlock_acquire_val(&rcu_seq, SEQLOCK_STARTVAL); +} + +static struct rcu_thread *rcu_self(void) +{ + return (struct rcu_thread *)pthread_getspecific(rcu_thread_key); +} + +/* + * thread management (for the non-main thread) + */ +struct rcu_thread *rcu_thread_prepare(void) +{ + struct rcu_thread *rt, *cur; + + rcu_assert_read_locked(); + + if (!rcu_active) + rcu_start(); + + cur = rcu_self(); + assert(cur->depth); + + /* new thread always starts with rcu_read_lock held at depth 1, and + * holding the same epoch as the parent (this makes it possible to + * use RCU for things passed into the thread through its arg) + */ + rt = XCALLOC(MTYPE_RCU_THREAD, sizeof(*rt)); + rt->depth = 1; + + seqlock_init(&rt->rcu); + seqlock_acquire(&rt->rcu, &cur->rcu); + + rcu_threads_add_tail(&rcu_threads, rt); + + return rt; +} + +void rcu_thread_start(struct rcu_thread *rt) +{ + pthread_setspecific(rcu_thread_key, rt); +} + +void rcu_thread_unprepare(struct rcu_thread *rt) +{ + if (rt == &rcu_thread_rcu) + return; + + rt->depth = 1; + seqlock_acquire(&rt->rcu, &rcu_seq); + + rcu_bump(); + if (rt != &rcu_thread_main) + /* this free() happens after seqlock_release() below */ + rcu_free_internal(MTYPE_RCU_THREAD, rt, rcu_head); + + rcu_threads_del(&rcu_threads, rt); + seqlock_release(&rt->rcu); +} + +static void rcu_thread_end(void *rtvoid) +{ + struct rcu_thread *rt = rtvoid; + rcu_thread_unprepare(rt); +} + +/* + * main RCU control aspects + */ + +static void rcu_bump(void) +{ + struct rcu_next *rn; + + rn = XMALLOC(MTYPE_RCU_NEXT, sizeof(*rn)); + + /* note: each RCUA_NEXT item corresponds to exactly one seqno bump. + * This means we don't need to communicate which seqno is which + * RCUA_NEXT, since we really don't care. + */ + + /* + * Important race condition: while rcu_heads_add_tail is executing, + * there is an intermediate point where the rcu_heads "last" pointer + * already points to rn->head_next, but rn->head_next isn't added to + * the list yet. That means any other "add_tail" calls append to this + * item, which isn't fully on the list yet. Freeze this thread at + * that point and look at another thread doing a rcu_bump. It adds + * these two items and then does a seqlock_bump. But the rcu_heads + * list is still "interrupted" and there's no RCUA_NEXT on the list + * yet (from either the frozen thread or the second thread). So + * rcu_main() might actually hit the end of the list at the + * "interrupt". + * + * This situation is prevented by requiring that rcu_read_lock is held + * for any calls to rcu_bump, since if we're holding the current RCU + * epoch, that means rcu_main can't be chewing on rcu_heads and hit + * that interruption point. Only by the time the thread has continued + * to rcu_read_unlock() - and therefore completed the add_tail - the + * RCU sweeper gobbles up the epoch and can be sure to find at least + * the RCUA_NEXT and RCUA_FREE items on rcu_heads. + */ + rn->head_next.action = &rcua_next; + rcu_heads_add_tail(&rcu_heads, &rn->head_next); + + /* free rn that we allocated above. + * + * This is INTENTIONALLY not built into the RCUA_NEXT action. This + * ensures that after the action above is popped off the queue, there + * is still at least 1 item on the RCU queue. This means we never + * delete the last item, which is extremely important since it keeps + * the atomlist ->last pointer alive and well. + * + * If we were to "run dry" on the RCU queue, add_tail may run into the + * "last item is being deleted - start over" case, and then we may end + * up accessing old RCU queue items that are already free'd. + */ + rcu_free_internal(MTYPE_RCU_NEXT, rn, head_free); + + /* Only allow the RCU sweeper to run after these 2 items are queued. + * + * If another thread enqueues some RCU action in the intermediate + * window here, nothing bad happens - the queued action is associated + * with a larger seq# than strictly necessary. Thus, it might get + * executed a bit later, but that's not a problem. + * + * If another thread acquires the read lock in this window, it holds + * the previous epoch, but its RCU queue actions will be in the next + * epoch. This isn't a problem either, just a tad inefficient. + */ + seqlock_bump(&rcu_seq); +} + +static void rcu_bump_maybe(void) +{ + seqlock_val_t dirty; + + dirty = atomic_load_explicit(&rcu_dirty, memory_order_relaxed); + /* no problem if we race here and multiple threads bump rcu_seq; + * bumping too much causes no issues while not bumping enough will + * result in delayed cleanup + */ + if (dirty == seqlock_cur(&rcu_seq)) + rcu_bump(); +} + +void rcu_read_lock(void) +{ + struct rcu_thread *rt = rcu_self(); + + assert(rt); + if (rt->depth++ > 0) + return; + + seqlock_acquire(&rt->rcu, &rcu_seq); + /* need to hold RCU for bump ... */ + rcu_bump_maybe(); + /* ... but no point in holding the old epoch if we just bumped */ + seqlock_acquire(&rt->rcu, &rcu_seq); +} + +void rcu_read_unlock(void) +{ + struct rcu_thread *rt = rcu_self(); + + assert(rt && rt->depth); + if (--rt->depth > 0) + return; + rcu_bump_maybe(); + seqlock_release(&rt->rcu); +} + +void rcu_assert_read_locked(void) +{ + struct rcu_thread *rt = rcu_self(); + assert(rt && rt->depth && seqlock_held(&rt->rcu)); +} + +void rcu_assert_read_unlocked(void) +{ + struct rcu_thread *rt = rcu_self(); + assert(rt && !rt->depth && !seqlock_held(&rt->rcu)); +} + +/* + * RCU resource-release thread + */ + +static void *rcu_main(void *arg); + +static void rcu_start(void) +{ + /* ensure we never handle signals on the RCU thread by blocking + * everything here (new thread inherits signal mask) + */ + sigset_t oldsigs, blocksigs; + + sigfillset(&blocksigs); + pthread_sigmask(SIG_BLOCK, &blocksigs, &oldsigs); + + rcu_active = true; + + assert(!pthread_create(&rcu_pthread, NULL, rcu_main, NULL)); + + pthread_sigmask(SIG_SETMASK, &oldsigs, NULL); + +#ifdef HAVE_PTHREAD_SETNAME_NP +# ifdef GNU_LINUX + pthread_setname_np(rcu_pthread, "RCU sweeper"); +# elif defined(__NetBSD__) + pthread_setname_np(rcu_pthread, "RCU sweeper", NULL); +# endif +#elif defined(HAVE_PTHREAD_SET_NAME_NP) + pthread_set_name_np(rcu_pthread, "RCU sweeper"); +#endif +} + +static void rcu_do(struct rcu_head *rh) +{ + struct rcu_head_close *rhc; + void *p; + + switch (rh->action->type) { + case RCUA_FREE: + p = (char *)rh - rh->action->u.free.offset; + if (rh->action->u.free.mt) + qfree(rh->action->u.free.mt, p); + else + free(p); + break; + case RCUA_CLOSE: + rhc = container_of(rh, struct rcu_head_close, + rcu_head); + close(rhc->fd); + break; + case RCUA_CALL: + p = (char *)rh - rh->action->u.call.offset; + rh->action->u.call.fptr(p); + break; + + case RCUA_INVALID: + case RCUA_NEXT: + case RCUA_END: + default: + assert(0); + } +} + +static void rcu_watchdog(struct rcu_thread *rt) +{ +#if 0 + /* future work: print a backtrace for the thread that's holding up + * RCU. The only (good) way of doing that is to send a signal to the + * other thread, save away the backtrace in the signal handler, and + * block here until the signal is done processing. + * + * Just haven't implemented that yet. + */ + fprintf(stderr, "RCU watchdog %p\n", rt); +#endif +} + +static void *rcu_main(void *arg) +{ + struct rcu_thread *rt; + struct rcu_head *rh = NULL; + bool end = false; + struct timespec maxwait; + + seqlock_val_t rcuval = SEQLOCK_STARTVAL; + + pthread_setspecific(rcu_thread_key, &rcu_thread_rcu); + + while (!end) { + seqlock_wait(&rcu_seq, rcuval); + + /* RCU watchdog timeout, TODO: configurable value */ + clock_gettime(CLOCK_MONOTONIC, &maxwait); + maxwait.tv_nsec += 100 * 1000 * 1000; + if (maxwait.tv_nsec >= 1000000000) { + maxwait.tv_sec++; + maxwait.tv_nsec -= 1000000000; + } + + frr_each (rcu_threads, &rcu_threads, rt) + if (!seqlock_timedwait(&rt->rcu, rcuval, &maxwait)) { + rcu_watchdog(rt); + seqlock_wait(&rt->rcu, rcuval); + } + + while ((rh = rcu_heads_pop(&rcu_heads))) { + if (rh->action->type == RCUA_NEXT) + break; + else if (rh->action->type == RCUA_END) + end = true; + else + rcu_do(rh); + } + + rcuval += SEQLOCK_INCR; + } + + /* rcu_shutdown can only be called singlethreaded, and it does a + * pthread_join, so it should be impossible that anything ended up + * on the queue after RCUA_END + */ +#if 1 + assert(!rcu_heads_first(&rcu_heads)); +#else + while ((rh = rcu_heads_pop(&rcu_heads))) + if (rh->action->type >= RCUA_FREE) + rcu_do(rh); +#endif + return NULL; +} + +void rcu_shutdown(void) +{ + static struct rcu_head rcu_head_end; + struct rcu_thread *rt = rcu_self(); + void *retval; + + if (!rcu_active) + return; + + rcu_assert_read_locked(); + assert(rcu_threads_count(&rcu_threads) == 1); + + rcu_enqueue(&rcu_head_end, &rcua_end); + + rt->depth = 0; + seqlock_release(&rt->rcu); + seqlock_release(&rcu_seq); + rcu_active = false; + + /* clearing rcu_active is before pthread_join in case we hang in + * pthread_join & get a SIGTERM or something - in that case, just + * ignore the maybe-still-running RCU thread + */ + if (pthread_join(rcu_pthread, &retval) == 0) { + seqlock_acquire_val(&rcu_seq, SEQLOCK_STARTVAL); + seqlock_acquire_val(&rt->rcu, SEQLOCK_STARTVAL); + rt->depth = 1; + } +} + +/* + * RCU'd free functions + */ + +void rcu_enqueue(struct rcu_head *rh, const struct rcu_action *action) +{ + /* refer to rcu_bump() for why we need to hold RCU when adding items + * to rcu_heads + */ + rcu_assert_read_locked(); + + rh->action = action; + + if (!rcu_active) { + rcu_do(rh); + return; + } + rcu_heads_add_tail(&rcu_heads, rh); + atomic_store_explicit(&rcu_dirty, seqlock_cur(&rcu_seq), + memory_order_relaxed); +} + +void rcu_close(struct rcu_head_close *rhc, int fd) +{ + rhc->fd = fd; + rcu_enqueue(&rhc->rcu_head, &rcua_close); +} diff --git a/lib/frrcu.h b/lib/frrcu.h new file mode 100644 index 0000000000..8f789303cc --- /dev/null +++ b/lib/frrcu.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2017-19 David Lamparter, for NetDEF, Inc. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _FRRCU_H +#define _FRRCU_H + +#include "memory.h" +#include "atomlist.h" +#include "seqlock.h" + +/* quick RCU primer: + * There's a global sequence counter. Whenever a thread does a + * rcu_read_lock(), it is marked as holding the current sequence counter. + * When something is cleaned with RCU, the global sequence counter is + * increased and the item is queued for cleanup - *after* all threads are + * at a more recent sequence counter (or no sequence counter / unheld). + * + * So, by delaying resource cleanup, RCU ensures that things don't go away + * while another thread may hold a (stale) reference. + * + * Note that even if a thread is in rcu_read_lock(), it is invalid for that + * thread to access bits after rcu_free() & co on them. This is a design + * choice to allow no-op'ing out the entire RCU mechanism if we're running + * singlethreaded. (Also allows some optimization on the counter bumping.) + * + * differences from Linux Kernel RCU: + * - there's no rcu_synchronize(), if you really need to defer something + * use rcu_call() (and double check it's really necessary) + * - rcu_dereference() and rcu_assign_pointer() don't exist, use atomic_* + * instead (ATOM* list structures do the right thing) + */ + +/* opaque */ +struct rcu_thread; + +/* called before new thread creation, sets up rcu thread info for new thread + * before it actually exits. This ensures possible RCU references are held + * for thread startup. + * + * return value must be passed into the new thread's call to rcu_thread_start() + */ +extern struct rcu_thread *rcu_thread_prepare(void); + +/* cleanup in case pthread_create() fails */ +extern void rcu_thread_unprepare(struct rcu_thread *rcu_thread); + +/* called early in the new thread, with the return value from the above. + * NB: new thread is initially in RCU-held state! (at depth 1) + * + * TBD: maybe inherit RCU state from rcu_thread_prepare()? + */ +extern void rcu_thread_start(struct rcu_thread *rcu_thread); + +/* thread exit is handled through pthread_key_create's destructor function */ + +/* global RCU shutdown - must be called with only 1 active thread left. waits + * until remaining RCU actions are done & RCU thread has exited. + * + * This is mostly here to get a clean exit without memleaks. + */ +extern void rcu_shutdown(void); + +/* enter / exit RCU-held state. counter-based, so can be called nested. */ +extern void rcu_read_lock(void); +extern void rcu_read_unlock(void); + +/* for debugging / safety checks */ +extern void rcu_assert_read_locked(void); +extern void rcu_assert_read_unlocked(void); + +enum rcu_action_type { + RCUA_INVALID = 0, + /* used internally by the RCU code, shouldn't ever show up outside */ + RCUA_NEXT, + RCUA_END, + /* normal RCU actions, for outside use */ + RCUA_FREE, + RCUA_CLOSE, + RCUA_CALL, +}; + +/* since rcu_head is intended to be embedded into structs which may exist + * with lots of copies, rcu_head is shrunk down to its absolute minimum - + * the atomlist pointer + a pointer to this action struct. + */ +struct rcu_action { + enum rcu_action_type type; + + union { + struct { + struct memtype *mt; + ptrdiff_t offset; + } free; + + struct { + void (*fptr)(void *arg); + ptrdiff_t offset; + } call; + } u; +}; + +/* RCU cleanup function queue item */ +PREDECL_ATOMLIST(rcu_heads) +struct rcu_head { + struct rcu_heads_item head; + const struct rcu_action *action; +}; + +/* special RCU head for delayed fd-close */ +struct rcu_head_close { + struct rcu_head rcu_head; + int fd; +}; + +/* enqueue RCU action - use the macros below to get the rcu_action set up */ +extern void rcu_enqueue(struct rcu_head *head, const struct rcu_action *action); + +/* RCU free() and file close() operations. + * + * freed memory / closed fds become _immediately_ unavailable to the calling + * thread, but will remain available for other threads until they have passed + * into RCU-released state. + */ + +/* may be called with NULL mt to do non-MTYPE free() */ +#define rcu_free(mtype, ptr, field) \ + do { \ + typeof(ptr) _ptr = (ptr); \ + struct rcu_head *_rcu_head = &_ptr->field; \ + static const struct rcu_action _rcu_action = { \ + .type = RCUA_FREE, \ + .u.free = { \ + .mt = mtype, \ + .offset = offsetof(typeof(*_ptr), field), \ + }, \ + }; \ + rcu_enqueue(_rcu_head, &_rcu_action); \ + } while (0) + +/* use this sparingly, it runs on (and blocks) the RCU thread */ +#define rcu_call(func, ptr, field) \ + do { \ + typeof(ptr) _ptr = (ptr); \ + void (*fptype)(typeof(ptr)); \ + struct rcu_head *_rcu_head = &_ptr->field; \ + static const struct rcu_action _rcu_action = { \ + .type = RCUA_CALL, \ + .u.call = { \ + .fptr = (void *)func, \ + .offset = offsetof(typeof(*_ptr), field), \ + }, \ + }; \ + (void)(_fptype = func); \ + rcu_enqueue(_rcu_head, &_rcu_action); \ + } while (0) + +extern void rcu_close(struct rcu_head_close *head, int fd); + +#endif /* _FRRCU_H */ diff --git a/lib/libfrr.c b/lib/libfrr.c index 0fc321d6e0..35c6092140 100644 --- a/lib/libfrr.c +++ b/lib/libfrr.c @@ -41,6 +41,7 @@ #include "northbound_cli.h" #include "northbound_db.h" #include "debug.h" +#include "frrcu.h" DEFINE_HOOK(frr_late_init, (struct thread_master * tm), (tm)) DEFINE_KOOH(frr_early_fini, (), ()) @@ -1081,6 +1082,7 @@ void frr_fini(void) master = NULL; closezlog(); /* frrmod_init -> nothing needed / hooks */ + rcu_shutdown(); if (!debug_memstats_at_exit) return; diff --git a/lib/subdir.am b/lib/subdir.am index aa89622028..becc80c3f3 100644 --- a/lib/subdir.am +++ b/lib/subdir.am @@ -21,6 +21,7 @@ lib_libfrr_la_SOURCES = \ lib/distribute.c \ lib/ferr.c \ lib/filter.c \ + lib/frrcu.c \ lib/frrlua.c \ lib/frr_pthread.c \ lib/frrstr.c \ @@ -163,6 +164,7 @@ pkginclude_HEADERS += \ lib/frrlua.h \ lib/frr_pthread.h \ lib/frratomic.h \ + lib/frrcu.h \ lib/frrstr.h \ lib/getopt.h \ lib/graph.h \ diff --git a/lib/thread.c b/lib/thread.c index fc2de09df0..0436f31c3d 100644 --- a/lib/thread.c +++ b/lib/thread.c @@ -25,6 +25,7 @@ #include "thread.h" #include "memory.h" +#include "frrcu.h" #include "log.h" #include "hash.h" #include "pqueue.h" @@ -753,6 +754,9 @@ static int fd_poll(struct thread_master *m, struct pollfd *pfds, nfds_t pfdsize, < 0) // effect a poll (return immediately) timeout = 0; + rcu_read_unlock(); + rcu_assert_read_unlocked(); + /* add poll pipe poker */ assert(count + 1 < pfdsize); pfds[count].fd = m->io_pipe[0]; @@ -766,6 +770,8 @@ static int fd_poll(struct thread_master *m, struct pollfd *pfds, nfds_t pfdsize, while (read(m->io_pipe[0], &trash, sizeof(trash)) > 0) ; + rcu_read_lock(); + return num; } From ec15e1b5884d6b23a878aedcc6bcbda927022c96 Mon Sep 17 00:00:00 2001 From: Quentin Young Date: Tue, 6 Aug 2019 21:18:42 +0000 Subject: [PATCH 08/13] bgpd: tx addpath info for labeled unicast Labeled unicast needs path IDs too! Signed-off-by: Quentin Young --- bgpd/bgp_attr.c | 3 ++- lib/stream.c | 14 ++++++++++++-- lib/stream.h | 3 ++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/bgpd/bgp_attr.c b/bgpd/bgp_attr.c index c64d153f1b..e21c84355e 100644 --- a/bgpd/bgp_attr.c +++ b/bgpd/bgp_attr.c @@ -2956,7 +2956,8 @@ void bgp_packet_mpattr_prefix(struct stream *s, afi_t afi, safi_t safi, addpath_encode, addpath_tx_id); } else if (safi == SAFI_LABELED_UNICAST) { /* Prefix write with label. */ - stream_put_labeled_prefix(s, p, label); + stream_put_labeled_prefix(s, p, label, addpath_encode, + addpath_tx_id); } else if (safi == SAFI_FLOWSPEC) { if (PSIZE (p->prefixlen)+2 < FLOWSPEC_NLRI_SIZELIMIT) stream_putc(s, PSIZE (p->prefixlen)+2); diff --git a/lib/stream.c b/lib/stream.c index 6c187bd359..c67bc3c993 100644 --- a/lib/stream.c +++ b/lib/stream.c @@ -904,20 +904,30 @@ int stream_put_prefix(struct stream *s, struct prefix *p) /* Put NLRI with label */ int stream_put_labeled_prefix(struct stream *s, struct prefix *p, - mpls_label_t *label) + mpls_label_t *label, int addpath_encode, + uint32_t addpath_tx_id) { size_t psize; + size_t psize_with_addpath; uint8_t *label_pnt = (uint8_t *)label; STREAM_VERIFY_SANE(s); psize = PSIZE(p->prefixlen); + psize_with_addpath = psize + (addpath_encode ? 4 : 0); - if (STREAM_WRITEABLE(s) < (psize + 3)) { + if (STREAM_WRITEABLE(s) < (psize_with_addpath + 3)) { STREAM_BOUND_WARN(s, "put"); return 0; } + if (addpath_encode) { + s->data[s->endp++] = (uint8_t)(addpath_tx_id >> 24); + s->data[s->endp++] = (uint8_t)(addpath_tx_id >> 16); + s->data[s->endp++] = (uint8_t)(addpath_tx_id >> 8); + s->data[s->endp++] = (uint8_t)addpath_tx_id; + } + stream_putc(s, (p->prefixlen + 24)); stream_putc(s, label_pnt[0]); stream_putc(s, label_pnt[1]); diff --git a/lib/stream.h b/lib/stream.h index 5341bfa40b..a903ec0ea5 100644 --- a/lib/stream.h +++ b/lib/stream.h @@ -199,7 +199,8 @@ extern int stream_put_prefix_addpath(struct stream *, struct prefix *, uint32_t addpath_tx_id); extern int stream_put_prefix(struct stream *, struct prefix *); extern int stream_put_labeled_prefix(struct stream *, struct prefix *, - mpls_label_t *); + mpls_label_t *, int addpath_encode, + uint32_t addpath_tx_id); extern void stream_get(void *, struct stream *, size_t); extern bool stream_get2(void *data, struct stream *s, size_t size); extern void stream_get_from(void *, struct stream *, size_t, size_t); From 838cef6d7ec7298090ad48bc87a33de734b71671 Mon Sep 17 00:00:00 2001 From: Chirag Shah Date: Thu, 8 Aug 2019 10:43:49 -0700 Subject: [PATCH 09/13] zebra: fix advertise svi ip as macip route PR #3745 added EVPN feature to advertise individual SVI-IPs as MAC-IP routes. Fix a condition in zebra to send MAC and IP pair to bgpd when the feature is enabled. Testing Done: Originator VTEP: TORC11:~# ip -br addr show VxU-1002 VxU-1002 UP 45.0.2.2/24 2001:fee1:0:2::2/64 show bgp l2vpn evpn vni 1004 VNI: 1004 (known to the kernel) Type: L2 Tenant-Vrf: default RD: 27.0.0.11:3 Advertise-svi-macip : Yes Import Route Target: 10:1004 Export Route Target: 10:1004 Remote vtep evpn route output for 45.0.4.2: BGP routing table entry for 27.0.0.11:3:[2]:[0]:[48]:[00:02:00:00:00:2f]:[32]:[45.0.4.2] Paths: (2 available, best #1) Advertised to non peer-group peers: MSP1(uplink-1) MSP2(uplink-2) Route [2]:[0]:[48]:[00:02:00:00:00:2f]:[32]:[45.0.4.2] VNI 1004 64435 65546 36.0.0.11 from MSP1(uplink-1) (27.0.0.9) Origin IGP, valid, external, bestpath-from-AS 64435, best (First path received) Extended Community: RT:10:1004 ET:8 Last update: Thu Aug 8 18:09:13 2019 Signed-off-by: Chirag Shah --- zebra/zebra_vxlan.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/zebra/zebra_vxlan.c b/zebra/zebra_vxlan.c index 2e8c81bddd..1450072aa9 100644 --- a/zebra/zebra_vxlan.c +++ b/zebra/zebra_vxlan.c @@ -2787,27 +2787,40 @@ static int zvni_gw_macip_add(struct interface *ifp, zebra_vni_t *zvni, /* Set "local" forwarding info. */ SET_FLAG(n->flags, ZEBRA_NEIGH_LOCAL); - SET_FLAG(n->flags, ZEBRA_NEIGH_DEF_GW); ZEBRA_NEIGH_SET_ACTIVE(n); - /* Set Router flag (R-bit) */ - if (ip->ipa_type == IPADDR_V6) - SET_FLAG(n->flags, ZEBRA_NEIGH_ROUTER_FLAG); memcpy(&n->emac, macaddr, ETH_ALEN); n->ifindex = ifp->ifindex; /* Only advertise in BGP if the knob is enabled */ - if (!advertise_gw_macip_enabled(zvni)) - return 0; + if (advertise_gw_macip_enabled(zvni)) { - if (IS_ZEBRA_DEBUG_VXLAN) - zlog_debug( + SET_FLAG(mac->flags, ZEBRA_MAC_DEF_GW); + SET_FLAG(n->flags, ZEBRA_NEIGH_DEF_GW); + /* Set Router flag (R-bit) */ + if (ip->ipa_type == IPADDR_V6) + SET_FLAG(n->flags, ZEBRA_NEIGH_ROUTER_FLAG); + + if (IS_ZEBRA_DEBUG_VXLAN) + zlog_debug( "SVI %s(%u) L2-VNI %u, sending GW MAC %s IP %s add to BGP with flags 0x%x", ifp->name, ifp->ifindex, zvni->vni, prefix_mac2str(macaddr, buf, sizeof(buf)), ipaddr2str(ip, buf2, sizeof(buf2)), n->flags); - zvni_neigh_send_add_to_client(zvni->vni, ip, macaddr, - n->flags, n->loc_seq); + zvni_neigh_send_add_to_client(zvni->vni, ip, macaddr, + n->flags, n->loc_seq); + } else if (advertise_svi_macip_enabled(zvni)) { + + if (IS_ZEBRA_DEBUG_VXLAN) + zlog_debug( + "SVI %s(%u) L2-VNI %u, sending SVI MAC %s IP %s add to BGP with flags 0x%x", + ifp->name, ifp->ifindex, zvni->vni, + prefix_mac2str(macaddr, buf, sizeof(buf)), + ipaddr2str(ip, buf2, sizeof(buf2)), n->flags); + + zvni_neigh_send_add_to_client(zvni->vni, ip, macaddr, + n->flags, n->loc_seq); + } return 0; } @@ -9015,7 +9028,7 @@ void zebra_vxlan_advertise_svi_macip(ZAPI_HANDLER_ARGS) if (IS_ZEBRA_DEBUG_VXLAN) zlog_debug("EVPN SVI-MACIP Adv %s, currently %s", advertise ? "enabled" : "disabled", - advertise_gw_macip_enabled(NULL) + advertise_svi_macip_enabled(NULL) ? "enabled" : "disabled"); From cc83907475a21327e89da60532f0490619a9f07e Mon Sep 17 00:00:00 2001 From: Ameya Dharkar Date: Fri, 9 Aug 2019 15:21:27 -0700 Subject: [PATCH 10/13] Zebra: Incorrect L3VNI for FPM rtmsg for EVPN RT-5 prefix We used the vrf_id in the rtm_table field of the netlink rtmsg to fetch L3VNI. But, now we program table_id to rtm_table field instead of vrf_id. Thus, L3VNI fetched using rtm_table is incorrect. Instead, use nexthop->vrf_id to fetch the L3VNI. Signed-off-by: Ameya Dharkar --- zebra/zebra_fpm_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zebra/zebra_fpm_netlink.c b/zebra/zebra_fpm_netlink.c index 822def318a..f347d3955c 100644 --- a/zebra/zebra_fpm_netlink.c +++ b/zebra/zebra_fpm_netlink.c @@ -238,7 +238,7 @@ static int netlink_route_info_add_nh(netlink_route_info_t *ri, if (re && CHECK_FLAG(re->flags, ZEBRA_FLAG_EVPN_ROUTE)) { nhi.encap_info.encap_type = FPM_NH_ENCAP_VXLAN; - zl3vni = zl3vni_from_vrf(ri->rtm_table); + zl3vni = zl3vni_from_vrf(nexthop->vrf_id); if (zl3vni && is_l3vni_oper_up(zl3vni)) { /* Add VNI to VxLAN encap info */ From 6aee38481a4531f8997831a24474f0a75eceb02b Mon Sep 17 00:00:00 2001 From: Naveen Thanikachalam Date: Sun, 11 Aug 2019 00:29:32 -0700 Subject: [PATCH 11/13] bgpd: Standard large-communities CLI does not return error when it's configured with reg-ex. The CLI to configure the standard format large-communities attribute accepts regular expressions as well. For ex., the below configuration is accepted. "bgp large-community-list standard TEST permit 1:1 100:*" The code to parse the large-communities does identify the configuration as invalid however, error returned isn't processed. The code has to be modified to handle the error. Signed-off-by: NaveenThanikachalam nthanikachal@vmware.com --- bgpd/bgp_lcommunity.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bgpd/bgp_lcommunity.c b/bgpd/bgp_lcommunity.c index 2b09a2954e..aeb290719a 100644 --- a/bgpd/bgp_lcommunity.c +++ b/bgpd/bgp_lcommunity.c @@ -439,7 +439,8 @@ struct lcommunity *lcommunity_str2com(const char *str) enum lcommunity_token token = lcommunity_token_unknown; struct lcommunity_val lval; - while ((str = lcommunity_gettoken(str, &lval, &token))) { + do { + str = lcommunity_gettoken(str, &lval, &token); switch (token) { case lcommunity_token_val: if (lcom == NULL) @@ -452,7 +453,8 @@ struct lcommunity *lcommunity_str2com(const char *str) lcommunity_free(&lcom); return NULL; } - } + } while (str); + return lcom; } From dfb6fd1dd119a5bd660012e940e8328534547e76 Mon Sep 17 00:00:00 2001 From: Naveen Thanikachalam Date: Sun, 11 Aug 2019 03:56:12 -0700 Subject: [PATCH 12/13] bgpd: Assertion failed during shutdown. A race condition causes the failure. The function "make_info()" sets the path info's peer to bgp instance's "peer_self" which is created when BGP is first configured and deleted only when BGP is brought down completely. A race condition causes the bgp instances's "peer_self" to be removed before the routes are being pulled off from the aggregate address. If the bgp instance's "peer_self" is NULL or, if BGP is being deleted, the aggregate route must not be reinstalled. Signed-off-by: NaveenThanikachalam nthanikachal@vmware.com --- bgpd/bgp_route.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c index abad1db5a2..a372568373 100644 --- a/bgpd/bgp_route.c +++ b/bgpd/bgp_route.c @@ -5332,6 +5332,13 @@ static void bgp_purge_af_static_redist_routes(struct bgp *bgp, afi_t afi, struct bgp_node *rn; struct bgp_path_info *pi; + /* Do not install the aggregate route if BGP is in the + * process of termination. + */ + if (bgp_flag_check(bgp, BGP_FLAG_DELETE_IN_PROGRESS) || + (bgp->peer_self == NULL)) + return; + table = bgp->rib[afi][safi]; for (rn = bgp_table_top(table); rn; rn = bgp_route_next(rn)) { for (pi = bgp_node_get_bgp_path_info(rn); pi; pi = pi->next) { From 47c8fa1f875ed1c8e2907c9ffe9c25ab69436ca1 Mon Sep 17 00:00:00 2001 From: Naveen Thanikachalam Date: Sun, 11 Aug 2019 04:24:15 -0700 Subject: [PATCH 13/13] bgpd: Optimizing route-map's processing of dependencies. Say for eg., 256 prefix-list entries are pasted to VTYSH. This results in BGP processing the events for several minutes. BGPD starts a timer for 5 seconds when the first dependency configuraion is received. On timer expiry, BGP process dependent route-maps. After this processing, BGPD reads the configurations received in the next 5 seconds and then re-processes the route-maps from the beginning. This cyclic re-processing consumes time and CPU cycles. Instead of starting a timer when the first configuration is received, everytime a configuration is received, the existing timer is reset. This would mean that all the configurations are read first before the timer expires. This eliminates the cyclic re-processing. Signed-off-by: NaveenThanikachalam nthanikachal@vmware.com --- bgpd/bgp_routemap.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/bgpd/bgp_routemap.c b/bgpd/bgp_routemap.c index 5ffc416dc5..7f1a9b71c1 100644 --- a/bgpd/bgp_routemap.c +++ b/bgpd/bgp_routemap.c @@ -3406,31 +3406,34 @@ int bgp_route_map_update_timer(struct thread *thread) static void bgp_route_map_mark_update(const char *rmap_name) { - if (bm->t_rmap_update == NULL) { - struct listnode *node, *nnode; - struct bgp *bgp; + struct listnode *node, *nnode; + struct bgp *bgp; - /* rmap_update_timer of 0 means don't do route updates */ - if (bm->rmap_update_timer) { - bm->t_rmap_update = NULL; - thread_add_timer(bm->master, bgp_route_map_update_timer, - NULL, bm->rmap_update_timer, - &bm->t_rmap_update); + /* If new update is received before the current timer timed out, + * turn it off and start a new timer. + */ + if (bm->t_rmap_update != NULL) + THREAD_OFF(bm->t_rmap_update); - /* Signal the groups that a route-map update event has - * started */ - for (ALL_LIST_ELEMENTS(bm->bgp, node, nnode, bgp)) - update_group_policy_update(bgp, - BGP_POLICY_ROUTE_MAP, - rmap_name, 1, 1); - } else { - for (ALL_LIST_ELEMENTS(bm->bgp, node, nnode, bgp)) - bgp_route_map_process_update(bgp, rmap_name, 0); + /* rmap_update_timer of 0 means don't do route updates */ + if (bm->rmap_update_timer) { + thread_add_timer(bm->master, bgp_route_map_update_timer, + NULL, bm->rmap_update_timer, + &bm->t_rmap_update); + + /* Signal the groups that a route-map update event has + * started */ + for (ALL_LIST_ELEMENTS(bm->bgp, node, nnode, bgp)) + update_group_policy_update(bgp, + BGP_POLICY_ROUTE_MAP, + rmap_name, 1, 1); + } else { + for (ALL_LIST_ELEMENTS(bm->bgp, node, nnode, bgp)) + bgp_route_map_process_update(bgp, rmap_name, 0); #if ENABLE_BGP_VNC - zlog_debug("%s: calling vnc_routemap_update", __func__); - vnc_routemap_update(bgp, __func__); + zlog_debug("%s: calling vnc_routemap_update", __func__); + vnc_routemap_update(bgp, __func__); #endif - } } }