From d8c16a951998b354ef3be124cc97eafef20c5ff0 Mon Sep 17 00:00:00 2001
From: Mark Stapp <mjs@voltanet.io>
Date: Thu, 30 Aug 2018 13:48:05 -0400
Subject: [PATCH 01/10] zebra: introduce dedicated dataplane pthread

Signed-off-by: Mark Stapp <mjs@voltanet.io>
---
 zebra/main.c         |  6 +++---
 zebra/zebra_dplane.c | 29 +++++++++++++++++++++++------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/zebra/main.c b/zebra/main.c
index 5628b5e022..20a2185b1b 100644
--- a/zebra/main.c
+++ b/zebra/main.c
@@ -390,6 +390,9 @@ int main(int argc, char **argv)
 	vty_config_lockless();
 	zebrad.master = frr_init();
 
+	/* Initialize pthread library */
+	frr_pthread_init();
+
 	/* Zebra related initialize. */
 	zebra_router_init();
 	zserv_init();
@@ -445,9 +448,6 @@ int main(int argc, char **argv)
 	/* Needed for BSD routing socket. */
 	pid = getpid();
 
-	/* Intialize pthread library */
-	frr_pthread_init();
-
 	/* Start Zebra API server */
 	zserv_start(zserv_path);
 
diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c
index 3e61418b64..98db3ca7e5 100644
--- a/zebra/zebra_dplane.c
+++ b/zebra/zebra_dplane.c
@@ -176,6 +176,9 @@ static struct zebra_dplane_globals {
 	_Atomic uint32_t dg_routes_queued_max;
 	_Atomic uint32_t dg_route_errors;
 
+	/* Dataplane pthread */
+	struct frr_pthread *dg_pthread;
+
 	/* Event-delivery context 'master' for the dplane */
 	struct thread_master *dg_master;
 
@@ -1004,13 +1007,23 @@ static void zebra_dplane_init_internal(struct zebra_t *zebra)
 	zdplane_info.dg_max_queued_updates = DPLANE_DEFAULT_MAX_QUEUED;
 
 	/* TODO -- register default kernel 'provider' during init */
+	zdplane_info.dg_run = true;
+
+	/* Start dataplane pthread */
 
 	zdplane_info.dg_run = true;
 
-	/* TODO -- start dataplane pthread. We're using the zebra
-	 * core/main thread temporarily
-	 */
-	zdplane_info.dg_master = zebra->master;
+	struct frr_pthread_attr pattr = {
+		.start = frr_pthread_attr_default.start,
+		.stop = frr_pthread_attr_default.stop
+	};
+
+	zdplane_info.dg_pthread = frr_pthread_new(&pattr, "Zebra dplane thread",
+						  "Zebra dplane");
+
+	zdplane_info.dg_master = zdplane_info.dg_pthread->master;
+
+	frr_pthread_run(zdplane_info.dg_pthread, NULL);
 }
 
 /* Indicates zebra shutdown/exit is in progress. Some operations may be
@@ -1122,8 +1135,12 @@ void zebra_dplane_shutdown(void)
 
 	THREAD_OFF(zdplane_info.dg_t_update);
 
-	/* TODO */
-	/* frr_pthread_stop(...) */
+	frr_pthread_stop(zdplane_info.dg_pthread, NULL);
+
+	/* Destroy pthread */
+	frr_pthread_destroy(zdplane_info.dg_pthread);
+	zdplane_info.dg_pthread = NULL;
+	zdplane_info.dg_master = NULL;
 
 	/* Notify provider(s) of final shutdown */
 

From 675440ad5a969daa6831b974a1e6a04079ac715a Mon Sep 17 00:00:00 2001
From: Mark Stapp <mjs@voltanet.io>
Date: Wed, 14 Nov 2018 10:06:05 -0500
Subject: [PATCH 02/10] zebra: fix get_old_instance api

One of the dplane context accessors was returning the wrong value;
correct it.

Signed-off-by: Mark Stapp <mjs@voltanet.io>
---
 zebra/zebra_dplane.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c
index 98db3ca7e5..6cd52f7bb8 100644
--- a/zebra/zebra_dplane.c
+++ b/zebra/zebra_dplane.c
@@ -444,7 +444,7 @@ uint16_t dplane_ctx_get_old_instance(const struct zebra_dplane_ctx *ctx)
 {
 	DPLANE_CTX_VALID(ctx);
 
-	return ctx->zd_instance;
+	return ctx->zd_old_instance;
 }
 
 uint32_t dplane_ctx_get_metric(const struct zebra_dplane_ctx *ctx)

From c831033fff8775579998463e372d7653e053e6d8 Mon Sep 17 00:00:00 2001
From: Mark Stapp <mjs@voltanet.io>
Date: Fri, 31 Aug 2018 13:46:50 -0400
Subject: [PATCH 03/10] zebra: dataplane provider enhancements

Limit the number of updates processed from the incoming queue;
add more stats. Fill out apis for dataplane providers; convert
route update processing to provider model; move dataplane
status enum

Signed-off-by: Mark Stapp <mjs@voltanet.io>
---
 zebra/zebra_dplane.c | 732 ++++++++++++++++++++++++++++++++++++-------
 zebra/zebra_dplane.h | 105 +++++--
 zebra/zebra_rib.c    |   5 +-
 3 files changed, 700 insertions(+), 142 deletions(-)

diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c
index 6cd52f7bb8..c859bdaf6e 100644
--- a/zebra/zebra_dplane.c
+++ b/zebra/zebra_dplane.c
@@ -42,6 +42,9 @@ DEFINE_MTYPE(ZEBRA, DP_PROV, "Zebra DPlane Provider")
 const uint32_t DPLANE_DEFAULT_MAX_QUEUED = 200;
 
 
+/* Default value for new work per cycle */
+const uint32_t DPLANE_DEFAULT_NEW_WORK = 100;
+
 /* Validation check macro for context blocks */
 /* #define DPLANE_DEBUG 1 */
 
@@ -69,6 +72,12 @@ struct zebra_dplane_ctx {
 	/* Status on return */
 	enum zebra_dplane_result zd_status;
 
+	/* Dplane provider id */
+	uint32_t zd_provider;
+
+	/* Flags - used by providers, e.g. */
+	int zd_flags;
+
 	/* TODO -- internal/sub-operation status? */
 	enum zebra_dplane_result zd_remote_status;
 	enum zebra_dplane_result zd_kernel_status;
@@ -118,6 +127,12 @@ struct zebra_dplane_ctx {
 	TAILQ_ENTRY(zebra_dplane_ctx) zd_q_entries;
 };
 
+/* Flag that can be set by a pre-kernel provider as a signal that an update
+ * should bypass the kernel.
+ */
+#define DPLANE_CTX_FLAG_NO_KERNEL 0x01
+
+
 /*
  * Registration block for one dataplane provider.
  */
@@ -131,16 +146,35 @@ struct zebra_dplane_provider {
 	/* Id value */
 	uint32_t dp_id;
 
+	/* Mutex */
+	pthread_mutex_t dp_mutex;
+
+	/* Plugin-provided extra data */
+	void *dp_data;
+
+	/* Flags */
+	int dp_flags;
+
 	dplane_provider_process_fp dp_fp;
 
 	dplane_provider_fini_fp dp_fini;
 
 	_Atomic uint32_t dp_in_counter;
+	_Atomic uint32_t dp_in_max;
+	_Atomic uint32_t dp_out_counter;
+	_Atomic uint32_t dp_out_max;
 	_Atomic uint32_t dp_error_counter;
 
-	/* Embedded list linkage */
-	TAILQ_ENTRY(zebra_dplane_provider) dp_q_providers;
+	/* Queue of contexts inbound to the provider */
+	struct dplane_ctx_q dp_ctx_in_q;
 
+	/* Queue of completed contexts outbound from the provider back
+	 * towards the dataplane module.
+	 */
+	struct dplane_ctx_q dp_ctx_out_q;
+
+	/* Embedded list linkage for provider objects */
+	TAILQ_ENTRY(zebra_dplane_provider) dp_prov_link;
 };
 
 /*
@@ -171,10 +205,16 @@ static struct zebra_dplane_globals {
 	/* Limit number of pending, unprocessed updates */
 	_Atomic uint32_t dg_max_queued_updates;
 
+	/* Limit number of new updates dequeued at once, to pace an
+	 * incoming burst.
+	 */
+	uint32_t dg_updates_per_cycle;
+
 	_Atomic uint32_t dg_routes_in;
 	_Atomic uint32_t dg_routes_queued;
 	_Atomic uint32_t dg_routes_queued_max;
 	_Atomic uint32_t dg_route_errors;
+	_Atomic uint32_t dg_update_yields;
 
 	/* Dataplane pthread */
 	struct frr_pthread *dg_pthread;
@@ -191,14 +231,20 @@ static struct zebra_dplane_globals {
 } zdplane_info;
 
 /*
- * Lock and unlock for interactions with the zebra 'core'
+ * Lock and unlock for interactions with the zebra 'core' pthread
  */
 #define DPLANE_LOCK() pthread_mutex_lock(&zdplane_info.dg_mutex)
-
 #define DPLANE_UNLOCK() pthread_mutex_unlock(&zdplane_info.dg_mutex)
 
+
+/*
+ * Lock and unlock for individual providers
+ */
+#define DPLANE_PROV_LOCK(p)   pthread_mutex_lock(&((p)->dp_mutex))
+#define DPLANE_PROV_UNLOCK(p) pthread_mutex_unlock(&((p)->dp_mutex))
+
 /* Prototypes */
-static int dplane_route_process(struct thread *event);
+static int dplane_thread_loop(struct thread *event);
 
 /*
  * Public APIs
@@ -285,6 +331,38 @@ enum zebra_dplane_result dplane_ctx_get_status(
 	return ctx->zd_status;
 }
 
+void dplane_ctx_set_status(struct zebra_dplane_ctx *ctx,
+			   enum zebra_dplane_result status)
+{
+	DPLANE_CTX_VALID(ctx);
+
+	ctx->zd_status = status;
+}
+
+/* Retrieve last/current provider id */
+uint32_t dplane_ctx_get_provider(const struct zebra_dplane_ctx *ctx)
+{
+	DPLANE_CTX_VALID(ctx);
+	return ctx->zd_provider;
+}
+
+/* Providers run before the kernel can control whether a kernel
+ * update should be done.
+ */
+void dplane_ctx_set_skip_kernel(struct zebra_dplane_ctx *ctx)
+{
+	DPLANE_CTX_VALID(ctx);
+
+	SET_FLAG(ctx->zd_flags, DPLANE_CTX_FLAG_NO_KERNEL);
+}
+
+bool dplane_ctx_is_skip_kernel(const struct zebra_dplane_ctx *ctx)
+{
+	DPLANE_CTX_VALID(ctx);
+
+	return CHECK_FLAG(ctx->zd_flags, DPLANE_CTX_FLAG_NO_KERNEL);
+}
+
 enum dplane_op_e dplane_ctx_get_op(const struct zebra_dplane_ctx *ctx)
 {
 	DPLANE_CTX_VALID(ctx);
@@ -517,6 +595,7 @@ const struct zebra_dplane_info *dplane_ctx_get_ns(
  * End of dplane context accessors
  */
 
+
 /*
  * Retrieve the limit on the number of pending, unprocessed updates.
  */
@@ -678,34 +757,11 @@ static int dplane_route_enqueue(struct zebra_dplane_ctx *ctx)
 	}
 
 	/* Ensure that an event for the dataplane thread is active */
-	thread_add_event(zdplane_info.dg_master, dplane_route_process, NULL, 0,
-			 &zdplane_info.dg_t_update);
-
-	ret = AOK;
+	ret = dplane_provider_work_ready();
 
 	return ret;
 }
 
-/*
- * Attempt to dequeue a route-update block
- */
-static struct zebra_dplane_ctx *dplane_route_dequeue(void)
-{
-	struct zebra_dplane_ctx *ctx = NULL;
-
-	DPLANE_LOCK();
-	{
-		ctx = TAILQ_FIRST(&zdplane_info.dg_route_ctx_q);
-		if (ctx) {
-			TAILQ_REMOVE(&zdplane_info.dg_route_ctx_q,
-				     ctx, zd_q_entries);
-		}
-	}
-	DPLANE_UNLOCK();
-
-	return ctx;
-}
-
 /*
  * Utility that prepares a route update and enqueues it for processing
  */
@@ -828,68 +884,15 @@ done:
 	return ret;
 }
 
-/*
- * Event handler function for routing updates
- */
-static int dplane_route_process(struct thread *event)
-{
-	enum zebra_dplane_result res;
-	struct zebra_dplane_ctx *ctx;
-
-	while (1) {
-		/* Check for shutdown */
-		if (!zdplane_info.dg_run)
-			break;
-
-		/* TODO -- limit number of updates per cycle? */
-		ctx = dplane_route_dequeue();
-		if (ctx == NULL)
-			break;
-
-		/* Update counter */
-		atomic_fetch_sub_explicit(&zdplane_info.dg_routes_queued, 1,
-					  memory_order_relaxed);
-
-		if (IS_ZEBRA_DEBUG_DPLANE_DETAIL) {
-			char dest_str[PREFIX_STRLEN];
-
-			prefix2str(dplane_ctx_get_dest(ctx),
-				   dest_str, sizeof(dest_str));
-
-			zlog_debug("%u:%s Dplane route update ctx %p op %s",
-				   dplane_ctx_get_vrf(ctx), dest_str,
-				   ctx, dplane_op2str(dplane_ctx_get_op(ctx)));
-		}
-
-		/* TODO -- support series of providers */
-
-		/* Initially, just doing kernel-facing update here */
-		res = kernel_route_update(ctx);
-
-		if (res != ZEBRA_DPLANE_REQUEST_SUCCESS)
-			atomic_fetch_add_explicit(&zdplane_info.dg_route_errors,
-						  1, memory_order_relaxed);
-
-		ctx->zd_status = res;
-
-		/* Enqueue result to zebra main context */
-		zdplane_info.dg_results_cb(ctx);
-
-		ctx = NULL;
-	}
-
-	return 0;
-}
-
 /*
  * Handler for 'show dplane'
  */
 int dplane_show_helper(struct vty *vty, bool detailed)
 {
-	uint64_t queued, limit, queue_max, errs, incoming;
+	uint64_t queued, queue_max, limit, errs, incoming, yields;
 
 	/* Using atomics because counters are being changed in different
-	 * contexts.
+	 * pthread contexts.
 	 */
 	incoming = atomic_load_explicit(&zdplane_info.dg_routes_in,
 					memory_order_relaxed);
@@ -901,12 +904,16 @@ int dplane_show_helper(struct vty *vty, bool detailed)
 					 memory_order_relaxed);
 	errs = atomic_load_explicit(&zdplane_info.dg_route_errors,
 				    memory_order_relaxed);
+	yields = atomic_load_explicit(&zdplane_info.dg_update_yields,
+				      memory_order_relaxed);
 
-	vty_out(vty, "Route updates:            %"PRIu64"\n", incoming);
+	vty_out(vty, "Zebra dataplane:\nRoute updates:            %"PRIu64"\n",
+		incoming);
 	vty_out(vty, "Route update errors:      %"PRIu64"\n", errs);
 	vty_out(vty, "Route update queue limit: %"PRIu64"\n", limit);
 	vty_out(vty, "Route update queue depth: %"PRIu64"\n", queued);
 	vty_out(vty, "Route update queue max:   %"PRIu64"\n", queue_max);
+	vty_out(vty, "Route update yields:      %"PRIu64"\n", yields);
 
 	return CMD_SUCCESS;
 }
@@ -916,8 +923,35 @@ int dplane_show_helper(struct vty *vty, bool detailed)
  */
 int dplane_show_provs_helper(struct vty *vty, bool detailed)
 {
-	vty_out(vty, "Zebra dataplane providers:%s\n",
-		(detailed ? " (detailed)" : ""));
+	struct zebra_dplane_provider *prov;
+	uint64_t in, in_max, out, out_max;
+
+	vty_out(vty, "Zebra dataplane providers:\n");
+
+	DPLANE_LOCK();
+	prov = TAILQ_FIRST(&zdplane_info.dg_providers_q);
+	DPLANE_UNLOCK();
+
+	/* Show counters, useful info from each registered provider */
+	while (prov) {
+
+		in = atomic_load_explicit(&prov->dp_in_counter,
+					  memory_order_relaxed);
+		in_max = atomic_load_explicit(&prov->dp_in_max,
+					      memory_order_relaxed);
+		out = atomic_load_explicit(&prov->dp_out_counter,
+					   memory_order_relaxed);
+		out_max = atomic_load_explicit(&prov->dp_out_max,
+					       memory_order_relaxed);
+
+		vty_out(vty, "%s (%u): in: %"PRIu64", max: %"PRIu64", "
+			"out: %"PRIu64", max: %"PRIu64"\n",
+			prov->dp_name, prov->dp_id, in, in_max, out, out_max);
+
+		DPLANE_LOCK();
+		prov = TAILQ_NEXT(prov, dp_prov_link);
+		DPLANE_UNLOCK();
+	}
 
 	return CMD_SUCCESS;
 }
@@ -926,9 +960,11 @@ int dplane_show_provs_helper(struct vty *vty, bool detailed)
  * Provider registration
  */
 int dplane_provider_register(const char *name,
-			     enum dplane_provider_prio_e prio,
+			     enum dplane_provider_prio prio,
+			     int flags,
 			     dplane_provider_process_fp fp,
-			     dplane_provider_fini_fp fini_fp)
+			     dplane_provider_fini_fp fini_fp,
+			     void *data)
 {
 	int ret = 0;
 	struct zebra_dplane_provider *p, *last;
@@ -952,37 +988,160 @@ int dplane_provider_register(const char *name,
 		goto done;
 	}
 
-	strncpy(p->dp_name, name, DPLANE_PROVIDER_NAMELEN);
-	p->dp_name[DPLANE_PROVIDER_NAMELEN] = '\0'; /* Belt-and-suspenders */
+	pthread_mutex_init(&(p->dp_mutex), NULL);
+	TAILQ_INIT(&(p->dp_ctx_in_q));
+	TAILQ_INIT(&(p->dp_ctx_out_q));
 
 	p->dp_priority = prio;
 	p->dp_fp = fp;
 	p->dp_fini = fini_fp;
+	p->dp_data = data;
 
-	/* Lock the lock - the dplane pthread may be running */
+	/* Lock - the dplane pthread may be running */
 	DPLANE_LOCK();
 
 	p->dp_id = ++zdplane_info.dg_provider_id;
 
+	if (name)
+		strlcpy(p->dp_name, name, DPLANE_PROVIDER_NAMELEN);
+	else
+		snprintf(p->dp_name, DPLANE_PROVIDER_NAMELEN,
+			 "provider-%u", p->dp_id);
+
 	/* Insert into list ordered by priority */
-	TAILQ_FOREACH(last, &zdplane_info.dg_providers_q, dp_q_providers) {
+	TAILQ_FOREACH(last, &zdplane_info.dg_providers_q, dp_prov_link) {
 		if (last->dp_priority > p->dp_priority)
 			break;
 	}
 
 	if (last)
-		TAILQ_INSERT_BEFORE(last, p, dp_q_providers);
+		TAILQ_INSERT_BEFORE(last, p, dp_prov_link);
 	else
 		TAILQ_INSERT_TAIL(&zdplane_info.dg_providers_q, p,
-				  dp_q_providers);
+				  dp_prov_link);
 
 	/* And unlock */
 	DPLANE_UNLOCK();
 
+	if (IS_ZEBRA_DEBUG_DPLANE)
+		zlog_debug("dplane: registered new provider '%s' (%u), prio %d",
+			   p->dp_name, p->dp_id, p->dp_priority);
+
 done:
 	return ret;
 }
 
+/* Accessors for provider attributes */
+const char *dplane_provider_get_name(const struct zebra_dplane_provider *prov)
+{
+	return prov->dp_name;
+}
+
+uint32_t dplane_provider_get_id(const struct zebra_dplane_provider *prov)
+{
+	return prov->dp_id;
+}
+
+void *dplane_provider_get_data(const struct zebra_dplane_provider *prov)
+{
+	return prov->dp_data;
+}
+
+int dplane_provider_get_work_limit(const struct zebra_dplane_provider *prov)
+{
+	return zdplane_info.dg_updates_per_cycle;
+}
+
+/*
+ * Dequeue and maintain associated counter
+ */
+struct zebra_dplane_ctx *dplane_provider_dequeue_in_ctx(
+	struct zebra_dplane_provider *prov)
+{
+	struct zebra_dplane_ctx *ctx = NULL;
+
+	if (dplane_provider_is_threaded(prov))
+		DPLANE_PROV_LOCK(prov);
+
+	ctx = TAILQ_FIRST(&(prov->dp_ctx_in_q));
+	if (ctx) {
+		TAILQ_REMOVE(&(prov->dp_ctx_in_q), ctx, zd_q_entries);
+	}
+
+	if (dplane_provider_is_threaded(prov))
+		DPLANE_PROV_UNLOCK(prov);
+
+	return ctx;
+}
+
+/*
+ * Dequeue work to a list, return count
+ */
+int dplane_provider_dequeue_in_list(struct zebra_dplane_provider *prov,
+				    struct dplane_ctx_q *listp)
+{
+	int limit, ret;
+	struct zebra_dplane_ctx *ctx;
+
+	limit = zdplane_info.dg_updates_per_cycle;
+
+	if (dplane_provider_is_threaded(prov))
+		DPLANE_PROV_LOCK(prov);
+
+	for (ret = 0; ret < limit; ret++) {
+		ctx = TAILQ_FIRST(&(prov->dp_ctx_in_q));
+		if (ctx) {
+			TAILQ_REMOVE(&(prov->dp_ctx_in_q), ctx, zd_q_entries);
+
+			TAILQ_INSERT_TAIL(listp, ctx, zd_q_entries);
+		} else {
+			break;
+		}
+	}
+
+	if (dplane_provider_is_threaded(prov))
+		DPLANE_PROV_UNLOCK(prov);
+
+	return ret;
+}
+
+/*
+ * Enqueue and maintain associated counter
+ */
+void dplane_provider_enqueue_out_ctx(struct zebra_dplane_provider *prov,
+				     struct zebra_dplane_ctx *ctx)
+{
+	if (dplane_provider_is_threaded(prov))
+		DPLANE_PROV_LOCK(prov);
+
+	TAILQ_INSERT_TAIL(&(prov->dp_ctx_out_q), ctx,
+			  zd_q_entries);
+
+	if (dplane_provider_is_threaded(prov))
+		DPLANE_PROV_UNLOCK(prov);
+
+	atomic_fetch_add_explicit(&(prov->dp_out_counter), 1,
+				  memory_order_relaxed);
+}
+
+bool dplane_provider_is_threaded(const struct zebra_dplane_provider *prov)
+{
+	return (prov->dp_flags & DPLANE_PROV_FLAG_THREADED);
+}
+
+/*
+ * Provider api to signal that work/events are available
+ * for the dataplane pthread.
+ */
+int dplane_provider_work_ready(void)
+{
+	thread_add_event(zdplane_info.dg_master,
+			 dplane_thread_loop, NULL, 0,
+			 &zdplane_info.dg_t_update);
+
+	return AOK;
+}
+
 /*
  * Zebra registers a results callback with the dataplane system
  */
@@ -993,37 +1152,154 @@ int dplane_results_register(dplane_results_fp fp)
 }
 
 /*
- * Initialize the dataplane module during startup, internal/private version
+ * Kernel dataplane provider
  */
-static void zebra_dplane_init_internal(struct zebra_t *zebra)
+
+/*
+ * Kernel provider callback
+ */
+static int kernel_dplane_process_func(struct zebra_dplane_provider *prov)
 {
-	memset(&zdplane_info, 0, sizeof(zdplane_info));
+	enum zebra_dplane_result res;
+	struct zebra_dplane_ctx *ctx;
+	int counter, limit;
 
-	pthread_mutex_init(&zdplane_info.dg_mutex, NULL);
+	limit = dplane_provider_get_work_limit(prov);
 
-	TAILQ_INIT(&zdplane_info.dg_route_ctx_q);
-	TAILQ_INIT(&zdplane_info.dg_providers_q);
+	if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+		zlog_debug("dplane provider '%s': processing",
+			   dplane_provider_get_name(prov));
 
-	zdplane_info.dg_max_queued_updates = DPLANE_DEFAULT_MAX_QUEUED;
+	for (counter = 0; counter < limit; counter++) {
 
-	/* TODO -- register default kernel 'provider' during init */
-	zdplane_info.dg_run = true;
+		ctx = dplane_provider_dequeue_in_ctx(prov);
+		if (ctx == NULL)
+			break;
 
-	/* Start dataplane pthread */
+		if (IS_ZEBRA_DEBUG_DPLANE_DETAIL) {
+			char dest_str[PREFIX_STRLEN];
 
-	zdplane_info.dg_run = true;
+			prefix2str(dplane_ctx_get_dest(ctx),
+				   dest_str, sizeof(dest_str));
 
-	struct frr_pthread_attr pattr = {
-		.start = frr_pthread_attr_default.start,
-		.stop = frr_pthread_attr_default.stop
-	};
+			zlog_debug("%u:%s Dplane route update ctx %p op %s",
+				   dplane_ctx_get_vrf(ctx), dest_str,
+				   ctx, dplane_op2str(dplane_ctx_get_op(ctx)));
+		}
 
-	zdplane_info.dg_pthread = frr_pthread_new(&pattr, "Zebra dplane thread",
-						  "Zebra dplane");
+		/* Call into the synchronous kernel-facing code here */
+		res = kernel_route_update(ctx);
 
-	zdplane_info.dg_master = zdplane_info.dg_pthread->master;
+		if (res != ZEBRA_DPLANE_REQUEST_SUCCESS)
+			atomic_fetch_add_explicit(
+				&zdplane_info.dg_route_errors, 1,
+				memory_order_relaxed);
 
-	frr_pthread_run(zdplane_info.dg_pthread, NULL);
+		dplane_ctx_set_status(ctx, res);
+
+		dplane_provider_enqueue_out_ctx(prov, ctx);
+	}
+
+	/* Ensure that we'll run the work loop again if there's still
+	 * more work to do.
+	 */
+	if (counter >= limit) {
+		if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+			zlog_debug("dplane provider '%s' reached max updates %d",
+				   dplane_provider_get_name(prov), counter);
+
+		atomic_fetch_add_explicit(&zdplane_info.dg_update_yields,
+					  1, memory_order_relaxed);
+
+		dplane_provider_work_ready();
+	}
+
+	return 0;
+}
+
+/*
+ * Test dataplane provider plugin
+ */
+
+/*
+ * Test provider process callback
+ */
+static int test_dplane_process_func(struct zebra_dplane_provider *prov)
+{
+	struct zebra_dplane_ctx *ctx;
+	int counter, limit;
+
+	/* Just moving from 'in' queue to 'out' queue */
+
+	if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+		zlog_debug("dplane provider '%s': processing",
+			   dplane_provider_get_name(prov));
+
+	limit = dplane_provider_get_work_limit(prov);
+
+	for (counter = 0; counter < limit; counter++) {
+
+		ctx = dplane_provider_dequeue_in_ctx(prov);
+		if (ctx == NULL)
+			break;
+
+		dplane_ctx_set_status(ctx, ZEBRA_DPLANE_REQUEST_SUCCESS);
+
+		dplane_provider_enqueue_out_ctx(prov, ctx);
+	}
+
+	/* Ensure that we'll run the work loop again if there's still
+	 * more work to do.
+	 */
+	if (counter >= limit)
+		dplane_provider_work_ready();
+
+	return 0;
+}
+
+/*
+ * Test provider shutdown/fini callback
+ */
+static int test_dplane_shutdown_func(struct zebra_dplane_provider *prov,
+				     bool early)
+{
+	if (IS_ZEBRA_DEBUG_DPLANE)
+		zlog_debug("dplane provider '%s': %sshutdown",
+			   dplane_provider_get_name(prov),
+			   early ? "early " : "");
+
+	return 0;
+}
+
+/*
+ * Register default kernel provider
+ */
+static void dplane_provider_init(void)
+{
+	int ret;
+
+	ret = dplane_provider_register("Kernel",
+				       DPLANE_PRIO_KERNEL,
+				       DPLANE_PROV_FLAGS_DEFAULT,
+				       kernel_dplane_process_func,
+				       NULL,
+				       NULL);
+
+	if (ret != AOK)
+		zlog_err("Unable to register kernel dplane provider: %d",
+			 ret);
+
+	/* TODO -- make the test provider optional... */
+	ret = dplane_provider_register("Test",
+				       DPLANE_PRIO_PRE_KERNEL,
+				       DPLANE_PROV_FLAGS_DEFAULT,
+				       test_dplane_process_func,
+				       test_dplane_shutdown_func,
+				       NULL /* data */);
+
+	if (ret != AOK)
+		zlog_err("Unable to register test dplane provider: %d",
+			 ret);
 }
 
 /* Indicates zebra shutdown/exit is in progress. Some operations may be
@@ -1059,7 +1335,9 @@ static bool dplane_work_pending(void)
 {
 	struct zebra_dplane_ctx *ctx;
 
-	/* TODO -- just checking incoming/pending work for now */
+	/* TODO -- just checking incoming/pending work for now, must check
+	 * providers
+	 */
 	DPLANE_LOCK();
 	{
 		ctx = TAILQ_FIRST(&zdplane_info.dg_route_ctx_q);
@@ -1120,6 +1398,184 @@ void zebra_dplane_finish(void)
 			 &zdplane_info.dg_t_shutdown_check);
 }
 
+/*
+ * Main dataplane pthread event loop. The thread takes new incoming work
+ * and offers it to the first provider. It then iterates through the
+ * providers, taking complete work from each one and offering it
+ * to the next in order. At each step, a limited number of updates are
+ * processed during a cycle in order to provide some fairness.
+ */
+static int dplane_thread_loop(struct thread *event)
+{
+	struct dplane_ctx_q work_list;
+	struct dplane_ctx_q error_list;
+	struct zebra_dplane_provider *prov;
+	struct zebra_dplane_ctx *ctx, *tctx;
+	int limit, counter, error_counter;
+	uint64_t lval;
+
+	/* Capture work limit per cycle */
+	limit = zdplane_info.dg_updates_per_cycle;
+
+	TAILQ_INIT(&work_list);
+
+	/* Check for zebra shutdown */
+	if (!zdplane_info.dg_run)
+		goto done;
+
+	/* Dequeue some incoming work from zebra (if any) onto the temporary
+	 * working list.
+	 */
+	DPLANE_LOCK();
+
+	/* Locate initial registered provider */
+	prov = TAILQ_FIRST(&zdplane_info.dg_providers_q);
+
+	for (counter = 0; counter < limit; counter++) {
+		ctx = TAILQ_FIRST(&zdplane_info.dg_route_ctx_q);
+		if (ctx) {
+			TAILQ_REMOVE(&zdplane_info.dg_route_ctx_q, ctx,
+				     zd_q_entries);
+
+			atomic_fetch_sub_explicit(
+				&zdplane_info.dg_routes_queued, 1,
+				memory_order_relaxed);
+
+			ctx->zd_provider = prov->dp_id;
+
+			TAILQ_INSERT_TAIL(&work_list, ctx, zd_q_entries);
+		} else {
+			break;
+		}
+	}
+
+	DPLANE_UNLOCK();
+
+	if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+		zlog_debug("dplane: incoming new work counter: %d", counter);
+
+	/* Iterate through the registered providers, offering new incoming
+	 * work. If the provider has outgoing work in its queue, take that
+	 * work for the next provider
+	 */
+	while (prov) {
+
+		if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+			zlog_debug("dplane enqueues %d new work to provider '%s'",
+				   counter, dplane_provider_get_name(prov));
+
+		/* Capture current provider id in each context; check for
+		 * error status.
+		 */
+		TAILQ_FOREACH_SAFE(ctx, &work_list, zd_q_entries, tctx) {
+			if (dplane_ctx_get_status(ctx) ==
+			    ZEBRA_DPLANE_REQUEST_SUCCESS) {
+				ctx->zd_provider = prov->dp_id;
+			} else {
+				/*
+				 * TODO -- improve error-handling: recirc
+				 * errors backwards so that providers can
+				 * 'undo' their work (if they want to)
+				 */
+
+				/* Move to error list; will be returned
+				 * zebra main.
+				 */
+				TAILQ_REMOVE(&work_list, ctx, zd_q_entries);
+				TAILQ_INSERT_TAIL(&error_list,
+						  ctx, zd_q_entries);
+				error_counter++;
+			}
+		}
+
+		/* Enqueue new work to the provider */
+		if (dplane_provider_is_threaded(prov))
+			DPLANE_PROV_LOCK(prov);
+
+		if (TAILQ_FIRST(&work_list))
+			TAILQ_CONCAT(&(prov->dp_ctx_in_q), &work_list,
+				     zd_q_entries);
+
+		lval = atomic_add_fetch_explicit(&prov->dp_in_counter, counter,
+						 memory_order_relaxed);
+		if (lval > prov->dp_in_max)
+			atomic_store_explicit(&prov->dp_in_max, lval,
+					      memory_order_relaxed);
+
+		if (dplane_provider_is_threaded(prov))
+			DPLANE_PROV_UNLOCK(prov);
+
+		/* Reset the temp list */
+		TAILQ_INIT(&work_list);
+		counter = 0;
+
+		/* Call into the provider code */
+		(*prov->dp_fp)(prov);
+
+		/* Check for zebra shutdown */
+		if (!zdplane_info.dg_run)
+			break;
+
+		/* Dequeue completed work from the provider */
+		if (dplane_provider_is_threaded(prov))
+			DPLANE_PROV_LOCK(prov);
+
+		while (counter < limit) {
+			ctx = TAILQ_FIRST(&(prov->dp_ctx_out_q));
+			if (ctx) {
+				TAILQ_REMOVE(&(prov->dp_ctx_out_q), ctx,
+					     zd_q_entries);
+
+				TAILQ_INSERT_TAIL(&work_list,
+						  ctx, zd_q_entries);
+				counter++;
+			} else
+				break;
+		}
+
+		if (dplane_provider_is_threaded(prov))
+			DPLANE_PROV_UNLOCK(prov);
+
+		if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+			zlog_debug("dplane dequeues %d completed work from provider %s",
+				   counter, dplane_provider_get_name(prov));
+
+		/* Locate next provider */
+		DPLANE_LOCK();
+		prov = TAILQ_NEXT(prov, dp_prov_link);
+		DPLANE_UNLOCK();
+
+		if (prov) {
+			TAILQ_FOREACH(ctx, &work_list, zd_q_entries)
+				ctx->zd_provider = prov->dp_id;
+		}
+	}
+
+	/* After all providers have been serviced, enqueue any completed
+	 * work back to zebra so it can process the results.
+	 */
+
+	if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+		zlog_debug("dplane has %d completed work for zebra main",
+			   counter);
+
+	/*
+	 * TODO -- I'd rather hand lists through the api to zebra main,
+	 * to reduce the number of lock/unlock cycles
+	 */
+	for (ctx = TAILQ_FIRST(&work_list); ctx; ) {
+		TAILQ_REMOVE(&work_list, ctx, zd_q_entries);
+
+		/* Call through to zebra main */
+		(*zdplane_info.dg_results_cb)(ctx);
+
+		ctx = TAILQ_FIRST(&work_list);
+	}
+
+done:
+	return 0;
+}
+
 /*
  * Final phase of shutdown, after all work enqueued to dplane has been
  * processed. This is called from the zebra main pthread context.
@@ -1142,11 +1598,51 @@ void zebra_dplane_shutdown(void)
 	zdplane_info.dg_pthread = NULL;
 	zdplane_info.dg_master = NULL;
 
-	/* Notify provider(s) of final shutdown */
+	/* TODO -- Notify provider(s) of final shutdown */
 
-	/* Clean-up provider objects */
+	/* TODO -- Clean-up provider objects */
 
-	/* Clean queue(s) */
+	/* TODO -- Clean queue(s), free memory */
+}
+
+/*
+ * Initialize the dataplane module during startup, internal/private version
+ */
+static void zebra_dplane_init_internal(struct zebra_t *zebra)
+{
+	memset(&zdplane_info, 0, sizeof(zdplane_info));
+
+	pthread_mutex_init(&zdplane_info.dg_mutex, NULL);
+
+	TAILQ_INIT(&zdplane_info.dg_route_ctx_q);
+	TAILQ_INIT(&zdplane_info.dg_providers_q);
+
+	zdplane_info.dg_updates_per_cycle = DPLANE_DEFAULT_NEW_WORK;
+
+	zdplane_info.dg_max_queued_updates = DPLANE_DEFAULT_MAX_QUEUED;
+
+	/* Register default kernel 'provider' during init */
+	dplane_provider_init();
+
+	/* Start dataplane pthread */
+
+	zdplane_info.dg_run = true;
+
+	struct frr_pthread_attr pattr = {
+		.start = frr_pthread_attr_default.start,
+		.stop = frr_pthread_attr_default.stop
+	};
+
+	zdplane_info.dg_pthread = frr_pthread_new(&pattr, "Zebra dplane thread",
+						  "Zebra dplane");
+
+	zdplane_info.dg_master = zdplane_info.dg_pthread->master;
+
+	/* Enqueue an initial event for the dataplane pthread */
+	thread_add_event(zdplane_info.dg_master, dplane_thread_loop, NULL, 0,
+			 &zdplane_info.dg_t_update);
+
+	frr_pthread_run(zdplane_info.dg_pthread, NULL);
 }
 
 /*
diff --git a/zebra/zebra_dplane.h b/zebra/zebra_dplane.h
index 999e0f39e4..844fd59600 100644
--- a/zebra/zebra_dplane.h
+++ b/zebra/zebra_dplane.h
@@ -29,7 +29,6 @@
 #include "zebra/rib.h"
 #include "zebra/zserv.h"
 
-
 /* Key netlink info from zebra ns */
 struct zebra_dplane_info {
 	ns_id_t ns_id;
@@ -127,6 +126,12 @@ void dplane_ctx_fini(struct zebra_dplane_ctx **pctx);
 void dplane_ctx_enqueue_tail(struct dplane_ctx_q *q,
 			     const struct zebra_dplane_ctx *ctx);
 
+/* Append a list of context blocks to another list - again, just keeping
+ * the context struct opaque.
+ */
+void dplane_ctx_list_append(struct dplane_ctx_q *to_list,
+			    struct dplane_ctx_q *from_list);
+
 /* Dequeue a context block from the head of caller's tailq */
 void dplane_ctx_dequeue(struct dplane_ctx_q *q, struct zebra_dplane_ctx **ctxp);
 
@@ -135,6 +140,8 @@ void dplane_ctx_dequeue(struct dplane_ctx_q *q, struct zebra_dplane_ctx **ctxp);
  */
 enum zebra_dplane_result dplane_ctx_get_status(
 	const struct zebra_dplane_ctx *ctx);
+void dplane_ctx_set_status(struct zebra_dplane_ctx *ctx,
+			   enum zebra_dplane_result status);
 const char *dplane_res2str(enum zebra_dplane_result res);
 
 enum dplane_op_e dplane_ctx_get_op(const struct zebra_dplane_ctx *ctx);
@@ -142,6 +149,15 @@ const char *dplane_op2str(enum dplane_op_e op);
 
 const struct prefix *dplane_ctx_get_dest(const struct zebra_dplane_ctx *ctx);
 
+/* Retrieve last/current provider id */
+uint32_t dplane_ctx_get_provider(const struct zebra_dplane_ctx *ctx);
+
+/* Providers running before the kernel can control whether a kernel
+ * update should be done.
+ */
+void dplane_ctx_set_skip_kernel(struct zebra_dplane_ctx *ctx);
+bool dplane_ctx_is_skip_kernel(const struct zebra_dplane_ctx *ctx);
+
 /* Source prefix is a little special - use convention to return NULL
  * to mean "no src prefix"
  */
@@ -212,9 +228,11 @@ int dplane_show_provs_helper(struct vty *vty, bool detailed);
 
 
 /*
- * Dataplane providers: modules that consume dataplane events.
+ * Dataplane providers: modules that process or consume dataplane events.
  */
 
+struct zebra_dplane_provider;
+
 /* Support string name for a dataplane provider */
 #define DPLANE_PROVIDER_NAMELEN 64
 
@@ -223,7 +241,7 @@ int dplane_show_provs_helper(struct vty *vty, bool detailed);
  * followed by the kernel, followed by some post-processing step (such as
  * the fpm output stream.)
  */
-enum dplane_provider_prio_e {
+enum dplane_provider_prio {
 	DPLANE_PRIO_NONE = 0,
 	DPLANE_PRIO_PREPROCESS,
 	DPLANE_PRIO_PRE_KERNEL,
@@ -232,28 +250,72 @@ enum dplane_provider_prio_e {
 	DPLANE_PRIO_LAST
 };
 
-/* Provider's entry-point to process a context block */
-typedef int (*dplane_provider_process_fp)(struct zebra_dplane_ctx *ctx);
-
-/* Provider's entry-point for shutdown and cleanup */
-typedef int (*dplane_provider_fini_fp)(void);
-
-/* Provider registration */
-int dplane_provider_register(const char *name,
-			     enum dplane_provider_prio_e prio,
-			     dplane_provider_process_fp fp,
-			     dplane_provider_fini_fp fini_fp);
-
-/*
- * Results are returned to zebra core via a callback
+/* Provider's entry-point for incoming work, called in the context of the
+ * dataplane pthread. The dataplane pthread enqueues any new work to the
+ * provider's 'inbound' queue, then calls the callback. The dataplane
+ * then checks the provider's outbound queue.
  */
-typedef int (*dplane_results_fp)(const struct zebra_dplane_ctx *ctx);
+typedef int (*dplane_provider_process_fp)(struct zebra_dplane_provider *prov);
+
+/* Provider's entry-point for shutdown and cleanup. Called with 'early'
+ * during shutdown, to indicate that the dataplane subsystem is allowing
+ * work to move through the providers and finish. When called without 'early',
+ * the provider should release all resources (if it has any allocated).
+ */
+typedef int (*dplane_provider_fini_fp)(struct zebra_dplane_provider *prov,
+				       bool early);
+
+/* Flags values used during provider registration. */
+#define DPLANE_PROV_FLAGS_DEFAULT  0x0
+
+/* Provider will be spawning its own worker thread */
+#define DPLANE_PROV_FLAG_THREADED  0x1
+
+
+/* Provider registration: ordering or priority value, callbacks, and optional
+ * opaque data value.
+ */
+int dplane_provider_register(const char *name,
+			     enum dplane_provider_prio prio,
+			     int flags,
+			     dplane_provider_process_fp fp,
+			     dplane_provider_fini_fp fini_fp,
+			     void *data);
+
+/* Accessors for provider attributes */
+const char *dplane_provider_get_name(const struct zebra_dplane_provider *prov);
+uint32_t dplane_provider_get_id(const struct zebra_dplane_provider *prov);
+void *dplane_provider_get_data(const struct zebra_dplane_provider *prov);
+bool dplane_provider_is_threaded(const struct zebra_dplane_provider *prov);
+
+/* Providers should limit number of updates per work cycle */
+int dplane_provider_get_work_limit(const struct zebra_dplane_provider *prov);
+
+/* Provider api to signal that work/events are available
+ * for the dataplane pthread.
+ */
+int dplane_provider_work_ready(void);
+
+/* Dequeue, maintain associated counter and locking */
+struct zebra_dplane_ctx *dplane_provider_dequeue_in_ctx(
+	struct zebra_dplane_provider *prov);
+
+/* Dequeue work to a list, maintain counter and locking, return count */
+int dplane_provider_dequeue_in_list(struct zebra_dplane_provider *prov,
+				    struct dplane_ctx_q *listp);
+
+/* Enqueue, maintain associated counter and locking */
+void dplane_provider_enqueue_out_ctx(struct zebra_dplane_provider *prov,
+				     struct zebra_dplane_ctx *ctx);
 
 /*
  * Zebra registers a results callback with the dataplane. The callback is
- * called in the dataplane thread context, so the expectation is that the
- * context is queued (or that processing is very limited).
+ * called in the dataplane pthread context, so the expectation is that the
+ * context is queued for the zebra main pthread or that processing
+ * is very limited.
  */
+typedef int (*dplane_results_fp)(struct zebra_dplane_ctx *ctx);
+
 int dplane_results_register(dplane_results_fp fp);
 
 /*
@@ -264,7 +326,8 @@ void zebra_dplane_init(void);
 
 /* Finalize/cleanup apis, one called early as shutdown is starting,
  * one called late at the end of zebra shutdown, and then one called
- * from the zebra main thread to stop the dplane thread free all resources.
+ * from the zebra main pthread to stop the dplane pthread and
+ * free all resources.
  *
  * Zebra expects to try to clean up all vrfs and all routes during
  * shutdown, so the dplane must be available until very late.
diff --git a/zebra/zebra_rib.c b/zebra/zebra_rib.c
index 8285392527..f161d5875b 100644
--- a/zebra/zebra_rib.c
+++ b/zebra/zebra_rib.c
@@ -1932,11 +1932,10 @@ static void rib_process_after(struct zebra_dplane_ctx *ctx)
 	op = dplane_ctx_get_op(ctx);
 	status = dplane_ctx_get_status(ctx);
 
-	if (IS_ZEBRA_DEBUG_DPLANE_DETAIL) {
+	if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
 		zlog_debug("%u:%s Processing dplane ctx %p, op %s result %s",
 			   dplane_ctx_get_vrf(ctx), dest_str, ctx,
 			   dplane_op2str(op), dplane_res2str(status));
-	}
 
 	if (op == DPLANE_OP_ROUTE_DELETE) {
 		/*
@@ -3289,7 +3288,7 @@ static int rib_process_dplane_results(struct thread *thread)
  * the dataplane pthread. We enqueue the results here for processing by
  * the main thread later.
  */
-static int rib_dplane_results(const struct zebra_dplane_ctx *ctx)
+static int rib_dplane_results(struct zebra_dplane_ctx *ctx)
 {
 	/* Take lock controlling queue of results */
 	pthread_mutex_lock(&dplane_mutex);

From e5a60d8259d4b969826e622133dff5a6d8330903 Mon Sep 17 00:00:00 2001
From: Mark Stapp <mjs@voltanet.io>
Date: Fri, 7 Sep 2018 15:46:13 -0400
Subject: [PATCH 04/10] zebra: reorg dataplane pthread start

Move dataplane pthread start later in the zebra startup; make
the 'test' dplane provider conditional

Signed-off-by: Mark Stapp <mjs@voltanet.io>
---
 zebra/main.c         |  3 +++
 zebra/zebra_dplane.c | 35 ++++++++++++++++++++++++++++-------
 zebra/zebra_dplane.h |  6 ++++++
 3 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/zebra/main.c b/zebra/main.c
index 20a2185b1b..e5160c1a51 100644
--- a/zebra/main.c
+++ b/zebra/main.c
@@ -448,6 +448,9 @@ int main(int argc, char **argv)
 	/* Needed for BSD routing socket. */
 	pid = getpid();
 
+	/* Start dataplane system */
+	zebra_dplane_start();
+
 	/* Start Zebra API server */
 	zserv_start(zserv_path);
 
diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c
index c859bdaf6e..34de6a956e 100644
--- a/zebra/zebra_dplane.c
+++ b/zebra/zebra_dplane.c
@@ -38,10 +38,12 @@ DEFINE_MTYPE(ZEBRA, DP_PROV, "Zebra DPlane Provider")
 #  define AOK 0
 #endif
 
+/* Enable test dataplane provider */
+/*#define DPLANE_TEST_PROVIDER 1 */
+
 /* Default value for max queued incoming updates */
 const uint32_t DPLANE_DEFAULT_MAX_QUEUED = 200;
 
-
 /* Default value for new work per cycle */
 const uint32_t DPLANE_DEFAULT_NEW_WORK = 100;
 
@@ -1135,9 +1137,16 @@ bool dplane_provider_is_threaded(const struct zebra_dplane_provider *prov)
  */
 int dplane_provider_work_ready(void)
 {
-	thread_add_event(zdplane_info.dg_master,
-			 dplane_thread_loop, NULL, 0,
-			 &zdplane_info.dg_t_update);
+	/* Note that during zebra startup, we may be offered work before
+	 * the dataplane pthread (and thread-master) are ready. We want to
+	 * enqueue the work, but the event-scheduling machinery may not be
+	 * available.
+	 */
+	if (zdplane_info.dg_run) {
+		thread_add_event(zdplane_info.dg_master,
+				 dplane_thread_loop, NULL, 0,
+				 &zdplane_info.dg_t_update);
+	}
 
 	return AOK;
 }
@@ -1217,6 +1226,8 @@ static int kernel_dplane_process_func(struct zebra_dplane_provider *prov)
 	return 0;
 }
 
+#if DPLANE_TEST_PROVIDER
+
 /*
  * Test dataplane provider plugin
  */
@@ -1270,6 +1281,7 @@ static int test_dplane_shutdown_func(struct zebra_dplane_provider *prov,
 
 	return 0;
 }
+#endif	/* DPLANE_TEST_PROVIDER */
 
 /*
  * Register default kernel provider
@@ -1289,7 +1301,8 @@ static void dplane_provider_init(void)
 		zlog_err("Unable to register kernel dplane provider: %d",
 			 ret);
 
-	/* TODO -- make the test provider optional... */
+#if DPLANE_TEST_PROVIDER
+	/* Optional test provider ... */
 	ret = dplane_provider_register("Test",
 				       DPLANE_PRIO_PRE_KERNEL,
 				       DPLANE_PROV_FLAGS_DEFAULT,
@@ -1300,6 +1313,7 @@ static void dplane_provider_init(void)
 	if (ret != AOK)
 		zlog_err("Unable to register test dplane provider: %d",
 			 ret);
+#endif	/* DPLANE_TEST_PROVIDER */
 }
 
 /* Indicates zebra shutdown/exit is in progress. Some operations may be
@@ -1623,11 +1637,16 @@ static void zebra_dplane_init_internal(struct zebra_t *zebra)
 
 	/* Register default kernel 'provider' during init */
 	dplane_provider_init();
+}
 
+/*
+ * Start the dataplane pthread. This step needs to be run later than the
+ * 'init' step, in case zebra has fork-ed.
+ */
+void zebra_dplane_start(void)
+{
 	/* Start dataplane pthread */
 
-	zdplane_info.dg_run = true;
-
 	struct frr_pthread_attr pattr = {
 		.start = frr_pthread_attr_default.start,
 		.stop = frr_pthread_attr_default.stop
@@ -1638,6 +1657,8 @@ static void zebra_dplane_init_internal(struct zebra_t *zebra)
 
 	zdplane_info.dg_master = zdplane_info.dg_pthread->master;
 
+	zdplane_info.dg_run = true;
+
 	/* Enqueue an initial event for the dataplane pthread */
 	thread_add_event(zdplane_info.dg_master, dplane_thread_loop, NULL, 0,
 			 &zdplane_info.dg_t_update);
diff --git a/zebra/zebra_dplane.h b/zebra/zebra_dplane.h
index 844fd59600..6f471e65a1 100644
--- a/zebra/zebra_dplane.h
+++ b/zebra/zebra_dplane.h
@@ -324,6 +324,12 @@ int dplane_results_register(dplane_results_fp fp);
  */
 void zebra_dplane_init(void);
 
+/*
+ * Start the dataplane pthread. This step needs to be run later than the
+ * 'init' step, in case zebra has fork-ed.
+ */
+void zebra_dplane_start(void);
+
 /* Finalize/cleanup apis, one called early as shutdown is starting,
  * one called late at the end of zebra shutdown, and then one called
  * from the zebra main pthread to stop the dplane pthread and

From 14b0bc8e88555caa394e0414154312f1e6036757 Mon Sep 17 00:00:00 2001
From: Mark Stapp <mjs@voltanet.io>
Date: Mon, 10 Sep 2018 15:17:19 -0400
Subject: [PATCH 05/10] zebra: add initial error handling to dplane loop

Capture error work during dataplane provider processing.

Signed-off-by: Mark Stapp <mjs@voltanet.io>
---
 zebra/zebra_dplane.c | 67 +++++++++++++++++++++++++++++++++-----------
 zebra/zebra_dplane.h |  2 +-
 2 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c
index 34de6a956e..a387241385 100644
--- a/zebra/zebra_dplane.c
+++ b/zebra/zebra_dplane.c
@@ -300,7 +300,7 @@ static void dplane_ctx_free(struct zebra_dplane_ctx **pctx)
  */
 void dplane_ctx_fini(struct zebra_dplane_ctx **pctx)
 {
-	/* TODO -- enqueue for next provider; for now, just free */
+	/* TODO -- maintain pool; for now, just free */
 	dplane_ctx_free(pctx);
 }
 
@@ -311,6 +311,18 @@ void dplane_ctx_enqueue_tail(struct dplane_ctx_q *q,
 	TAILQ_INSERT_TAIL(q, (struct zebra_dplane_ctx *)ctx, zd_q_entries);
 }
 
+/* Append a list of context blocks to another list */
+void dplane_ctx_list_append(struct dplane_ctx_q *to_list,
+			    struct dplane_ctx_q *from_list)
+{
+	if (TAILQ_FIRST(from_list)) {
+		TAILQ_CONCAT(to_list, from_list, zd_q_entries);
+
+		/* And clear 'from' list */
+		TAILQ_INIT(from_list);
+	}
+}
+
 /* Dequeue a context block from the head of a list */
 void dplane_ctx_dequeue(struct dplane_ctx_q *q, struct zebra_dplane_ctx **ctxp)
 {
@@ -649,6 +661,7 @@ static int dplane_ctx_route_init(struct zebra_dplane_ctx *ctx,
 		goto done;
 
 	ctx->zd_op = op;
+	ctx->zd_status = ZEBRA_DPLANE_REQUEST_SUCCESS;
 
 	ctx->zd_type = re->type;
 	ctx->zd_old_type = re->type;
@@ -702,7 +715,7 @@ static int dplane_ctx_route_init(struct zebra_dplane_ctx *ctx,
 
 	/* TODO -- maybe use array of nexthops to avoid allocs? */
 
-	/* Ensure that the dplane's nexthop flag is clear. */
+	/* Ensure that the dplane's nexthops flags are clear. */
 	for (ALL_NEXTHOPS(ctx->zd_ng, nexthop))
 		UNSET_FLAG(nexthop->flags, NEXTHOP_FLAG_FIB);
 
@@ -1418,6 +1431,10 @@ void zebra_dplane_finish(void)
  * providers, taking complete work from each one and offering it
  * to the next in order. At each step, a limited number of updates are
  * processed during a cycle in order to provide some fairness.
+ *
+ * This loop through the providers is only run once, so that the dataplane
+ * pthread can look for other pending work - such as i/o work on behalf of
+ * providers.
  */
 static int dplane_thread_loop(struct thread *event)
 {
@@ -1431,7 +1448,10 @@ static int dplane_thread_loop(struct thread *event)
 	/* Capture work limit per cycle */
 	limit = zdplane_info.dg_updates_per_cycle;
 
+	/* Init temporary lists used to move contexts among providers */
 	TAILQ_INIT(&work_list);
+	TAILQ_INIT(&error_list);
+	error_counter = 0;
 
 	/* Check for zebra shutdown */
 	if (!zdplane_info.dg_run)
@@ -1445,16 +1465,13 @@ static int dplane_thread_loop(struct thread *event)
 	/* Locate initial registered provider */
 	prov = TAILQ_FIRST(&zdplane_info.dg_providers_q);
 
+	/* Move new work from incoming list to temp list */
 	for (counter = 0; counter < limit; counter++) {
 		ctx = TAILQ_FIRST(&zdplane_info.dg_route_ctx_q);
 		if (ctx) {
 			TAILQ_REMOVE(&zdplane_info.dg_route_ctx_q, ctx,
 				     zd_q_entries);
 
-			atomic_fetch_sub_explicit(
-				&zdplane_info.dg_routes_queued, 1,
-				memory_order_relaxed);
-
 			ctx->zd_provider = prov->dp_id;
 
 			TAILQ_INSERT_TAIL(&work_list, ctx, zd_q_entries);
@@ -1465,6 +1482,9 @@ static int dplane_thread_loop(struct thread *event)
 
 	DPLANE_UNLOCK();
 
+	atomic_fetch_sub_explicit(&zdplane_info.dg_routes_queued, counter,
+				  memory_order_relaxed);
+
 	if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
 		zlog_debug("dplane: incoming new work counter: %d", counter);
 
@@ -1474,6 +1494,10 @@ static int dplane_thread_loop(struct thread *event)
 	 */
 	while (prov) {
 
+		/* At each iteration, the temporary work list has 'counter'
+		 * items.
+		 */
+
 		if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
 			zlog_debug("dplane enqueues %d new work to provider '%s'",
 				   counter, dplane_provider_get_name(prov));
@@ -1519,11 +1543,16 @@ static int dplane_thread_loop(struct thread *event)
 		if (dplane_provider_is_threaded(prov))
 			DPLANE_PROV_UNLOCK(prov);
 
-		/* Reset the temp list */
+		/* Reset the temp list (though the 'concat' may have done this
+		 * already), and the counter
+		 */
 		TAILQ_INIT(&work_list);
 		counter = 0;
 
-		/* Call into the provider code */
+		/* Call into the provider code. Note that this is
+		 * unconditional: we offer to do work even if we don't enqueue
+		 * any _new_ work.
+		 */
 		(*prov->dp_fp)(prov);
 
 		/* Check for zebra shutdown */
@@ -1558,25 +1587,29 @@ static int dplane_thread_loop(struct thread *event)
 		DPLANE_LOCK();
 		prov = TAILQ_NEXT(prov, dp_prov_link);
 		DPLANE_UNLOCK();
-
-		if (prov) {
-			TAILQ_FOREACH(ctx, &work_list, zd_q_entries)
-				ctx->zd_provider = prov->dp_id;
-		}
 	}
 
 	/* After all providers have been serviced, enqueue any completed
-	 * work back to zebra so it can process the results.
+	 * work and any errors back to zebra so it can process the results.
 	 */
-
 	if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
-		zlog_debug("dplane has %d completed work for zebra main",
-			   counter);
+		zlog_debug("dplane has %d completed, %d errors, for zebra main",
+			   counter, error_counter);
 
 	/*
 	 * TODO -- I'd rather hand lists through the api to zebra main,
 	 * to reduce the number of lock/unlock cycles
 	 */
+	for (ctx = TAILQ_FIRST(&error_list); ctx; ) {
+		TAILQ_REMOVE(&error_list, ctx, zd_q_entries);
+
+		/* Call through to zebra main */
+		(*zdplane_info.dg_results_cb)(ctx);
+
+		ctx = TAILQ_FIRST(&error_list);
+	}
+
+
 	for (ctx = TAILQ_FIRST(&work_list); ctx; ) {
 		TAILQ_REMOVE(&work_list, ctx, zd_q_entries);
 
diff --git a/zebra/zebra_dplane.h b/zebra/zebra_dplane.h
index 6f471e65a1..60dfb07fed 100644
--- a/zebra/zebra_dplane.h
+++ b/zebra/zebra_dplane.h
@@ -120,7 +120,7 @@ TAILQ_HEAD(dplane_ctx_q, zebra_dplane_ctx);
  */
 void dplane_ctx_fini(struct zebra_dplane_ctx **pctx);
 
-/* Enqueue a context block to caller's tailq. This just exists so that the
+/* Enqueue a context block to caller's tailq. This exists so that the
  * context struct can remain opaque.
  */
 void dplane_ctx_enqueue_tail(struct dplane_ctx_q *q,

From 68b375e05956cdd1ce935fd54939312b75e3a546 Mon Sep 17 00:00:00 2001
From: Mark Stapp <mjs@voltanet.io>
Date: Mon, 10 Sep 2018 15:36:30 -0400
Subject: [PATCH 06/10] zebra: revise dplane dequeue api

Change the dataplane context dequeue api used by zebra to make the
purpose a bit clearer.

Signed-off-by: Mark Stapp <mjs@voltanet.io>
---
 zebra/zebra_dplane.c | 4 ++--
 zebra/zebra_dplane.h | 2 +-
 zebra/zebra_rib.c    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c
index a387241385..24975697ea 100644
--- a/zebra/zebra_dplane.c
+++ b/zebra/zebra_dplane.c
@@ -324,14 +324,14 @@ void dplane_ctx_list_append(struct dplane_ctx_q *to_list,
 }
 
 /* Dequeue a context block from the head of a list */
-void dplane_ctx_dequeue(struct dplane_ctx_q *q, struct zebra_dplane_ctx **ctxp)
+struct zebra_dplane_ctx *dplane_ctx_dequeue(struct dplane_ctx_q *q)
 {
 	struct zebra_dplane_ctx *ctx = TAILQ_FIRST(q);
 
 	if (ctx)
 		TAILQ_REMOVE(q, ctx, zd_q_entries);
 
-	*ctxp = ctx;
+	return ctx;
 }
 
 /*
diff --git a/zebra/zebra_dplane.h b/zebra/zebra_dplane.h
index 60dfb07fed..2e64540d6b 100644
--- a/zebra/zebra_dplane.h
+++ b/zebra/zebra_dplane.h
@@ -133,7 +133,7 @@ void dplane_ctx_list_append(struct dplane_ctx_q *to_list,
 			    struct dplane_ctx_q *from_list);
 
 /* Dequeue a context block from the head of caller's tailq */
-void dplane_ctx_dequeue(struct dplane_ctx_q *q, struct zebra_dplane_ctx **ctxp);
+struct zebra_dplane_ctx *dplane_ctx_dequeue(struct dplane_ctx_q *q);
 
 /*
  * Accessors for information from the context object
diff --git a/zebra/zebra_rib.c b/zebra/zebra_rib.c
index f161d5875b..f2d07310ee 100644
--- a/zebra/zebra_rib.c
+++ b/zebra/zebra_rib.c
@@ -3266,7 +3266,7 @@ static int rib_process_dplane_results(struct thread *thread)
 		pthread_mutex_lock(&dplane_mutex);
 		{
 			/* Dequeue context block */
-			dplane_ctx_dequeue(&rib_dplane_q, &ctx);
+			ctx = dplane_ctx_dequeue(&rib_dplane_q);
 		}
 		pthread_mutex_unlock(&dplane_mutex);
 

From c9d17fe85ea55f011381dc98ba927bd107fd9d19 Mon Sep 17 00:00:00 2001
From: Mark Stapp <mjs@voltanet.io>
Date: Tue, 11 Sep 2018 16:42:36 -0400
Subject: [PATCH 07/10] zebra: improve dataplane shutdown checks

Update the dataplane shutdown checks to include the providers.
Also revise the typedef for provider structs to make const
work.

Signed-off-by: Mark Stapp <mjs@voltanet.io>
---
 zebra/zebra_dplane.c | 71 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 61 insertions(+), 10 deletions(-)

diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c
index 24975697ea..5fd27dc830 100644
--- a/zebra/zebra_dplane.c
+++ b/zebra/zebra_dplane.c
@@ -162,8 +162,10 @@ struct zebra_dplane_provider {
 	dplane_provider_fini_fp dp_fini;
 
 	_Atomic uint32_t dp_in_counter;
+	_Atomic uint32_t dp_in_queued;
 	_Atomic uint32_t dp_in_max;
 	_Atomic uint32_t dp_out_counter;
+	_Atomic uint32_t dp_out_queued;
 	_Atomic uint32_t dp_out_max;
 	_Atomic uint32_t dp_error_counter;
 
@@ -959,8 +961,8 @@ int dplane_show_provs_helper(struct vty *vty, bool detailed)
 		out_max = atomic_load_explicit(&prov->dp_out_max,
 					       memory_order_relaxed);
 
-		vty_out(vty, "%s (%u): in: %"PRIu64", max: %"PRIu64", "
-			"out: %"PRIu64", max: %"PRIu64"\n",
+		vty_out(vty, "%s (%u): in: %"PRIu64", q_max: %"PRIu64", "
+			"out: %"PRIu64", q_max: %"PRIu64"\n",
 			prov->dp_name, prov->dp_id, in, in_max, out, out_max);
 
 		DPLANE_LOCK();
@@ -1081,6 +1083,9 @@ struct zebra_dplane_ctx *dplane_provider_dequeue_in_ctx(
 	ctx = TAILQ_FIRST(&(prov->dp_ctx_in_q));
 	if (ctx) {
 		TAILQ_REMOVE(&(prov->dp_ctx_in_q), ctx, zd_q_entries);
+
+		atomic_fetch_sub_explicit(&prov->dp_in_queued, 1,
+					  memory_order_relaxed);
 	}
 
 	if (dplane_provider_is_threaded(prov))
@@ -1114,6 +1119,10 @@ int dplane_provider_dequeue_in_list(struct zebra_dplane_provider *prov,
 		}
 	}
 
+	if (ret > 0)
+		atomic_fetch_sub_explicit(&prov->dp_in_queued, ret,
+					  memory_order_relaxed);
+
 	if (dplane_provider_is_threaded(prov))
 		DPLANE_PROV_UNLOCK(prov);
 
@@ -1272,6 +1281,10 @@ static int test_dplane_process_func(struct zebra_dplane_provider *prov)
 		dplane_provider_enqueue_out_ctx(prov, ctx);
 	}
 
+	if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+		zlog_debug("dplane provider '%s': processed %d",
+			   dplane_provider_get_name(prov), counter);
+
 	/* Ensure that we'll run the work loop again if there's still
 	 * more work to do.
 	 */
@@ -1342,7 +1355,7 @@ bool dplane_is_in_shutdown(void)
  * early during zebra shutdown, as a signal to stop new work and prepare
  * for updates generated by shutdown/cleanup activity, as zebra tries to
  * remove everything it's responsible for.
- * NB: This runs in the main zebra thread context.
+ * NB: This runs in the main zebra pthread context.
  */
 void zebra_dplane_pre_finish(void)
 {
@@ -1351,7 +1364,7 @@ void zebra_dplane_pre_finish(void)
 
 	zdplane_info.dg_is_shutdown = true;
 
-	/* Notify provider(s) of pending shutdown */
+	/* TODO -- Notify provider(s) of pending shutdown */
 }
 
 /*
@@ -1360,7 +1373,9 @@ void zebra_dplane_pre_finish(void)
  */
 static bool dplane_work_pending(void)
 {
+	bool ret = false;
 	struct zebra_dplane_ctx *ctx;
+	struct zebra_dplane_provider *prov;
 
 	/* TODO -- just checking incoming/pending work for now, must check
 	 * providers
@@ -1368,10 +1383,40 @@ static bool dplane_work_pending(void)
 	DPLANE_LOCK();
 	{
 		ctx = TAILQ_FIRST(&zdplane_info.dg_route_ctx_q);
+		prov = TAILQ_FIRST(&zdplane_info.dg_providers_q);
 	}
 	DPLANE_UNLOCK();
 
-	return (ctx != NULL);
+	if (ctx != NULL) {
+		ret = true;
+		goto done;
+	}
+
+	while (prov) {
+
+		if (dplane_provider_is_threaded(prov))
+			DPLANE_PROV_LOCK(prov);
+
+		ctx = TAILQ_FIRST(&(prov->dp_ctx_in_q));
+		if (ctx == NULL)
+			ctx = TAILQ_FIRST(&(prov->dp_ctx_out_q));
+
+		if (dplane_provider_is_threaded(prov))
+			DPLANE_PROV_UNLOCK(prov);
+
+		if (ctx != NULL)
+			break;
+
+		DPLANE_LOCK();
+		prov = TAILQ_NEXT(prov, dp_prov_link);
+		DPLANE_UNLOCK();
+	}
+
+	if (ctx != NULL)
+		ret = true;
+
+done:
+	return ret;
 }
 
 /*
@@ -1443,7 +1488,7 @@ static int dplane_thread_loop(struct thread *event)
 	struct zebra_dplane_provider *prov;
 	struct zebra_dplane_ctx *ctx, *tctx;
 	int limit, counter, error_counter;
-	uint64_t lval;
+	uint64_t curr, high;
 
 	/* Capture work limit per cycle */
 	limit = zdplane_info.dg_updates_per_cycle;
@@ -1534,10 +1579,16 @@ static int dplane_thread_loop(struct thread *event)
 			TAILQ_CONCAT(&(prov->dp_ctx_in_q), &work_list,
 				     zd_q_entries);
 
-		lval = atomic_add_fetch_explicit(&prov->dp_in_counter, counter,
-						 memory_order_relaxed);
-		if (lval > prov->dp_in_max)
-			atomic_store_explicit(&prov->dp_in_max, lval,
+		atomic_fetch_add_explicit(&prov->dp_in_counter, counter,
+					  memory_order_relaxed);
+		atomic_fetch_add_explicit(&prov->dp_in_queued, counter,
+					  memory_order_relaxed);
+		curr = atomic_load_explicit(&prov->dp_in_queued,
+					    memory_order_relaxed);
+		high = atomic_load_explicit(&prov->dp_in_max,
+					    memory_order_relaxed);
+		if (curr > high)
+			atomic_store_explicit(&prov->dp_in_max, curr,
 					      memory_order_relaxed);
 
 		if (dplane_provider_is_threaded(prov))

From ad6aad4d0bc06f7711d05e1d05576ea25aac04c5 Mon Sep 17 00:00:00 2001
From: Mark Stapp <mjs@voltanet.io>
Date: Thu, 27 Sep 2018 15:10:54 -0400
Subject: [PATCH 08/10] zebra: dplane lock and thread_master apis

Improve, simplify dataplane provider locking apis. Add accessor
for dataplane pthread's thread_master, for use by providers who
need to use the thread/event apis.

Signed-off-by: Mark Stapp <mjs@voltanet.io>
---
 zebra/zebra_dplane.c | 58 +++++++++++++++++++++++++-------------------
 zebra/zebra_dplane.h | 11 ++++++++-
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c
index 5fd27dc830..95df243829 100644
--- a/zebra/zebra_dplane.c
+++ b/zebra/zebra_dplane.c
@@ -254,6 +254,12 @@ static int dplane_thread_loop(struct thread *event);
  * Public APIs
  */
 
+/* Obtain thread_master for dataplane thread */
+struct thread_master *dplane_get_thread_master(void)
+{
+	return zdplane_info.dg_master;
+}
+
 /*
  * Allocate a dataplane update context
  */
@@ -1069,6 +1075,21 @@ int dplane_provider_get_work_limit(const struct zebra_dplane_provider *prov)
 	return zdplane_info.dg_updates_per_cycle;
 }
 
+/* Lock/unlock a provider's mutex - iff the provider was registered with
+ * the THREADED flag.
+ */
+void dplane_provider_lock(struct zebra_dplane_provider *prov)
+{
+	if (dplane_provider_is_threaded(prov))
+		DPLANE_PROV_LOCK(prov);
+}
+
+void dplane_provider_unlock(struct zebra_dplane_provider *prov)
+{
+	if (dplane_provider_is_threaded(prov))
+		DPLANE_PROV_UNLOCK(prov);
+}
+
 /*
  * Dequeue and maintain associated counter
  */
@@ -1077,8 +1098,7 @@ struct zebra_dplane_ctx *dplane_provider_dequeue_in_ctx(
 {
 	struct zebra_dplane_ctx *ctx = NULL;
 
-	if (dplane_provider_is_threaded(prov))
-		DPLANE_PROV_LOCK(prov);
+	dplane_provider_lock(prov);
 
 	ctx = TAILQ_FIRST(&(prov->dp_ctx_in_q));
 	if (ctx) {
@@ -1088,8 +1108,7 @@ struct zebra_dplane_ctx *dplane_provider_dequeue_in_ctx(
 					  memory_order_relaxed);
 	}
 
-	if (dplane_provider_is_threaded(prov))
-		DPLANE_PROV_UNLOCK(prov);
+	dplane_provider_unlock(prov);
 
 	return ctx;
 }
@@ -1105,8 +1124,7 @@ int dplane_provider_dequeue_in_list(struct zebra_dplane_provider *prov,
 
 	limit = zdplane_info.dg_updates_per_cycle;
 
-	if (dplane_provider_is_threaded(prov))
-		DPLANE_PROV_LOCK(prov);
+	dplane_provider_lock(prov);
 
 	for (ret = 0; ret < limit; ret++) {
 		ctx = TAILQ_FIRST(&(prov->dp_ctx_in_q));
@@ -1123,8 +1141,7 @@ int dplane_provider_dequeue_in_list(struct zebra_dplane_provider *prov,
 		atomic_fetch_sub_explicit(&prov->dp_in_queued, ret,
 					  memory_order_relaxed);
 
-	if (dplane_provider_is_threaded(prov))
-		DPLANE_PROV_UNLOCK(prov);
+	dplane_provider_unlock(prov);
 
 	return ret;
 }
@@ -1135,14 +1152,12 @@ int dplane_provider_dequeue_in_list(struct zebra_dplane_provider *prov,
 void dplane_provider_enqueue_out_ctx(struct zebra_dplane_provider *prov,
 				     struct zebra_dplane_ctx *ctx)
 {
-	if (dplane_provider_is_threaded(prov))
-		DPLANE_PROV_LOCK(prov);
+	dplane_provider_lock(prov);
 
 	TAILQ_INSERT_TAIL(&(prov->dp_ctx_out_q), ctx,
 			  zd_q_entries);
 
-	if (dplane_provider_is_threaded(prov))
-		DPLANE_PROV_UNLOCK(prov);
+	dplane_provider_unlock(prov);
 
 	atomic_fetch_add_explicit(&(prov->dp_out_counter), 1,
 				  memory_order_relaxed);
@@ -1394,15 +1409,13 @@ static bool dplane_work_pending(void)
 
 	while (prov) {
 
-		if (dplane_provider_is_threaded(prov))
-			DPLANE_PROV_LOCK(prov);
+		dplane_provider_lock(prov);
 
 		ctx = TAILQ_FIRST(&(prov->dp_ctx_in_q));
 		if (ctx == NULL)
 			ctx = TAILQ_FIRST(&(prov->dp_ctx_out_q));
 
-		if (dplane_provider_is_threaded(prov))
-			DPLANE_PROV_UNLOCK(prov);
+		dplane_provider_unlock(prov);
 
 		if (ctx != NULL)
 			break;
@@ -1542,7 +1555,6 @@ static int dplane_thread_loop(struct thread *event)
 		/* At each iteration, the temporary work list has 'counter'
 		 * items.
 		 */
-
 		if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
 			zlog_debug("dplane enqueues %d new work to provider '%s'",
 				   counter, dplane_provider_get_name(prov));
@@ -1572,8 +1584,7 @@ static int dplane_thread_loop(struct thread *event)
 		}
 
 		/* Enqueue new work to the provider */
-		if (dplane_provider_is_threaded(prov))
-			DPLANE_PROV_LOCK(prov);
+		dplane_provider_lock(prov);
 
 		if (TAILQ_FIRST(&work_list))
 			TAILQ_CONCAT(&(prov->dp_ctx_in_q), &work_list,
@@ -1591,8 +1602,7 @@ static int dplane_thread_loop(struct thread *event)
 			atomic_store_explicit(&prov->dp_in_max, curr,
 					      memory_order_relaxed);
 
-		if (dplane_provider_is_threaded(prov))
-			DPLANE_PROV_UNLOCK(prov);
+		dplane_provider_unlock(prov);
 
 		/* Reset the temp list (though the 'concat' may have done this
 		 * already), and the counter
@@ -1611,8 +1621,7 @@ static int dplane_thread_loop(struct thread *event)
 			break;
 
 		/* Dequeue completed work from the provider */
-		if (dplane_provider_is_threaded(prov))
-			DPLANE_PROV_LOCK(prov);
+		dplane_provider_lock(prov);
 
 		while (counter < limit) {
 			ctx = TAILQ_FIRST(&(prov->dp_ctx_out_q));
@@ -1627,8 +1636,7 @@ static int dplane_thread_loop(struct thread *event)
 				break;
 		}
 
-		if (dplane_provider_is_threaded(prov))
-			DPLANE_PROV_UNLOCK(prov);
+		dplane_provider_unlock(prov);
 
 		if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
 			zlog_debug("dplane dequeues %d completed work from provider %s",
diff --git a/zebra/zebra_dplane.h b/zebra/zebra_dplane.h
index 2e64540d6b..b6b2e64600 100644
--- a/zebra/zebra_dplane.h
+++ b/zebra/zebra_dplane.h
@@ -288,7 +288,16 @@ uint32_t dplane_provider_get_id(const struct zebra_dplane_provider *prov);
 void *dplane_provider_get_data(const struct zebra_dplane_provider *prov);
 bool dplane_provider_is_threaded(const struct zebra_dplane_provider *prov);
 
-/* Providers should limit number of updates per work cycle */
+/* Lock/unlock a provider's mutex - iff the provider was registered with
+ * the THREADED flag.
+ */
+void dplane_provider_lock(struct zebra_dplane_provider *prov);
+void dplane_provider_unlock(struct zebra_dplane_provider *prov);
+
+/* Obtain thread_master for dataplane thread */
+struct thread_master *dplane_get_thread_master(void);
+
+/* Providers should (generally) limit number of updates per work cycle */
 int dplane_provider_get_work_limit(const struct zebra_dplane_provider *prov);
 
 /* Provider api to signal that work/events are available

From 62b8bb7a1776ea98e5874a4a057617e335a7f77c Mon Sep 17 00:00:00 2001
From: Mark Stapp <mjs@voltanet.io>
Date: Mon, 12 Nov 2018 15:57:03 -0500
Subject: [PATCH 09/10] zebra: separate netlink socket for dataplane

Use a separate netlink socket for the dataplane's updates, to
avoid races between the dataplane pthread and the zebra main
pthread. Revise zebra shutdown so that the dataplane netlink
socket is cleaned-up later, after all shutdown-time dataplane
work has been done.

Signed-off-by: Mark Stapp <mjs@voltanet.io>
---
 zebra/kernel_netlink.c | 65 +++++++++++++++++++++++++++++++++---------
 zebra/kernel_socket.c  |  2 +-
 zebra/main.c           |  5 +++-
 zebra/rt.h             |  2 +-
 zebra/zebra_dplane.c   | 27 ++++++++++++++++--
 zebra/zebra_ns.c       | 39 +++++++++++++++++++++----
 zebra/zebra_ns.h       |  8 ++++--
 7 files changed, 120 insertions(+), 28 deletions(-)

diff --git a/zebra/kernel_netlink.c b/zebra/kernel_netlink.c
index 0772c59b92..afc3985854 100644
--- a/zebra/kernel_netlink.c
+++ b/zebra/kernel_netlink.c
@@ -396,7 +396,7 @@ static int kernel_read(struct thread *thread)
 
 /*
  * Filter out messages from self that occur on listener socket,
- * caused by our actions on the command socket
+ * caused by our actions on the command socket(s)
  *
  * When we add new Netlink message types we probably
  * do not need to add them here as that we are filtering
@@ -407,7 +407,7 @@ static int kernel_read(struct thread *thread)
  * so that we only had to write one way to handle incoming
  * address add/delete changes.
  */
-static void netlink_install_filter(int sock, __u32 pid)
+static void netlink_install_filter(int sock, __u32 pid, __u32 dplane_pid)
 {
 	/*
 	 * BPF_JUMP instructions and where you jump to are based upon
@@ -418,7 +418,8 @@ static void netlink_install_filter(int sock, __u32 pid)
 	struct sock_filter filter[] = {
 		/*
 		 * Logic:
-		 *   if (nlmsg_pid == pid) {
+		 *   if (nlmsg_pid == pid ||
+		 *       nlmsg_pid == dplane_pid) {
 		 *       if (the incoming nlmsg_type ==
 		 *           RTM_NEWADDR | RTM_DELADDR)
 		 *           keep this message
@@ -435,26 +436,30 @@ static void netlink_install_filter(int sock, __u32 pid)
 		/*
 		 * 1: Compare to pid
 		 */
-		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(pid), 0, 4),
+		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(pid), 1, 0),
 		/*
-		 * 2: Load the nlmsg_type into BPF register
+		 * 2: Compare to dplane pid
+		 */
+		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(dplane_pid), 0, 4),
+		/*
+		 * 3: Load the nlmsg_type into BPF register
 		 */
 		BPF_STMT(BPF_LD | BPF_ABS | BPF_H,
 			 offsetof(struct nlmsghdr, nlmsg_type)),
 		/*
-		 * 3: Compare to RTM_NEWADDR
+		 * 4: Compare to RTM_NEWADDR
 		 */
 		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_NEWADDR), 2, 0),
 		/*
-		 * 4: Compare to RTM_DELADDR
+		 * 5: Compare to RTM_DELADDR
 		 */
 		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_DELADDR), 1, 0),
 		/*
-		 * 5: This is the end state of we want to skip the
+		 * 6: This is the end state of we want to skip the
 		 *    message
 		 */
 		BPF_STMT(BPF_RET | BPF_K, 0),
-		/* 6: This is the end state of we want to keep
+		/* 7: This is the end state of we want to keep
 		 *     the message
 		 */
 		BPF_STMT(BPF_RET | BPF_K, 0xffff),
@@ -1102,6 +1107,15 @@ void kernel_init(struct zebra_ns *zns)
 		exit(-1);
 	}
 
+	snprintf(zns->netlink_dplane.name, sizeof(zns->netlink_dplane.name),
+		 "netlink-dp (NS %u)", zns->ns_id);
+	zns->netlink_dplane.sock = -1;
+	if (netlink_socket(&zns->netlink_dplane, 0, zns->ns_id) < 0) {
+		zlog_err("Failure to create %s socket",
+			 zns->netlink_dplane.name);
+		exit(-1);
+	}
+
 	/*
 	 * SOL_NETLINK is not available on all platforms yet
 	 * apparently.  It's in bits/socket.h which I am not
@@ -1110,14 +1124,22 @@ void kernel_init(struct zebra_ns *zns)
 #if defined SOL_NETLINK
 	/*
 	 * Let's tell the kernel that we want to receive extended
-	 * ACKS over our command socket
+	 * ACKS over our command socket(s)
 	 */
 	one = 1;
 	ret = setsockopt(zns->netlink_cmd.sock, SOL_NETLINK, NETLINK_EXT_ACK,
 			 &one, sizeof(one));
 
 	if (ret < 0)
-		zlog_notice("Registration for extended ACK failed : %d %s",
+		zlog_notice("Registration for extended cmd ACK failed : %d %s",
+			    errno, safe_strerror(errno));
+
+	one = 1;
+	ret = setsockopt(zns->netlink_dplane.sock, SOL_NETLINK, NETLINK_EXT_ACK,
+			 &one, sizeof(one));
+
+	if (ret < 0)
+		zlog_notice("Registration for extended dp ACK failed : %d %s",
 			    errno, safe_strerror(errno));
 #endif
 
@@ -1130,12 +1152,18 @@ void kernel_init(struct zebra_ns *zns)
 		zlog_err("Can't set %s socket error: %s(%d)",
 			 zns->netlink_cmd.name, safe_strerror(errno), errno);
 
+	if (fcntl(zns->netlink_dplane.sock, F_SETFL, O_NONBLOCK) < 0)
+		zlog_err("Can't set %s socket error: %s(%d)",
+			 zns->netlink_dplane.name, safe_strerror(errno), errno);
+
 	/* Set receive buffer size if it's set from command line */
 	if (nl_rcvbufsize)
 		netlink_recvbuf(&zns->netlink, nl_rcvbufsize);
 
 	netlink_install_filter(zns->netlink.sock,
-			       zns->netlink_cmd.snl.nl_pid);
+			       zns->netlink_cmd.snl.nl_pid,
+			       zns->netlink_dplane.snl.nl_pid);
+
 	zns->t_netlink = NULL;
 
 	thread_add_read(zebrad.master, kernel_read, zns,
@@ -1144,7 +1172,7 @@ void kernel_init(struct zebra_ns *zns)
 	rt_netlink_init();
 }
 
-void kernel_terminate(struct zebra_ns *zns)
+void kernel_terminate(struct zebra_ns *zns, bool complete)
 {
 	THREAD_READ_OFF(zns->t_netlink);
 
@@ -1157,6 +1185,15 @@ void kernel_terminate(struct zebra_ns *zns)
 		close(zns->netlink_cmd.sock);
 		zns->netlink_cmd.sock = -1;
 	}
-}
 
+	/* During zebra shutdown, we need to leave the dataplane socket
+	 * around until all work is done.
+	 */
+	if (complete) {
+		if (zns->netlink_dplane.sock >= 0) {
+			close(zns->netlink_dplane.sock);
+			zns->netlink_dplane.sock = -1;
+		}
+	}
+}
 #endif /* HAVE_NETLINK */
diff --git a/zebra/kernel_socket.c b/zebra/kernel_socket.c
index 7af3083fd2..ff739ae79b 100644
--- a/zebra/kernel_socket.c
+++ b/zebra/kernel_socket.c
@@ -1415,7 +1415,7 @@ void kernel_init(struct zebra_ns *zns)
 	routing_socket(zns);
 }
 
-void kernel_terminate(struct zebra_ns *zns)
+void kernel_terminate(struct zebra_ns *zns, bool complete)
 {
 	return;
 }
diff --git a/zebra/main.c b/zebra/main.c
index e5160c1a51..5b5ee8259a 100644
--- a/zebra/main.c
+++ b/zebra/main.c
@@ -172,7 +172,7 @@ static void sigint(void)
 		work_queue_free_and_null(&zebrad.lsp_process_q);
 	vrf_terminate();
 
-	ns_walk_func(zebra_ns_disabled);
+	ns_walk_func(zebra_ns_early_shutdown);
 	zebra_ns_notify_close();
 
 	access_list_reset();
@@ -196,6 +196,9 @@ int zebra_finalize(struct thread *dummy)
 {
 	zlog_info("Zebra final shutdown");
 
+	/* Final shutdown of ns resources */
+	ns_walk_func(zebra_ns_final_shutdown);
+
 	/* Stop dplane thread and finish any cleanup */
 	zebra_dplane_shutdown();
 
diff --git a/zebra/rt.h b/zebra/rt.h
index 70ac6f635c..0317dc85ba 100644
--- a/zebra/rt.h
+++ b/zebra/rt.h
@@ -86,7 +86,7 @@ extern int kernel_del_neigh(struct interface *ifp, struct ipaddr *ip);
  */
 extern void interface_list(struct zebra_ns *zns);
 extern void kernel_init(struct zebra_ns *zns);
-extern void kernel_terminate(struct zebra_ns *zns);
+extern void kernel_terminate(struct zebra_ns *zns, bool complete);
 extern void macfdb_read(struct zebra_ns *zns);
 extern void macfdb_read_for_bridge(struct zebra_ns *zns, struct interface *ifp,
 				   struct interface *br_if);
diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c
index 95df243829..ba0f1b41aa 100644
--- a/zebra/zebra_dplane.c
+++ b/zebra/zebra_dplane.c
@@ -249,6 +249,8 @@ static struct zebra_dplane_globals {
 
 /* Prototypes */
 static int dplane_thread_loop(struct thread *event);
+static void dplane_info_from_zns(struct zebra_dplane_info *ns_info,
+				 struct zebra_ns *zns);
 
 /*
  * Public APIs
@@ -706,16 +708,17 @@ static int dplane_ctx_route_init(struct zebra_dplane_ctx *ctx,
 	zvrf = vrf_info_lookup(re->vrf_id);
 	zns = zvrf->zns;
 
-	zebra_dplane_info_from_zns(&(ctx->zd_ns_info), zns, true /*is_cmd*/);
+	/* Internal copy helper */
+	dplane_info_from_zns(&(ctx->zd_ns_info), zns);
 
 #if defined(HAVE_NETLINK)
 	/* Increment message counter after copying to context struct - may need
 	 * two messages in some 'update' cases.
 	 */
 	if (op == DPLANE_OP_ROUTE_UPDATE)
-		zns->netlink_cmd.seq += 2;
+		zns->netlink_dplane.seq += 2;
 	else
-		zns->netlink_cmd.seq++;
+		zns->netlink_dplane.seq++;
 #endif /* NETLINK*/
 
 	/* Copy nexthops; recursive info is included too */
@@ -1163,11 +1166,29 @@ void dplane_provider_enqueue_out_ctx(struct zebra_dplane_provider *prov,
 				  memory_order_relaxed);
 }
 
+/*
+ * Accessor for provider object
+ */
 bool dplane_provider_is_threaded(const struct zebra_dplane_provider *prov)
 {
 	return (prov->dp_flags & DPLANE_PROV_FLAG_THREADED);
 }
 
+/*
+ * Internal helper that copies information from a zebra ns object; this is
+ * called in the zebra main pthread context as part of dplane ctx init.
+ */
+static void dplane_info_from_zns(struct zebra_dplane_info *ns_info,
+				 struct zebra_ns *zns)
+{
+	ns_info->ns_id = zns->ns_id;
+
+#if defined(HAVE_NETLINK)
+	ns_info->is_cmd = true;
+	ns_info->nls = zns->netlink_dplane;
+#endif /* NETLINK */
+}
+
 /*
  * Provider api to signal that work/events are available
  * for the dataplane pthread.
diff --git a/zebra/zebra_ns.c b/zebra/zebra_ns.c
index e65f23dc8a..965c8c206c 100644
--- a/zebra/zebra_ns.c
+++ b/zebra/zebra_ns.c
@@ -47,6 +47,7 @@ DEFINE_MTYPE(ZEBRA, ZEBRA_NS, "Zebra Name Space")
 static struct zebra_ns *dzns;
 
 static int logicalrouter_config_write(struct vty *vty);
+static int zebra_ns_disable_internal(struct zebra_ns *zns, bool complete);
 
 struct zebra_ns *zebra_ns_lookup(ns_id_t ns_id)
 {
@@ -111,7 +112,7 @@ int zebra_ns_disabled(struct ns *ns)
 		zlog_info("ZNS %s with id %u (disabled)", ns->name, ns->ns_id);
 	if (!zns)
 		return 0;
-	return zebra_ns_disable(ns->ns_id, (void **)&zns);
+	return zebra_ns_disable_internal(zns, true);
 }
 
 /* Do global enable actions - open sockets, read kernel config etc. */
@@ -135,17 +136,18 @@ int zebra_ns_enable(ns_id_t ns_id, void **info)
 	return 0;
 }
 
-int zebra_ns_disable(ns_id_t ns_id, void **info)
+/* Common handler for ns disable - this can be called during ns config,
+ * or during zebra shutdown.
+ */
+static int zebra_ns_disable_internal(struct zebra_ns *zns, bool complete)
 {
-	struct zebra_ns *zns = (struct zebra_ns *)(*info);
-
 	route_table_finish(zns->if_table);
 	zebra_vxlan_ns_disable(zns);
 #if defined(HAVE_RTADV)
 	rtadv_terminate(zns);
 #endif
 
-	kernel_terminate(zns);
+	kernel_terminate(zns, complete);
 
 	table_manager_disable(zns->ns_id);
 
@@ -154,6 +156,33 @@ int zebra_ns_disable(ns_id_t ns_id, void **info)
 	return 0;
 }
 
+/* During zebra shutdown, do partial cleanup while the async dataplane
+ * is still running.
+ */
+int zebra_ns_early_shutdown(struct ns *ns)
+{
+	struct zebra_ns *zns = ns->info;
+
+	if (zns == NULL)
+		return 0;
+
+	return zebra_ns_disable_internal(zns, false);
+}
+
+/* During zebra shutdown, do final cleanup
+ * after all dataplane work is complete.
+ */
+int zebra_ns_final_shutdown(struct ns *ns)
+{
+	struct zebra_ns *zns = ns->info;
+
+	if (zns == NULL)
+		return 0;
+
+	kernel_terminate(zns, true);
+
+	return 0;
+}
 
 int zebra_ns_init(void)
 {
diff --git a/zebra/zebra_ns.h b/zebra/zebra_ns.h
index c1a9b41b8d..d3592f8f30 100644
--- a/zebra/zebra_ns.h
+++ b/zebra/zebra_ns.h
@@ -46,8 +46,9 @@ struct zebra_ns {
 	ns_id_t ns_id;
 
 #ifdef HAVE_NETLINK
-	struct nlsock netlink;     /* kernel messages */
-	struct nlsock netlink_cmd; /* command channel */
+	struct nlsock netlink;        /* kernel messages */
+	struct nlsock netlink_cmd;    /* command channel */
+	struct nlsock netlink_dplane; /* dataplane channel */
 	struct thread *t_netlink;
 #endif
 
@@ -62,7 +63,8 @@ struct zebra_ns *zebra_ns_lookup(ns_id_t ns_id);
 int zebra_ns_init(void);
 int zebra_ns_enable(ns_id_t ns_id, void **info);
 int zebra_ns_disabled(struct ns *ns);
-int zebra_ns_disable(ns_id_t ns_id, void **info);
+int zebra_ns_early_shutdown(struct ns *ns);
+int zebra_ns_final_shutdown(struct ns *ns);
 
 int zebra_ns_config_write(struct vty *vty, struct ns *ns);
 

From 80776aec8147d643ca8aaa99f7320c7f22a70677 Mon Sep 17 00:00:00 2001
From: Mark Stapp <mjs@voltanet.io>
Date: Wed, 14 Nov 2018 14:45:30 -0500
Subject: [PATCH 10/10] zebra: add dataplane routing socket

To avoid conflicts between the zebra main pthread and the
dataplane pthread, use a separate routing socket (on non-netlink
platforms) for dataplane route updates to the OS.

Signed-off-by: Mark Stapp <mjs@voltanet.io>
---
 zebra/kernel_socket.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/zebra/kernel_socket.c b/zebra/kernel_socket.c
index ff739ae79b..dcc22d2162 100644
--- a/zebra/kernel_socket.c
+++ b/zebra/kernel_socket.c
@@ -278,6 +278,11 @@ static const struct message rtm_flag_str[] = {{RTF_UP, "UP"},
 /* Kernel routing update socket. */
 int routing_sock = -1;
 
+/* Kernel dataplane routing update socket, used in the dataplane pthread
+ * context.
+ */
+int dplane_routing_sock = -1;
+
 /* Yes I'm checking ugly routing socket behavior. */
 /* #define DEBUG */
 
@@ -1136,7 +1141,7 @@ int rtm_write(int message, union sockunion *dest, union sockunion *mask,
 		char buf[512];
 	} msg;
 
-	if (routing_sock < 0)
+	if (dplane_routing_sock < 0)
 		return ZEBRA_ERR_EPERM;
 
 	/* Clear and set rt_msghdr values */
@@ -1243,7 +1248,7 @@ int rtm_write(int message, union sockunion *dest, union sockunion *mask,
 
 	msg.rtm.rtm_msglen = pnt - (caddr_t)&msg;
 
-	ret = write(routing_sock, &msg, msg.rtm.rtm_msglen);
+	ret = write(dplane_routing_sock, &msg, msg.rtm.rtm_msglen);
 
 	if (ret != msg.rtm.rtm_msglen) {
 		if (errno == EEXIST)
@@ -1390,6 +1395,9 @@ static void routing_socket(struct zebra_ns *zns)
 {
 	frr_elevate_privs(&zserv_privs) {
 		routing_sock = ns_socket(AF_ROUTE, SOCK_RAW, 0, zns->ns_id);
+
+		dplane_routing_sock =
+			ns_socket(AF_ROUTE, SOCK_RAW, 0, zns->ns_id);
 	}
 
 	if (routing_sock < 0) {
@@ -1397,6 +1405,12 @@ static void routing_socket(struct zebra_ns *zns)
 		return;
 	}
 
+	if (dplane_routing_sock < 0) {
+		flog_err_sys(EC_LIB_SOCKET,
+			     "Can't init kernel dataplane routing socket");
+		return;
+	}
+
 	/* XXX: Socket should be NONBLOCK, however as we currently
 	 * discard failed writes, this will lead to inconsistencies.
 	 * For now, socket must be blocking.