Merge pull request #3267 from mjstapp/dplane_3

zebra async dataplane: phase 2
2025-08-04 18:25:00 +00:00 · 2018-11-27 13:56:30 -05:00 · 2018-11-27 13:56:30 -05:00 · 4fedcc6947
commit 4fedcc6947
parent 98d8359fe7 80776aec81
9 changed files with 992 additions and 180 deletions
--- a/zebra/kernel_netlink.c
+++ b/zebra/kernel_netlink.c
@ -396,7 +396,7 @@ static int kernel_read(struct thread *thread)

 /*
 * Filter out messages from self that occur on listener socket,
- * caused by our actions on the command socket
+ * caused by our actions on the command socket(s)
 *
 * When we add new Netlink message types we probably
 * do not need to add them here as that we are filtering
@ -407,7 +407,7 @@ static int kernel_read(struct thread *thread)
 * so that we only had to write one way to handle incoming
 * address add/delete changes.
 */
-static void netlink_install_filter(int sock, __u32 pid)
+static void netlink_install_filter(int sock, __u32 pid, __u32 dplane_pid)
 {
 	/*
 	 * BPF_JUMP instructions and where you jump to are based upon
@ -418,7 +418,8 @@ static void netlink_install_filter(int sock, __u32 pid)
 	struct sock_filter filter[] = {
 		/*
 		 * Logic:
-		 *   if (nlmsg_pid == pid) {
+		 *   if (nlmsg_pid == pid ||
+		 *       nlmsg_pid == dplane_pid) {
 		 *       if (the incoming nlmsg_type ==
 		 *           RTM_NEWADDR | RTM_DELADDR)
 		 *           keep this message
@ -435,26 +436,30 @@ static void netlink_install_filter(int sock, __u32 pid)
 		/*
 		 * 1: Compare to pid
 		 */
-		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(pid), 0, 4),
+		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(pid), 1, 0),
 		/*
-		 * 2: Load the nlmsg_type into BPF register
+		 * 2: Compare to dplane pid
+		 */
+		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(dplane_pid), 0, 4),
+		/*
+		 * 3: Load the nlmsg_type into BPF register
 		 */
 		BPF_STMT(BPF_LD | BPF_ABS | BPF_H,
 			 offsetof(struct nlmsghdr, nlmsg_type)),
 		/*
-		 * 3: Compare to RTM_NEWADDR
+		 * 4: Compare to RTM_NEWADDR
 		 */
 		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_NEWADDR), 2, 0),
 		/*
-		 * 4: Compare to RTM_DELADDR
+		 * 5: Compare to RTM_DELADDR
 		 */
 		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_DELADDR), 1, 0),
 		/*
-		 * 5: This is the end state of we want to skip the
+		 * 6: This is the end state of we want to skip the
 		 *    message
 		 */
 		BPF_STMT(BPF_RET | BPF_K, 0),
-		/* 6: This is the end state of we want to keep
+		/* 7: This is the end state of we want to keep
 		 *     the message
 		 */
 		BPF_STMT(BPF_RET | BPF_K, 0xffff),
@ -1102,6 +1107,15 @@ void kernel_init(struct zebra_ns *zns)
 		exit(-1);
 	}

+	snprintf(zns->netlink_dplane.name, sizeof(zns->netlink_dplane.name),
+		 "netlink-dp (NS %u)", zns->ns_id);
+	zns->netlink_dplane.sock = -1;
+	if (netlink_socket(&zns->netlink_dplane, 0, zns->ns_id) < 0) {
+		zlog_err("Failure to create %s socket",
+			 zns->netlink_dplane.name);
+		exit(-1);
+	}
+
 	/*
 	 * SOL_NETLINK is not available on all platforms yet
 	 * apparently.  It's in bits/socket.h which I am not
@ -1110,14 +1124,22 @@ void kernel_init(struct zebra_ns *zns)
 #if defined SOL_NETLINK
 	/*
 	 * Let's tell the kernel that we want to receive extended
-	 * ACKS over our command socket
+	 * ACKS over our command socket(s)
 	 */
 	one = 1;
 	ret = setsockopt(zns->netlink_cmd.sock, SOL_NETLINK, NETLINK_EXT_ACK,
 			 &one, sizeof(one));

 	if (ret < 0)
-		zlog_notice("Registration for extended ACK failed : %d %s",
+		zlog_notice("Registration for extended cmd ACK failed : %d %s",
+			    errno, safe_strerror(errno));
+
+	one = 1;
+	ret = setsockopt(zns->netlink_dplane.sock, SOL_NETLINK, NETLINK_EXT_ACK,
+			 &one, sizeof(one));
+
+	if (ret < 0)
+		zlog_notice("Registration for extended dp ACK failed : %d %s",
 			    errno, safe_strerror(errno));
 #endif

@ -1130,12 +1152,18 @@ void kernel_init(struct zebra_ns *zns)
 		zlog_err("Can't set %s socket error: %s(%d)",
 			 zns->netlink_cmd.name, safe_strerror(errno), errno);

+	if (fcntl(zns->netlink_dplane.sock, F_SETFL, O_NONBLOCK) < 0)
+		zlog_err("Can't set %s socket error: %s(%d)",
+			 zns->netlink_dplane.name, safe_strerror(errno), errno);
+
 	/* Set receive buffer size if it's set from command line */
 	if (nl_rcvbufsize)
 		netlink_recvbuf(&zns->netlink, nl_rcvbufsize);

 	netlink_install_filter(zns->netlink.sock,
-			       zns->netlink_cmd.snl.nl_pid);
+			       zns->netlink_cmd.snl.nl_pid,
+			       zns->netlink_dplane.snl.nl_pid);
+
 	zns->t_netlink = NULL;

 	thread_add_read(zebrad.master, kernel_read, zns,
@ -1144,7 +1172,7 @@ void kernel_init(struct zebra_ns *zns)
 	rt_netlink_init();
 }

-void kernel_terminate(struct zebra_ns *zns)
+void kernel_terminate(struct zebra_ns *zns, bool complete)
 {
 	THREAD_READ_OFF(zns->t_netlink);

@ -1157,6 +1185,15 @@ void kernel_terminate(struct zebra_ns *zns)
 		close(zns->netlink_cmd.sock);
 		zns->netlink_cmd.sock = -1;
 	}
-}

+	/* During zebra shutdown, we need to leave the dataplane socket
+	 * around until all work is done.
+	 */
+	if (complete) {
+		if (zns->netlink_dplane.sock >= 0) {
+			close(zns->netlink_dplane.sock);
+			zns->netlink_dplane.sock = -1;
+		}
+	}
+}
 #endif /* HAVE_NETLINK */
--- a/zebra/kernel_socket.c
+++ b/zebra/kernel_socket.c
@ -278,6 +278,11 @@ static const struct message rtm_flag_str[] = {{RTF_UP, "UP"},
 /* Kernel routing update socket. */
 int routing_sock = -1;

+/* Kernel dataplane routing update socket, used in the dataplane pthread
+ * context.
+ */
+int dplane_routing_sock = -1;
+
 /* Yes I'm checking ugly routing socket behavior. */
 /* #define DEBUG */

@ -1136,7 +1141,7 @@ int rtm_write(int message, union sockunion *dest, union sockunion *mask,
 		char buf[512];
 	} msg;

-	if (routing_sock < 0)
+	if (dplane_routing_sock < 0)
 		return ZEBRA_ERR_EPERM;

 	/* Clear and set rt_msghdr values */
@ -1243,7 +1248,7 @@ int rtm_write(int message, union sockunion *dest, union sockunion *mask,

 	msg.rtm.rtm_msglen = pnt - (caddr_t)&msg;

-	ret = write(routing_sock, &msg, msg.rtm.rtm_msglen);
+	ret = write(dplane_routing_sock, &msg, msg.rtm.rtm_msglen);

 	if (ret != msg.rtm.rtm_msglen) {
 		if (errno == EEXIST)
@ -1390,6 +1395,9 @@ static void routing_socket(struct zebra_ns *zns)
 {
 	frr_elevate_privs(&zserv_privs) {
 		routing_sock = ns_socket(AF_ROUTE, SOCK_RAW, 0, zns->ns_id);
+
+		dplane_routing_sock =
+			ns_socket(AF_ROUTE, SOCK_RAW, 0, zns->ns_id);
 	}

 	if (routing_sock < 0) {
@ -1397,6 +1405,12 @@ static void routing_socket(struct zebra_ns *zns)
 		return;
 	}

+	if (dplane_routing_sock < 0) {
+		flog_err_sys(EC_LIB_SOCKET,
+			     "Can't init kernel dataplane routing socket");
+		return;
+	}
+
 	/* XXX: Socket should be NONBLOCK, however as we currently
 	 * discard failed writes, this will lead to inconsistencies.
 	 * For now, socket must be blocking.
@ -1415,7 +1429,7 @@ void kernel_init(struct zebra_ns *zns)
 	routing_socket(zns);
 }

-void kernel_terminate(struct zebra_ns *zns)
+void kernel_terminate(struct zebra_ns *zns, bool complete)
 {
 	return;
 }
--- a/zebra/main.c
+++ b/zebra/main.c
@ -172,7 +172,7 @@ static void sigint(void)
 		work_queue_free_and_null(&zebrad.lsp_process_q);
 	vrf_terminate();

-	ns_walk_func(zebra_ns_disabled);
+	ns_walk_func(zebra_ns_early_shutdown);
 	zebra_ns_notify_close();

 	access_list_reset();
@ -196,6 +196,9 @@ int zebra_finalize(struct thread *dummy)
 {
 	zlog_info("Zebra final shutdown");

+	/* Final shutdown of ns resources */
+	ns_walk_func(zebra_ns_final_shutdown);
+
 	/* Stop dplane thread and finish any cleanup */
 	zebra_dplane_shutdown();

@ -390,6 +393,9 @@ int main(int argc, char **argv)
 	vty_config_lockless();
 	zebrad.master = frr_init();

+	/* Initialize pthread library */
+	frr_pthread_init();
+
 	/* Zebra related initialize. */
 	zebra_router_init();
 	zserv_init();
@ -445,8 +451,8 @@ int main(int argc, char **argv)
 	/* Needed for BSD routing socket. */
 	pid = getpid();

-	/* Intialize pthread library */
-	frr_pthread_init();
+	/* Start dataplane system */
+	zebra_dplane_start();

 	/* Start Zebra API server */
 	zserv_start(zserv_path);
--- a/zebra/rt.h
+++ b/zebra/rt.h
@ -86,7 +86,7 @@ extern int kernel_del_neigh(struct interface *ifp, struct ipaddr *ip);
 */
 extern void interface_list(struct zebra_ns *zns);
 extern void kernel_init(struct zebra_ns *zns);
-extern void kernel_terminate(struct zebra_ns *zns);
+extern void kernel_terminate(struct zebra_ns *zns, bool complete);
 extern void macfdb_read(struct zebra_ns *zns);
 extern void macfdb_read_for_bridge(struct zebra_ns *zns, struct interface *ifp,
 				   struct interface *br_if);
--- a/zebra/zebra_dplane.c
+++ b/zebra/zebra_dplane.c
--- a/zebra/zebra_dplane.h
+++ b/zebra/zebra_dplane.h
@ -29,7 +29,6 @@
 #include "zebra/rib.h"
 #include "zebra/zserv.h"

-
 /* Key netlink info from zebra ns */
 struct zebra_dplane_info {
 	ns_id_t ns_id;
@ -121,20 +120,28 @@ TAILQ_HEAD(dplane_ctx_q, zebra_dplane_ctx);
 */
 void dplane_ctx_fini(struct zebra_dplane_ctx **pctx);

-/* Enqueue a context block to caller's tailq. This just exists so that the
+/* Enqueue a context block to caller's tailq. This exists so that the
 * context struct can remain opaque.
 */
 void dplane_ctx_enqueue_tail(struct dplane_ctx_q *q,
 			     const struct zebra_dplane_ctx *ctx);

+/* Append a list of context blocks to another list - again, just keeping
+ * the context struct opaque.
+ */
+void dplane_ctx_list_append(struct dplane_ctx_q *to_list,
+			    struct dplane_ctx_q *from_list);
+
 /* Dequeue a context block from the head of caller's tailq */
-void dplane_ctx_dequeue(struct dplane_ctx_q *q, struct zebra_dplane_ctx **ctxp);
+struct zebra_dplane_ctx *dplane_ctx_dequeue(struct dplane_ctx_q *q);

 /*
 * Accessors for information from the context object
 */
 enum zebra_dplane_result dplane_ctx_get_status(
 	const struct zebra_dplane_ctx *ctx);
+void dplane_ctx_set_status(struct zebra_dplane_ctx *ctx,
+			   enum zebra_dplane_result status);
 const char *dplane_res2str(enum zebra_dplane_result res);

 enum dplane_op_e dplane_ctx_get_op(const struct zebra_dplane_ctx *ctx);
@ -142,6 +149,15 @@ const char *dplane_op2str(enum dplane_op_e op);

 const struct prefix *dplane_ctx_get_dest(const struct zebra_dplane_ctx *ctx);

+/* Retrieve last/current provider id */
+uint32_t dplane_ctx_get_provider(const struct zebra_dplane_ctx *ctx);
+
+/* Providers running before the kernel can control whether a kernel
+ * update should be done.
+ */
+void dplane_ctx_set_skip_kernel(struct zebra_dplane_ctx *ctx);
+bool dplane_ctx_is_skip_kernel(const struct zebra_dplane_ctx *ctx);
+
 /* Source prefix is a little special - use convention to return NULL
 * to mean "no src prefix"
 */
@ -212,9 +228,11 @@ int dplane_show_provs_helper(struct vty *vty, bool detailed);


 /*
- * Dataplane providers: modules that consume dataplane events.
+ * Dataplane providers: modules that process or consume dataplane events.
 */

+struct zebra_dplane_provider;
+
 /* Support string name for a dataplane provider */
 #define DPLANE_PROVIDER_NAMELEN 64

@ -223,7 +241,7 @@ int dplane_show_provs_helper(struct vty *vty, bool detailed);
 * followed by the kernel, followed by some post-processing step (such as
 * the fpm output stream.)
 */
-enum dplane_provider_prio_e {
+enum dplane_provider_prio {
 	DPLANE_PRIO_NONE = 0,
 	DPLANE_PRIO_PREPROCESS,
 	DPLANE_PRIO_PRE_KERNEL,
@ -232,28 +250,81 @@ enum dplane_provider_prio_e {
 	DPLANE_PRIO_LAST
 };

-/* Provider's entry-point to process a context block */
-typedef int (*dplane_provider_process_fp)(struct zebra_dplane_ctx *ctx);
-
-/* Provider's entry-point for shutdown and cleanup */
-typedef int (*dplane_provider_fini_fp)(void);
-
-/* Provider registration */
-int dplane_provider_register(const char *name,
-			     enum dplane_provider_prio_e prio,
-			     dplane_provider_process_fp fp,
-			     dplane_provider_fini_fp fini_fp);
-
-/*
- * Results are returned to zebra core via a callback
+/* Provider's entry-point for incoming work, called in the context of the
+ * dataplane pthread. The dataplane pthread enqueues any new work to the
+ * provider's 'inbound' queue, then calls the callback. The dataplane
+ * then checks the provider's outbound queue.
 */
-typedef int (*dplane_results_fp)(const struct zebra_dplane_ctx *ctx);
+typedef int (*dplane_provider_process_fp)(struct zebra_dplane_provider *prov);
+
+/* Provider's entry-point for shutdown and cleanup. Called with 'early'
+ * during shutdown, to indicate that the dataplane subsystem is allowing
+ * work to move through the providers and finish. When called without 'early',
+ * the provider should release all resources (if it has any allocated).
+ */
+typedef int (*dplane_provider_fini_fp)(struct zebra_dplane_provider *prov,
+				       bool early);
+
+/* Flags values used during provider registration. */
+#define DPLANE_PROV_FLAGS_DEFAULT  0x0
+
+/* Provider will be spawning its own worker thread */
+#define DPLANE_PROV_FLAG_THREADED  0x1
+
+
+/* Provider registration: ordering or priority value, callbacks, and optional
+ * opaque data value.
+ */
+int dplane_provider_register(const char *name,
+			     enum dplane_provider_prio prio,
+			     int flags,
+			     dplane_provider_process_fp fp,
+			     dplane_provider_fini_fp fini_fp,
+			     void *data);
+
+/* Accessors for provider attributes */
+const char *dplane_provider_get_name(const struct zebra_dplane_provider *prov);
+uint32_t dplane_provider_get_id(const struct zebra_dplane_provider *prov);
+void *dplane_provider_get_data(const struct zebra_dplane_provider *prov);
+bool dplane_provider_is_threaded(const struct zebra_dplane_provider *prov);
+
+/* Lock/unlock a provider's mutex - iff the provider was registered with
+ * the THREADED flag.
+ */
+void dplane_provider_lock(struct zebra_dplane_provider *prov);
+void dplane_provider_unlock(struct zebra_dplane_provider *prov);
+
+/* Obtain thread_master for dataplane thread */
+struct thread_master *dplane_get_thread_master(void);
+
+/* Providers should (generally) limit number of updates per work cycle */
+int dplane_provider_get_work_limit(const struct zebra_dplane_provider *prov);
+
+/* Provider api to signal that work/events are available
+ * for the dataplane pthread.
+ */
+int dplane_provider_work_ready(void);
+
+/* Dequeue, maintain associated counter and locking */
+struct zebra_dplane_ctx *dplane_provider_dequeue_in_ctx(
+	struct zebra_dplane_provider *prov);
+
+/* Dequeue work to a list, maintain counter and locking, return count */
+int dplane_provider_dequeue_in_list(struct zebra_dplane_provider *prov,
+				    struct dplane_ctx_q *listp);
+
+/* Enqueue, maintain associated counter and locking */
+void dplane_provider_enqueue_out_ctx(struct zebra_dplane_provider *prov,
+				     struct zebra_dplane_ctx *ctx);

 /*
 * Zebra registers a results callback with the dataplane. The callback is
- * called in the dataplane thread context, so the expectation is that the
- * context is queued (or that processing is very limited).
+ * called in the dataplane pthread context, so the expectation is that the
+ * context is queued for the zebra main pthread or that processing
+ * is very limited.
 */
+typedef int (*dplane_results_fp)(struct zebra_dplane_ctx *ctx);
+
 int dplane_results_register(dplane_results_fp fp);

 /*
@ -262,9 +333,16 @@ int dplane_results_register(dplane_results_fp fp);
 */
 void zebra_dplane_init(void);

+/*
+ * Start the dataplane pthread. This step needs to be run later than the
+ * 'init' step, in case zebra has fork-ed.
+ */
+void zebra_dplane_start(void);
+
 /* Finalize/cleanup apis, one called early as shutdown is starting,
 * one called late at the end of zebra shutdown, and then one called
- * from the zebra main thread to stop the dplane thread free all resources.
+ * from the zebra main pthread to stop the dplane pthread and
+ * free all resources.
 *
 * Zebra expects to try to clean up all vrfs and all routes during
 * shutdown, so the dplane must be available until very late.
--- a/zebra/zebra_ns.c
+++ b/zebra/zebra_ns.c
@ -47,6 +47,7 @@ DEFINE_MTYPE(ZEBRA, ZEBRA_NS, "Zebra Name Space")
 static struct zebra_ns *dzns;

 static int logicalrouter_config_write(struct vty *vty);
+static int zebra_ns_disable_internal(struct zebra_ns *zns, bool complete);

 struct zebra_ns *zebra_ns_lookup(ns_id_t ns_id)
 {
@ -111,7 +112,7 @@ int zebra_ns_disabled(struct ns *ns)
 		zlog_info("ZNS %s with id %u (disabled)", ns->name, ns->ns_id);
 	if (!zns)
 		return 0;
-	return zebra_ns_disable(ns->ns_id, (void **)&zns);
+	return zebra_ns_disable_internal(zns, true);
 }

 /* Do global enable actions - open sockets, read kernel config etc. */
@ -135,17 +136,18 @@ int zebra_ns_enable(ns_id_t ns_id, void **info)
 	return 0;
 }

-int zebra_ns_disable(ns_id_t ns_id, void **info)
+/* Common handler for ns disable - this can be called during ns config,
+ * or during zebra shutdown.
+ */
+static int zebra_ns_disable_internal(struct zebra_ns *zns, bool complete)
 {
-	struct zebra_ns *zns = (struct zebra_ns *)(*info);
-
 	route_table_finish(zns->if_table);
 	zebra_vxlan_ns_disable(zns);
 #if defined(HAVE_RTADV)
 	rtadv_terminate(zns);
 #endif

-	kernel_terminate(zns);
+	kernel_terminate(zns, complete);

 	table_manager_disable(zns->ns_id);

@ -154,6 +156,33 @@ int zebra_ns_disable(ns_id_t ns_id, void **info)
 	return 0;
 }

+/* During zebra shutdown, do partial cleanup while the async dataplane
+ * is still running.
+ */
+int zebra_ns_early_shutdown(struct ns *ns)
+{
+	struct zebra_ns *zns = ns->info;
+
+	if (zns == NULL)
+		return 0;
+
+	return zebra_ns_disable_internal(zns, false);
+}
+
+/* During zebra shutdown, do final cleanup
+ * after all dataplane work is complete.
+ */
+int zebra_ns_final_shutdown(struct ns *ns)
+{
+	struct zebra_ns *zns = ns->info;
+
+	if (zns == NULL)
+		return 0;
+
+	kernel_terminate(zns, true);
+
+	return 0;
+}

 int zebra_ns_init(void)
 {
--- a/zebra/zebra_ns.h
+++ b/zebra/zebra_ns.h
@ -46,8 +46,9 @@ struct zebra_ns {
 	ns_id_t ns_id;

 #ifdef HAVE_NETLINK
-	struct nlsock netlink;     /* kernel messages */
-	struct nlsock netlink_cmd; /* command channel */
+	struct nlsock netlink;        /* kernel messages */
+	struct nlsock netlink_cmd;    /* command channel */
+	struct nlsock netlink_dplane; /* dataplane channel */
 	struct thread *t_netlink;
 #endif

@ -62,7 +63,8 @@ struct zebra_ns *zebra_ns_lookup(ns_id_t ns_id);
 int zebra_ns_init(void);
 int zebra_ns_enable(ns_id_t ns_id, void **info);
 int zebra_ns_disabled(struct ns *ns);
-int zebra_ns_disable(ns_id_t ns_id, void **info);
+int zebra_ns_early_shutdown(struct ns *ns);
+int zebra_ns_final_shutdown(struct ns *ns);

 int zebra_ns_config_write(struct vty *vty, struct ns *ns);

--- a/zebra/zebra_rib.c
+++ b/zebra/zebra_rib.c
@ -1932,11 +1932,10 @@ static void rib_process_after(struct zebra_dplane_ctx *ctx)
 	op = dplane_ctx_get_op(ctx);
 	status = dplane_ctx_get_status(ctx);

-	if (IS_ZEBRA_DEBUG_DPLANE_DETAIL) {
+	if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
 		zlog_debug("%u:%s Processing dplane ctx %p, op %s result %s",
 			   dplane_ctx_get_vrf(ctx), dest_str, ctx,
 			   dplane_op2str(op), dplane_res2str(status));
-	}

 	if (op == DPLANE_OP_ROUTE_DELETE) {
 		/*
@ -3267,7 +3266,7 @@ static int rib_process_dplane_results(struct thread *thread)
 		pthread_mutex_lock(&dplane_mutex);
 		{
 			/* Dequeue context block */
-			dplane_ctx_dequeue(&rib_dplane_q, &ctx);
+			ctx = dplane_ctx_dequeue(&rib_dplane_q);
 		}
 		pthread_mutex_unlock(&dplane_mutex);

@ -3289,7 +3288,7 @@ static int rib_process_dplane_results(struct thread *thread)
 * the dataplane pthread. We enqueue the results here for processing by
 * the main thread later.
 */
-static int rib_dplane_results(const struct zebra_dplane_ctx *ctx)
+static int rib_dplane_results(struct zebra_dplane_ctx *ctx)
 {
 	/* Take lock controlling queue of results */
 	pthread_mutex_lock(&dplane_mutex);