diff --git a/pimd/pim_cmd.c b/pimd/pim_cmd.c index 6508fb4453..1fa674a6f7 100644 --- a/pimd/pim_cmd.c +++ b/pimd/pim_cmd.c @@ -5294,7 +5294,7 @@ static void pim_cmd_show_ip_multicast_helper(struct pim_instance *pim, pim = vrf->info; vty_out(vty, "Router MLAG Role: %s\n", - mlag_role2str(router->role, mlag_role, sizeof(mlag_role))); + mlag_role2str(router->mlag_role, mlag_role, sizeof(mlag_role))); vty_out(vty, "Mroute socket descriptor:"); vty_out(vty, " %d(%s)\n", pim->mroute_socket, vrf->name); @@ -10259,7 +10259,7 @@ DEFUN_HIDDEN (no_ip_pim_mlag, addr.s_addr = 0; pim_vxlan_mlag_update(true/*mlag_enable*/, - false/*peer_state*/, PIM_VXLAN_MLAG_ROLE_SECONDARY, + false/*peer_state*/, MLAG_ROLE_NONE, NULL/*peerlink*/, &addr); return CMD_SUCCESS; @@ -10299,9 +10299,9 @@ DEFUN_HIDDEN (ip_pim_mlag, idx += 2; if (!strcmp(argv[idx]->arg, "primary")) { - role = PIM_VXLAN_MLAG_ROLE_PRIMARY; + role = MLAG_ROLE_PRIMARY; } else if (!strcmp(argv[idx]->arg, "secondary")) { - role = PIM_VXLAN_MLAG_ROLE_SECONDARY; + role = MLAG_ROLE_SECONDARY; } else { vty_out(vty, "unknown MLAG role %s\n", argv[idx]->arg); return CMD_WARNING; diff --git a/pimd/pim_instance.h b/pimd/pim_instance.h index da0c75decb..7b1fd2e172 100644 --- a/pimd/pim_instance.h +++ b/pimd/pim_instance.h @@ -48,6 +48,46 @@ enum pim_spt_switchover { PIM_SPT_INFINITY, }; +/* stats for updates rxed from the MLAG component during the life of a + * session + */ +struct pim_mlag_msg_stats { + uint32_t mroute_add_rx; + uint32_t mroute_add_tx; + uint32_t mroute_del_rx; + uint32_t mroute_del_tx; + uint32_t mlag_status_updates; + uint32_t pim_status_updates; + uint32_t vxlan_updates; + uint32_t peer_zebra_status_updates; +}; + +struct pim_mlag_stats { + /* message stats are reset when the connection to mlagd flaps */ + struct pim_mlag_msg_stats msg; + uint32_t mlagd_session_downs; + uint32_t peer_session_downs; + uint32_t peer_zebra_downs; +}; + +enum pim_mlag_flags { + PIM_MLAGF_NONE = 0, + /* connection to the local MLAG daemon is up */ + PIM_MLAGF_LOCAL_CONN_UP = (1 << 0), + /* connection to the MLAG daemon on the peer switch is up. note + * that there is no direct connection between FRR and the peer MLAG + * daemon. this is just a peer-session status provided by the local + * MLAG daemon. + */ + PIM_MLAGF_PEER_CONN_UP = (1 << 1), + /* status update rxed from the local daemon */ + PIM_MLAGF_STATUS_RXED = (1 << 2), + /* initial dump of data done post peerlink flap */ + PIM_MLAGF_PEER_REPLAY_DONE = (1 << 3), + /* zebra is up on the peer */ + PIM_MLAGF_PEER_ZEBRA_UP = (1 << 4) +}; + struct pim_router { struct thread_master *master; @@ -65,7 +105,7 @@ struct pim_router { */ vrf_id_t vrf_id; - enum mlag_role role; + enum mlag_role mlag_role; uint32_t pim_mlag_intf_cnt; /* if true we have registered with MLAG */ bool mlag_process_register; @@ -77,6 +117,12 @@ struct pim_router { struct stream_fifo *mlag_fifo; struct stream *mlag_stream; struct thread *zpthread_mlag_write; + struct in_addr anycast_vtep_ip; + struct in_addr local_vtep_ip; + struct pim_mlag_stats mlag_stats; + enum pim_mlag_flags mlag_flags; + char peerlink_rif[INTERFACE_NAMSIZ]; + struct interface *peerlink_rif_p; }; /* Per VRF PIM DB */ diff --git a/pimd/pim_mlag.c b/pimd/pim_mlag.c index f60c18204b..1c2f7c563d 100644 --- a/pimd/pim_mlag.c +++ b/pimd/pim_mlag.c @@ -25,14 +25,462 @@ #include "pimd.h" #include "pim_mlag.h" +#include "pim_upstream.h" +#include "pim_vxlan.h" extern struct zclient *zclient; +#define PIM_MLAG_METADATA_LEN 4 + +/******************************* pim upstream sync **************************/ +/* Update DF role for the upstream entry and return true on role change */ +bool pim_mlag_up_df_role_update(struct pim_instance *pim, + struct pim_upstream *up, bool is_df, const char *reason) +{ + struct channel_oil *c_oil = up->channel_oil; + bool old_is_df = !PIM_UPSTREAM_FLAG_TEST_MLAG_NON_DF(up->flags); + struct pim_interface *vxlan_ifp; + + if (is_df == old_is_df) { + if (PIM_DEBUG_MLAG) + zlog_debug( + "%s: Ignoring Role update for %s, since no change", + __func__, up->sg_str); + return false; + } + + if (PIM_DEBUG_MLAG) + zlog_debug("local MLAG mroute %s role changed to %s based on %s", + up->sg_str, is_df ? "df" : "non-df", reason); + + if (is_df) + PIM_UPSTREAM_FLAG_UNSET_MLAG_NON_DF(up->flags); + else + PIM_UPSTREAM_FLAG_SET_MLAG_NON_DF(up->flags); + + + /* If the DF role has changed check if ipmr-lo needs to be + * muted/un-muted. Active-Active devices and vxlan termination + * devices (ipmr-lo) are suppressed on the non-DF. + * This may leave the mroute with the empty OIL in which case the + * the forwarding entry's sole purpose is to just blackhole the flow + * headed to the switch. + */ + if (c_oil) { + vxlan_ifp = pim_vxlan_get_term_ifp(pim); + if (vxlan_ifp) + pim_channel_update_oif_mute(c_oil, vxlan_ifp); + } + + /* If DF role changed on a (*,G) termination mroute update the + * associated DF role on the inherited (S,G) entries + */ + if ((up->sg.src.s_addr == INADDR_ANY) && + PIM_UPSTREAM_FLAG_TEST_MLAG_VXLAN(up->flags)) + pim_vxlan_inherit_mlag_flags(pim, up, true /* inherit */); + + return true; +} + +/* Run per-upstream entry DF election and return true on role change */ +static bool pim_mlag_up_df_role_elect(struct pim_instance *pim, + struct pim_upstream *up) +{ + bool is_df; + uint32_t peer_cost; + uint32_t local_cost; + bool rv; + + if (!pim_up_mlag_is_local(up)) + return false; + + /* We are yet to rx a status update from the local MLAG daemon so + * we will assume DF status. + */ + if (!(router->mlag_flags & PIM_MLAGF_STATUS_RXED)) + return pim_mlag_up_df_role_update(pim, up, + true /*is_df*/, "mlagd-down"); + + /* If not connected to peer assume DF role on the MLAG primary + * switch (and non-DF on the secondary switch. + */ + if (!(router->mlag_flags & PIM_MLAGF_PEER_CONN_UP)) { + is_df = (router->mlag_role == MLAG_ROLE_PRIMARY) ? true : false; + return pim_mlag_up_df_role_update(pim, up, + is_df, "peer-down"); + } + + /* If MLAG peer session is up but zebra is down on the peer + * assume DF role. + */ + if (!(router->mlag_flags & PIM_MLAGF_PEER_ZEBRA_UP)) + return pim_mlag_up_df_role_update(pim, up, + true /*is_df*/, "zebra-down"); + + /* If we are connected to peer switch but don't have a mroute + * from it we have to assume non-DF role to avoid duplicates. + * Note: When the peer connection comes up we wait for initial + * replay to complete before moving "strays" i.e. local-mlag-mroutes + * without a peer reference to non-df role. + */ + if (!PIM_UPSTREAM_FLAG_TEST_MLAG_PEER(up->flags)) + return pim_mlag_up_df_role_update(pim, up, + false /*is_df*/, "no-peer-mroute"); + + /* switch with the lowest RPF cost wins. if both switches have the same + * cost MLAG role is used as a tie breaker (MLAG primary wins). + */ + peer_cost = up->mlag.peer_mrib_metric; + local_cost = pim_up_mlag_local_cost(up); + if (local_cost == peer_cost) { + is_df = (router->mlag_role == MLAG_ROLE_PRIMARY) ? true : false; + rv = pim_mlag_up_df_role_update(pim, up, is_df, "equal-cost"); + } else { + is_df = (local_cost < peer_cost) ? true : false; + rv = pim_mlag_up_df_role_update(pim, up, is_df, "cost"); + } + + return rv; +} + +/* Handle upstream entry add from the peer MLAG switch - + * - if a local entry doesn't exist one is created with reference + * _MLAG_PEER + * - if a local entry exists and has a MLAG OIF DF election is run. + * the non-DF switch stop forwarding traffic to MLAG devices. + */ +static void pim_mlag_up_peer_add(struct mlag_mroute_add *msg) +{ + struct pim_upstream *up; + struct pim_instance *pim; + int flags = 0; + struct prefix_sg sg; + struct vrf *vrf; + char sg_str[PIM_SG_LEN]; + + memset(&sg, 0, sizeof(struct prefix_sg)); + sg.src.s_addr = htonl(msg->source_ip); + sg.grp.s_addr = htonl(msg->group_ip); + if (PIM_DEBUG_MLAG) + pim_str_sg_set(&sg, sg_str); + + if (PIM_DEBUG_MLAG) + zlog_debug("peer MLAG mroute add %s:%s cost %d", + msg->vrf_name, sg_str, msg->cost_to_rp); + + /* XXX - this is not correct. we MUST cache updates to avoid losing + * an entry because of race conditions with the peer switch. + */ + vrf = vrf_lookup_by_name(msg->vrf_name); + if (!vrf) { + if (PIM_DEBUG_MLAG) + zlog_debug("peer MLAG mroute add failed %s:%s; no vrf", + msg->vrf_name, sg_str); + return; + } + pim = vrf->info; + + up = pim_upstream_find(pim, &sg); + if (up) { + /* upstream already exists; create peer reference if it + * doesn't already exist. + */ + if (!PIM_UPSTREAM_FLAG_TEST_MLAG_PEER(up->flags)) + pim_upstream_ref(up, + PIM_UPSTREAM_FLAG_MASK_MLAG_PEER, + __PRETTY_FUNCTION__); + } else { + PIM_UPSTREAM_FLAG_SET_MLAG_PEER(flags); + up = pim_upstream_add(pim, &sg, NULL /*iif*/, flags, + __PRETTY_FUNCTION__, NULL /*if_ch*/); + + if (!up) { + if (PIM_DEBUG_MLAG) + zlog_debug("peer MLAG mroute add failed %s:%s", + vrf->name, sg_str); + return; + } + } + up->mlag.peer_mrib_metric = msg->cost_to_rp; + pim_mlag_up_df_role_elect(pim, up); +} + +/* Handle upstream entry del from the peer MLAG switch - + * - peer reference is removed. this can result in the upstream + * being deleted altogether. + * - if a local entry continues to exisy and has a MLAG OIF DF election + * is re-run (at the end of which the local entry will be the DF). + */ +static void pim_mlag_up_peer_deref(struct pim_instance *pim, + struct pim_upstream *up) +{ + if (!PIM_UPSTREAM_FLAG_TEST_MLAG_PEER(up->flags)) + return; + + PIM_UPSTREAM_FLAG_UNSET_MLAG_PEER(up->flags); + up = pim_upstream_del(pim, up, __PRETTY_FUNCTION__); + if (up) + pim_mlag_up_df_role_elect(pim, up); +} +static void pim_mlag_up_peer_del(struct mlag_mroute_del *msg) +{ + struct pim_upstream *up; + struct pim_instance *pim; + struct prefix_sg sg; + struct vrf *vrf; + char sg_str[PIM_SG_LEN]; + + memset(&sg, 0, sizeof(struct prefix_sg)); + sg.src.s_addr = htonl(msg->source_ip); + sg.grp.s_addr = htonl(msg->group_ip); + if (PIM_DEBUG_MLAG) + pim_str_sg_set(&sg, sg_str); + + if (PIM_DEBUG_MLAG) + zlog_debug("peer MLAG mroute del %s:%s", msg->vrf_name, + sg_str); + + vrf = vrf_lookup_by_name(msg->vrf_name); + if (!vrf) { + if (PIM_DEBUG_MLAG) + zlog_debug("peer MLAG mroute del skipped %s:%s; no vrf", + msg->vrf_name, sg_str); + return; + } + pim = vrf->info; + + up = pim_upstream_find(pim, &sg); + if (!up) { + if (PIM_DEBUG_MLAG) + zlog_debug("peer MLAG mroute del skipped %s:%s; no up", + vrf->name, sg_str); + return; + } + + pim_mlag_up_peer_deref(pim, up); +} + +/* When we lose connection to the local MLAG daemon we can drop all peer + * references. + */ +static void pim_mlag_up_peer_del_all(void) +{ + struct list *temp = list_new(); + struct pim_upstream *up; + struct vrf *vrf; + struct pim_instance *pim; + + /* + * So why these gyrations? + * pim->upstream_head has the list of *,G and S,G + * that are in the system. The problem of course + * is that it is an ordered list: + * (*,G1) -> (S1,G1) -> (S2,G2) -> (S3, G2) -> (*,G2) -> (S1,G2) + * And the *,G1 has pointers to S1,G1 and S2,G1 + * if we delete *,G1 then we have a situation where + * S1,G1 and S2,G2 can be deleted as well. Then a + * simple ALL_LIST_ELEMENTS will have the next listnode + * pointer become invalid and we crash. + * So let's grab the list of MLAG_PEER upstreams + * add a refcount put on another list and delete safely + */ + RB_FOREACH(vrf, vrf_name_head, &vrfs_by_name) { + pim = vrf->info; + frr_each (rb_pim_upstream, &pim->upstream_head, up) { + if (!PIM_UPSTREAM_FLAG_TEST_MLAG_PEER(up->flags)) + continue; + listnode_add(temp, up); + /* + * Add a reference since we are adding to this + * list for deletion + */ + up->ref_count++; + } + + while (temp->count) { + up = listnode_head(temp); + listnode_delete(temp, up); + + pim_mlag_up_peer_deref(pim, up); + /* + * This is the deletion of the reference added + * above + */ + pim_upstream_del(pim, up, __PRETTY_FUNCTION__); + } + } + + list_delete(&temp); +} + +static int pim_mlag_signal_zpthread(void) +{ + /* XXX - This is a temporary stub; the MLAG thread code is planned for + * a separate commit + */ + return (0); +} + +/* Send upstream entry to the local MLAG daemon (which will subsequently + * send it to the peer MLAG switch). + */ +static void pim_mlag_up_local_add_send(struct pim_instance *pim, + struct pim_upstream *up) +{ + struct stream *s = NULL; + struct vrf *vrf = pim->vrf; + + if (!(router->mlag_flags & PIM_MLAGF_LOCAL_CONN_UP)) + return; + + s = stream_new(sizeof(struct mlag_mroute_add) + PIM_MLAG_METADATA_LEN); + if (!s) + return; + + if (PIM_DEBUG_MLAG) + zlog_debug("local MLAG mroute add %s:%s", + vrf->name, up->sg_str); + + ++router->mlag_stats.msg.mroute_add_tx; + + stream_putl(s, MLAG_MROUTE_ADD); + stream_put(s, vrf->name, VRF_NAMSIZ); + stream_putl(s, ntohl(up->sg.src.s_addr)); + stream_putl(s, ntohl(up->sg.grp.s_addr)); + + stream_putl(s, pim_up_mlag_local_cost(up)); + /* XXX - who is addding*/ + stream_putl(s, MLAG_OWNER_VXLAN); + /* XXX - am_i_DR field should be removed */ + stream_putc(s, false); + stream_putc(s, !(PIM_UPSTREAM_FLAG_TEST_MLAG_NON_DF(up->flags))); + stream_putl(s, vrf->vrf_id); + /* XXX - this field is a No-op for VXLAN*/ + stream_put(s, NULL, INTERFACE_NAMSIZ); + + stream_fifo_push_safe(router->mlag_fifo, s); + pim_mlag_signal_zpthread(); +} + +static void pim_mlag_up_local_del_send(struct pim_instance *pim, + struct pim_upstream *up) +{ + struct stream *s = NULL; + struct vrf *vrf = pim->vrf; + + if (!(router->mlag_flags & PIM_MLAGF_LOCAL_CONN_UP)) + return; + + s = stream_new(sizeof(struct mlag_mroute_del) + PIM_MLAG_METADATA_LEN); + if (!s) + return; + + if (PIM_DEBUG_MLAG) + zlog_debug("local MLAG mroute del %s:%s", + vrf->name, up->sg_str); + + ++router->mlag_stats.msg.mroute_del_tx; + + stream_putl(s, MLAG_MROUTE_DEL); + stream_put(s, vrf->name, VRF_NAMSIZ); + stream_putl(s, ntohl(up->sg.src.s_addr)); + stream_putl(s, ntohl(up->sg.grp.s_addr)); + /* XXX - who is adding */ + stream_putl(s, MLAG_OWNER_VXLAN); + stream_putl(s, vrf->vrf_id); + /* XXX - this field is a No-op for VXLAN */ + stream_put(s, NULL, INTERFACE_NAMSIZ); + + /* XXX - is this the the most optimal way to do things */ + stream_fifo_push_safe(router->mlag_fifo, s); + pim_mlag_signal_zpthread(); +} + + +/* Called when a local upstream entry is created or if it's cost changes */ +void pim_mlag_up_local_add(struct pim_instance *pim, + struct pim_upstream *up) +{ + pim_mlag_up_df_role_elect(pim, up); + /* XXX - need to add some dup checks here */ + pim_mlag_up_local_add_send(pim, up); +} + +/* Called when local MLAG reference is removed from an upstream entry */ +void pim_mlag_up_local_del(struct pim_instance *pim, + struct pim_upstream *up) +{ + pim_mlag_up_df_role_elect(pim, up); + pim_mlag_up_local_del_send(pim, up); +} + +/* When connection to local MLAG daemon is established all the local + * MLAG upstream entries are replayed to it. + */ +static void pim_mlag_up_local_replay(void) +{ + struct pim_upstream *up; + struct vrf *vrf; + struct pim_instance *pim; + + RB_FOREACH(vrf, vrf_name_head, &vrfs_by_name) { + pim = vrf->info; + frr_each (rb_pim_upstream, &pim->upstream_head, up) { + if (pim_up_mlag_is_local(up)) + pim_mlag_up_local_add_send(pim, up); + } + } +} + +/* on local/peer mlag connection and role changes the DF status needs + * to be re-evaluated + */ +static void pim_mlag_up_local_reeval(bool mlagd_send, const char *reason_code) +{ + struct pim_upstream *up; + struct vrf *vrf; + struct pim_instance *pim; + + if (PIM_DEBUG_MLAG) + zlog_debug("%s re-run DF election because of %s", + __func__, reason_code); + RB_FOREACH(vrf, vrf_name_head, &vrfs_by_name) { + pim = vrf->info; + frr_each (rb_pim_upstream, &pim->upstream_head, up) { + if (!pim_up_mlag_is_local(up)) + continue; + /* if role changes re-send to peer */ + if (pim_mlag_up_df_role_elect(pim, up) && + mlagd_send) + pim_mlag_up_local_add_send(pim, up); + } + } +} + +/*****************PIM Actions for MLAG state changes**********************/ + +/* notify the anycast VTEP component about state changes */ +static inline void pim_mlag_vxlan_state_update(void) +{ + bool enable = !!(router->mlag_flags & PIM_MLAGF_STATUS_RXED); + bool peer_state = !!(router->mlag_flags & PIM_MLAGF_PEER_CONN_UP); + + pim_vxlan_mlag_update(enable, peer_state, router->mlag_role, + router->peerlink_rif_p, &router->local_vtep_ip); + +} + +/**************End of PIM Actions for MLAG State changes******************/ + /********************API to process PIM MLAG Data ************************/ static void pim_mlag_process_mlagd_state_change(struct mlag_status msg) { + bool role_chg = false; + bool state_chg = false; + bool notify_vxlan = false; + struct interface *peerlink_rif_p; char buf[MLAG_ROLE_STRSIZE]; if (PIM_DEBUG_MLAG) @@ -41,6 +489,84 @@ static void pim_mlag_process_mlagd_state_change(struct mlag_status msg) mlag_role2str(msg.my_role, buf, sizeof(buf)), (msg.peer_state == MLAG_STATE_RUNNING ? "RUNNING" : "DOWN")); + + if (!(router->mlag_flags & PIM_MLAGF_LOCAL_CONN_UP)) { + if (PIM_DEBUG_MLAG) + zlog_debug("%s: msg ignored mlagd process state down", + __func__); + return; + } + ++router->mlag_stats.msg.mlag_status_updates; + + /* evaluate the changes first */ + if (router->mlag_role != msg.my_role) { + role_chg = true; + notify_vxlan = true; + router->mlag_role = msg.my_role; + } + + strcpy(router->peerlink_rif, msg.peerlink_rif); + /* XXX - handle the case where we may rx the interface name from the + * MLAG daemon before we get the interface from zebra. + */ + peerlink_rif_p = if_lookup_by_name(router->peerlink_rif, VRF_DEFAULT); + if (router->peerlink_rif_p != peerlink_rif_p) { + router->peerlink_rif_p = peerlink_rif_p; + notify_vxlan = true; + } + + if (msg.peer_state == MLAG_STATE_RUNNING) { + if (!(router->mlag_flags & PIM_MLAGF_PEER_CONN_UP)) { + state_chg = true; + notify_vxlan = true; + router->mlag_flags |= PIM_MLAGF_PEER_CONN_UP; + } + router->connected_to_mlag = true; + } else { + if (router->mlag_flags & PIM_MLAGF_PEER_CONN_UP) { + ++router->mlag_stats.peer_session_downs; + state_chg = true; + notify_vxlan = true; + router->mlag_flags &= ~PIM_MLAGF_PEER_CONN_UP; + } + router->connected_to_mlag = false; + } + + /* apply the changes */ + /* when connection to mlagd comes up we hold send mroutes till we have + * rxed the status and had a chance to re-valuate DF state + */ + if (!(router->mlag_flags & PIM_MLAGF_STATUS_RXED)) { + router->mlag_flags |= PIM_MLAGF_STATUS_RXED; + pim_mlag_vxlan_state_update(); + /* on session up re-eval DF status */ + pim_mlag_up_local_reeval(false /*mlagd_send*/, "mlagd_up"); + /* replay all the upstream entries to the local MLAG daemon */ + pim_mlag_up_local_replay(); + return; + } + + if (notify_vxlan) + pim_mlag_vxlan_state_update(); + + if (state_chg) { + if (!(router->mlag_flags & PIM_MLAGF_PEER_CONN_UP)) + /* when a connection goes down the primary takes over + * DF role for all entries + */ + pim_mlag_up_local_reeval(true /*mlagd_send*/, + "peer_down"); + else + /* XXX - when session comes up we need to wait for + * PEER_REPLAY_DONE before running re-election on + * local-mlag entries that are missing peer reference + */ + pim_mlag_up_local_reeval(true /*mlagd_send*/, + "peer_up"); + } else if (role_chg) { + /* MLAG role changed without a state change */ + pim_mlag_up_local_reeval(true /*mlagd_send*/, "role_chg"); + } } static void pim_mlag_process_peer_frr_state_change(struct mlag_frr_status msg) @@ -49,37 +575,116 @@ static void pim_mlag_process_peer_frr_state_change(struct mlag_frr_status msg) zlog_debug( "%s: msg dump: peer_frr_state: %s", __func__, (msg.frr_state == MLAG_FRR_STATE_UP ? "UP" : "DOWN")); + + if (!(router->mlag_flags & PIM_MLAGF_LOCAL_CONN_UP)) { + if (PIM_DEBUG_MLAG) + zlog_debug("%s: msg ignored mlagd process state down", + __func__); + return; + } + ++router->mlag_stats.msg.peer_zebra_status_updates; + + /* evaluate the changes first */ + if (msg.frr_state == MLAG_FRR_STATE_UP) { + if (!(router->mlag_flags & PIM_MLAGF_PEER_ZEBRA_UP)) { + router->mlag_flags |= PIM_MLAGF_PEER_ZEBRA_UP; + /* XXX - when peer zebra comes up we need to wait for + * for some time to let the peer setup MDTs before + * before relinquishing DF status + */ + pim_mlag_up_local_reeval(true /*mlagd_send*/, + "zebra_up"); + } + } else { + if (router->mlag_flags & PIM_MLAGF_PEER_ZEBRA_UP) { + ++router->mlag_stats.peer_zebra_downs; + router->mlag_flags &= ~PIM_MLAGF_PEER_ZEBRA_UP; + /* when a peer zebra goes down we assume DF role */ + pim_mlag_up_local_reeval(true /*mlagd_send*/, + "zebra_down"); + } + } } static void pim_mlag_process_vxlan_update(struct mlag_vxlan *msg) { + char addr_buf1[INET_ADDRSTRLEN]; + char addr_buf2[INET_ADDRSTRLEN]; + uint32_t local_ip; + + if (!(router->mlag_flags & PIM_MLAGF_LOCAL_CONN_UP)) { + if (PIM_DEBUG_MLAG) + zlog_debug("%s: msg ignored mlagd process state down", + __func__); + return; + } + + ++router->mlag_stats.msg.vxlan_updates; + router->anycast_vtep_ip.s_addr = htonl(msg->anycast_ip); + local_ip = htonl(msg->local_ip); + if (router->local_vtep_ip.s_addr != local_ip) { + router->local_vtep_ip.s_addr = local_ip; + pim_mlag_vxlan_state_update(); + } + + if (PIM_DEBUG_MLAG) { + inet_ntop(AF_INET, &router->local_vtep_ip, + addr_buf1, INET_ADDRSTRLEN); + inet_ntop(AF_INET, &router->anycast_vtep_ip, + addr_buf2, INET_ADDRSTRLEN); + + zlog_debug("%s: msg dump: local-ip:%s, anycast-ip:%s", + __func__, addr_buf1, addr_buf2); + } } static void pim_mlag_process_mroute_add(struct mlag_mroute_add msg) { if (PIM_DEBUG_MLAG) { zlog_debug( - "%s: msg dump: vrf_name: %s, s.ip: 0x%x, g.ip: 0x%x cost: %u", - __func__, msg.vrf_name, msg.source_ip, msg.group_ip, - msg.cost_to_rp); + "%s: msg dump: vrf_name: %s, s.ip: 0x%x, g.ip: 0x%x cost: %u", + __func__, msg.vrf_name, msg.source_ip, + msg.group_ip, msg.cost_to_rp); zlog_debug( - "owner_id: %d, DR: %d, Dual active: %d, vrf_id: 0x%x intf_name: %s", - msg.owner_id, msg.am_i_dr, msg.am_i_dual_active, - msg.vrf_id, msg.intf_name); + "owner_id: %d, DR: %d, Dual active: %d, vrf_id: 0x%x intf_name: %s", + msg.owner_id, msg.am_i_dr, msg.am_i_dual_active, + msg.vrf_id, msg.intf_name); } + + if (!(router->mlag_flags & PIM_MLAGF_LOCAL_CONN_UP)) { + if (PIM_DEBUG_MLAG) + zlog_debug("%s: msg ignored mlagd process state down", + __func__); + return; + } + + ++router->mlag_stats.msg.mroute_add_rx; + + pim_mlag_up_peer_add(&msg); } static void pim_mlag_process_mroute_del(struct mlag_mroute_del msg) { if (PIM_DEBUG_MLAG) { zlog_debug( - "%s: msg dump: vrf_name: %s, s.ip: 0x%x, g.ip: 0x%x ", - __func__, msg.vrf_name, msg.source_ip, msg.group_ip); + "%s: msg dump: vrf_name: %s, s.ip: 0x%x, g.ip: 0x%x ", + __func__, msg.vrf_name, msg.source_ip, + msg.group_ip); zlog_debug("owner_id: %d, vrf_id: 0x%x intf_name: %s", - msg.owner_id, msg.vrf_id, msg.intf_name); + msg.owner_id, msg.vrf_id, msg.intf_name); } -} + if (!(router->mlag_flags & PIM_MLAGF_LOCAL_CONN_UP)) { + if (PIM_DEBUG_MLAG) + zlog_debug("%s: msg ignored mlagd process state down", + __func__); + return; + } + + ++router->mlag_stats.msg.mroute_del_rx; + + pim_mlag_up_peer_del(&msg); +} int pim_zebra_mlag_handle_msg(struct stream *s, int len) { @@ -179,11 +784,40 @@ int pim_zebra_mlag_process_up(void) return 0; } +static void pim_mlag_param_reset(void) +{ + /* reset the cached params and stats */ + router->mlag_flags &= ~(PIM_MLAGF_STATUS_RXED | + PIM_MLAGF_LOCAL_CONN_UP | + PIM_MLAGF_PEER_CONN_UP | + PIM_MLAGF_PEER_ZEBRA_UP); + router->local_vtep_ip.s_addr = INADDR_ANY; + router->anycast_vtep_ip.s_addr = INADDR_ANY; + router->mlag_role = MLAG_ROLE_NONE; + memset(&router->mlag_stats.msg, 0, sizeof(router->mlag_stats.msg)); + router->peerlink_rif[0] = '\0'; +} + int pim_zebra_mlag_process_down(void) { if (PIM_DEBUG_MLAG) zlog_debug("%s: Received Process-Down from Mlag", __func__); + /* Local CLAG is down, reset peer data and forward the traffic if + * we are DR + */ + if (router->mlag_flags & PIM_MLAGF_PEER_CONN_UP) + ++router->mlag_stats.peer_session_downs; + if (router->mlag_flags & PIM_MLAGF_PEER_ZEBRA_UP) + ++router->mlag_stats.peer_zebra_downs; + router->connected_to_mlag = false; + pim_mlag_param_reset(); + /* on mlagd session down re-eval DF status */ + pim_mlag_up_local_reeval(false /*mlagd_send*/, "mlagd_down"); + /* flush all peer references */ + pim_mlag_up_peer_del_all(); + /* notify the vxlan component */ + pim_mlag_vxlan_state_update(); return 0; } @@ -339,6 +973,7 @@ void pim_instance_mlag_terminate(struct pim_instance *pim) void pim_mlag_init(void) { + pim_mlag_param_reset(); router->pim_mlag_intf_cnt = 0; router->connected_to_mlag = false; router->mlag_fifo = stream_fifo_new(); diff --git a/pimd/pim_mlag.h b/pimd/pim_mlag.h index e86fdae78f..dab29cc9a2 100644 --- a/pimd/pim_mlag.h +++ b/pimd/pim_mlag.h @@ -37,4 +37,10 @@ extern void pim_mlag_deregister(void); extern int pim_zebra_mlag_process_up(void); extern int pim_zebra_mlag_process_down(void); extern int pim_zebra_mlag_handle_msg(struct stream *msg, int len); +extern void pim_mlag_up_local_add(struct pim_instance *pim, + struct pim_upstream *upstream); +extern void pim_mlag_up_local_del(struct pim_instance *pim, + struct pim_upstream *upstream); +extern bool pim_mlag_up_df_role_update(struct pim_instance *pim, + struct pim_upstream *up, bool is_df, const char *reason); #endif diff --git a/pimd/pim_rpf.c b/pimd/pim_rpf.c index 24519adb1e..889e0704c4 100644 --- a/pimd/pim_rpf.c +++ b/pimd/pim_rpf.c @@ -194,6 +194,32 @@ static int nexthop_mismatch(const struct pim_nexthop *nh1, || (nh1->mrib_route_metric != nh2->mrib_route_metric); } +static void pim_rpf_cost_change(struct pim_instance *pim, + struct pim_upstream *up, uint32_t old_cost) +{ + struct pim_rpf *rpf = &up->rpf; + uint32_t new_cost; + + new_cost = pim_up_mlag_local_cost(up); + if (PIM_DEBUG_MLAG) + zlog_debug( + "%s: Cost_to_rp of upstream-%s changed to:%u, from:%u", + __func__, up->sg_str, new_cost, old_cost); + + if (old_cost == new_cost) + return; + + /* Cost changed, it might Impact MLAG DF election, update */ + if (PIM_DEBUG_MLAG) + zlog_debug( + "%s: Cost_to_rp of upstream-%s changed to:%u", + __func__, up->sg_str, + rpf->source_nexthop.mrib_route_metric); + + if (pim_up_mlag_is_local(up)) + pim_mlag_up_local_add(pim, up); +} + enum pim_rpf_result pim_rpf_update(struct pim_instance *pim, struct pim_upstream *up, struct pim_rpf *old, const char *caller) @@ -203,6 +229,7 @@ enum pim_rpf_result pim_rpf_update(struct pim_instance *pim, struct prefix nht_p; struct prefix src, grp; bool neigh_needed = true; + uint32_t saved_mrib_route_metric; if (PIM_UPSTREAM_FLAG_TEST_STATIC_IIF(up->flags)) return PIM_RPF_OK; @@ -215,6 +242,7 @@ enum pim_rpf_result pim_rpf_update(struct pim_instance *pim, saved.source_nexthop = rpf->source_nexthop; saved.rpf_addr = rpf->rpf_addr; + saved_mrib_route_metric = pim_up_mlag_local_cost(up); if (old) { old->source_nexthop = saved.source_nexthop; old->rpf_addr = saved.rpf_addr; @@ -236,8 +264,12 @@ enum pim_rpf_result pim_rpf_update(struct pim_instance *pim, neigh_needed = false; pim_find_or_track_nexthop(pim, &nht_p, up, NULL, false, NULL); if (!pim_ecmp_nexthop_lookup(pim, &rpf->source_nexthop, &src, &grp, - neigh_needed)) + neigh_needed)) { + /* Route is Deleted in Zebra, reset the stored NH data */ + pim_upstream_rpf_clear(pim, up); + pim_rpf_cost_change(pim, up, saved_mrib_route_metric); return PIM_RPF_FAILURE; + } rpf->rpf_addr.family = AF_INET; rpf->rpf_addr.u.prefix4 = pim_rpf_find_rpf_addr(up); @@ -290,10 +322,18 @@ enum pim_rpf_result pim_rpf_update(struct pim_instance *pim, if (saved.rpf_addr.u.prefix4.s_addr != rpf->rpf_addr.u.prefix4.s_addr || saved.source_nexthop .interface != rpf->source_nexthop.interface) { - + pim_rpf_cost_change(pim, up, saved_mrib_route_metric); return PIM_RPF_CHANGED; } + if (PIM_DEBUG_MLAG) + zlog_debug( + "%s(%s): Cost_to_rp of upstream-%s changed to:%u", + __func__, caller, up->sg_str, + rpf->source_nexthop.mrib_route_metric); + + pim_rpf_cost_change(pim, up, saved_mrib_route_metric); + return PIM_RPF_OK; } diff --git a/pimd/pim_upstream.c b/pimd/pim_upstream.c index c899e403c8..2d3a44b646 100644 --- a/pimd/pim_upstream.c +++ b/pimd/pim_upstream.c @@ -52,6 +52,7 @@ #include "pim_nht.h" #include "pim_ssm.h" #include "pim_vxlan.h" +#include "pim_mlag.h" static void join_timer_stop(struct pim_upstream *up); static void @@ -883,6 +884,13 @@ static struct pim_upstream *pim_upstream_new(struct pim_instance *pim, } } + /* send the entry to the MLAG peer */ + /* XXX - duplicate send is possible here if pim_rpf_update + * successfully resolved the nexthop + */ + if (pim_up_mlag_is_local(up)) + pim_mlag_up_local_add(pim, up); + if (PIM_DEBUG_PIM_TRACE) { zlog_debug( "%s: Created Upstream %s upstream_addr %s ref count %d increment", @@ -893,6 +901,22 @@ static struct pim_upstream *pim_upstream_new(struct pim_instance *pim, return up; } +uint32_t pim_up_mlag_local_cost(struct pim_upstream *up) +{ + if (!(pim_up_mlag_is_local(up))) + return router->infinite_assert_metric.route_metric; + + return up->rpf.source_nexthop.mrib_route_metric; +} + +uint32_t pim_up_mlag_peer_cost(struct pim_upstream *up) +{ + if (!(up->flags & PIM_UPSTREAM_FLAG_MASK_MLAG_PEER)) + return router->infinite_assert_metric.route_metric; + + return up->mlag.peer_mrib_metric; +} + struct pim_upstream *pim_upstream_find(struct pim_instance *pim, struct prefix_sg *sg) { @@ -916,6 +940,15 @@ struct pim_upstream *pim_upstream_find_or_add(struct prefix_sg *sg, void pim_upstream_ref(struct pim_upstream *up, int flags, const char *name) { + /* if a local MLAG reference is being created we need to send the mroute + * to the peer + */ + if (!PIM_UPSTREAM_FLAG_TEST_MLAG_VXLAN(up->flags) && + PIM_UPSTREAM_FLAG_TEST_MLAG_VXLAN(flags)) { + PIM_UPSTREAM_FLAG_SET_MLAG_VXLAN(up->flags); + pim_mlag_up_local_add(up->pim, up); + } + /* when we go from non-FHR to FHR we need to re-eval traffic * forwarding path */ @@ -1950,8 +1983,9 @@ static void pim_upstream_sg_running(void *arg) "source reference created on kat restart %s[%s]", up->sg_str, pim->vrf->name); - pim_upstream_ref(up, PIM_UPSTREAM_FLAG_MASK_SRC_STREAM, - __PRETTY_FUNCTION__); + pim_upstream_ref(up, + PIM_UPSTREAM_FLAG_MASK_SRC_STREAM, + __PRETTY_FUNCTION__); PIM_UPSTREAM_FLAG_SET_SRC_STREAM(up->flags); pim_upstream_fhr_kat_start(up); } diff --git a/pimd/pim_upstream.h b/pimd/pim_upstream.h index 1eb2052bb3..1c1f180083 100644 --- a/pimd/pim_upstream.h +++ b/pimd/pim_upstream.h @@ -74,6 +74,8 @@ * blackholing the traffic pulled down to the LHR. */ #define PIM_UPSTREAM_FLAG_MASK_MLAG_NON_DF (1 << 17) +/* MLAG mroute rxed from the peer MLAG switch */ +#define PIM_UPSTREAM_FLAG_MASK_MLAG_PEER (1 << 18) /* * We are creating a non-joined upstream data structure * for this S,G as that we want to have a channel oil @@ -108,6 +110,7 @@ #define PIM_UPSTREAM_FLAG_TEST_SRC_VXLAN(flags) ((flags) & (PIM_UPSTREAM_FLAG_MASK_SRC_VXLAN_ORIG | PIM_UPSTREAM_FLAG_MASK_SRC_VXLAN_TERM)) #define PIM_UPSTREAM_FLAG_TEST_MLAG_VXLAN(flags) ((flags) & PIM_UPSTREAM_FLAG_MASK_MLAG_VXLAN) #define PIM_UPSTREAM_FLAG_TEST_MLAG_NON_DF(flags) ((flags) & PIM_UPSTREAM_FLAG_MASK_MLAG_NON_DF) +#define PIM_UPSTREAM_FLAG_TEST_MLAG_PEER(flags) ((flags) & PIM_UPSTREAM_FLAG_MASK_MLAG_PEER) #define PIM_UPSTREAM_FLAG_TEST_SRC_NOCACHE(flags) ((flags) &PIM_UPSTREAM_FLAG_MASK_SRC_NOCACHE) #define PIM_UPSTREAM_FLAG_TEST_USE_RPT(flags) ((flags) & PIM_UPSTREAM_FLAG_MASK_USE_RPT) @@ -129,6 +132,7 @@ #define PIM_UPSTREAM_FLAG_SET_SRC_VXLAN_TERM(flags) ((flags) |= PIM_UPSTREAM_FLAG_MASK_SRC_VXLAN_TERM) #define PIM_UPSTREAM_FLAG_SET_MLAG_VXLAN(flags) ((flags) |= PIM_UPSTREAM_FLAG_MASK_MLAG_VXLAN) #define PIM_UPSTREAM_FLAG_SET_MLAG_NON_DF(flags) ((flags) |= PIM_UPSTREAM_FLAG_MASK_MLAG_NON_DF) +#define PIM_UPSTREAM_FLAG_SET_MLAG_PEER(flags) ((flags) |= PIM_UPSTREAM_FLAG_MASK_MLAG_PEER) #define PIM_UPSTREAM_FLAG_SET_USE_RPT(flags) ((flags) |= PIM_UPSTREAM_FLAG_MASK_USE_RPT) #define PIM_UPSTREAM_FLAG_UNSET_DR_JOIN_DESIRED(flags) ((flags) &= ~PIM_UPSTREAM_FLAG_MASK_DR_JOIN_DESIRED) @@ -149,6 +153,7 @@ #define PIM_UPSTREAM_FLAG_UNSET_SRC_VXLAN_TERM(flags) ((flags) &= ~PIM_UPSTREAM_FLAG_MASK_SRC_VXLAN_TERM) #define PIM_UPSTREAM_FLAG_UNSET_MLAG_VXLAN(flags) ((flags) &= ~PIM_UPSTREAM_FLAG_MASK_MLAG_VXLAN) #define PIM_UPSTREAM_FLAG_UNSET_MLAG_NON_DF(flags) ((flags) &= ~PIM_UPSTREAM_FLAG_MASK_MLAG_NON_DF) +#define PIM_UPSTREAM_FLAG_UNSET_MLAG_PEER(flags) ((flags) &= ~PIM_UPSTREAM_FLAG_MASK_MLAG_PEER) #define PIM_UPSTREAM_FLAG_UNSET_SRC_NOCACHE(flags) ((flags) &= ~PIM_UPSTREAM_FLAG_MASK_SRC_NOCACHE) #define PIM_UPSTREAM_FLAG_UNSET_USE_RPT(flags) ((flags) &= ~PIM_UPSTREAM_FLAG_MASK_USE_RPT) @@ -169,6 +174,13 @@ enum pim_upstream_sptbit { PIM_UPSTREAM_SPTBIT_TRUE }; +struct pim_up_mlag { + /* MRIB.metric(S) from the peer switch. This is used for DF election + * and switch with the lowest cost wins. + */ + uint32_t peer_mrib_metric; +}; + PREDECL_RBTREE_UNIQ(rb_pim_upstream); /* Upstream (S,G) channel in Joined state @@ -218,6 +230,8 @@ struct pim_upstream { struct pim_rpf rpf; + struct pim_up_mlag mlag; + struct thread *t_join_timer; /* @@ -249,6 +263,14 @@ static inline bool pim_upstream_is_kat_running(struct pim_upstream *up) return (up->t_ka_timer != NULL); } +static inline bool pim_up_mlag_is_local(struct pim_upstream *up) +{ + /* XXX: extend this to also return true if the channel-oil has + * any AA devices + */ + return (up->flags & PIM_UPSTREAM_FLAG_MASK_MLAG_VXLAN); +} + struct pim_upstream *pim_upstream_find(struct pim_instance *pim, struct prefix_sg *sg); struct pim_upstream *pim_upstream_find_or_add(struct prefix_sg *sg, @@ -259,7 +281,8 @@ struct pim_upstream *pim_upstream_add(struct pim_instance *pim, struct interface *ifp, int flags, const char *name, struct pim_ifchannel *ch); -void pim_upstream_ref(struct pim_upstream *up, int flags, const char *name); +void pim_upstream_ref(struct pim_upstream *up, + int flags, const char *name); struct pim_upstream *pim_upstream_del(struct pim_instance *pim, struct pim_upstream *up, const char *name); @@ -350,5 +373,7 @@ void pim_upstream_fill_static_iif(struct pim_upstream *up, struct interface *incoming); void pim_upstream_update_use_rpt(struct pim_upstream *up, bool update_mroute); +uint32_t pim_up_mlag_local_cost(struct pim_upstream *up); +uint32_t pim_up_mlag_peer_cost(struct pim_upstream *up); void pim_upstream_reeval_use_rpt(struct pim_instance *pim); #endif /* PIM_UPSTREAM_H */ diff --git a/pimd/pim_vty.c b/pimd/pim_vty.c index c48ec373f8..b5a5089ae7 100644 --- a/pimd/pim_vty.c +++ b/pimd/pim_vty.c @@ -242,8 +242,6 @@ int pim_global_config_write_worker(struct pim_instance *pim, struct vty *vty) } } - pim_vxlan_config_write(vty, spaces, &writes); - return writes; } diff --git a/pimd/pim_vxlan.c b/pimd/pim_vxlan.c index 4d8fe779ae..f1f5c81c00 100644 --- a/pimd/pim_vxlan.c +++ b/pimd/pim_vxlan.c @@ -38,6 +38,7 @@ #include "pim_nht.h" #include "pim_zebra.h" #include "pim_vxlan.h" +#include "pim_mlag.h" /* pim-vxlan global info */ struct pim_vxlan vxlan_info, *pim_vxlan_p = &vxlan_info; @@ -594,7 +595,7 @@ static void pim_vxlan_term_mr_up_del(struct pim_vxlan_sg *vxlan_sg) /* clear out all the vxlan related flags */ up->flags &= ~(PIM_UPSTREAM_FLAG_MASK_SRC_VXLAN_TERM | PIM_UPSTREAM_FLAG_MASK_MLAG_VXLAN); - + pim_mlag_up_local_del(vxlan_sg->pim, up); pim_upstream_del(vxlan_sg->pim, up, __PRETTY_FUNCTION__); } @@ -825,27 +826,6 @@ void pim_vxlan_mlag_update(bool enable, bool peer_state, uint32_t role, } /****************************** misc callbacks *******************************/ -void pim_vxlan_config_write(struct vty *vty, char *spaces, int *writes) -{ - char addr_buf[INET_ADDRSTRLEN]; - - if ((vxlan_mlag.flags & PIM_VXLAN_MLAGF_ENABLED) && - vxlan_mlag.peerlink_rif) { - - inet_ntop(AF_INET, &vxlan_mlag.reg_addr, - addr_buf, sizeof(addr_buf)); - vty_out(vty, - "%sip pim mlag %s role %s state %s addr %s\n", - spaces, - vxlan_mlag.peerlink_rif->name, - (vxlan_mlag.role == PIM_VXLAN_MLAG_ROLE_PRIMARY) ? - "primary":"secondary", - vxlan_mlag.peer_state ? "up" : "down", - addr_buf); - *writes += 1; - } -} - static void pim_vxlan_set_default_iif(struct pim_instance *pim, struct interface *ifp) { diff --git a/pimd/pim_vxlan.h b/pimd/pim_vxlan.h index 22ed1f761a..4495dca6d7 100644 --- a/pimd/pim_vxlan.h +++ b/pimd/pim_vxlan.h @@ -70,14 +70,10 @@ enum pim_vxlan_mlag_flags { PIM_VXLAN_MLAGF_DO_REG = (1 << 1) }; -enum pim_vxlan_mlag_role { - PIM_VXLAN_MLAG_ROLE_SECONDARY = 0, - PIM_VXLAN_MLAG_ROLE_PRIMARY -}; - struct pim_vxlan_mlag { enum pim_vxlan_mlag_flags flags; - enum pim_vxlan_mlag_role role; + /* XXX - remove this variable from here */ + int role; bool peer_state; /* routed interface setup on top of MLAG peerlink */ struct interface *peerlink_rif; @@ -142,7 +138,6 @@ extern bool pim_vxlan_get_register_src(struct pim_instance *pim, extern void pim_vxlan_mlag_update(bool enable, bool peer_state, uint32_t role, struct interface *peerlink_rif, struct in_addr *reg_addr); -extern void pim_vxlan_config_write(struct vty *vty, char *spaces, int *writes); extern bool pim_vxlan_do_mlag_reg(void); #endif /* PIM_VXLAN_H */ diff --git a/pimd/pim_zebra.c b/pimd/pim_zebra.c index 06507b1f4c..7eb648ab86 100644 --- a/pimd/pim_zebra.c +++ b/pimd/pim_zebra.c @@ -452,7 +452,7 @@ static void pim_zebra_connected(struct zclient *zclient) static void pim_zebra_capabilities(struct zclient_capabilities *cap) { - router->role = cap->role; + router->mlag_role = cap->role; } void pim_zebra_init(void)