bgpd: add support for l3vpn per-nexthop label

This commit introduces a new method to associate a label to
prefixes to export to a VPNv4 backbone. All the methods to
associate a label to a BGP update is documented in rfc4364,
chapter 4.3.2. Initially, the "single label for an entire
VRF" method was available. This commit adds "single label
for each attachment circuit" method.

The change impacts the control-plane, because each BGP update
is checked to know if the nexthop has reachability in the VRF
or not. If this is the case, then a unique label for a given
destination IP in the VRF will be picked up. This label will
be reused for an other BGP update that will have the same
nexthop IP address.

The change impacts the data-plane, because the MPLs pop
mechanism applied to incoming labelled packets changes: the
MPLS label is popped, and the packet is directly sent to the
connected nexthop described in the previous outgoing BGP VPN
update.

By default per-vrf mode is done, but the user may choose
the per-nexthop mode, by using the vty command from the
previous commit. In the latter case, a per-vrf label
will however be allocated to handle networks that are not directly
connected. This is the case for local traffic for instance.

The change also include the following:

-  ECMP case
In case a route is learnt in a given VRF, and is resolved via an
ECMP nexthop. This implies that when exporting the route as a BGP
update, if label allocation per nexthop is used, then two possible
MPLS values could be picked up, which is not possible with the
current implementation. Actually, the NLRI for VPNv4 stores one
prefix, and one single label value, not two. Today, RFC8277 with
multiple label capability is not yet available.
To avoid this corner case, when a route is resolved via more than one
nexthop, the label allocation per nexthop will not apply, and the
default per-vrf label will be chosen.
Let us imagine BGP redistributes a static route using the `172.31.0.20`
nexthop. The nexthop resolution will find two different nexthops fo a
unique BGP update.

 > r1# show running-config
 > [..]
 > vrf vrf1
 >  ip route 172.31.0.30/32 172.31.0.20
 > r1# show bgp vrf vrf1 nexthop
 > [..]
 > 172.31.0.20 valid [IGP metric 0], #paths 1
 >  gate 192.0.2.11
 >  gate 192.0.2.12
 >  Last update: Mon Jan 16 09:27:09 2023
 >  Paths:
 >    1/1 172.31.0.30/32 VRF vrf1 flags 0x20018

To avoid this situation, BGP updates that resolve over multiple
nexthops are using the unique per-vrf label.

- recursive route case

Prefixes that need a recursive route to be resolved can
also be eligible for mpls allocation per nexthop. In that
case, the nexthop will be the recursive nexthop calculated.

To achieve this, all nexthop types in bnc contexts are valid,
except for the blackhole nexthops.

- network declared prefixes

Nexthop tracking is used to look for the reachability of the
prefixes. When the the 'no bgp network import-check' command
is used, network declared prefixes are maintained active,
even if there is no active nexthop.

Signed-off-by: Philippe Guibert <philippe.guibert@6wind.com>
This commit is contained in:
Philippe Guibert 2023-02-28 14:25:02 +01:00
parent 546d58702e
commit 577be36a41
7 changed files with 327 additions and 2 deletions

View File

@ -1336,6 +1336,248 @@ leak_update(struct bgp *to_bgp, struct bgp_dest *bn,
return new;
}
void bgp_mplsvpn_path_nh_label_unlink(struct bgp_path_info *pi)
{
struct bgp_label_per_nexthop_cache *blnc;
if (!pi)
return;
blnc = pi->label_nexthop_cache;
if (!blnc)
return;
LIST_REMOVE(pi, label_nh_thread);
pi->label_nexthop_cache->path_count--;
pi->label_nexthop_cache = NULL;
if (LIST_EMPTY(&(blnc->paths)))
bgp_label_per_nexthop_free(blnc);
}
/* Called upon reception of a ZAPI Message from zebra, about
* a new available label.
*/
static int bgp_mplsvpn_get_label_per_nexthop_cb(mpls_label_t label,
void *context, bool allocated)
{
struct bgp_label_per_nexthop_cache *blnc = context;
mpls_label_t old_label;
int debug = BGP_DEBUG(vpn, VPN_LEAK_LABEL);
struct bgp_path_info *pi;
struct bgp_table *table;
old_label = blnc->label;
if (debug)
zlog_debug("%s: label=%u, allocated=%d, nexthop=%pFX", __func__,
label, allocated, &blnc->nexthop);
if (allocated)
/* update the entry with the new label */
blnc->label = label;
else
/*
* previously-allocated label is now invalid
* eg: zebra deallocated the labels and notifies it
*/
blnc->label = MPLS_INVALID_LABEL;
if (old_label == blnc->label)
return 0; /* no change */
/* update paths */
if (blnc->label != MPLS_INVALID_LABEL)
bgp_zebra_send_nexthop_label(ZEBRA_MPLS_LABELS_ADD, blnc->label,
ZEBRA_LSP_BGP, &blnc->nexthop);
LIST_FOREACH (pi, &(blnc->paths), label_nh_thread) {
if (!pi->net)
continue;
table = bgp_dest_table(pi->net);
if (!table)
continue;
vpn_leak_from_vrf_update(blnc->to_bgp, table->bgp, pi);
}
return 0;
}
/* Get a per label nexthop value:
* - Find and return a per label nexthop from the cache
* - else allocate a new per label nexthop cache entry and request a
* label to zebra. Return MPLS_INVALID_LABEL
*/
static mpls_label_t _vpn_leak_from_vrf_get_per_nexthop_label(
struct bgp_path_info *pi, struct bgp *to_bgp, struct bgp *from_bgp,
afi_t afi, safi_t safi)
{
struct bgp_nexthop_cache *bnc = pi->nexthop;
struct bgp_label_per_nexthop_cache *blnc;
struct bgp_label_per_nexthop_cache_head *tree;
struct prefix *nh_pfx = NULL;
struct prefix nh_gate = {0};
/* extract the nexthop from the BNC nexthop cache */
switch (bnc->nexthop->type) {
case NEXTHOP_TYPE_IPV4:
case NEXTHOP_TYPE_IPV4_IFINDEX:
/* the nexthop is recursive */
nh_gate.family = AF_INET;
nh_gate.prefixlen = IPV4_MAX_BITLEN;
IPV4_ADDR_COPY(&nh_gate.u.prefix4, &bnc->nexthop->gate.ipv4);
nh_pfx = &nh_gate;
break;
case NEXTHOP_TYPE_IPV6:
case NEXTHOP_TYPE_IPV6_IFINDEX:
/* the nexthop is recursive */
nh_gate.family = AF_INET6;
nh_gate.prefixlen = IPV6_MAX_BITLEN;
IPV6_ADDR_COPY(&nh_gate.u.prefix6, &bnc->nexthop->gate.ipv6);
nh_pfx = &nh_gate;
break;
case NEXTHOP_TYPE_IFINDEX:
/* the nexthop is direcly connected */
nh_pfx = &bnc->prefix;
break;
case NEXTHOP_TYPE_BLACKHOLE:
assert(!"Blackhole nexthop. Already checked by the caller.");
}
/* find or allocate a nexthop label cache entry */
tree = &from_bgp->mpls_labels_per_nexthop[family2afi(nh_pfx->family)];
blnc = bgp_label_per_nexthop_find(tree, nh_pfx);
if (!blnc) {
blnc = bgp_label_per_nexthop_new(tree, nh_pfx);
blnc->to_bgp = to_bgp;
/* request a label to zebra for this nexthop
* the response from zebra will trigger the callback
*/
bgp_lp_get(LP_TYPE_NEXTHOP, blnc,
bgp_mplsvpn_get_label_per_nexthop_cb);
}
if (pi->label_nexthop_cache == blnc)
/* no change */
return blnc->label;
/* Unlink from any existing nexthop cache. Free the entry if unused.
*/
bgp_mplsvpn_path_nh_label_unlink(pi);
if (blnc) {
/* updates NHT pi list reference */
LIST_INSERT_HEAD(&(blnc->paths), pi, label_nh_thread);
pi->label_nexthop_cache = blnc;
pi->label_nexthop_cache->path_count++;
}
return blnc->label;
}
/* Filter out all the cases where a per nexthop label is not possible:
* - return an invalid label when the nexthop is invalid
* - return the per VRF label when the per nexthop label is not supported
* Otherwise, find or request a per label nexthop.
*/
static mpls_label_t vpn_leak_from_vrf_get_per_nexthop_label(
afi_t afi, safi_t safi, struct bgp_path_info *pi, struct bgp *from_bgp,
struct bgp *to_bgp)
{
struct bgp_path_info *bpi_ultimate = bgp_get_imported_bpi_ultimate(pi);
struct bgp *bgp_nexthop = NULL;
bool nh_valid;
afi_t nh_afi;
bool is_bgp_static_route;
is_bgp_static_route = bpi_ultimate->sub_type == BGP_ROUTE_STATIC &&
bpi_ultimate->type == ZEBRA_ROUTE_BGP;
if (is_bgp_static_route == false && afi == AFI_IP &&
CHECK_FLAG(pi->attr->flag, ATTR_FLAG_BIT(BGP_ATTR_NEXT_HOP)) &&
(pi->attr->nexthop.s_addr == INADDR_ANY ||
!ipv4_unicast_valid(&pi->attr->nexthop))) {
/* IPv4 nexthop in standard BGP encoding format.
* Format of address is not valid (not any, not unicast).
* Fallback to the per VRF label.
*/
bgp_mplsvpn_path_nh_label_unlink(pi);
return from_bgp->vpn_policy[afi].tovpn_label;
}
if (is_bgp_static_route == false && afi == AFI_IP &&
pi->attr->mp_nexthop_len == BGP_ATTR_NHLEN_IPV4 &&
(pi->attr->mp_nexthop_global_in.s_addr == INADDR_ANY ||
!ipv4_unicast_valid(&pi->attr->mp_nexthop_global_in))) {
/* IPv4 nexthop is in MP-BGP encoding format.
* Format of address is not valid (not any, not unicast).
* Fallback to the per VRF label.
*/
bgp_mplsvpn_path_nh_label_unlink(pi);
return from_bgp->vpn_policy[afi].tovpn_label;
}
if (is_bgp_static_route == false && afi == AFI_IP6 &&
(pi->attr->mp_nexthop_len == BGP_ATTR_NHLEN_IPV6_GLOBAL ||
pi->attr->mp_nexthop_len == BGP_ATTR_NHLEN_IPV6_GLOBAL_AND_LL) &&
(IN6_IS_ADDR_UNSPECIFIED(&pi->attr->mp_nexthop_global) ||
IN6_IS_ADDR_LOOPBACK(&pi->attr->mp_nexthop_global) ||
IN6_IS_ADDR_MULTICAST(&pi->attr->mp_nexthop_global))) {
/* IPv6 nexthop is in MP-BGP encoding format.
* Format of address is not valid
* Fallback to the per VRF label.
*/
bgp_mplsvpn_path_nh_label_unlink(pi);
return from_bgp->vpn_policy[afi].tovpn_label;
}
/* Check the next-hop reachability.
* Get the bgp instance where the bgp_path_info originates.
*/
if (pi->extra && pi->extra->bgp_orig)
bgp_nexthop = pi->extra->bgp_orig;
else
bgp_nexthop = from_bgp;
nh_afi = BGP_ATTR_NH_AFI(afi, pi->attr);
nh_valid = bgp_find_or_add_nexthop(from_bgp, bgp_nexthop, nh_afi, safi,
pi, NULL, 0, NULL);
if (!nh_valid && is_bgp_static_route &&
!CHECK_FLAG(from_bgp->flags, BGP_FLAG_IMPORT_CHECK)) {
/* "network" prefixes not routable, but since 'no bgp network
* import-check' is configured, they are always valid in the BGP
* table. Fallback to the per-vrf label
*/
bgp_mplsvpn_path_nh_label_unlink(pi);
return from_bgp->vpn_policy[afi].tovpn_label;
}
if (!nh_valid || !pi->nexthop || pi->nexthop->nexthop_num == 0 ||
!pi->nexthop->nexthop) {
/* invalid next-hop:
* do not send the per-vrf label
* otherwise, when the next-hop becomes valid,
* we will have 2 BGP updates:
* - one with the per-vrf label
* - the second with the per-nexthop label
*/
bgp_mplsvpn_path_nh_label_unlink(pi);
return MPLS_INVALID_LABEL;
}
if (pi->nexthop->nexthop_num > 1 ||
pi->nexthop->nexthop->type == NEXTHOP_TYPE_BLACKHOLE) {
/* Blackhole or ECMP routes
* is not compatible with per-nexthop label.
* Fallback to per-vrf label.
*/
bgp_mplsvpn_path_nh_label_unlink(pi);
return from_bgp->vpn_policy[afi].tovpn_label;
}
return _vpn_leak_from_vrf_get_per_nexthop_label(pi, to_bgp, from_bgp,
afi, safi);
}
/* cf vnc_import_bgp_add_route_mode_nvegroup() and add_vnc_route() */
void vpn_leak_from_vrf_update(struct bgp *to_bgp, /* to */
struct bgp *from_bgp, /* from */
@ -1528,7 +1770,28 @@ void vpn_leak_from_vrf_update(struct bgp *to_bgp, /* to */
nexthop_self_flag = 1;
}
label_val = from_bgp->vpn_policy[afi].tovpn_label;
if (CHECK_FLAG(from_bgp->vpn_policy[afi].flags,
BGP_VPN_POLICY_TOVPN_LABEL_PER_NEXTHOP))
/* per nexthop label mode */
label_val = vpn_leak_from_vrf_get_per_nexthop_label(
afi, safi, path_vrf, from_bgp, to_bgp);
else
/* per VRF label mode */
label_val = from_bgp->vpn_policy[afi].tovpn_label;
if (label_val == MPLS_INVALID_LABEL &&
CHECK_FLAG(from_bgp->vpn_policy[afi].flags,
BGP_VPN_POLICY_TOVPN_LABEL_PER_NEXTHOP)) {
/* no valid label for the moment
* when the 'bgp_mplsvpn_get_label_per_nexthop_cb' callback gets
* a valid label value, it will call the current function again.
*/
if (debug)
zlog_debug(
"%s: %s skipping: waiting for a valid per-label nexthop.",
__func__, from_bgp->name_pretty);
return;
}
if (label_val == MPLS_LABEL_NONE)
encode_label(MPLS_LABEL_IMPLICIT_NULL, &label);
else
@ -1769,6 +2032,8 @@ void vpn_leak_from_vrf_withdraw_all(struct bgp *to_bgp, struct bgp *from_bgp,
bpi, afi, safi);
bgp_path_info_delete(bn, bpi);
bgp_process(to_bgp, bn, afi, safi);
bgp_mplsvpn_path_nh_label_unlink(
bpi->extra->parent);
}
}
}

View File

@ -31,6 +31,7 @@
#define BGP_PREFIX_SID_SRV6_MAX_FUNCTION_LENGTH 20
extern void bgp_mplsvpn_init(void);
extern void bgp_mplsvpn_path_nh_label_unlink(struct bgp_path_info *pi);
extern int bgp_nlri_parse_vpn(struct peer *, struct attr *, struct bgp_nlri *);
extern uint32_t decode_label(mpls_label_t *);
extern void encode_label(mpls_label_t, mpls_label_t *);

View File

@ -31,6 +31,7 @@
#include "bgpd/bgp_fsm.h"
#include "bgpd/bgp_vty.h"
#include "bgpd/bgp_rd.h"
#include "bgpd/bgp_mplsvpn.h"
DEFINE_MTYPE_STATIC(BGPD, MARTIAN_STRING, "BGP Martian Addr Intf String");
@ -119,6 +120,8 @@ static void bgp_nexthop_cache_reset(struct bgp_nexthop_cache_head *tree)
while (!LIST_EMPTY(&(bnc->paths))) {
struct bgp_path_info *path = LIST_FIRST(&(bnc->paths));
bgp_mplsvpn_path_nh_label_unlink(path);
path_nh_map(path, bnc, false);
}

View File

@ -31,6 +31,7 @@
#include "bgpd/bgp_flowspec_util.h"
#include "bgpd/bgp_evpn.h"
#include "bgpd/bgp_rd.h"
#include "bgpd/bgp_mplsvpn.h"
extern struct zclient *zclient;
@ -149,6 +150,8 @@ void bgp_unlink_nexthop(struct bgp_path_info *path)
{
struct bgp_nexthop_cache *bnc = path->nexthop;
bgp_mplsvpn_path_nh_label_unlink(path);
if (!bnc)
return;
@ -1230,7 +1233,16 @@ void evaluate_paths(struct bgp_nexthop_cache *bnc)
SET_FLAG(path->flags, BGP_PATH_IGP_CHANGED);
path_valid = CHECK_FLAG(path->flags, BGP_PATH_VALID);
if (path_valid != bnc_is_valid_nexthop) {
if (path->type == ZEBRA_ROUTE_BGP &&
path->sub_type == BGP_ROUTE_STATIC &&
!CHECK_FLAG(bgp_path->flags, BGP_FLAG_IMPORT_CHECK))
/* static routes with 'no bgp network import-check' are
* always valid. if nht is called with static routes,
* the vpn exportation needs to be triggered
*/
vpn_leak_from_vrf_update(bgp_get_default(), bgp_path,
path);
else if (path_valid != bnc_is_valid_nexthop) {
if (path_valid) {
/* No longer valid, clear flag; also for EVPN
* routes, unimport from VRFs if needed.
@ -1243,6 +1255,12 @@ void evaluate_paths(struct bgp_nexthop_cache *bnc)
bgp_evpn_is_prefix_nht_supported(bgp_dest_get_prefix(dest)))
bgp_evpn_unimport_route(bgp_path,
afi, safi, bgp_dest_get_prefix(dest), path);
if (safi == SAFI_UNICAST &&
(bgp_path->inst_type !=
BGP_INSTANCE_TYPE_VIEW))
vpn_leak_from_vrf_withdraw(
bgp_get_default(), bgp_path,
path);
} else {
/* Path becomes valid, set flag; also for EVPN
* routes, import from VRFs if needed.
@ -1255,6 +1273,12 @@ void evaluate_paths(struct bgp_nexthop_cache *bnc)
bgp_evpn_is_prefix_nht_supported(bgp_dest_get_prefix(dest)))
bgp_evpn_import_route(bgp_path,
afi, safi, bgp_dest_get_prefix(dest), path);
if (safi == SAFI_UNICAST &&
(bgp_path->inst_type !=
BGP_INSTANCE_TYPE_VIEW))
vpn_leak_from_vrf_update(
bgp_get_default(), bgp_path,
path);
}
}

View File

@ -319,6 +319,12 @@ struct bgp_path_info {
/* Addpath identifiers */
uint32_t addpath_rx_id;
struct bgp_addpath_info_data tx_addpath;
/* For nexthop per label linked list */
LIST_ENTRY(bgp_path_info) label_nh_thread;
/* Back pointer to the bgp label per nexthop structure */
struct bgp_label_per_nexthop_cache *label_nexthop_cache;
};
/* Structure used in BGP path selection */

View File

@ -3911,3 +3911,26 @@ int bgp_zebra_srv6_manager_release_locator_chunk(const char *name)
{
return srv6_manager_release_locator_chunk(zclient, name);
}
void bgp_zebra_send_nexthop_label(int cmd, mpls_label_t label,
enum lsp_types_t ltype, struct prefix *p)
{
struct zapi_labels zl = {};
struct zapi_nexthop *znh;
zl.type = ltype;
zl.local_label = label;
zl.nexthop_num = 1;
znh = &zl.nexthops[0];
if (p->family == AF_INET)
IPV4_ADDR_COPY(&znh->gate.ipv4, &p->u.prefix4);
else
IPV6_ADDR_COPY(&znh->gate.ipv6, &p->u.prefix6);
znh->type =
(p->family == AF_INET) ? NEXTHOP_TYPE_IPV4 : NEXTHOP_TYPE_IPV6;
znh->ifindex = 0;
znh->label_num = 0;
/* vrf_id is DEFAULT_VRF */
zebra_send_mpls_labels(zclient, cmd, &zl);
}

View File

@ -118,4 +118,7 @@ extern int bgp_zebra_update(struct bgp *bgp, afi_t afi, safi_t safi,
extern int bgp_zebra_stale_timer_update(struct bgp *bgp);
extern int bgp_zebra_srv6_manager_get_locator_chunk(const char *name);
extern int bgp_zebra_srv6_manager_release_locator_chunk(const char *name);
extern void bgp_zebra_send_nexthop_label(int cmd, mpls_label_t label,
enum lsp_types_t ltype,
struct prefix *p);
#endif /* _QUAGGA_BGP_ZEBRA_H */