diff --git a/bgpd/bgp_mpath.c b/bgpd/bgp_mpath.c index a3d40849ce..f66f56cb49 100644 --- a/bgpd/bgp_mpath.c +++ b/bgpd/bgp_mpath.c @@ -419,6 +419,10 @@ static void bgp_path_info_mpath_lb_update(struct bgp_path_info *path, bool set, return; } if (set) { + if (cum_bw) + SET_FLAG(mpath->mp_flags, BGP_MP_LB_PRESENT); + else + UNSET_FLAG(mpath->mp_flags, BGP_MP_LB_PRESENT); if (all_paths_lb) SET_FLAG(mpath->mp_flags, BGP_MP_LB_ALL); else @@ -446,14 +450,24 @@ struct attr *bgp_path_info_mpath_attr(struct bgp_path_info *path) /* * bgp_path_info_chkwtd * - * Given bestpath bgp_path_info, return if we should attempt to - * do weighted ECMP or not + * Return if we should attempt to do weighted ECMP or not + * The path passed in is the bestpath. */ -bool bgp_path_info_mpath_chkwtd(struct bgp_path_info *path) +bool bgp_path_info_mpath_chkwtd(struct bgp *bgp, struct bgp_path_info *path) { - if (!path->mpath) + /* Check if told to ignore weights or not multipath */ + if (bgp->lb_handling == BGP_LINK_BW_IGNORE_BW || !path->mpath) return false; - return (path->mpath->mp_flags & BGP_MP_LB_ALL); + + /* All paths in multipath should have associated weight (bandwidth) + * unless told explicitly otherwise. + */ + if (bgp->lb_handling != BGP_LINK_BW_SKIP_MISSING && + bgp->lb_handling != BGP_LINK_BW_DEFWT_4_MISSING) + return (path->mpath->mp_flags & BGP_MP_LB_ALL); + + /* At least one path should have bandwidth. */ + return (path->mpath->mp_flags & BGP_MP_LB_PRESENT); } /* diff --git a/bgpd/bgp_mpath.h b/bgpd/bgp_mpath.h index 6d3df29415..34f94b256b 100644 --- a/bgpd/bgp_mpath.h +++ b/bgpd/bgp_mpath.h @@ -40,8 +40,8 @@ struct bgp_path_info_mpath { /* Flags - relevant as noted. */ uint16_t mp_flags; -/* Attached to best path, indicates that all multipaths have link-bandwidth */ -#define BGP_MP_LB_ALL 0x1 +#define BGP_MP_LB_PRESENT 0x1 /* Link-bandwidth present for >= 1 path */ +#define BGP_MP_LB_ALL 0x2 /* Link-bandwidth present for all multipaths */ /* Aggregated attribute for advertising multipath route */ struct attr *mp_attr; @@ -86,7 +86,8 @@ bgp_path_info_mpath_next(struct bgp_path_info *path); /* Accessors for multipath information */ extern uint32_t bgp_path_info_mpath_count(struct bgp_path_info *path); extern struct attr *bgp_path_info_mpath_attr(struct bgp_path_info *path); -extern bool bgp_path_info_mpath_chkwtd(struct bgp_path_info *path); +extern bool bgp_path_info_mpath_chkwtd(struct bgp *bgp, + struct bgp_path_info *path); extern uint64_t bgp_path_info_mpath_cumbw(struct bgp_path_info *path); #endif /* _QUAGGA_BGP_MPATH_H */ diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c index 8bb4bb075b..2009375c89 100644 --- a/bgpd/bgp_route.c +++ b/bgpd/bgp_route.c @@ -2066,7 +2066,7 @@ bool subgroup_announce_check(struct bgp_node *rn, struct bgp_path_info *pi, * been explicitly set by user policy. */ if (nh_reset && - bgp_path_info_mpath_chkwtd(pi) && + bgp_path_info_mpath_chkwtd(bgp, pi) && (cum_bw = bgp_path_info_mpath_cumbw(pi)) != 0 && !CHECK_FLAG(attr->rmap_change_flags, BATTR_RMAP_LINK_BW_SET)) attr->ecommunity = ecommunity_replace_linkbw( diff --git a/bgpd/bgp_vty.c b/bgpd/bgp_vty.c index aea6dbbf4e..544bee2a54 100644 --- a/bgpd/bgp_vty.c +++ b/bgpd/bgp_vty.c @@ -2973,6 +2973,49 @@ DEFUN (no_bgp_bestpath_med, return CMD_SUCCESS; } +/* "bgp bestpath bandwidth" configuration. */ +DEFPY (bgp_bestpath_bw, + bgp_bestpath_bw_cmd, + "[no$no] bgp bestpath bandwidth [$bw_cfg]", + NO_STR + "BGP specific commands\n" + "Change the default bestpath selection\n" + "Link Bandwidth attribute\n" + "Ignore link bandwidth (i.e., do regular ECMP, not weighted)\n" + "Ignore paths without link bandwidth for ECMP (if other paths have it)\n" + "Assign a low default weight (value 1) to paths not having link bandwidth\n") +{ + VTY_DECLVAR_CONTEXT(bgp, bgp); + afi_t afi; + safi_t safi; + + if (no) { + bgp->lb_handling = BGP_LINK_BW_ECMP; + } else { + if (!bw_cfg) { + vty_out(vty, "%% Bandwidth configuration must be specified\n"); + return CMD_ERR_INCOMPLETE; + } + if (!strcmp(bw_cfg, "ignore")) + bgp->lb_handling = BGP_LINK_BW_IGNORE_BW; + else if (!strcmp(bw_cfg, "skip-missing")) + bgp->lb_handling = BGP_LINK_BW_SKIP_MISSING; + else if (!strcmp(bw_cfg, "default-weight-for-missing")) + bgp->lb_handling = BGP_LINK_BW_DEFWT_4_MISSING; + else + return CMD_ERR_NO_MATCH; + } + + /* This config is used in route install, so redo that. */ + FOREACH_AFI_SAFI (afi, safi) { + if (!bgp_fibupd_safi(safi)) + continue; + bgp_zebra_announce_table(bgp, afi, safi); + } + + return CMD_SUCCESS; +} + /* "no bgp default ipv4-unicast". */ DEFUN (no_bgp_default_ipv4_unicast, no_bgp_default_ipv4_unicast_cmd, @@ -15161,6 +15204,14 @@ int bgp_config_write(struct vty *vty) vty_out(vty, "\n"); } + /* Link bandwidth handling. */ + if (bgp->lb_handling == BGP_LINK_BW_IGNORE_BW) + vty_out(vty, " bgp bestpath bandwidth ignore\n"); + else if (bgp->lb_handling == BGP_LINK_BW_SKIP_MISSING) + vty_out(vty, " bgp bestpath bandwidth skip-missing\n"); + else if (bgp->lb_handling == BGP_LINK_BW_DEFWT_4_MISSING) + vty_out(vty, " bgp bestpath bandwidth default-weight-for-missing\n"); + /* BGP network import check. */ if (!!CHECK_FLAG(bgp->flags, BGP_FLAG_IMPORT_CHECK) != SAVE_BGP_IMPORT_CHECK) @@ -15567,6 +15618,9 @@ void bgp_vty_init(void) install_element(BGP_NODE, &bgp_bestpath_med_cmd); install_element(BGP_NODE, &no_bgp_bestpath_med_cmd); + /* "bgp bestpath bandwidth" commands */ + install_element(BGP_NODE, &bgp_bestpath_bw_cmd); + /* "no bgp default ipv4-unicast" commands. */ install_element(BGP_NODE, &no_bgp_default_ipv4_unicast_cmd); install_element(BGP_NODE, &bgp_default_ipv4_unicast_cmd); diff --git a/bgpd/bgp_zebra.c b/bgpd/bgp_zebra.c index db99811d5c..b01bb788a5 100644 --- a/bgpd/bgp_zebra.c +++ b/bgpd/bgp_zebra.c @@ -1148,10 +1148,29 @@ static bool update_ipv6nh_for_route_install(int nh_othervrf, struct bgp *nh_bgp, return true; } -static uint32_t bgp_zebra_nhop_weight(uint32_t bw, uint64_t tot_bw) +static bool bgp_zebra_use_nhop_weighted(struct bgp *bgp, struct attr *attr, + uint64_t tot_bw, uint32_t *nh_weight) { - uint64_t tmp = (uint64_t)bw * 100; - return ((uint32_t)(tmp / tot_bw)); + uint32_t bw; + uint64_t tmp; + + bw = attr->link_bw; + /* zero link-bandwidth and link-bandwidth not present are treated + * as the same situation. + */ + if (!bw) { + /* the only situations should be if we're either told + * to skip or use default weight. + */ + if (bgp->lb_handling == BGP_LINK_BW_SKIP_MISSING) + return false; + *nh_weight = BGP_ZEBRA_DEFAULT_NHOP_WEIGHT; + } else { + tmp = (uint64_t)bw * 100; + *nh_weight = ((uint32_t)(tmp / tot_bw)); + } + + return true; } void bgp_zebra_announce(struct bgp_node *rn, const struct prefix *p, @@ -1250,15 +1269,18 @@ void bgp_zebra_announce(struct bgp_node *rn, const struct prefix *p, metric = info->attr->med; /* Determine if we're doing weighted ECMP or not */ - do_wt_ecmp = bgp_path_info_mpath_chkwtd(info); + do_wt_ecmp = bgp_path_info_mpath_chkwtd(bgp, info); if (do_wt_ecmp) cum_bw = bgp_path_info_mpath_cumbw(info); for (mpinfo = info; mpinfo; mpinfo = bgp_path_info_mpath_next(mpinfo)) { + uint32_t nh_weight; + if (valid_nh_count >= multipath_num) break; *mpinfo_cp = *mpinfo; + nh_weight = 0; /* Get nexthop address-family */ if (p->family == AF_INET @@ -1271,6 +1293,15 @@ void bgp_zebra_announce(struct bgp_node *rn, const struct prefix *p, else continue; + /* If processing for weighted ECMP, determine the next hop's + * weight. Based on user setting, we may skip the next hop + * in some situations. + */ + if (do_wt_ecmp) { + if (!bgp_zebra_use_nhop_weighted(bgp, mpinfo->attr, + cum_bw, &nh_weight)) + continue; + } api_nh = &api.nexthops[valid_nh_count]; if (nh_family == AF_INET) { if (bgp_debug_zebra(&api.prefix)) { @@ -1370,11 +1401,8 @@ void bgp_zebra_announce(struct bgp_node *rn, const struct prefix *p, } memcpy(&api_nh->rmac, &(mpinfo->attr->rmac), sizeof(struct ethaddr)); + api_nh->weight = nh_weight; - /* Update next hop's weight for weighted ECMP */ - if (do_wt_ecmp) - api_nh->weight = bgp_zebra_nhop_weight( - mpinfo->attr->link_bw, cum_bw); valid_nh_count++; } diff --git a/bgpd/bgp_zebra.h b/bgpd/bgp_zebra.h index e546cd5da7..a069d01503 100644 --- a/bgpd/bgp_zebra.h +++ b/bgpd/bgp_zebra.h @@ -23,6 +23,9 @@ #include "vxlan.h" +/* Default weight for next hop, if doing weighted ECMP. */ +#define BGP_ZEBRA_DEFAULT_NHOP_WEIGHT 1 + extern void bgp_zebra_init(struct thread_master *master, unsigned short instance); extern void bgp_zebra_init_tm_connect(struct bgp *bgp); diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c index d1363b70a4..3124efbb65 100644 --- a/bgpd/bgpd.c +++ b/bgpd/bgpd.c @@ -2971,6 +2971,7 @@ static struct bgp *bgp_create(as_t *as, const char *name, bgp->dynamic_neighbors_limit = BGP_DYNAMIC_NEIGHBORS_LIMIT_DEFAULT; bgp->dynamic_neighbors_count = 0; bgp->lb_ref_bw = BGP_LINK_BW_REF_BW; + bgp->lb_handling = BGP_LINK_BW_ECMP; bgp->ebgp_requires_policy = DEFAULT_EBGP_POLICY_DISABLED; bgp->reject_as_sets = BGP_REJECT_AS_SETS_DISABLED; bgp_addpath_init_bgp_data(&bgp->tx_addpath); diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h index 24ea66b5a1..f6f9687783 100644 --- a/bgpd/bgpd.h +++ b/bgpd/bgpd.h @@ -284,6 +284,20 @@ enum global_gr_command { #define BGP_GR_SUCCESS 0 #define BGP_GR_FAILURE 1 +/* Handling of BGP link bandwidth (LB) on receiver - whether and how to + * do weighted ECMP. Note: This applies after multipath computation. + */ +enum bgp_link_bw_handling { + /* Do ECMP if some paths don't have LB - default */ + BGP_LINK_BW_ECMP, + /* Completely ignore LB, just do regular ECMP */ + BGP_LINK_BW_IGNORE_BW, + /* Skip paths without LB, do wECMP on others */ + BGP_LINK_BW_SKIP_MISSING, + /* Do wECMP with default weight for paths not having LB */ + BGP_LINK_BW_DEFWT_4_MISSING +}; + /* BGP instance structure. */ struct bgp { /* AS number of this BGP instance. */ @@ -658,6 +672,9 @@ struct bgp { /* Count of peers in established state */ uint32_t established_peers; + /* Weighted ECMP related config. */ + enum bgp_link_bw_handling lb_handling; + QOBJ_FIELDS }; DECLARE_QOBJ_TYPE(bgp)