diff --git a/bgpd/bgp_io.c b/bgpd/bgp_io.c index c8b7e5155f..9e1c89b71c 100644 --- a/bgpd/bgp_io.c +++ b/bgpd/bgp_io.c @@ -22,6 +22,7 @@ /* clang-format off */ #include #include // for pthread_mutex_unlock, pthread_mutex_lock +#include // for writev #include "frr_pthread.h" #include "linklist.h" // for list_delete, list_delete_all_node, lis... @@ -275,35 +276,96 @@ static uint16_t bgp_write(struct peer *peer) { uint8_t type; struct stream *s; - int num; int update_last_write = 0; - unsigned int count = 0; + unsigned int count; uint32_t uo = 0; uint16_t status = 0; uint32_t wpkt_quanta_old; + int writenum = 0; + int num; + unsigned int iovsz; + unsigned int strmsz; + unsigned int total_written; + wpkt_quanta_old = atomic_load_explicit(&peer->bgp->wpkt_quanta, memory_order_relaxed); + struct stream *ostreams[wpkt_quanta_old]; + struct stream **streams = ostreams; + struct iovec iov[wpkt_quanta_old]; - while (count < wpkt_quanta_old && (s = stream_fifo_head(peer->obuf))) { - int writenum; - do { - writenum = stream_get_endp(s) - stream_get_getp(s); - num = write(peer->fd, stream_pnt(s), writenum); + s = stream_fifo_head(peer->obuf); - if (num < 0) { - if (!ERRNO_IO_RETRY(errno)) { - BGP_EVENT_ADD(peer, TCP_fatal_error); - SET_FLAG(status, BGP_IO_FATAL_ERR); - } else { - SET_FLAG(status, BGP_IO_TRANS_ERR); - } + if (!s) + goto done; - goto done; - } else if (num != writenum) - stream_forward_getp(s, num); + count = iovsz = 0; + while (count < wpkt_quanta_old && iovsz < array_size(iov) && s) { + ostreams[iovsz] = s; + iov[iovsz].iov_base = stream_pnt(s); + iov[iovsz].iov_len = STREAM_READABLE(s); + writenum += STREAM_READABLE(s); + s = s->next; + ++iovsz; + ++count; + } - } while (num != writenum); + strmsz = iovsz; + total_written = 0; + + do { + num = writev(peer->fd, iov, iovsz); + + if (num < 0) { + if (!ERRNO_IO_RETRY(errno)) { + BGP_EVENT_ADD(peer, TCP_fatal_error); + SET_FLAG(status, BGP_IO_FATAL_ERR); + } else { + SET_FLAG(status, BGP_IO_TRANS_ERR); + } + + break; + } else if (num != writenum) { + unsigned int msg_written = 0; + unsigned int ic = iovsz; + + for (unsigned int i = 0; i < ic; i++) { + size_t ss = iov[i].iov_len; + + if (ss > (unsigned int) num) + break; + + msg_written++; + iovsz--; + writenum -= ss; + num -= ss; + } + + total_written += msg_written; + + assert(total_written < count); + + memmove(&iov, &iov[msg_written], + sizeof(iov[0]) * iovsz); + streams = &streams[msg_written]; + stream_forward_getp(streams[0], num); + iov[0].iov_base = stream_pnt(streams[0]); + iov[0].iov_len = STREAM_READABLE(streams[0]); + + writenum -= num; + num = 0; + assert(writenum > 0); + } else { + total_written = strmsz; + } + + } while (num != writenum); + + /* Handle statistics */ + for (unsigned int i = 0; i < total_written; i++) { + s = stream_fifo_pop(peer->obuf); + + assert(s == ostreams[i]); /* Retrieve BGP packet type. */ stream_set_getp(s, BGP_MARKER_SIZE + 2); @@ -351,9 +413,8 @@ static uint16_t bgp_write(struct peer *peer) break; } - count++; - - stream_free(stream_fifo_pop(peer->obuf)); + stream_free(s); + ostreams[i] = NULL; update_last_write = 1; } diff --git a/bgpd/bgp_io.h b/bgpd/bgp_io.h index 14a12d3705..75d014f38e 100644 --- a/bgpd/bgp_io.h +++ b/bgpd/bgp_io.h @@ -22,7 +22,7 @@ #ifndef _FRR_BGP_IO_H #define _FRR_BGP_IO_H -#define BGP_WRITE_PACKET_MAX 10U +#define BGP_WRITE_PACKET_MAX 64U #define BGP_READ_PACKET_MAX 10U #include "bgpd/bgpd.h" diff --git a/bgpd/bgp_vty.c b/bgpd/bgp_vty.c index dd880768b8..eae9db5a61 100644 --- a/bgpd/bgp_vty.c +++ b/bgpd/bgp_vty.c @@ -1586,36 +1586,24 @@ DEFUN (no_bgp_update_delay, } -static int bgp_wpkt_quanta_config_vty(struct vty *vty, const char *num, - char set) +static int bgp_wpkt_quanta_config_vty(struct vty *vty, uint32_t quanta, + bool set) { VTY_DECLVAR_CONTEXT(bgp, bgp); - if (set) { - uint32_t quanta = strtoul(num, NULL, 10); - atomic_store_explicit(&bgp->wpkt_quanta, quanta, - memory_order_relaxed); - } else { - atomic_store_explicit(&bgp->wpkt_quanta, BGP_WRITE_PACKET_MAX, - memory_order_relaxed); - } + quanta = set ? quanta : BGP_WRITE_PACKET_MAX; + atomic_store_explicit(&bgp->wpkt_quanta, quanta, memory_order_relaxed); return CMD_SUCCESS; } -static int bgp_rpkt_quanta_config_vty(struct vty *vty, const char *num, - char set) +static int bgp_rpkt_quanta_config_vty(struct vty *vty, uint32_t quanta, + bool set) { VTY_DECLVAR_CONTEXT(bgp, bgp); - if (set) { - uint32_t quanta = strtoul(num, NULL, 10); - atomic_store_explicit(&bgp->rpkt_quanta, quanta, - memory_order_relaxed); - } else { - atomic_store_explicit(&bgp->rpkt_quanta, BGP_READ_PACKET_MAX, - memory_order_relaxed); - } + quanta = set ? quanta : BGP_READ_PACKET_MAX; + atomic_store_explicit(&bgp->rpkt_quanta, quanta, memory_order_relaxed); return CMD_SUCCESS; } @@ -1636,47 +1624,32 @@ void bgp_config_write_rpkt_quanta(struct vty *vty, struct bgp *bgp) vty_out(vty, " read-quanta %d\n", quanta); } -/* Packet quanta configuration */ -DEFUN (bgp_wpkt_quanta, +/* Packet quanta configuration + * + * XXX: The value set here controls the size of a stack buffer in the IO + * thread. When changing these limits be careful to prevent stack overflow. + * + * Furthermore, the maximums used here should correspond to + * BGP_WRITE_PACKET_MAX and BGP_READ_PACKET_MAX. + */ +DEFPY (bgp_wpkt_quanta, bgp_wpkt_quanta_cmd, - "write-quanta (1-10)", + "[no] write-quanta (1-64)$quanta", + NO_STR "How many packets to write to peer socket per run\n" "Number of packets\n") { - int idx_number = 1; - return bgp_wpkt_quanta_config_vty(vty, argv[idx_number]->arg, 1); + return bgp_wpkt_quanta_config_vty(vty, quanta, !no); } -DEFUN (no_bgp_wpkt_quanta, - no_bgp_wpkt_quanta_cmd, - "no write-quanta (1-10)", - NO_STR - "How many packets to write to peer socket per I/O cycle\n" - "Number of packets\n") -{ - int idx_number = 2; - return bgp_wpkt_quanta_config_vty(vty, argv[idx_number]->arg, 0); -} - -DEFUN (bgp_rpkt_quanta, +DEFPY (bgp_rpkt_quanta, bgp_rpkt_quanta_cmd, - "read-quanta (1-10)", - "How many packets to read from peer socket per I/O cycle\n" - "Number of packets\n") -{ - int idx_number = 1; - return bgp_rpkt_quanta_config_vty(vty, argv[idx_number]->arg, 1); -} - -DEFUN (no_bgp_rpkt_quanta, - no_bgp_rpkt_quanta_cmd, - "no read-quanta (1-10)", + "[no] read-quanta (1-10)$quanta", NO_STR "How many packets to read from peer socket per I/O cycle\n" "Number of packets\n") { - int idx_number = 2; - return bgp_rpkt_quanta_config_vty(vty, argv[idx_number]->arg, 0); + return bgp_rpkt_quanta_config_vty(vty, quanta, !no); } void bgp_config_write_coalesce_time(struct vty *vty, struct bgp *bgp) @@ -13072,9 +13045,7 @@ void bgp_vty_init(void) install_element(BGP_NODE, &bgp_update_delay_establish_wait_cmd); install_element(BGP_NODE, &bgp_wpkt_quanta_cmd); - install_element(BGP_NODE, &no_bgp_wpkt_quanta_cmd); install_element(BGP_NODE, &bgp_rpkt_quanta_cmd); - install_element(BGP_NODE, &no_bgp_rpkt_quanta_cmd); install_element(BGP_NODE, &bgp_coalesce_time_cmd); install_element(BGP_NODE, &no_bgp_coalesce_time_cmd); diff --git a/doc/user/bgp.rst b/doc/user/bgp.rst index c99a5c49a3..bb60bad947 100644 --- a/doc/user/bgp.rst +++ b/doc/user/bgp.rst @@ -2167,6 +2167,8 @@ Dumping Messages and Routing Tables Other BGP Commands ------------------ +The following are available in the top level *enable* mode: + .. index:: clear bgp \* .. clicmd:: clear bgp \* @@ -2202,6 +2204,24 @@ Other BGP Commands Clear peer using soft reconfiguration in this address-family and sub-address-family. +The following are available in the ``router bgp`` mode: + +.. index:: write-quanta (1-64) +.. clicmd:: write-quanta (1-64) + + BGP message Tx I/O is vectored. This means that multiple packets are written + to the peer socket at the same time each I/O cycle, in order to minimize + system call overhead. This value controls how many are written at a time. + Under certain load conditions, reducing this value could make peer traffic + less 'bursty'. In practice, leave this settings on the default (64) unless + you truly know what you are doing. + +.. index:: read-quanta (1-10) +.. index:: read-quanta (1-10) + + Unlike Tx, BGP Rx traffic is not vectored. Packets are read off the wire one + at a time in a loop. This setting controls how many iterations the loop runs + for. As with write-quanta, it is best to leave this setting on the default. .. _bgp-displaying-bgp-information: