From 421a7dfc934f43176174ca4553b0fb2bea35b23b Mon Sep 17 00:00:00 2001 From: Quentin Young Date: Mon, 14 Oct 2019 16:09:36 +0000 Subject: [PATCH 1/5] bgpd: move assert out of error case bgp_process_packets has an assert to make sure an appropriate amount of working space in the input buffer has been freed up for future reads. However, this assert shouldn't be made when we have encountered an error that's going to tear down the session, because in this case we may not be able to process the full contents of the input buffer. Signed-off-by: Quentin Young --- bgpd/bgp_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bgpd/bgp_io.c b/bgpd/bgp_io.c index c2d06a4b5a..c8b7e5155f 100644 --- a/bgpd/bgp_io.c +++ b/bgpd/bgp_io.c @@ -242,13 +242,13 @@ static int bgp_process_reads(struct thread *thread) break; } - assert(ringbuf_space(peer->ibuf_work) >= BGP_MAX_PACKET_SIZE); - /* handle invalid header */ if (fatal) { /* wipe buffer just in case someone screwed up */ ringbuf_wipe(peer->ibuf_work); } else { + assert(ringbuf_space(peer->ibuf_work) >= BGP_MAX_PACKET_SIZE); + thread_add_read(fpt->master, bgp_process_reads, peer, peer->fd, &peer->t_read); if (added_pkt) From 093279cd028299b771a96dea30fd8d193b2e1d80 Mon Sep 17 00:00:00 2001 From: Quentin Young Date: Fri, 4 Oct 2019 18:52:24 +0000 Subject: [PATCH 2/5] bgpd: vector I/O Signed-off-by: Quentin Young --- bgpd/bgp_io.c | 101 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 21 deletions(-) diff --git a/bgpd/bgp_io.c b/bgpd/bgp_io.c index c8b7e5155f..c4555afc52 100644 --- a/bgpd/bgp_io.c +++ b/bgpd/bgp_io.c @@ -22,6 +22,7 @@ /* clang-format off */ #include #include // for pthread_mutex_unlock, pthread_mutex_lock +#include // for writev #include "frr_pthread.h" #include "linklist.h" // for list_delete, list_delete_all_node, lis... @@ -275,35 +276,94 @@ static uint16_t bgp_write(struct peer *peer) { uint8_t type; struct stream *s; - int num; int update_last_write = 0; - unsigned int count = 0; + unsigned int count; uint32_t uo = 0; uint16_t status = 0; uint32_t wpkt_quanta_old; + int writenum = 0; + int num; + unsigned int iovsz; + unsigned int strmsz; + unsigned int total_written; + wpkt_quanta_old = atomic_load_explicit(&peer->bgp->wpkt_quanta, memory_order_relaxed); + struct stream *ostreams[wpkt_quanta_old]; + struct stream **streams = ostreams; + struct iovec iov[wpkt_quanta_old]; - while (count < wpkt_quanta_old && (s = stream_fifo_head(peer->obuf))) { - int writenum; - do { - writenum = stream_get_endp(s) - stream_get_getp(s); - num = write(peer->fd, stream_pnt(s), writenum); + s = stream_fifo_head(peer->obuf); - if (num < 0) { - if (!ERRNO_IO_RETRY(errno)) { - BGP_EVENT_ADD(peer, TCP_fatal_error); - SET_FLAG(status, BGP_IO_FATAL_ERR); - } else { - SET_FLAG(status, BGP_IO_TRANS_ERR); - } + if (!s) + goto done; - goto done; - } else if (num != writenum) - stream_forward_getp(s, num); + count = iovsz = 0; + while (count < wpkt_quanta_old && iovsz < array_size(iov) && s) { + ostreams[iovsz] = s; + iov[iovsz].iov_base = stream_pnt(s); + iov[iovsz].iov_len = STREAM_READABLE(s); + writenum += STREAM_READABLE(s); + s = s->next; + ++iovsz; + ++count; + } - } while (num != writenum); + strmsz = iovsz; + total_written = 0; + + do { + num = writev(peer->fd, iov, iovsz); + + if (num < 0) { + if (!ERRNO_IO_RETRY(errno)) { + BGP_EVENT_ADD(peer, TCP_fatal_error); + SET_FLAG(status, BGP_IO_FATAL_ERR); + } else { + SET_FLAG(status, BGP_IO_TRANS_ERR); + } + + break; + } else if (num != writenum) { + unsigned int msg_written = 0; + unsigned int ic = iovsz; + + for (unsigned int i = 0; i < ic; i++) { + size_t ss = iov[i].iov_len; + + if (ss > (unsigned int) num) + break; + + msg_written++; + iovsz--; + writenum -= ss; + num -= ss; + } + + total_written += msg_written; + + memmove(&iov, &iov[msg_written], + sizeof(iov[0]) * iovsz); + streams = &streams[msg_written]; + stream_forward_getp(streams[0], num); + iov[0].iov_base = stream_pnt(streams[0]); + iov[0].iov_len = STREAM_READABLE(streams[0]); + + writenum -= num; + num = 0; + assert(writenum > 0); + } else { + total_written = strmsz; + } + + } while (num != writenum); + + /* Handle statistics */ + for (unsigned int i = 0; i < total_written; i++) { + s = stream_fifo_pop(peer->obuf); + + assert(s == ostreams[i]); /* Retrieve BGP packet type. */ stream_set_getp(s, BGP_MARKER_SIZE + 2); @@ -351,9 +411,8 @@ static uint16_t bgp_write(struct peer *peer) break; } - count++; - - stream_free(stream_fifo_pop(peer->obuf)); + stream_free(s); + ostreams[i] = NULL; update_last_write = 1; } From 8fa7732f5da38eaaaf49b442b67030c5483cee0d Mon Sep 17 00:00:00 2001 From: Quentin Young Date: Mon, 7 Oct 2019 17:33:39 +0000 Subject: [PATCH 3/5] bgpd: raise default & max r/w quanta to 64 Vectored writes are more efficient with a higher quantum. Signed-off-by: Quentin Young --- bgpd/bgp_io.h | 2 +- bgpd/bgp_vty.c | 75 ++++++++++++++++---------------------------------- 2 files changed, 24 insertions(+), 53 deletions(-) diff --git a/bgpd/bgp_io.h b/bgpd/bgp_io.h index 14a12d3705..75d014f38e 100644 --- a/bgpd/bgp_io.h +++ b/bgpd/bgp_io.h @@ -22,7 +22,7 @@ #ifndef _FRR_BGP_IO_H #define _FRR_BGP_IO_H -#define BGP_WRITE_PACKET_MAX 10U +#define BGP_WRITE_PACKET_MAX 64U #define BGP_READ_PACKET_MAX 10U #include "bgpd/bgpd.h" diff --git a/bgpd/bgp_vty.c b/bgpd/bgp_vty.c index dd880768b8..eae9db5a61 100644 --- a/bgpd/bgp_vty.c +++ b/bgpd/bgp_vty.c @@ -1586,36 +1586,24 @@ DEFUN (no_bgp_update_delay, } -static int bgp_wpkt_quanta_config_vty(struct vty *vty, const char *num, - char set) +static int bgp_wpkt_quanta_config_vty(struct vty *vty, uint32_t quanta, + bool set) { VTY_DECLVAR_CONTEXT(bgp, bgp); - if (set) { - uint32_t quanta = strtoul(num, NULL, 10); - atomic_store_explicit(&bgp->wpkt_quanta, quanta, - memory_order_relaxed); - } else { - atomic_store_explicit(&bgp->wpkt_quanta, BGP_WRITE_PACKET_MAX, - memory_order_relaxed); - } + quanta = set ? quanta : BGP_WRITE_PACKET_MAX; + atomic_store_explicit(&bgp->wpkt_quanta, quanta, memory_order_relaxed); return CMD_SUCCESS; } -static int bgp_rpkt_quanta_config_vty(struct vty *vty, const char *num, - char set) +static int bgp_rpkt_quanta_config_vty(struct vty *vty, uint32_t quanta, + bool set) { VTY_DECLVAR_CONTEXT(bgp, bgp); - if (set) { - uint32_t quanta = strtoul(num, NULL, 10); - atomic_store_explicit(&bgp->rpkt_quanta, quanta, - memory_order_relaxed); - } else { - atomic_store_explicit(&bgp->rpkt_quanta, BGP_READ_PACKET_MAX, - memory_order_relaxed); - } + quanta = set ? quanta : BGP_READ_PACKET_MAX; + atomic_store_explicit(&bgp->rpkt_quanta, quanta, memory_order_relaxed); return CMD_SUCCESS; } @@ -1636,47 +1624,32 @@ void bgp_config_write_rpkt_quanta(struct vty *vty, struct bgp *bgp) vty_out(vty, " read-quanta %d\n", quanta); } -/* Packet quanta configuration */ -DEFUN (bgp_wpkt_quanta, +/* Packet quanta configuration + * + * XXX: The value set here controls the size of a stack buffer in the IO + * thread. When changing these limits be careful to prevent stack overflow. + * + * Furthermore, the maximums used here should correspond to + * BGP_WRITE_PACKET_MAX and BGP_READ_PACKET_MAX. + */ +DEFPY (bgp_wpkt_quanta, bgp_wpkt_quanta_cmd, - "write-quanta (1-10)", + "[no] write-quanta (1-64)$quanta", + NO_STR "How many packets to write to peer socket per run\n" "Number of packets\n") { - int idx_number = 1; - return bgp_wpkt_quanta_config_vty(vty, argv[idx_number]->arg, 1); + return bgp_wpkt_quanta_config_vty(vty, quanta, !no); } -DEFUN (no_bgp_wpkt_quanta, - no_bgp_wpkt_quanta_cmd, - "no write-quanta (1-10)", - NO_STR - "How many packets to write to peer socket per I/O cycle\n" - "Number of packets\n") -{ - int idx_number = 2; - return bgp_wpkt_quanta_config_vty(vty, argv[idx_number]->arg, 0); -} - -DEFUN (bgp_rpkt_quanta, +DEFPY (bgp_rpkt_quanta, bgp_rpkt_quanta_cmd, - "read-quanta (1-10)", - "How many packets to read from peer socket per I/O cycle\n" - "Number of packets\n") -{ - int idx_number = 1; - return bgp_rpkt_quanta_config_vty(vty, argv[idx_number]->arg, 1); -} - -DEFUN (no_bgp_rpkt_quanta, - no_bgp_rpkt_quanta_cmd, - "no read-quanta (1-10)", + "[no] read-quanta (1-10)$quanta", NO_STR "How many packets to read from peer socket per I/O cycle\n" "Number of packets\n") { - int idx_number = 2; - return bgp_rpkt_quanta_config_vty(vty, argv[idx_number]->arg, 0); + return bgp_rpkt_quanta_config_vty(vty, quanta, !no); } void bgp_config_write_coalesce_time(struct vty *vty, struct bgp *bgp) @@ -13072,9 +13045,7 @@ void bgp_vty_init(void) install_element(BGP_NODE, &bgp_update_delay_establish_wait_cmd); install_element(BGP_NODE, &bgp_wpkt_quanta_cmd); - install_element(BGP_NODE, &no_bgp_wpkt_quanta_cmd); install_element(BGP_NODE, &bgp_rpkt_quanta_cmd); - install_element(BGP_NODE, &no_bgp_rpkt_quanta_cmd); install_element(BGP_NODE, &bgp_coalesce_time_cmd); install_element(BGP_NODE, &no_bgp_coalesce_time_cmd); From 185553660fb5f24b0a30e746a484a0d958cbe37f Mon Sep 17 00:00:00 2001 From: Quentin Young Date: Tue, 15 Oct 2019 18:25:02 +0000 Subject: [PATCH 4/5] bgpd: speak soothing words to scanbuild Signed-off-by: Quentin Young --- bgpd/bgp_io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bgpd/bgp_io.c b/bgpd/bgp_io.c index c4555afc52..9e1c89b71c 100644 --- a/bgpd/bgp_io.c +++ b/bgpd/bgp_io.c @@ -343,6 +343,8 @@ static uint16_t bgp_write(struct peer *peer) total_written += msg_written; + assert(total_written < count); + memmove(&iov, &iov[msg_written], sizeof(iov[0]) * iovsz); streams = &streams[msg_written]; From e312b6c6c78ccec15a10d1be9fde49eb938cd693 Mon Sep 17 00:00:00 2001 From: Quentin Young Date: Wed, 16 Oct 2019 14:41:54 +0000 Subject: [PATCH 5/5] doc: add documentation for write- and read-quanta Signed-off-by: Quentin Young --- doc/user/bgp.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/doc/user/bgp.rst b/doc/user/bgp.rst index c99a5c49a3..bb60bad947 100644 --- a/doc/user/bgp.rst +++ b/doc/user/bgp.rst @@ -2167,6 +2167,8 @@ Dumping Messages and Routing Tables Other BGP Commands ------------------ +The following are available in the top level *enable* mode: + .. index:: clear bgp \* .. clicmd:: clear bgp \* @@ -2202,6 +2204,24 @@ Other BGP Commands Clear peer using soft reconfiguration in this address-family and sub-address-family. +The following are available in the ``router bgp`` mode: + +.. index:: write-quanta (1-64) +.. clicmd:: write-quanta (1-64) + + BGP message Tx I/O is vectored. This means that multiple packets are written + to the peer socket at the same time each I/O cycle, in order to minimize + system call overhead. This value controls how many are written at a time. + Under certain load conditions, reducing this value could make peer traffic + less 'bursty'. In practice, leave this settings on the default (64) unless + you truly know what you are doing. + +.. index:: read-quanta (1-10) +.. index:: read-quanta (1-10) + + Unlike Tx, BGP Rx traffic is not vectored. Packets are read off the wire one + at a time in a loop. This setting controls how many iterations the loop runs + for. As with write-quanta, it is best to leave this setting on the default. .. _bgp-displaying-bgp-information: