Merge pull request #5150 from qlyoung/bgp-vector-io-4

BGP vector I/O - Redux
This commit is contained in:
Mark Stapp 2019-10-17 10:57:47 -04:00 committed by GitHub
commit 10e75ceb6a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 126 additions and 74 deletions

View File

@ -22,6 +22,7 @@
/* clang-format off */
#include <zebra.h>
#include <pthread.h> // for pthread_mutex_unlock, pthread_mutex_lock
#include <sys/uio.h> // for writev
#include "frr_pthread.h"
#include "linklist.h" // for list_delete, list_delete_all_node, lis...
@ -275,35 +276,96 @@ static uint16_t bgp_write(struct peer *peer)
{
uint8_t type;
struct stream *s;
int num;
int update_last_write = 0;
unsigned int count = 0;
unsigned int count;
uint32_t uo = 0;
uint16_t status = 0;
uint32_t wpkt_quanta_old;
int writenum = 0;
int num;
unsigned int iovsz;
unsigned int strmsz;
unsigned int total_written;
wpkt_quanta_old = atomic_load_explicit(&peer->bgp->wpkt_quanta,
memory_order_relaxed);
struct stream *ostreams[wpkt_quanta_old];
struct stream **streams = ostreams;
struct iovec iov[wpkt_quanta_old];
while (count < wpkt_quanta_old && (s = stream_fifo_head(peer->obuf))) {
int writenum;
do {
writenum = stream_get_endp(s) - stream_get_getp(s);
num = write(peer->fd, stream_pnt(s), writenum);
s = stream_fifo_head(peer->obuf);
if (num < 0) {
if (!ERRNO_IO_RETRY(errno)) {
BGP_EVENT_ADD(peer, TCP_fatal_error);
SET_FLAG(status, BGP_IO_FATAL_ERR);
} else {
SET_FLAG(status, BGP_IO_TRANS_ERR);
}
if (!s)
goto done;
goto done;
} else if (num != writenum)
stream_forward_getp(s, num);
count = iovsz = 0;
while (count < wpkt_quanta_old && iovsz < array_size(iov) && s) {
ostreams[iovsz] = s;
iov[iovsz].iov_base = stream_pnt(s);
iov[iovsz].iov_len = STREAM_READABLE(s);
writenum += STREAM_READABLE(s);
s = s->next;
++iovsz;
++count;
}
} while (num != writenum);
strmsz = iovsz;
total_written = 0;
do {
num = writev(peer->fd, iov, iovsz);
if (num < 0) {
if (!ERRNO_IO_RETRY(errno)) {
BGP_EVENT_ADD(peer, TCP_fatal_error);
SET_FLAG(status, BGP_IO_FATAL_ERR);
} else {
SET_FLAG(status, BGP_IO_TRANS_ERR);
}
break;
} else if (num != writenum) {
unsigned int msg_written = 0;
unsigned int ic = iovsz;
for (unsigned int i = 0; i < ic; i++) {
size_t ss = iov[i].iov_len;
if (ss > (unsigned int) num)
break;
msg_written++;
iovsz--;
writenum -= ss;
num -= ss;
}
total_written += msg_written;
assert(total_written < count);
memmove(&iov, &iov[msg_written],
sizeof(iov[0]) * iovsz);
streams = &streams[msg_written];
stream_forward_getp(streams[0], num);
iov[0].iov_base = stream_pnt(streams[0]);
iov[0].iov_len = STREAM_READABLE(streams[0]);
writenum -= num;
num = 0;
assert(writenum > 0);
} else {
total_written = strmsz;
}
} while (num != writenum);
/* Handle statistics */
for (unsigned int i = 0; i < total_written; i++) {
s = stream_fifo_pop(peer->obuf);
assert(s == ostreams[i]);
/* Retrieve BGP packet type. */
stream_set_getp(s, BGP_MARKER_SIZE + 2);
@ -351,9 +413,8 @@ static uint16_t bgp_write(struct peer *peer)
break;
}
count++;
stream_free(stream_fifo_pop(peer->obuf));
stream_free(s);
ostreams[i] = NULL;
update_last_write = 1;
}

View File

@ -22,7 +22,7 @@
#ifndef _FRR_BGP_IO_H
#define _FRR_BGP_IO_H
#define BGP_WRITE_PACKET_MAX 10U
#define BGP_WRITE_PACKET_MAX 64U
#define BGP_READ_PACKET_MAX 10U
#include "bgpd/bgpd.h"

View File

@ -1586,36 +1586,24 @@ DEFUN (no_bgp_update_delay,
}
static int bgp_wpkt_quanta_config_vty(struct vty *vty, const char *num,
char set)
static int bgp_wpkt_quanta_config_vty(struct vty *vty, uint32_t quanta,
bool set)
{
VTY_DECLVAR_CONTEXT(bgp, bgp);
if (set) {
uint32_t quanta = strtoul(num, NULL, 10);
atomic_store_explicit(&bgp->wpkt_quanta, quanta,
memory_order_relaxed);
} else {
atomic_store_explicit(&bgp->wpkt_quanta, BGP_WRITE_PACKET_MAX,
memory_order_relaxed);
}
quanta = set ? quanta : BGP_WRITE_PACKET_MAX;
atomic_store_explicit(&bgp->wpkt_quanta, quanta, memory_order_relaxed);
return CMD_SUCCESS;
}
static int bgp_rpkt_quanta_config_vty(struct vty *vty, const char *num,
char set)
static int bgp_rpkt_quanta_config_vty(struct vty *vty, uint32_t quanta,
bool set)
{
VTY_DECLVAR_CONTEXT(bgp, bgp);
if (set) {
uint32_t quanta = strtoul(num, NULL, 10);
atomic_store_explicit(&bgp->rpkt_quanta, quanta,
memory_order_relaxed);
} else {
atomic_store_explicit(&bgp->rpkt_quanta, BGP_READ_PACKET_MAX,
memory_order_relaxed);
}
quanta = set ? quanta : BGP_READ_PACKET_MAX;
atomic_store_explicit(&bgp->rpkt_quanta, quanta, memory_order_relaxed);
return CMD_SUCCESS;
}
@ -1636,47 +1624,32 @@ void bgp_config_write_rpkt_quanta(struct vty *vty, struct bgp *bgp)
vty_out(vty, " read-quanta %d\n", quanta);
}
/* Packet quanta configuration */
DEFUN (bgp_wpkt_quanta,
/* Packet quanta configuration
*
* XXX: The value set here controls the size of a stack buffer in the IO
* thread. When changing these limits be careful to prevent stack overflow.
*
* Furthermore, the maximums used here should correspond to
* BGP_WRITE_PACKET_MAX and BGP_READ_PACKET_MAX.
*/
DEFPY (bgp_wpkt_quanta,
bgp_wpkt_quanta_cmd,
"write-quanta (1-10)",
"[no] write-quanta (1-64)$quanta",
NO_STR
"How many packets to write to peer socket per run\n"
"Number of packets\n")
{
int idx_number = 1;
return bgp_wpkt_quanta_config_vty(vty, argv[idx_number]->arg, 1);
return bgp_wpkt_quanta_config_vty(vty, quanta, !no);
}
DEFUN (no_bgp_wpkt_quanta,
no_bgp_wpkt_quanta_cmd,
"no write-quanta (1-10)",
NO_STR
"How many packets to write to peer socket per I/O cycle\n"
"Number of packets\n")
{
int idx_number = 2;
return bgp_wpkt_quanta_config_vty(vty, argv[idx_number]->arg, 0);
}
DEFUN (bgp_rpkt_quanta,
DEFPY (bgp_rpkt_quanta,
bgp_rpkt_quanta_cmd,
"read-quanta (1-10)",
"How many packets to read from peer socket per I/O cycle\n"
"Number of packets\n")
{
int idx_number = 1;
return bgp_rpkt_quanta_config_vty(vty, argv[idx_number]->arg, 1);
}
DEFUN (no_bgp_rpkt_quanta,
no_bgp_rpkt_quanta_cmd,
"no read-quanta (1-10)",
"[no] read-quanta (1-10)$quanta",
NO_STR
"How many packets to read from peer socket per I/O cycle\n"
"Number of packets\n")
{
int idx_number = 2;
return bgp_rpkt_quanta_config_vty(vty, argv[idx_number]->arg, 0);
return bgp_rpkt_quanta_config_vty(vty, quanta, !no);
}
void bgp_config_write_coalesce_time(struct vty *vty, struct bgp *bgp)
@ -13072,9 +13045,7 @@ void bgp_vty_init(void)
install_element(BGP_NODE, &bgp_update_delay_establish_wait_cmd);
install_element(BGP_NODE, &bgp_wpkt_quanta_cmd);
install_element(BGP_NODE, &no_bgp_wpkt_quanta_cmd);
install_element(BGP_NODE, &bgp_rpkt_quanta_cmd);
install_element(BGP_NODE, &no_bgp_rpkt_quanta_cmd);
install_element(BGP_NODE, &bgp_coalesce_time_cmd);
install_element(BGP_NODE, &no_bgp_coalesce_time_cmd);

View File

@ -2167,6 +2167,8 @@ Dumping Messages and Routing Tables
Other BGP Commands
------------------
The following are available in the top level *enable* mode:
.. index:: clear bgp \*
.. clicmd:: clear bgp \*
@ -2202,6 +2204,24 @@ Other BGP Commands
Clear peer using soft reconfiguration in this address-family and sub-address-family.
The following are available in the ``router bgp`` mode:
.. index:: write-quanta (1-64)
.. clicmd:: write-quanta (1-64)
BGP message Tx I/O is vectored. This means that multiple packets are written
to the peer socket at the same time each I/O cycle, in order to minimize
system call overhead. This value controls how many are written at a time.
Under certain load conditions, reducing this value could make peer traffic
less 'bursty'. In practice, leave this settings on the default (64) unless
you truly know what you are doing.
.. index:: read-quanta (1-10)
.. index:: read-quanta (1-10)
Unlike Tx, BGP Rx traffic is not vectored. Packets are read off the wire one
at a time in a loop. This setting controls how many iterations the loop runs
for. As with write-quanta, it is best to leave this setting on the default.
.. _bgp-displaying-bgp-information: