mirror of
https://git.proxmox.com/git/mirror_frr
synced 2025-08-07 12:44:55 +00:00
Merge pull request #5150 from qlyoung/bgp-vector-io-4
BGP vector I/O - Redux
This commit is contained in:
commit
10e75ceb6a
103
bgpd/bgp_io.c
103
bgpd/bgp_io.c
@ -22,6 +22,7 @@
|
|||||||
/* clang-format off */
|
/* clang-format off */
|
||||||
#include <zebra.h>
|
#include <zebra.h>
|
||||||
#include <pthread.h> // for pthread_mutex_unlock, pthread_mutex_lock
|
#include <pthread.h> // for pthread_mutex_unlock, pthread_mutex_lock
|
||||||
|
#include <sys/uio.h> // for writev
|
||||||
|
|
||||||
#include "frr_pthread.h"
|
#include "frr_pthread.h"
|
||||||
#include "linklist.h" // for list_delete, list_delete_all_node, lis...
|
#include "linklist.h" // for list_delete, list_delete_all_node, lis...
|
||||||
@ -275,35 +276,96 @@ static uint16_t bgp_write(struct peer *peer)
|
|||||||
{
|
{
|
||||||
uint8_t type;
|
uint8_t type;
|
||||||
struct stream *s;
|
struct stream *s;
|
||||||
int num;
|
|
||||||
int update_last_write = 0;
|
int update_last_write = 0;
|
||||||
unsigned int count = 0;
|
unsigned int count;
|
||||||
uint32_t uo = 0;
|
uint32_t uo = 0;
|
||||||
uint16_t status = 0;
|
uint16_t status = 0;
|
||||||
uint32_t wpkt_quanta_old;
|
uint32_t wpkt_quanta_old;
|
||||||
|
|
||||||
|
int writenum = 0;
|
||||||
|
int num;
|
||||||
|
unsigned int iovsz;
|
||||||
|
unsigned int strmsz;
|
||||||
|
unsigned int total_written;
|
||||||
|
|
||||||
wpkt_quanta_old = atomic_load_explicit(&peer->bgp->wpkt_quanta,
|
wpkt_quanta_old = atomic_load_explicit(&peer->bgp->wpkt_quanta,
|
||||||
memory_order_relaxed);
|
memory_order_relaxed);
|
||||||
|
struct stream *ostreams[wpkt_quanta_old];
|
||||||
|
struct stream **streams = ostreams;
|
||||||
|
struct iovec iov[wpkt_quanta_old];
|
||||||
|
|
||||||
while (count < wpkt_quanta_old && (s = stream_fifo_head(peer->obuf))) {
|
s = stream_fifo_head(peer->obuf);
|
||||||
int writenum;
|
|
||||||
do {
|
|
||||||
writenum = stream_get_endp(s) - stream_get_getp(s);
|
|
||||||
num = write(peer->fd, stream_pnt(s), writenum);
|
|
||||||
|
|
||||||
if (num < 0) {
|
if (!s)
|
||||||
if (!ERRNO_IO_RETRY(errno)) {
|
goto done;
|
||||||
BGP_EVENT_ADD(peer, TCP_fatal_error);
|
|
||||||
SET_FLAG(status, BGP_IO_FATAL_ERR);
|
|
||||||
} else {
|
|
||||||
SET_FLAG(status, BGP_IO_TRANS_ERR);
|
|
||||||
}
|
|
||||||
|
|
||||||
goto done;
|
count = iovsz = 0;
|
||||||
} else if (num != writenum)
|
while (count < wpkt_quanta_old && iovsz < array_size(iov) && s) {
|
||||||
stream_forward_getp(s, num);
|
ostreams[iovsz] = s;
|
||||||
|
iov[iovsz].iov_base = stream_pnt(s);
|
||||||
|
iov[iovsz].iov_len = STREAM_READABLE(s);
|
||||||
|
writenum += STREAM_READABLE(s);
|
||||||
|
s = s->next;
|
||||||
|
++iovsz;
|
||||||
|
++count;
|
||||||
|
}
|
||||||
|
|
||||||
} while (num != writenum);
|
strmsz = iovsz;
|
||||||
|
total_written = 0;
|
||||||
|
|
||||||
|
do {
|
||||||
|
num = writev(peer->fd, iov, iovsz);
|
||||||
|
|
||||||
|
if (num < 0) {
|
||||||
|
if (!ERRNO_IO_RETRY(errno)) {
|
||||||
|
BGP_EVENT_ADD(peer, TCP_fatal_error);
|
||||||
|
SET_FLAG(status, BGP_IO_FATAL_ERR);
|
||||||
|
} else {
|
||||||
|
SET_FLAG(status, BGP_IO_TRANS_ERR);
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
} else if (num != writenum) {
|
||||||
|
unsigned int msg_written = 0;
|
||||||
|
unsigned int ic = iovsz;
|
||||||
|
|
||||||
|
for (unsigned int i = 0; i < ic; i++) {
|
||||||
|
size_t ss = iov[i].iov_len;
|
||||||
|
|
||||||
|
if (ss > (unsigned int) num)
|
||||||
|
break;
|
||||||
|
|
||||||
|
msg_written++;
|
||||||
|
iovsz--;
|
||||||
|
writenum -= ss;
|
||||||
|
num -= ss;
|
||||||
|
}
|
||||||
|
|
||||||
|
total_written += msg_written;
|
||||||
|
|
||||||
|
assert(total_written < count);
|
||||||
|
|
||||||
|
memmove(&iov, &iov[msg_written],
|
||||||
|
sizeof(iov[0]) * iovsz);
|
||||||
|
streams = &streams[msg_written];
|
||||||
|
stream_forward_getp(streams[0], num);
|
||||||
|
iov[0].iov_base = stream_pnt(streams[0]);
|
||||||
|
iov[0].iov_len = STREAM_READABLE(streams[0]);
|
||||||
|
|
||||||
|
writenum -= num;
|
||||||
|
num = 0;
|
||||||
|
assert(writenum > 0);
|
||||||
|
} else {
|
||||||
|
total_written = strmsz;
|
||||||
|
}
|
||||||
|
|
||||||
|
} while (num != writenum);
|
||||||
|
|
||||||
|
/* Handle statistics */
|
||||||
|
for (unsigned int i = 0; i < total_written; i++) {
|
||||||
|
s = stream_fifo_pop(peer->obuf);
|
||||||
|
|
||||||
|
assert(s == ostreams[i]);
|
||||||
|
|
||||||
/* Retrieve BGP packet type. */
|
/* Retrieve BGP packet type. */
|
||||||
stream_set_getp(s, BGP_MARKER_SIZE + 2);
|
stream_set_getp(s, BGP_MARKER_SIZE + 2);
|
||||||
@ -351,9 +413,8 @@ static uint16_t bgp_write(struct peer *peer)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
count++;
|
stream_free(s);
|
||||||
|
ostreams[i] = NULL;
|
||||||
stream_free(stream_fifo_pop(peer->obuf));
|
|
||||||
update_last_write = 1;
|
update_last_write = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@
|
|||||||
#ifndef _FRR_BGP_IO_H
|
#ifndef _FRR_BGP_IO_H
|
||||||
#define _FRR_BGP_IO_H
|
#define _FRR_BGP_IO_H
|
||||||
|
|
||||||
#define BGP_WRITE_PACKET_MAX 10U
|
#define BGP_WRITE_PACKET_MAX 64U
|
||||||
#define BGP_READ_PACKET_MAX 10U
|
#define BGP_READ_PACKET_MAX 10U
|
||||||
|
|
||||||
#include "bgpd/bgpd.h"
|
#include "bgpd/bgpd.h"
|
||||||
|
@ -1586,36 +1586,24 @@ DEFUN (no_bgp_update_delay,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int bgp_wpkt_quanta_config_vty(struct vty *vty, const char *num,
|
static int bgp_wpkt_quanta_config_vty(struct vty *vty, uint32_t quanta,
|
||||||
char set)
|
bool set)
|
||||||
{
|
{
|
||||||
VTY_DECLVAR_CONTEXT(bgp, bgp);
|
VTY_DECLVAR_CONTEXT(bgp, bgp);
|
||||||
|
|
||||||
if (set) {
|
quanta = set ? quanta : BGP_WRITE_PACKET_MAX;
|
||||||
uint32_t quanta = strtoul(num, NULL, 10);
|
atomic_store_explicit(&bgp->wpkt_quanta, quanta, memory_order_relaxed);
|
||||||
atomic_store_explicit(&bgp->wpkt_quanta, quanta,
|
|
||||||
memory_order_relaxed);
|
|
||||||
} else {
|
|
||||||
atomic_store_explicit(&bgp->wpkt_quanta, BGP_WRITE_PACKET_MAX,
|
|
||||||
memory_order_relaxed);
|
|
||||||
}
|
|
||||||
|
|
||||||
return CMD_SUCCESS;
|
return CMD_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int bgp_rpkt_quanta_config_vty(struct vty *vty, const char *num,
|
static int bgp_rpkt_quanta_config_vty(struct vty *vty, uint32_t quanta,
|
||||||
char set)
|
bool set)
|
||||||
{
|
{
|
||||||
VTY_DECLVAR_CONTEXT(bgp, bgp);
|
VTY_DECLVAR_CONTEXT(bgp, bgp);
|
||||||
|
|
||||||
if (set) {
|
quanta = set ? quanta : BGP_READ_PACKET_MAX;
|
||||||
uint32_t quanta = strtoul(num, NULL, 10);
|
atomic_store_explicit(&bgp->rpkt_quanta, quanta, memory_order_relaxed);
|
||||||
atomic_store_explicit(&bgp->rpkt_quanta, quanta,
|
|
||||||
memory_order_relaxed);
|
|
||||||
} else {
|
|
||||||
atomic_store_explicit(&bgp->rpkt_quanta, BGP_READ_PACKET_MAX,
|
|
||||||
memory_order_relaxed);
|
|
||||||
}
|
|
||||||
|
|
||||||
return CMD_SUCCESS;
|
return CMD_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -1636,47 +1624,32 @@ void bgp_config_write_rpkt_quanta(struct vty *vty, struct bgp *bgp)
|
|||||||
vty_out(vty, " read-quanta %d\n", quanta);
|
vty_out(vty, " read-quanta %d\n", quanta);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Packet quanta configuration */
|
/* Packet quanta configuration
|
||||||
DEFUN (bgp_wpkt_quanta,
|
*
|
||||||
|
* XXX: The value set here controls the size of a stack buffer in the IO
|
||||||
|
* thread. When changing these limits be careful to prevent stack overflow.
|
||||||
|
*
|
||||||
|
* Furthermore, the maximums used here should correspond to
|
||||||
|
* BGP_WRITE_PACKET_MAX and BGP_READ_PACKET_MAX.
|
||||||
|
*/
|
||||||
|
DEFPY (bgp_wpkt_quanta,
|
||||||
bgp_wpkt_quanta_cmd,
|
bgp_wpkt_quanta_cmd,
|
||||||
"write-quanta (1-10)",
|
"[no] write-quanta (1-64)$quanta",
|
||||||
|
NO_STR
|
||||||
"How many packets to write to peer socket per run\n"
|
"How many packets to write to peer socket per run\n"
|
||||||
"Number of packets\n")
|
"Number of packets\n")
|
||||||
{
|
{
|
||||||
int idx_number = 1;
|
return bgp_wpkt_quanta_config_vty(vty, quanta, !no);
|
||||||
return bgp_wpkt_quanta_config_vty(vty, argv[idx_number]->arg, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFUN (no_bgp_wpkt_quanta,
|
DEFPY (bgp_rpkt_quanta,
|
||||||
no_bgp_wpkt_quanta_cmd,
|
|
||||||
"no write-quanta (1-10)",
|
|
||||||
NO_STR
|
|
||||||
"How many packets to write to peer socket per I/O cycle\n"
|
|
||||||
"Number of packets\n")
|
|
||||||
{
|
|
||||||
int idx_number = 2;
|
|
||||||
return bgp_wpkt_quanta_config_vty(vty, argv[idx_number]->arg, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
DEFUN (bgp_rpkt_quanta,
|
|
||||||
bgp_rpkt_quanta_cmd,
|
bgp_rpkt_quanta_cmd,
|
||||||
"read-quanta (1-10)",
|
"[no] read-quanta (1-10)$quanta",
|
||||||
"How many packets to read from peer socket per I/O cycle\n"
|
|
||||||
"Number of packets\n")
|
|
||||||
{
|
|
||||||
int idx_number = 1;
|
|
||||||
return bgp_rpkt_quanta_config_vty(vty, argv[idx_number]->arg, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
DEFUN (no_bgp_rpkt_quanta,
|
|
||||||
no_bgp_rpkt_quanta_cmd,
|
|
||||||
"no read-quanta (1-10)",
|
|
||||||
NO_STR
|
NO_STR
|
||||||
"How many packets to read from peer socket per I/O cycle\n"
|
"How many packets to read from peer socket per I/O cycle\n"
|
||||||
"Number of packets\n")
|
"Number of packets\n")
|
||||||
{
|
{
|
||||||
int idx_number = 2;
|
return bgp_rpkt_quanta_config_vty(vty, quanta, !no);
|
||||||
return bgp_rpkt_quanta_config_vty(vty, argv[idx_number]->arg, 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void bgp_config_write_coalesce_time(struct vty *vty, struct bgp *bgp)
|
void bgp_config_write_coalesce_time(struct vty *vty, struct bgp *bgp)
|
||||||
@ -13072,9 +13045,7 @@ void bgp_vty_init(void)
|
|||||||
install_element(BGP_NODE, &bgp_update_delay_establish_wait_cmd);
|
install_element(BGP_NODE, &bgp_update_delay_establish_wait_cmd);
|
||||||
|
|
||||||
install_element(BGP_NODE, &bgp_wpkt_quanta_cmd);
|
install_element(BGP_NODE, &bgp_wpkt_quanta_cmd);
|
||||||
install_element(BGP_NODE, &no_bgp_wpkt_quanta_cmd);
|
|
||||||
install_element(BGP_NODE, &bgp_rpkt_quanta_cmd);
|
install_element(BGP_NODE, &bgp_rpkt_quanta_cmd);
|
||||||
install_element(BGP_NODE, &no_bgp_rpkt_quanta_cmd);
|
|
||||||
|
|
||||||
install_element(BGP_NODE, &bgp_coalesce_time_cmd);
|
install_element(BGP_NODE, &bgp_coalesce_time_cmd);
|
||||||
install_element(BGP_NODE, &no_bgp_coalesce_time_cmd);
|
install_element(BGP_NODE, &no_bgp_coalesce_time_cmd);
|
||||||
|
@ -2167,6 +2167,8 @@ Dumping Messages and Routing Tables
|
|||||||
Other BGP Commands
|
Other BGP Commands
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
|
The following are available in the top level *enable* mode:
|
||||||
|
|
||||||
.. index:: clear bgp \*
|
.. index:: clear bgp \*
|
||||||
.. clicmd:: clear bgp \*
|
.. clicmd:: clear bgp \*
|
||||||
|
|
||||||
@ -2202,6 +2204,24 @@ Other BGP Commands
|
|||||||
|
|
||||||
Clear peer using soft reconfiguration in this address-family and sub-address-family.
|
Clear peer using soft reconfiguration in this address-family and sub-address-family.
|
||||||
|
|
||||||
|
The following are available in the ``router bgp`` mode:
|
||||||
|
|
||||||
|
.. index:: write-quanta (1-64)
|
||||||
|
.. clicmd:: write-quanta (1-64)
|
||||||
|
|
||||||
|
BGP message Tx I/O is vectored. This means that multiple packets are written
|
||||||
|
to the peer socket at the same time each I/O cycle, in order to minimize
|
||||||
|
system call overhead. This value controls how many are written at a time.
|
||||||
|
Under certain load conditions, reducing this value could make peer traffic
|
||||||
|
less 'bursty'. In practice, leave this settings on the default (64) unless
|
||||||
|
you truly know what you are doing.
|
||||||
|
|
||||||
|
.. index:: read-quanta (1-10)
|
||||||
|
.. index:: read-quanta (1-10)
|
||||||
|
|
||||||
|
Unlike Tx, BGP Rx traffic is not vectored. Packets are read off the wire one
|
||||||
|
at a time in a loop. This setting controls how many iterations the loop runs
|
||||||
|
for. As with write-quanta, it is best to leave this setting on the default.
|
||||||
|
|
||||||
.. _bgp-displaying-bgp-information:
|
.. _bgp-displaying-bgp-information:
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user