linux-loongson/include/uapi/linux/if_xdp.h
Jason Xing 45e359be1c net: xsk: introduce XDP_MAX_TX_SKB_BUDGET setsockopt
This patch provides a setsockopt method to let applications leverage to
adjust how many descs to be handled at most in one send syscall. It
mitigates the situation where the default value (32) that is too small
leads to higher frequency of triggering send syscall.

Considering the prosperity/complexity the applications have, there is no
absolutely ideal suggestion fitting all cases. So keep 32 as its default
value like before.

The patch does the following things:
- Add XDP_MAX_TX_SKB_BUDGET socket option.
- Set max_tx_budget to 32 by default in the initialization phase as a
  per-socket granular control.
- Set the range of max_tx_budget as [32, xs->tx->nentries].

The idea behind this comes out of real workloads in production. We use a
user-level stack with xsk support to accelerate sending packets and
minimize triggering syscalls. When the packets are aggregated, it's not
hard to hit the upper bound (namely, 32). The moment user-space stack
fetches the -EAGAIN error number passed from sendto(), it will loop to try
again until all the expected descs from tx ring are sent out to the driver.
Enlarging the XDP_MAX_TX_SKB_BUDGET value contributes to less frequency of
sendto() and higher throughput/PPS.

Here is what I did in production, along with some numbers as follows:
For one application I saw lately, I suggested using 128 as max_tx_budget
because I saw two limitations without changing any default configuration:
1) XDP_MAX_TX_SKB_BUDGET, 2) socket sndbuf which is 212992 decided by
net.core.wmem_default. As to XDP_MAX_TX_SKB_BUDGET, the scenario behind
this was I counted how many descs are transmitted to the driver at one
time of sendto() based on [1] patch and then I calculated the
possibility of hitting the upper bound. Finally I chose 128 as a
suitable value because 1) it covers most of the cases, 2) a higher
number would not bring evident results. After twisting the parameters,
a stable improvement of around 4% for both PPS and throughput and less
resources consumption were found to be observed by strace -c -p xxx:
1) %time was decreased by 7.8%
2) error counter was decreased from 18367 to 572

[1]: https://lore.kernel.org/all/20250619093641.70700-1-kerneljasonxing@gmail.com/

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://patch.msgid.link/20250704160138.48677-1-kerneljasonxing@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-07-10 14:48:29 +02:00

185 lines
5.3 KiB
C

/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
* if_xdp: XDP socket user-space interface
* Copyright(c) 2018 Intel Corporation.
*
* Author(s): Björn Töpel <bjorn.topel@intel.com>
* Magnus Karlsson <magnus.karlsson@intel.com>
*/
#ifndef _UAPI_LINUX_IF_XDP_H
#define _UAPI_LINUX_IF_XDP_H
#include <linux/types.h>
/* Options for the sxdp_flags field */
#define XDP_SHARED_UMEM (1 << 0)
#define XDP_COPY (1 << 1) /* Force copy-mode */
#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */
/* If this option is set, the driver might go sleep and in that case
* the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be
* set. If it is set, the application need to explicitly wake up the
* driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are
* running the driver and the application on the same core, you should
* use this option so that the kernel will yield to the user space
* application.
*/
#define XDP_USE_NEED_WAKEUP (1 << 3)
/* By setting this option, userspace application indicates that it can
* handle multiple descriptors per packet thus enabling AF_XDP to split
* multi-buffer XDP frames into multiple Rx descriptors. Without this set
* such frames will be dropped.
*/
#define XDP_USE_SG (1 << 4)
/* Flags for xsk_umem_config flags */
#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
/* Force checksum calculation in software. Can be used for testing or
* working around potential HW issues. This option causes performance
* degradation and only works in XDP_COPY mode.
*/
#define XDP_UMEM_TX_SW_CSUM (1 << 1)
/* Request to reserve tx_metadata_len bytes of per-chunk metadata.
*/
#define XDP_UMEM_TX_METADATA_LEN (1 << 2)
struct sockaddr_xdp {
__u16 sxdp_family;
__u16 sxdp_flags;
__u32 sxdp_ifindex;
__u32 sxdp_queue_id;
__u32 sxdp_shared_umem_fd;
};
/* XDP_RING flags */
#define XDP_RING_NEED_WAKEUP (1 << 0)
struct xdp_ring_offset {
__u64 producer;
__u64 consumer;
__u64 desc;
__u64 flags;
};
struct xdp_mmap_offsets {
struct xdp_ring_offset rx;
struct xdp_ring_offset tx;
struct xdp_ring_offset fr; /* Fill */
struct xdp_ring_offset cr; /* Completion */
};
/* XDP socket options */
#define XDP_MMAP_OFFSETS 1
#define XDP_RX_RING 2
#define XDP_TX_RING 3
#define XDP_UMEM_REG 4
#define XDP_UMEM_FILL_RING 5
#define XDP_UMEM_COMPLETION_RING 6
#define XDP_STATISTICS 7
#define XDP_OPTIONS 8
#define XDP_MAX_TX_SKB_BUDGET 9
struct xdp_umem_reg {
__u64 addr; /* Start of packet data area */
__u64 len; /* Length of packet data area */
__u32 chunk_size;
__u32 headroom;
__u32 flags;
__u32 tx_metadata_len;
};
struct xdp_statistics {
__u64 rx_dropped; /* Dropped for other reasons */
__u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
__u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
__u64 rx_ring_full; /* Dropped due to rx ring being full */
__u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */
__u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */
};
struct xdp_options {
__u32 flags;
};
/* Flags for the flags field of struct xdp_options */
#define XDP_OPTIONS_ZEROCOPY (1 << 0)
/* Pgoff for mmaping the rings */
#define XDP_PGOFF_RX_RING 0
#define XDP_PGOFF_TX_RING 0x80000000
#define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL
#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL
/* Masks for unaligned chunks mode */
#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48
#define XSK_UNALIGNED_BUF_ADDR_MASK \
((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
/* Request transmit timestamp. Upon completion, put it into tx_timestamp
* field of struct xsk_tx_metadata.
*/
#define XDP_TXMD_FLAGS_TIMESTAMP (1 << 0)
/* Request transmit checksum offload. Checksum start position and offset
* are communicated via csum_start and csum_offset fields of struct
* xsk_tx_metadata.
*/
#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1)
/* Request launch time hardware offload. The device will schedule the packet for
* transmission at a pre-determined time called launch time. The value of
* launch time is communicated via launch_time field of struct xsk_tx_metadata.
*/
#define XDP_TXMD_FLAGS_LAUNCH_TIME (1 << 2)
/* AF_XDP offloads request. 'request' union member is consumed by the driver
* when the packet is being transmitted. 'completion' union member is
* filled by the driver when the transmit completion arrives.
*/
struct xsk_tx_metadata {
__u64 flags;
union {
struct {
/* XDP_TXMD_FLAGS_CHECKSUM */
/* Offset from desc->addr where checksumming should start. */
__u16 csum_start;
/* Offset from csum_start where checksum should be stored. */
__u16 csum_offset;
/* XDP_TXMD_FLAGS_LAUNCH_TIME */
/* Launch time in nanosecond against the PTP HW Clock */
__u64 launch_time;
} request;
struct {
/* XDP_TXMD_FLAGS_TIMESTAMP */
__u64 tx_timestamp;
} completion;
};
};
/* Rx/Tx descriptor */
struct xdp_desc {
__u64 addr;
__u32 len;
__u32 options;
};
/* UMEM descriptor is __u64 */
/* Flag indicating that the packet continues with the buffer pointed out by the
* next frame in the ring. The end of the packet is signalled by setting this
* bit to zero. For single buffer packets, every descriptor has 'options' set
* to 0 and this maintains backward compatibility.
*/
#define XDP_PKT_CONTD (1 << 0)
/* TX packet carries valid metadata. */
#define XDP_TX_METADATA (1 << 1)
#endif /* _UAPI_LINUX_IF_XDP_H */