mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson
synced 2025-08-27 15:36:48 +00:00

Require that iter->batch always contains a full bucket snapshot. This invariant is important to avoid skipping or repeating sockets during iteration when combined with the next few patches. Before, there were two cases where a call to bpf_iter_udp_batch may only capture part of a bucket: 1. When bpf_iter_udp_realloc_batch() returns -ENOMEM [1]. 2. When more sockets are added to the bucket while calling bpf_iter_udp_realloc_batch(), making the updated batch size insufficient [2]. In cases where the batch size only covers part of a bucket, it is possible to forget which sockets were already visited, especially if we have to process a bucket in more than two batches. This forces us to choose between repeating or skipping sockets, so don't allow this: 1. Stop iteration and propagate -ENOMEM up to userspace if reallocation fails instead of continuing with a partial batch. 2. Try bpf_iter_udp_realloc_batch() with GFP_USER just as before, but if we still aren't able to capture the full bucket, call bpf_iter_udp_realloc_batch() again while holding the bucket lock to guarantee the bucket does not change. On the second attempt use GFP_NOWAIT since we hold onto the spin lock. Introduce the udp_portaddr_for_each_entry_from macro and use it instead of udp_portaddr_for_each_entry to make it possible to continue iteration from an arbitrary socket. This is required for this patch in the GFP_NOWAIT case to allow us to fill the rest of a batch starting from the middle of a bucket and the later patch which skips sockets that were already seen. Testing all scenarios directly is a bit difficult, but I did some manual testing to exercise the code paths where GFP_NOWAIT is used and where ERR_PTR(err) is returned. I used the realloc test case included later in this series to trigger a scenario where a realloc happens inside bpf_iter_udp_batch and made a small code tweak to force the first realloc attempt to allocate a too-small batch, thus requiring another attempt with GFP_NOWAIT. Some printks showed both reallocs with the tests passing: Apr 25 23:16:24 crow kernel: go again GFP_USER Apr 25 23:16:24 crow kernel: go again GFP_NOWAIT With this setup, I also forced each of the bpf_iter_udp_realloc_batch calls to return -ENOMEM to ensure that iteration ends and that the read() in userspace fails. [1]: https://lore.kernel.org/bpf/CABi4-ogUtMrH8-NVB6W8Xg_F_KDLq=yy-yu-tKr2udXE2Mu1Lg@mail.gmail.com/ [2]: https://lore.kernel.org/bpf/7ed28273-a716-4638-912d-f86f965e54bb@linux.dev/ Signed-off-by: Jordan Rife <jordan@jrife.io> Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
242 lines
6.6 KiB
C
242 lines
6.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/*
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
* operating system. INET is implemented using the BSD Socket
|
|
* interface as the means of communication with the user level.
|
|
*
|
|
* Definitions for the UDP protocol.
|
|
*
|
|
* Version: @(#)udp.h 1.0.2 04/28/93
|
|
*
|
|
* Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
|
|
*/
|
|
#ifndef _LINUX_UDP_H
|
|
#define _LINUX_UDP_H
|
|
|
|
#include <net/inet_sock.h>
|
|
#include <linux/skbuff.h>
|
|
#include <net/netns/hash.h>
|
|
#include <uapi/linux/udp.h>
|
|
|
|
static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
|
|
{
|
|
return (struct udphdr *)skb_transport_header(skb);
|
|
}
|
|
|
|
#define UDP_HTABLE_SIZE_MIN_PERNET 128
|
|
#define UDP_HTABLE_SIZE_MIN (IS_ENABLED(CONFIG_BASE_SMALL) ? 128 : 256)
|
|
#define UDP_HTABLE_SIZE_MAX 65536
|
|
|
|
static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
|
|
{
|
|
return (num + net_hash_mix(net)) & mask;
|
|
}
|
|
|
|
enum {
|
|
UDP_FLAGS_CORK, /* Cork is required */
|
|
UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
|
|
UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */
|
|
UDP_FLAGS_GRO_ENABLED, /* Request GRO aggregation */
|
|
UDP_FLAGS_ACCEPT_FRAGLIST,
|
|
UDP_FLAGS_ACCEPT_L4,
|
|
UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */
|
|
UDP_FLAGS_UDPLITE_SEND_CC, /* set via udplite setsockopt */
|
|
UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */
|
|
};
|
|
|
|
struct udp_sock {
|
|
/* inet_sock has to be the first member */
|
|
struct inet_sock inet;
|
|
#define udp_port_hash inet.sk.__sk_common.skc_u16hashes[0]
|
|
#define udp_portaddr_hash inet.sk.__sk_common.skc_u16hashes[1]
|
|
#define udp_portaddr_node inet.sk.__sk_common.skc_portaddr_node
|
|
|
|
unsigned long udp_flags;
|
|
|
|
int pending; /* Any pending frames ? */
|
|
__u8 encap_type; /* Is this an Encapsulation socket? */
|
|
|
|
#if !IS_ENABLED(CONFIG_BASE_SMALL)
|
|
/* For UDP 4-tuple hash */
|
|
__u16 udp_lrpa_hash;
|
|
struct hlist_nulls_node udp_lrpa_node;
|
|
#endif
|
|
|
|
/*
|
|
* Following member retains the information to create a UDP header
|
|
* when the socket is uncorked.
|
|
*/
|
|
__u16 len; /* total length of pending frames */
|
|
__u16 gso_size;
|
|
/*
|
|
* Fields specific to UDP-Lite.
|
|
*/
|
|
__u16 pcslen;
|
|
__u16 pcrlen;
|
|
/*
|
|
* For encapsulation sockets.
|
|
*/
|
|
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
|
|
void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, int err,
|
|
__be16 port, u32 info, u8 *payload);
|
|
int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb);
|
|
void (*encap_destroy)(struct sock *sk);
|
|
|
|
/* GRO functions for UDP socket */
|
|
struct sk_buff * (*gro_receive)(struct sock *sk,
|
|
struct list_head *head,
|
|
struct sk_buff *skb);
|
|
int (*gro_complete)(struct sock *sk,
|
|
struct sk_buff *skb,
|
|
int nhoff);
|
|
|
|
/* udp_recvmsg try to use this before splicing sk_receive_queue */
|
|
struct sk_buff_head reader_queue ____cacheline_aligned_in_smp;
|
|
|
|
/* This field is dirtied by udp_recvmsg() */
|
|
int forward_deficit;
|
|
|
|
/* This fields follows rcvbuf value, and is touched by udp_recvmsg */
|
|
int forward_threshold;
|
|
|
|
/* Cache friendly copy of sk->sk_peek_off >= 0 */
|
|
bool peeking_with_offset;
|
|
|
|
/*
|
|
* Accounting for the tunnel GRO fastpath.
|
|
* Unprotected by compilers guard, as it uses space available in
|
|
* the last UDP socket cacheline.
|
|
*/
|
|
struct hlist_node tunnel_list;
|
|
};
|
|
|
|
#define udp_test_bit(nr, sk) \
|
|
test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
|
|
#define udp_set_bit(nr, sk) \
|
|
set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
|
|
#define udp_test_and_set_bit(nr, sk) \
|
|
test_and_set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
|
|
#define udp_clear_bit(nr, sk) \
|
|
clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
|
|
#define udp_assign_bit(nr, sk, val) \
|
|
assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val)
|
|
|
|
#define UDP_MAX_SEGMENTS (1 << 7UL)
|
|
|
|
#define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk)
|
|
|
|
static inline int udp_set_peek_off(struct sock *sk, int val)
|
|
{
|
|
sk_set_peek_off(sk, val);
|
|
WRITE_ONCE(udp_sk(sk)->peeking_with_offset, val >= 0);
|
|
return 0;
|
|
}
|
|
|
|
static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
|
|
{
|
|
udp_assign_bit(NO_CHECK6_TX, sk, val);
|
|
}
|
|
|
|
static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
|
|
{
|
|
udp_assign_bit(NO_CHECK6_RX, sk, val);
|
|
}
|
|
|
|
static inline bool udp_get_no_check6_tx(const struct sock *sk)
|
|
{
|
|
return udp_test_bit(NO_CHECK6_TX, sk);
|
|
}
|
|
|
|
static inline bool udp_get_no_check6_rx(const struct sock *sk)
|
|
{
|
|
return udp_test_bit(NO_CHECK6_RX, sk);
|
|
}
|
|
|
|
static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
|
|
struct sk_buff *skb)
|
|
{
|
|
int gso_size;
|
|
|
|
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
|
|
gso_size = skb_shinfo(skb)->gso_size;
|
|
put_cmsg(msg, SOL_UDP, UDP_GRO, sizeof(gso_size), &gso_size);
|
|
}
|
|
}
|
|
|
|
DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key);
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
|
|
#endif
|
|
|
|
static inline bool udp_encap_needed(void)
|
|
{
|
|
if (static_branch_unlikely(&udp_encap_needed_key))
|
|
return true;
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
if (static_branch_unlikely(&udpv6_encap_needed_key))
|
|
return true;
|
|
#endif
|
|
|
|
return false;
|
|
}
|
|
|
|
static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
if (!skb_is_gso(skb))
|
|
return false;
|
|
|
|
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
|
|
!udp_test_bit(ACCEPT_L4, sk))
|
|
return true;
|
|
|
|
if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST &&
|
|
!udp_test_bit(ACCEPT_FRAGLIST, sk))
|
|
return true;
|
|
|
|
/* GSO packets lacking the SKB_GSO_UDP_TUNNEL/_CSUM bits might still
|
|
* land in a tunnel as the socket check in udp_gro_receive cannot be
|
|
* foolproof.
|
|
*/
|
|
if (udp_encap_needed() &&
|
|
READ_ONCE(udp_sk(sk)->encap_rcv) &&
|
|
!(skb_shinfo(skb)->gso_type &
|
|
(SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM)))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static inline void udp_allow_gso(struct sock *sk)
|
|
{
|
|
udp_set_bit(ACCEPT_L4, sk);
|
|
udp_set_bit(ACCEPT_FRAGLIST, sk);
|
|
}
|
|
|
|
#define udp_portaddr_for_each_entry(__sk, list) \
|
|
hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
|
|
|
|
#define udp_portaddr_for_each_entry_from(__sk) \
|
|
hlist_for_each_entry_from(__sk, __sk_common.skc_portaddr_node)
|
|
|
|
#define udp_portaddr_for_each_entry_rcu(__sk, list) \
|
|
hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)
|
|
|
|
#if !IS_ENABLED(CONFIG_BASE_SMALL)
|
|
#define udp_lrpa_for_each_entry_rcu(__up, node, list) \
|
|
hlist_nulls_for_each_entry_rcu(__up, node, list, udp_lrpa_node)
|
|
#endif
|
|
|
|
#define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE)
|
|
|
|
static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6)
|
|
{
|
|
#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
|
|
return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk);
|
|
#else
|
|
return NULL;
|
|
#endif
|
|
}
|
|
|
|
#endif /* _LINUX_UDP_H */
|