From cdeb2674aabb2ace05d59516938933b36329ec86 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 26 Mar 2019 16:08:05 -0700 Subject: [PATCH 01/22] Update kernel headers Update kernel headers to fa7e428c6b7e ("openvswitch: add seqadj extension when NAT is used.") Signed-off-by: David Ahern --- include/uapi/linux/bpf.h | 188 ++++++++++++++++++++---------- include/uapi/linux/if_tun.h | 1 + include/uapi/linux/tcp.h | 27 +++++ include/uapi/linux/tipc_netlink.h | 2 + 4 files changed, 157 insertions(+), 61 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a857878f..882a97cc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -502,16 +502,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) - * Description - * Push an element *value* in *map*. *flags* is one of: - * - * **BPF_EXIST** - * If the queue/stack is full, the oldest element is removed to - * make room for this. - * Return - * 0 on success, or a negative error in case of failure. - * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -1435,14 +1425,14 @@ union bpf_attr { * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts - * *skb*, but gets socket from **struct bpf_sock_addr** contex. + * *skb*, but gets socket from **struct bpf_sock_addr** context. * Return * A 8-byte long non-decreasing number. * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts - * *skb*, but gets socket from **struct bpf_sock_ops** contex. + * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return * A 8-byte long non-decreasing number. * @@ -2098,6 +2088,25 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_rc_repeat(void *ctx) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. + * + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) * Description * This helper is used in programs implementing IR decoding, to @@ -2124,26 +2133,7 @@ union bpf_attr { * Return * 0 * - * int bpf_rc_repeat(void *ctx) - * Description - * This helper is used in programs implementing IR decoding, to - * report a successfully decoded repeat key message. This delays - * the generation of a key up event for previously generated - * key down event. - * - * Some IR protocols like NEC have a special IR message for - * repeating last button, for when a button is held down. - * - * The *ctx* should point to the lirc sample as passed into - * the program. - * - * This helper is only available is the kernel was compiled with - * the **CONFIG_BPF_LIRC_MODE2** configuration option set to - * "**y**". - * Return - * 0 - * - * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb) + * u64 bpf_skb_cgroup_id(struct sk_buff *skb) * Description * Return the cgroup v2 id of the socket associated with the *skb*. * This is roughly similar to the **bpf_get_cgroup_classid**\ () @@ -2159,30 +2149,12 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) - * Description - * Return id of cgroup v2 that is ancestor of cgroup associated - * with the *skb* at the *ancestor_level*. The root cgroup is at - * *ancestor_level* zero and each step down the hierarchy - * increments the level. If *ancestor_level* == level of cgroup - * associated with *skb*, then return value will be same as that - * of **bpf_skb_cgroup_id**\ (). - * - * The helper is useful to implement policies based on cgroups - * that are upper in hierarchy than immediate cgroup associated - * with *skb*. - * - * The format of returned id and helper limitations are same as in - * **bpf_skb_cgroup_id**\ (). - * Return - * The id is returned or 0 in case the id could not be retrieved. - * * u64 bpf_get_current_cgroup_id(void) * Return * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. * - * void* get_local_storage(void *map, u64 flags) + * void *bpf_get_local_storage(void *map, u64 flags) * Description * Get the pointer to the local storage area. * The type and the size of the local storage is defined @@ -2209,6 +2181,24 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for TCP socket matching *tuple*, optionally in a child @@ -2289,6 +2279,16 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is + * removed to make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_map_pop_elem(struct bpf_map *map, void *value) * Description * Pop an element from *map*. @@ -2343,29 +2343,94 @@ union bpf_attr { * Return * 0 * + * int bpf_spin_lock(struct bpf_spin_lock *lock) + * Description + * Acquire a spinlock represented by the pointer *lock*, which is + * stored as part of a value of a map. Taking the lock allows to + * safely update the rest of the fields in that value. The + * spinlock can (and must) later be released with a call to + * **bpf_spin_unlock**\ (\ *lock*\ ). + * + * Spinlocks in BPF programs come with a number of restrictions + * and constraints: + * + * * **bpf_spin_lock** objects are only allowed inside maps of + * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this + * list could be extended in the future). + * * BTF description of the map is mandatory. + * * The BPF program can take ONE lock at a time, since taking two + * or more could cause dead locks. + * * Only one **struct bpf_spin_lock** is allowed per map element. + * * When the lock is taken, calls (either BPF to BPF or helpers) + * are not allowed. + * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not + * allowed inside a spinlock-ed region. + * * The BPF program MUST call **bpf_spin_unlock**\ () to release + * the lock, on all execution paths, before it returns. + * * The BPF program can access **struct bpf_spin_lock** only via + * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () + * helpers. Loading or storing data into the **struct + * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. + * * To use the **bpf_spin_lock**\ () helper, the BTF description + * of the map value must be a struct and have **struct + * bpf_spin_lock** *anyname*\ **;** field at the top level. + * Nested lock inside another struct is not allowed. + * * The **struct bpf_spin_lock** *lock* field in a map value must + * be aligned on a multiple of 4 bytes in that value. + * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy + * the **bpf_spin_lock** field to user space. + * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from + * a BPF program, do not update the **bpf_spin_lock** field. + * * **bpf_spin_lock** cannot be on the stack or inside a + * networking packet (it can only be inside of a map values). + * * **bpf_spin_lock** is available to root only. + * * Tracing programs and socket filter programs cannot use + * **bpf_spin_lock**\ () due to insufficient preemption checks + * (but this may change in the future). + * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. + * Return + * 0 + * + * int bpf_spin_unlock(struct bpf_spin_lock *lock) + * Description + * Release the *lock* previously locked by a call to + * **bpf_spin_lock**\ (\ *lock*\ ). + * Return + * 0 + * * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_sock** pointer such - * that all the fields in bpf_sock can be accessed. + * that all the fields in this **bpf_sock** can be accessed. * Return - * A **struct bpf_sock** pointer on success, or NULL in + * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. * * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_tcp_sock** pointer from a * **struct bpf_sock** pointer. - * * Return - * A **struct bpf_tcp_sock** pointer on success, or NULL in + * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * * int bpf_skb_ecn_set_ce(struct sk_buf *skb) - * Description - * Sets ECN of IP header to ce (congestion encountered) if - * current value is ect (ECN capable). Works with IPv6 and IPv4. - * Return - * 1 if set, 0 if not set. + * Description + * Set ECN (Explicit Congestion Notification) field of IP header + * to **CE** (Congestion Encountered) if current value is **ECT** + * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 + * and IPv4. + * Return + * 1 if the **CE** flag is set (either by the current helper call + * or because it was already present), 0 if it is not set. + * + * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) + * Description + * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. + * **bpf_sk_release**\ () is unnecessary and not allowed. + * Return + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2465,7 +2530,8 @@ union bpf_attr { FN(spin_unlock), \ FN(sk_fullsock), \ FN(tcp_sock), \ - FN(skb_ecn_set_ce), + FN(skb_ecn_set_ce), \ + FN(get_listener_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 2f011655..8489ae03 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -60,6 +60,7 @@ #define TUNSETSTEERINGEBPF _IOR('T', 224, int) #define TUNSETFILTEREBPF _IOR('T', 225, int) #define TUNSETCARRIER _IOW('T', 226, int) +#define TUNGETDEVNETNS _IO('T', 227) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 799b5c5f..2d562797 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -160,15 +160,42 @@ enum { #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ +/* + * Sender's congestion state indicating normal or abnormal situations + * in the last round of packets sent. The state is driven by the ACK + * information and timer events. + */ enum tcp_ca_state { + /* + * Nothing bad has been observed recently. + * No apparent reordering, packet loss, or ECN marks. + */ TCP_CA_Open = 0, #define TCPF_CA_Open (1< Date: Fri, 22 Mar 2019 15:47:33 +0700 Subject: [PATCH 02/22] tipc: add link broadcast set method and ratio The command added here makes it possible to forcibly configure the broadcast link to use either broadcast or replicast, in addition to the already existing auto selection algorithm. A sample usage is shown below: $tipc link set broadcast BROADCAST $tipc link set broadcast AUTOSELECT ratio 25 $tipc link set broadcast -h Usage: tipc link set broadcast PROPERTY PROPERTIES BROADCAST - Forces all multicast traffic to be transmitted via broadcast only, irrespective of cluster size and number of destinations REPLICAST - Forces all multicast traffic to be transmitted via replicast only, irrespective of cluster size and number of destinations AUTOSELECT - Auto switching to broadcast or replicast depending on cluster size and destination node number ratio SIZE - Set the AUTOSELECT criteria, percentage of destination nodes vs cluster size Acked-by: Jon Maloy Signed-off-by: Hoang Le Signed-off-by: David Ahern --- tipc/link.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/tipc/link.c b/tipc/link.c index 43e26da3..e3b10bb7 100644 --- a/tipc/link.c +++ b/tipc/link.c @@ -28,6 +28,9 @@ #define PRIORITY_STR "priority" #define TOLERANCE_STR "tolerance" #define WINDOW_STR "window" +#define BROADCAST_STR "broadcast" + +static const char tipc_bclink_name[] = "broadcast-link"; static int link_list_cb(const struct nlmsghdr *nlh, void *data) { @@ -521,7 +524,8 @@ static void cmd_link_set_help(struct cmdl *cmdl) "PROPERTIES\n" " tolerance TOLERANCE - Set link tolerance\n" " priority PRIORITY - Set link priority\n" - " window WINDOW - Set link window\n", + " window WINDOW - Set link window\n" + " broadcast BROADCAST - Set link broadcast\n", cmdl->argv[0]); } @@ -585,6 +589,95 @@ static int cmd_link_set_prop(struct nlmsghdr *nlh, const struct cmd *cmd, return msg_doit(nlh, link_get_cb, &prop); } +static void cmd_link_set_bcast_help(struct cmdl *cmdl) +{ + fprintf(stderr, "Usage: %s link set broadcast PROPERTY\n\n" + "PROPERTIES\n" + " BROADCAST - Forces all multicast traffic to be\n" + " transmitted via broadcast only,\n" + " irrespective of cluster size and number\n" + " of destinations\n\n" + " REPLICAST - Forces all multicast traffic to be\n" + " transmitted via replicast only,\n" + " irrespective of cluster size and number\n" + " of destinations\n\n" + " AUTOSELECT - Auto switching to broadcast or replicast\n" + " depending on cluster size and destination\n" + " node number\n\n" + " ratio SIZE - Set the AUTOSELECT criteria, percentage of\n" + " destination nodes vs cluster size\n\n", + cmdl->argv[0]); +} + +static int cmd_link_set_bcast(struct nlmsghdr *nlh, const struct cmd *cmd, + struct cmdl *cmdl, void *data) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlattr *props; + struct nlattr *attrs; + struct opt *opt; + struct opt opts[] = { + { "BROADCAST", OPT_KEY, NULL }, + { "REPLICAST", OPT_KEY, NULL }, + { "AUTOSELECT", OPT_KEY, NULL }, + { "ratio", OPT_KEYVAL, NULL }, + { NULL } + }; + int method = 0; + + if (help_flag) { + (cmd->help)(cmdl); + return -EINVAL; + } + + if (parse_opts(opts, cmdl) < 0) + return -EINVAL; + + for (opt = opts; opt->key; opt++) + if (opt->val) + break; + + if (!opt || !opt->key) { + (cmd->help)(cmdl); + return -EINVAL; + } + + nlh = msg_init(buf, TIPC_NL_LINK_SET); + if (!nlh) { + fprintf(stderr, "error, message initialisation failed\n"); + return -1; + } + + attrs = mnl_attr_nest_start(nlh, TIPC_NLA_LINK); + /* Direct to broadcast-link setting */ + mnl_attr_put_strz(nlh, TIPC_NLA_LINK_NAME, tipc_bclink_name); + props = mnl_attr_nest_start(nlh, TIPC_NLA_LINK_PROP); + + if (get_opt(opts, "BROADCAST")) + method = 0x1; + else if (get_opt(opts, "REPLICAST")) + method = 0x2; + else if (get_opt(opts, "AUTOSELECT")) + method = 0x4; + + opt = get_opt(opts, "ratio"); + if (!method && !opt) { + (cmd->help)(cmdl); + return -EINVAL; + } + + if (method) + mnl_attr_put_u32(nlh, TIPC_NLA_PROP_BROADCAST, method); + + if (opt) + mnl_attr_put_u32(nlh, TIPC_NLA_PROP_BROADCAST_RATIO, + atoi(opt->val)); + + mnl_attr_nest_end(nlh, props); + mnl_attr_nest_end(nlh, attrs); + return msg_doit(nlh, NULL, NULL); +} + static int cmd_link_set(struct nlmsghdr *nlh, const struct cmd *cmd, struct cmdl *cmdl, void *data) { @@ -592,6 +685,7 @@ static int cmd_link_set(struct nlmsghdr *nlh, const struct cmd *cmd, { PRIORITY_STR, cmd_link_set_prop, cmd_link_set_help }, { TOLERANCE_STR, cmd_link_set_prop, cmd_link_set_help }, { WINDOW_STR, cmd_link_set_prop, cmd_link_set_help }, + { BROADCAST_STR, cmd_link_set_bcast, cmd_link_set_bcast_help }, { NULL } }; From 5027f233e35b80f306bc06b63f2ad1243ba5f3a8 Mon Sep 17 00:00:00 2001 From: Hoang Le Date: Fri, 22 Mar 2019 15:47:34 +0700 Subject: [PATCH 03/22] tipc: add link broadcast get The command prints the actually method that multicast is running in the system. Also 'ratio' value for AUTOSELECT method. A sample usage is shown below: $tipc link get broadcast BROADCAST $tipc link get broadcast AUTOSELECT ratio:30% $tipc link get broadcast -j -p [ { "method": "AUTOSELECT" },{ "ratio": 30 } ] Acked-by: Jon Maloy Signed-off-by: Hoang Le Signed-off-by: David Ahern --- tipc/link.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/tipc/link.c b/tipc/link.c index e3b10bb7..e123c186 100644 --- a/tipc/link.c +++ b/tipc/link.c @@ -175,10 +175,92 @@ static void cmd_link_get_help(struct cmdl *cmdl) "PROPERTIES\n" " tolerance - Get link tolerance\n" " priority - Get link priority\n" - " window - Get link window\n", + " window - Get link window\n" + " broadcast - Get link broadcast\n", cmdl->argv[0]); } +static int cmd_link_get_bcast_cb(const struct nlmsghdr *nlh, void *data) +{ + int *prop = data; + int prop_ratio = TIPC_NLA_PROP_BROADCAST_RATIO; + struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh); + struct nlattr *info[TIPC_NLA_MAX + 1] = {}; + struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1] = {}; + struct nlattr *props[TIPC_NLA_PROP_MAX + 1] = {}; + int bc_mode; + + mnl_attr_parse(nlh, sizeof(*genl), parse_attrs, info); + if (!info[TIPC_NLA_LINK]) + return MNL_CB_ERROR; + + mnl_attr_parse_nested(info[TIPC_NLA_LINK], parse_attrs, attrs); + if (!attrs[TIPC_NLA_LINK_PROP]) + return MNL_CB_ERROR; + + mnl_attr_parse_nested(attrs[TIPC_NLA_LINK_PROP], parse_attrs, props); + if (!props[*prop]) + return MNL_CB_ERROR; + + bc_mode = mnl_attr_get_u32(props[*prop]); + + new_json_obj(json); + open_json_object(NULL); + switch (bc_mode) { + case 0x1: + print_string(PRINT_ANY, "method", "%s\n", "BROADCAST"); + break; + case 0x2: + print_string(PRINT_ANY, "method", "%s\n", "REPLICAST"); + break; + case 0x4: + print_string(PRINT_ANY, "method", "%s", "AUTOSELECT"); + close_json_object(); + open_json_object(NULL); + print_uint(PRINT_ANY, "ratio", " ratio:%u%\n", + mnl_attr_get_u32(props[prop_ratio])); + break; + default: + print_string(PRINT_ANY, NULL, "UNKNOWN\n", NULL); + break; + } + close_json_object(); + delete_json_obj(); + return MNL_CB_OK; +} + +static void cmd_link_get_bcast_help(struct cmdl *cmdl) +{ + fprintf(stderr, "Usage: %s link get PPROPERTY\n\n" + "PROPERTIES\n" + " broadcast - Get link broadcast\n", + cmdl->argv[0]); +} + +static int cmd_link_get_bcast(struct nlmsghdr *nlh, const struct cmd *cmd, + struct cmdl *cmdl, void *data) +{ + int prop = TIPC_NLA_PROP_BROADCAST; + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlattr *attrs; + + if (help_flag) { + (cmd->help)(cmdl); + return -EINVAL; + } + + nlh = msg_init(buf, TIPC_NL_LINK_GET); + if (!nlh) { + fprintf(stderr, "error, message initialisation failed\n"); + return -1; + } + attrs = mnl_attr_nest_start(nlh, TIPC_NLA_LINK); + /* Direct to broadcast-link setting */ + mnl_attr_put_strz(nlh, TIPC_NLA_LINK_NAME, tipc_bclink_name); + mnl_attr_nest_end(nlh, attrs); + return msg_doit(nlh, cmd_link_get_bcast_cb, &prop); +} + static int cmd_link_get(struct nlmsghdr *nlh, const struct cmd *cmd, struct cmdl *cmdl, void *data) { @@ -186,6 +268,7 @@ static int cmd_link_get(struct nlmsghdr *nlh, const struct cmd *cmd, { PRIORITY_STR, cmd_link_get_prop, cmd_link_get_help }, { TOLERANCE_STR, cmd_link_get_prop, cmd_link_get_help }, { WINDOW_STR, cmd_link_get_prop, cmd_link_get_help }, + { BROADCAST_STR, cmd_link_get_bcast, cmd_link_get_bcast_help }, { NULL } }; From 35114a4cfe2e68bb6f736d309be2786f90ec8a8f Mon Sep 17 00:00:00 2001 From: Hoang Le Date: Fri, 22 Mar 2019 15:47:35 +0700 Subject: [PATCH 04/22] tipc: add link broadcast man page Add a man page describing tipc link broadcast command get and set Signed-off-by: Hoang Le Signed-off-by: David Ahern --- man/man8/tipc-link.8 | 53 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/man/man8/tipc-link.8 b/man/man8/tipc-link.8 index 01afa1c3..47dae25d 100644 --- a/man/man8/tipc-link.8 +++ b/man/man8/tipc-link.8 @@ -1,4 +1,4 @@ -.TH TIPC-LINK 8 "02 Jun 2015" "iproute2" "Linux" +.TH TIPC-LINK 8 "22 Mar 2019" "iproute2" "Linux" .\" For consistency, please keep padding right aligned. .\" For example '.B "foo " bar' and not '.B foo " bar"' @@ -14,18 +14,36 @@ tipc-link \- show links or modify link properties .ti -8 .B tipc link set -.RB "{ " "priority " +.br +.RB "[ " "{ " "priority " .IR PRIORITY .RB "| " tolerance .IR TOLERANCE .RB "| " window .IR "WINDOW " } -.BI "link " LINK +.BI "link " LINK " ]" +.RB "|" +.br +.RB "[ " +.RB "{ " broadcast " [ " +.IR BROADCAST +.RB " | " +.IR REPLICAST +.RB " | " +.IR AUTOSELECT +.RB "[ " ratio +.IR SIZE +.RB "] " ] " } " "]" .ti -8 .B tipc link get -.RB "{ " "priority" " | " tolerance " | " window " } " link -.I LINK +.br +.RB "[ " "{ " "priority" " | " tolerance " | " window " } " link +.IR LINK " ] " +.RB "|" +.br +.RB "[ " { " broadcast " } " ]" +.br .ti -8 .B tipc link statistics @@ -306,6 +324,31 @@ They are usually transient and occur during the cluster startup phase or network reconfiguration. Possible status are: U or D. The status U implies up and D down. +.SS Broadcast properties +.TP +.B BROADCAST +.br +Forces all multicast traffic to be transmitted via broadcast only, +irrespective of cluster size and number of destinations. + +.TP +.B REPLICAST +.br +Forces all multicast traffic to be transmitted via replicast only, +irrespective of cluster size and number of destinations. + +.TP +.B AUTOSELECT +.br +Auto switching to broadcast or replicast depending on cluster size and +destination node number. + +.TP +.B ratio SIZE +.br +Set the AUTOSELECT criteria, percentage of destination nodes vs cluster +size. + .SH EXAMPLES .PP tipc link monitor list From 65147bbe8fac4a728118d08f6ae98f4dfc340ff2 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 3 Apr 2019 12:10:29 -0500 Subject: [PATCH 05/22] Add .mailmap file .mailmap allows tracking multiple email addresses to the proper user name. Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Signed-off-by: David Ahern --- .mailmap | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 .mailmap diff --git a/.mailmap b/.mailmap new file mode 100644 index 00000000..c012d3d0 --- /dev/null +++ b/.mailmap @@ -0,0 +1,8 @@ +# +# This list is used by git-shortlog to fix a few botched name translations +# in the git archive, either because the author's full name was messed up +# and/or not always written the same way, making contributions from the +# same person appearing not to be so or badly displayed. +# +Steve Wise +Steve Wise From 8f5cfd23cd827054f2706e204602fa1e35a68320 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 3 Apr 2019 12:10:30 -0500 Subject: [PATCH 06/22] rdma: add helper rd_sendrecv_msg() This function sends the constructed netlink message and then receives the response. Change rd_recv_msg() to display any error messages. Change 'rdma dev set' to use rd_sendrecv_msg(). Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Signed-off-by: David Ahern --- rdma/dev.c | 2 +- rdma/rdma.h | 2 ++ rdma/res.h | 1 + rdma/utils.c | 18 ++++++++++++++++++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/rdma/dev.c b/rdma/dev.c index 954e0015..33962520 100644 --- a/rdma/dev.c +++ b/rdma/dev.c @@ -268,7 +268,7 @@ static int dev_set_name(struct rd *rd) mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx); mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, rd_argv(rd)); - return rd_send_msg(rd); + return rd_sendrecv_msg(rd, seq); } static int dev_one_set(struct rd *rd) diff --git a/rdma/rdma.h b/rdma/rdma.h index 1022e9a2..6c7f7d15 100644 --- a/rdma/rdma.h +++ b/rdma/rdma.h @@ -68,6 +68,7 @@ struct rd { json_writer_t *jw; bool json_output; bool pretty_output; + bool suppress_errors; struct list_head filter_list; }; @@ -119,6 +120,7 @@ bool rd_is_string_filtered_attr(struct rd *rd, const char *key, const char *val, */ int rd_send_msg(struct rd *rd); int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, uint32_t seq); +int rd_sendrecv_msg(struct rd *rd, unsigned int seq); void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t flags); int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data); int rd_attr_cb(const struct nlattr *attr, void *data); diff --git a/rdma/res.h b/rdma/res.h index b4a7e552..525171fc 100644 --- a/rdma/res.h +++ b/rdma/res.h @@ -31,6 +31,7 @@ int res_qp_idx_parse_cb(const struct nlmsghdr *nlh, void *data); if (id) { \ ret = rd_doit_index(rd, &idx); \ if (ret) { \ + rd->suppress_errors = true; \ ret = _res_send_idx_msg(rd, command, \ name##_idx_parse_cb, \ idx, id); \ diff --git a/rdma/utils.c b/rdma/utils.c index 1f6bf330..11ed8a73 100644 --- a/rdma/utils.c +++ b/rdma/utils.c @@ -693,10 +693,28 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, unsigned int seq) ret = mnl_cb_run(buf, ret, seq, portid, callback, data); } while (ret > 0); + if (ret < 0 && !rd->suppress_errors) + perror("error"); + mnl_socket_close(rd->nl); return ret; } +static int null_cb(const struct nlmsghdr *nlh, void *data) +{ + return MNL_CB_OK; +} + +int rd_sendrecv_msg(struct rd *rd, unsigned int seq) +{ + int ret; + + ret = rd_send_msg(rd); + if (!ret) + ret = rd_recv_msg(rd, null_cb, rd, seq); + return ret; +} + static struct dev_map *_dev_map_lookup(struct rd *rd, const char *dev_name) { struct dev_map *dev_map; From 4336c5821a7befa11298afaaa730045e14d1b4d9 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 3 Apr 2019 12:10:31 -0500 Subject: [PATCH 07/22] rdma: add 'link add/delete' commands Add new 'link' subcommand 'add' and 'delete' to allow binding a soft-rdma device to a netdev interface. EG: rdma link add rxe_eth0 type rxe netdev eth0 rdma link delete rxe_eth0 Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Signed-off-by: David Ahern --- rdma/link.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++ rdma/rdma.h | 2 ++ 2 files changed, 80 insertions(+) diff --git a/rdma/link.c b/rdma/link.c index 89e81b84..10b2e513 100644 --- a/rdma/link.c +++ b/rdma/link.c @@ -9,6 +9,9 @@ static int link_help(struct rd *rd) { pr_out("Usage: %s link show [DEV/PORT_INDEX]\n", rd->filename); + pr_out("Usage: %s link add NAME type TYPE netdev NETDEV\n", + rd->filename); + pr_out("Usage: %s link delete NAME\n", rd->filename); return 0; } @@ -336,10 +339,85 @@ static int link_show(struct rd *rd) return rd_exec_link(rd, link_one_show, true); } +static int link_add_netdev(struct rd *rd) +{ + char *link_netdev; + uint32_t seq; + + if (rd_no_arg(rd)) { + pr_err("Please provide a net device name.\n"); + return -EINVAL; + } + + link_netdev = rd_argv(rd); + rd_prepare_msg(rd, RDMA_NLDEV_CMD_NEWLINK, &seq, + (NLM_F_REQUEST | NLM_F_ACK)); + mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, rd->link_name); + mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_LINK_TYPE, rd->link_type); + mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_NDEV_NAME, link_netdev); + return rd_sendrecv_msg(rd, seq); +} + +static int link_add_type(struct rd *rd) +{ + const struct rd_cmd cmds[] = { + { NULL, link_help}, + { "netdev", link_add_netdev}, + { 0 } + }; + + if (rd_no_arg(rd)) { + pr_err("Please provide a link type name.\n"); + return -EINVAL; + } + rd->link_type = rd_argv(rd); + rd_arg_inc(rd); + return rd_exec_cmd(rd, cmds, "parameter"); +} + +static int link_add(struct rd *rd) +{ + const struct rd_cmd cmds[] = { + { NULL, link_help}, + { "type", link_add_type}, + { 0 } + }; + + if (rd_no_arg(rd)) { + pr_err("Please provide a link name to add.\n"); + return -EINVAL; + } + rd->link_name = rd_argv(rd); + rd_arg_inc(rd); + + return rd_exec_cmd(rd, cmds, "parameter"); +} + +static int _link_del(struct rd *rd) +{ + uint32_t seq; + + if (!rd_no_arg(rd)) { + pr_err("Unknown parameter %s\n", rd_argv(rd)); + return -EINVAL; + } + rd_prepare_msg(rd, RDMA_NLDEV_CMD_DELLINK, &seq, + (NLM_F_REQUEST | NLM_F_ACK)); + mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx); + return rd_sendrecv_msg(rd, seq); +} + +static int link_del(struct rd *rd) +{ + return rd_exec_require_dev(rd, _link_del); +} + int cmd_link(struct rd *rd) { const struct rd_cmd cmds[] = { { NULL, link_show }, + { "add", link_add }, + { "delete", link_del }, { "show", link_show }, { "list", link_show }, { "help", link_help }, diff --git a/rdma/rdma.h b/rdma/rdma.h index 6c7f7d15..9ed9e045 100644 --- a/rdma/rdma.h +++ b/rdma/rdma.h @@ -70,6 +70,8 @@ struct rd { bool pretty_output; bool suppress_errors; struct list_head filter_list; + char *link_name; + char *link_type; }; struct rd_cmd { From 1d45bf724eb5e8e470e38ca7068145bb1fa98eea Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 3 Apr 2019 12:10:32 -0500 Subject: [PATCH 08/22] rdma: man page update for link add/delete Update the 'rdma link' man page with 'link add/delete' info. Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Signed-off-by: David Ahern --- man/man8/rdma-link.8 | 47 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/man/man8/rdma-link.8 b/man/man8/rdma-link.8 index bddf3474..b3b40de7 100644 --- a/man/man8/rdma-link.8 +++ b/man/man8/rdma-link.8 @@ -22,6 +22,18 @@ rdma-link \- rdma link configuration .B rdma link show .RI "[ " DEV/PORT_INDEX " ]" +.ti -8 +.B rdma link add +.BR NAME +.BR type +.BR TYPE +.BR netdev +.BR NETDEV + +.ti -8 +.B rdma link delete +.RI NAME + .ti -8 .B rdma link help @@ -33,6 +45,31 @@ rdma-link \- rdma link configuration - specifies the RDMA link to show. If this argument is omitted all links are listed. +.SS rdma link add NAME type TYPE netdev NETDEV - add an rdma link for the specified type to the network device +.sp +.BR NAME +- specifies the new name of the rdma link to add + +.BR TYPE +- specifies which rdma type to use. Link types: +.sp +.in +8 +.B rxe +- Soft RoCE driver +.sp +.B siw +- Soft iWARP driver +.in -8 + +.BR NETDEV +- specifies the network device to which the link is bound + +.SS rdma link delete NAME - delete an rdma link +.PP +.BR NAME +- specifies the name of the rdma link to delete +.PP + .SH "EXAMPLES" .PP rdma link show @@ -45,6 +82,16 @@ rdma link show mlx5_2/1 Shows the state of specified rdma link. .RE .PP +rdma link add rxe_eth0 type rxe netdev eth0 +.RS 4 +Adds a RXE link named rxe_eth0 to network device eth0 +.RE +.PP +rdma link del rxe_eth0 +.RS 4 +Removes RXE link rxe_eth0 +.RE +.PP .SH SEE ALSO .BR rdma (8), From 188c7fe6eaba56a6ddfd8f142e9abbfcd8840318 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 17 Apr 2019 14:07:48 -0700 Subject: [PATCH 09/22] Update kernel headers Update kernel headers to commit 6b0a7f84ea1f ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") Signed-off-by: David Ahern --- include/uapi/linux/bpf.h | 102 ++++++++++++++++++++++++++++++++++++--- include/uapi/linux/btf.h | 32 ++++++++++-- include/uapi/linux/fou.h | 6 +++ 3 files changed, 130 insertions(+), 10 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 882a97cc..79f729c6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -105,6 +105,7 @@ enum bpf_cmd { BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, BPF_MAP_LOOKUP_AND_DELETE_ELEM, + BPF_MAP_FREEZE, }; enum bpf_map_type { @@ -255,8 +256,19 @@ enum bpf_attach_type { */ #define BPF_F_ANY_ALIGNMENT (1U << 1) -/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ +/* When BPF ldimm64's insn[0].src_reg != 0 then this can have + * two extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE + * insn[0].imm: map fd map fd + * insn[1].imm: 0 offset into value + * insn[0].off: 0 0 + * insn[1].off: 0 0 + * ldimm64 rewrite: address of map address of map[0]+offset + * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE + */ #define BPF_PSEUDO_MAP_FD 1 +#define BPF_PSEUDO_MAP_VALUE 2 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -283,7 +295,7 @@ enum bpf_attach_type { #define BPF_OBJ_NAME_LEN 16U -/* Flags for accessing BPF object */ +/* Flags for accessing BPF object from syscall side. */ #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) @@ -293,6 +305,10 @@ enum bpf_attach_type { /* Zero-initialize hash function seed. This should only be used for testing. */ #define BPF_F_ZERO_SEED (1U << 6) +/* Flags for accessing BPF object from program side. */ +#define BPF_F_RDONLY_PROG (1U << 7) +#define BPF_F_WRONLY_PROG (1U << 8) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) @@ -396,6 +412,13 @@ union bpf_attr { __aligned_u64 data_out; __u32 repeat; __u32 duration; + __u32 ctx_size_in; /* input: len of ctx_in */ + __u32 ctx_size_out; /* input/output: len of ctx_out + * returns ENOSPC if ctx_out + * is too small. + */ + __aligned_u64 ctx_in; + __aligned_u64 ctx_out; } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ @@ -1478,13 +1501,31 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * - * There is a single supported mode at this time: + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed below the layer 2 header). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). * - * All values for *flags* are reserved for future usage, and must - * be left at zero. + * The following flags are supported at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **: + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **: + * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: + * Use with ENCAP_L3 flags to further specify the tunnel type. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2(len) **: + * Use with ENCAP_L3/L4 flags to further specify the tunnel + * type; **len** is the length of the inner MAC header. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2431,6 +2472,38 @@ union bpf_attr { * Return * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. + * + * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to bpf_sk_lookup_tcp, except that it + * also returns timewait or request sockets. Use bpf_sk_fullsock + * or bpf_tcp_socket to access the full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from **reuse->socks**\ [] using the hash of the tuple. + * + * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Check whether iph and th contain a valid SYN cookie ACK for + * the listening socket in sk. + * + * iph points to the start of the IPv4 or IPv6 header, while + * iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr). + * + * th points to the start of the TCP header, while th_len contains + * sizeof(struct tcphdr). + * + * Return + * 0 if iph and th are a valid SYN cookie ACK, or a negative error + * otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2531,7 +2604,9 @@ union bpf_attr { FN(sk_fullsock), \ FN(tcp_sock), \ FN(skb_ecn_set_ce), \ - FN(get_listener_sock), + FN(get_listener_sock), \ + FN(skc_lookup_tcp), \ + FN(tcp_check_syncookie), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2590,9 +2665,24 @@ enum bpf_func_id { /* Current network namespace */ #define BPF_F_CURRENT_NETNS (-1L) +/* BPF_FUNC_skb_adjust_room flags. */ +#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) + +#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff +#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 + +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) +#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) +#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ + BPF_ADJ_ROOM_ENCAP_L2_MASK) \ + << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, + BPF_ADJ_ROOM_MAC, }; /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index cb4cf8cc..73eba2e5 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -39,11 +39,11 @@ struct btf_type { * struct, union and fwd */ __u32 info; - /* "size" is used by INT, ENUM, STRUCT and UNION. + /* "size" is used by INT, ENUM, STRUCT, UNION and DATASEC. * "size" tells the size of the type it is describing. * * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, - * FUNC and FUNC_PROTO. + * FUNC, FUNC_PROTO and VAR. * "type" is a type_id referring to another type. */ union { @@ -70,8 +70,10 @@ struct btf_type { #define BTF_KIND_RESTRICT 11 /* Restrict */ #define BTF_KIND_FUNC 12 /* Function */ #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ -#define BTF_KIND_MAX 13 -#define NR_BTF_KINDS 14 +#define BTF_KIND_VAR 14 /* Variable */ +#define BTF_KIND_DATASEC 15 /* Section */ +#define BTF_KIND_MAX BTF_KIND_DATASEC +#define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately * followed by extra data. @@ -138,4 +140,26 @@ struct btf_param { __u32 type; }; +enum { + BTF_VAR_STATIC = 0, + BTF_VAR_GLOBAL_ALLOCATED, +}; + +/* BTF_KIND_VAR is followed by a single "struct btf_var" to describe + * additional information related to the variable such as its linkage. + */ +struct btf_var { + __u32 linkage; +}; + +/* BTF_KIND_DATASEC is followed by multiple "struct btf_var_secinfo" + * to describe all BTF_KIND_VAR types it contains along with it's + * in-section offset as well as size. + */ +struct btf_var_secinfo { + __u32 type; + __u32 offset; + __u32 size; +}; + #endif /* __LINUX_BTF_H__ */ diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h index bf022c63..9f915118 100644 --- a/include/uapi/linux/fou.h +++ b/include/uapi/linux/fou.h @@ -16,6 +16,12 @@ enum { FOU_ATTR_IPPROTO, /* u8 */ FOU_ATTR_TYPE, /* u8 */ FOU_ATTR_REMCSUM_NOPARTIAL, /* flag */ + FOU_ATTR_LOCAL_V4, /* u32 */ + FOU_ATTR_LOCAL_V6, /* in6_addr */ + FOU_ATTR_PEER_V4, /* u32 */ + FOU_ATTR_PEER_V6, /* in6_addr */ + FOU_ATTR_PEER_PORT, /* u16 */ + FOU_ATTR_IFINDEX, /* s32 */ __FOU_ATTR_MAX, }; From 185ba5e2d4cc5e033ce0bdfb45596783a9610ba0 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 18 Apr 2019 06:44:05 +0000 Subject: [PATCH 10/22] ipneigh: Print neighbour offload indication Print the offload indication in case it is set on the neighbour. Signed-off-by: Ido Schimmel Signed-off-by: David Ahern --- ip/ipneigh.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ip/ipneigh.c b/ip/ipneigh.c index 88596245..27986ff7 100644 --- a/ip/ipneigh.c +++ b/ip/ipneigh.c @@ -384,6 +384,9 @@ int print_neigh(struct nlmsghdr *n, void *arg) if (r->ndm_flags & NTF_EXT_LEARNED) print_null(PRINT_ANY, "extern_learn", " %s ", "extern_learn"); + if (r->ndm_flags & NTF_OFFLOADED) + print_null(PRINT_ANY, "offload", " %s", "offload"); + if (show_stats) { if (tb[NDA_CACHEINFO]) print_cacheinfo(RTA_DATA(tb[NDA_CACHEINFO])); From 90306a14402b6a082ab0aac0f5016b2258201220 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 16 Apr 2019 16:19:10 +0300 Subject: [PATCH 11/22] iplink: bridge: add support for vlan_stats_per_port Add support for manipulating and showing the vlan_stats_per_port bridge option which can be toggled only when there are no port VLANs configured. Also update the man page with the new option. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David Ahern --- ip/iplink_bridge.c | 15 +++++++++++++++ man/man8/ip-link.8.in | 9 +++++++++ 2 files changed, 24 insertions(+) diff --git a/ip/iplink_bridge.c b/ip/iplink_bridge.c index e9b77fdf..10ba85f6 100644 --- a/ip/iplink_bridge.c +++ b/ip/iplink_bridge.c @@ -41,6 +41,7 @@ static void print_explain(FILE *f) " [ vlan_protocol VLAN_PROTOCOL ]\n" " [ vlan_default_pvid VLAN_DEFAULT_PVID ]\n" " [ vlan_stats_enabled VLAN_STATS_ENABLED ]\n" + " [ vlan_stats_per_port VLAN_STATS_PER_PORT ]\n" " [ mcast_snooping MULTICAST_SNOOPING ]\n" " [ mcast_router MULTICAST_ROUTER ]\n" " [ mcast_query_use_ifaddr MCAST_QUERY_USE_IFADDR ]\n" @@ -175,6 +176,14 @@ static int bridge_parse_opt(struct link_util *lu, int argc, char **argv, invarg("invalid vlan_stats_enabled", *argv); addattr8(n, 1024, IFLA_BR_VLAN_STATS_ENABLED, vlan_stats_enabled); + } else if (matches(*argv, "vlan_stats_per_port") == 0) { + __u8 vlan_stats_per_port; + + NEXT_ARG(); + if (get_u8(&vlan_stats_per_port, *argv, 0)) + invarg("invalid vlan_stats_per_port", *argv); + addattr8(n, 1024, IFLA_BR_VLAN_STATS_PER_PORT, + vlan_stats_per_port); } else if (matches(*argv, "mcast_router") == 0) { __u8 mcast_router; @@ -521,6 +530,12 @@ static void bridge_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) "vlan_stats_enabled %u ", rta_getattr_u8(tb[IFLA_BR_VLAN_STATS_ENABLED])); + if (tb[IFLA_BR_VLAN_STATS_PER_PORT]) + print_uint(PRINT_ANY, + "vlan_stats_per_port", + "vlan_stats_per_port %u ", + rta_getattr_u8(tb[IFLA_BR_VLAN_STATS_PER_PORT])); + if (tb[IFLA_BR_GROUP_FWD_MASK]) print_0xhex(PRINT_ANY, "group_fwd_mask", diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 2411d43e..628a3651 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -1392,6 +1392,8 @@ the following additional arguments are supported: ] [ .BI vlan_stats_enabled " VLAN_STATS_ENABLED " ] [ +.BI vlan_stats_per_port " VLAN_STATS_PER_PORT " +] [ .BI mcast_snooping " MULTICAST_SNOOPING " ] [ .BI mcast_router " MULTICAST_ROUTER " @@ -1503,6 +1505,13 @@ or disable .RI ( VLAN_STATS_ENABLED " == 0) " per-VLAN stats accounting. +.BI vlan_stats_per_port " VLAN_STATS_PER_PORT " +- enable +.RI ( VLAN_STATS_PER_PORT " == 1) " +or disable +.RI ( VLAN_STATS_PER_PORT " == 0) " +per-VLAN per-port stats accounting. Can be changed only when there are no port VLANs configured. + .BI mcast_snooping " MULTICAST_SNOOPING " - turn multicast snooping on .RI ( MULTICAST_SNOOPING " > 0) " From 112112b8eb11d2bb041aca270f5f4456d40e24fb Mon Sep 17 00:00:00 2001 From: Kristian Evensen Date: Mon, 22 Apr 2019 17:27:41 +0200 Subject: [PATCH 12/22] ip fou: Support binding FOU ports This patch adds support for binding FOU ports using iproute2. Kernel-support was added in 1713cb37bf67 ("fou: Support binding FoU socket"). The parse function now handles new arguments for setting the binding-related attributes, while the print function writes the new attributes if they are set. Also, the man page has been updated. v2->v3: * Remove redundant ll_init_map()-calls (thanks David Ahern). v1->v2 (all changes suggested by David Ahern): * Fix reverse Christmas tree ordering. * Remove redundant peer_port_set-variable, it is enough to check peer_port. * Add proper error handling of invalid local/peer addresses. * Use interface name and not index. * Remove updating fou-header file, it is already done. Signed-off-by: Kristian Evensen Signed-off-by: David Ahern --- ip/ipfou.c | 135 +++++++++++++++++++++++++++++++++++++++++++--- man/man8/ip-fou.8 | 49 ++++++++++++++++- 2 files changed, 175 insertions(+), 9 deletions(-) diff --git a/ip/ipfou.c b/ip/ipfou.c index 346522dd..ea126b08 100644 --- a/ip/ipfou.c +++ b/ip/ipfou.c @@ -28,11 +28,16 @@ static void usage(void) { fprintf(stderr, "Usage: ip fou add port PORT { ipproto PROTO | gue } [ -6 ]\n" - " ip fou del port PORT [ -6 ]\n" + " [ local IFADDR ] [ peer IFADDR ]\n" + " [ peer_port PORT ] [ dev IFNAME ]\n" + " ip fou del port PORT [ -6 ] [ local IFADDR ]\n" + " [ peer IFADDR ] [ peer_port PORT ]\n" + " [ dev IFNAME ]\n" " ip fou show\n" "\n" "Where: PROTO { ipproto-name | 1..255 }\n" - " PORT { 1..65535 }\n"); + " PORT { 1..65535 }\n" + " IFADDR { addr }\n"); exit(-1); } @@ -48,12 +53,14 @@ static int genl_family = -1; static int fou_parse_opt(int argc, char **argv, struct nlmsghdr *n, bool adding) { - __u16 port; - int port_set = 0; - __u8 ipproto, type; + const char *local = NULL, *peer = NULL; + __u16 port, peer_port = 0; + __u8 family = AF_INET; bool gue_set = false; int ipproto_set = 0; - __u8 family = AF_INET; + __u8 ipproto, type; + int port_set = 0; + int index = 0; while (argc > 0) { if (!matches(*argv, "port")) { @@ -77,6 +84,37 @@ static int fou_parse_opt(int argc, char **argv, struct nlmsghdr *n, gue_set = true; } else if (!matches(*argv, "-6")) { family = AF_INET6; + } else if (!matches(*argv, "local")) { + NEXT_ARG(); + + local = *argv; + } else if (!matches(*argv, "peer")) { + NEXT_ARG(); + + peer = *argv; + } else if (!matches(*argv, "peer_port")) { + NEXT_ARG(); + + if (get_be16(&peer_port, *argv, 0) || peer_port == 0) + invarg("invalid peer port", *argv); + } else if (!matches(*argv, "dev")) { + const char *ifname; + + NEXT_ARG(); + + ifname = *argv; + + if (check_ifname(ifname)) { + fprintf(stderr, "fou: invalid device name\n"); + exit(EXIT_FAILURE); + } + + index = ll_name_to_index(ifname); + + if (!index) { + fprintf(stderr, "fou: unknown device name\n"); + exit(EXIT_FAILURE); + } } else { fprintf(stderr , "fou: unknown command \"%s\"?\n", *argv); @@ -101,6 +139,11 @@ static int fou_parse_opt(int argc, char **argv, struct nlmsghdr *n, return -1; } + if ((peer_port && !peer) || (peer && !peer_port)) { + fprintf(stderr, "fou: both peer and peer port must be set\n"); + return -1; + } + type = gue_set ? FOU_ENCAP_GUE : FOU_ENCAP_DIRECT; addattr16(n, 1024, FOU_ATTR_PORT, port); @@ -110,6 +153,38 @@ static int fou_parse_opt(int argc, char **argv, struct nlmsghdr *n, if (ipproto_set) addattr8(n, 1024, FOU_ATTR_IPPROTO, ipproto); + if (local) { + inet_prefix local_addr; + __u8 attr_type = family == AF_INET ? FOU_ATTR_LOCAL_V4 : + FOU_ATTR_LOCAL_V6; + + if (get_addr(&local_addr, local, family)) { + fprintf(stderr, "fou: parsing local address failed\n"); + exit(EXIT_FAILURE); + } + addattr_l(n, 1024, attr_type, &local_addr.data, + local_addr.bytelen); + } + + if (peer) { + inet_prefix peer_addr; + __u8 attr_type = family == AF_INET ? FOU_ATTR_PEER_V4 : + FOU_ATTR_PEER_V6; + + if (get_addr(&peer_addr, peer, family)) { + fprintf(stderr, "fou: parsing peer address failed\n"); + exit(EXIT_FAILURE); + } + addattr_l(n, 1024, attr_type, &peer_addr.data, + peer_addr.bytelen); + + if (peer_port) + addattr16(n, 1024, FOU_ATTR_PEER_PORT, peer_port); + } + + if (index) + addattr32(n, 1024, FOU_ATTR_IFINDEX, index); + return 0; } @@ -139,8 +214,10 @@ static int do_del(int argc, char **argv) static int print_fou_mapping(struct nlmsghdr *n, void *arg) { - struct genlmsghdr *ghdr; + __u8 family = AF_INET, local_attr_type, peer_attr_type, byte_len; struct rtattr *tb[FOU_ATTR_MAX + 1]; + __u8 empty_buf[16] = {0}; + struct genlmsghdr *ghdr; int len = n->nlmsg_len; if (n->nlmsg_type != genl_family) @@ -166,7 +243,7 @@ static int print_fou_mapping(struct nlmsghdr *n, void *arg) " ipproto %u", rta_getattr_u8(tb[FOU_ATTR_IPPROTO])); if (tb[FOU_ATTR_AF]) { - __u8 family = rta_getattr_u8(tb[FOU_ATTR_AF]); + family = rta_getattr_u8(tb[FOU_ATTR_AF]); print_string(PRINT_JSON, "family", NULL, family_name(family)); @@ -175,6 +252,48 @@ static int print_fou_mapping(struct nlmsghdr *n, void *arg) print_string(PRINT_FP, NULL, " -6", NULL); } + + local_attr_type = family == AF_INET ? FOU_ATTR_LOCAL_V4 : + FOU_ATTR_LOCAL_V6; + peer_attr_type = family == AF_INET ? FOU_ATTR_PEER_V4 : + FOU_ATTR_PEER_V6; + byte_len = af_bit_len(family) / 8; + + if (tb[local_attr_type] && memcmp(RTA_DATA(tb[local_attr_type]), + empty_buf, byte_len)) { + print_string(PRINT_ANY, "local", " local %s", + format_host_rta(family, tb[local_attr_type])); + } + + if (tb[peer_attr_type] && memcmp(RTA_DATA(tb[peer_attr_type]), + empty_buf, byte_len)) { + print_string(PRINT_ANY, "peer", " peer %s", + format_host_rta(family, tb[peer_attr_type])); + } + + if (tb[FOU_ATTR_PEER_PORT]) { + __u16 p_port = ntohs(rta_getattr_u16(tb[FOU_ATTR_PEER_PORT])); + + if (p_port) + print_uint(PRINT_ANY, "peer_port", " peer_port %u", + p_port); + + } + + if (tb[FOU_ATTR_IFINDEX]) { + int index = rta_getattr_s32(tb[FOU_ATTR_IFINDEX]); + + if (index) { + const char *ifname; + + ifname = ll_index_to_name(index); + + if (ifname) + print_string(PRINT_ANY, "dev", " dev %s", + ifname); + } + } + print_string(PRINT_FP, NULL, "\n", NULL); close_json_object(); diff --git a/man/man8/ip-fou.8 b/man/man8/ip-fou.8 index 81cab928..f4e08f16 100644 --- a/man/man8/ip-fou.8 +++ b/man/man8/ip-fou.8 @@ -24,11 +24,43 @@ ip-gue \- Generic UDP Encapsulation receive port configuration .B ipproto .IR PROTO .RB " }" +.RB "[ " +.B local +.IR IFADDR +.RB " ]" +.RB "[ " +.B peer +.IR IFADDR +.RB " ]" +.RB "[ " +.B peer_port +.IR PORT +.RB " ]" +.RB "[ " +.B dev +.IR IFNAME +.RB " ]" .br .ti -8 .BR "ip fou del" .B port .IR PORT +.RB "[ " +.B local +.IR IFADDR +.RB " ]" +.RB "[ " +.B peer +.IR IFADDR +.RB " ]" +.RB "[ " +.B peer_port +.IR PORT +.RB " ]" +.RB "[ " +.B dev +.IR IFNAME +.RB " ]" .br .ti -8 .B ip fou show @@ -50,11 +82,22 @@ When creating a FOU or GUE receive port, the port number is specified in .I PORT argument. If FOU is used, the IP protocol number associated with the port is specified in .I PROTO +argument. You can bind a port to a local address/interface, by specifying the +address in the local +.I IFADDR +argument or the device in the +.I IFNAME +argument. If you would like to connect the port, you can specify the peer +address in the peer +.I IFADDR +argument and peer port in the peer_port +.I PORT argument. .PP A FOU or GUE receive port is deleted by specifying .I PORT -in the delete command. +in the delete command, as well as local address/interface or peer address/port +(if set). .SH EXAMPLES .PP .SS Configure a FOU receive port for GRE bound to 7777 @@ -72,6 +115,10 @@ in the delete command. .SS Delete the GUE receive port bound to 9999 .nf # ip fou del port 9999 +.SS Configure a FOU receive port for GRE bound to 1.2.3.4:7777 +.nf +# ip fou add port 7777 ipproto 47 local 1.2.3.4 +.PP .SH SEE ALSO .br .BR ip (8) From 70de8a7fa702ce29f8819c1e4ecf90177cc5deea Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 26 Apr 2019 11:02:36 -0700 Subject: [PATCH 13/22] Update kernel headers Update kernel headers to commit 148f025d41a8 ("Merge branch 'hns3-next'") Note, these warnings: ../include/uapi/linux/sockios.h:42:0: warning: "SIOCGSTAMP" redefined ../include/uapi/linux/sockios.h:43:0: warning: "SIOCGSTAMPNS" redefined are due to kernel commit 0768e17073dc5 ("net: socket: implement 64-bit timestamps") which moved the definitions from include/asm-generic/sockios.h to include/uapi/linux/sockios.h Signed-off-by: David Ahern --- include/uapi/linux/bpf.h | 154 +++++++++++++++++++++++++++++++++-- include/uapi/linux/icmpv6.h | 4 + include/uapi/linux/if_vlan.h | 9 +- include/uapi/linux/sockios.h | 21 +++++ include/uapi/linux/tipc.h | 1 + 5 files changed, 180 insertions(+), 9 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 79f729c6..fcfd7e3c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -167,6 +167,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, + BPF_PROG_TYPE_CGROUP_SYSCTL, }; enum bpf_attach_type { @@ -188,6 +189,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP6_SENDMSG, BPF_LIRC_MODE2, BPF_FLOW_DISSECTOR, + BPF_CGROUP_SYSCTL, __MAX_BPF_ATTACH_TYPE }; @@ -1735,12 +1737,19 @@ union bpf_attr { * error if an eBPF program tries to set a callback that is not * supported in the current kernel. * - * The supported callback values that *argval* can combine are: + * *argval* is a flag array which can combine these flags: * * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * * Here are some examples of where one could call such eBPF * program: * @@ -2504,6 +2513,122 @@ union bpf_attr { * Return * 0 if iph and th are a valid SYN cookie ACK, or a negative error * otherwise. + * + * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * Description + * Get name of sysctl in /proc/sys/ and copy it into provided by + * program buffer *buf* of size *buf_len*. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is + * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name + * only (e.g. "tcp_mem"). + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get current value of sysctl as it is presented in /proc/sys + * (incl. newline, etc), and copy it as a string into provided + * by program buffer *buf* of size *buf_len*. + * + * The whole value is copied, no matter what file position user + * space issued e.g. sys_read at. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if current value was unavailable, e.g. because + * sysctl is uninitialized and read returns -EIO for it. + * + * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get new value being written by user space to sysctl (before + * the actual write happens) and copy it as a string into + * provided by program buffer *buf* of size *buf_len*. + * + * User space may write new value at file position > 0. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if sysctl is being read. + * + * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * Description + * Override new value being written by user space to sysctl with + * value provided by program in buffer *buf* of size *buf_len*. + * + * *buf* should contain a string in same form as provided by user + * space on sysctl write. + * + * User space may write new value at file position > 0. To override + * the whole sysctl value file position should be set to zero. + * Return + * 0 on success. + * + * **-E2BIG** if the *buf_len* is too big. + * + * **-EINVAL** if sysctl is being read. + * + * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to a long integer according to the given base + * and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by isspace(3)) followed by a single optional '-' + * sign. + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space strtol(3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than buf_len. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to an unsigned long integer according to the + * given base and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by isspace(3)). + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space strtoul(3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than buf_len. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2606,7 +2731,13 @@ union bpf_attr { FN(skb_ecn_set_ce), \ FN(get_listener_sock), \ FN(skc_lookup_tcp), \ - FN(tcp_check_syncookie), + FN(tcp_check_syncookie), \ + FN(sysctl_get_name), \ + FN(sysctl_get_current_value), \ + FN(sysctl_get_new_value), \ + FN(sysctl_set_new_value), \ + FN(strtol), \ + FN(strtoul), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2668,17 +2799,20 @@ enum bpf_func_id { /* BPF_FUNC_skb_adjust_room flags. */ #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) -#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff -#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 +#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff +#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) -#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) +/* BPF_FUNC_sysctl_get_name flags. */ +#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, @@ -3308,4 +3442,14 @@ struct bpf_line_info { struct bpf_spin_lock { __u32 val; }; + +struct bpf_sysctl { + __u32 write; /* Sysctl is being read (= 0) or written (= 1). + * Allows 1,2,4-byte read, but no write. + */ + __u32 file_pos; /* Sysctl file position to read from, write to. + * Allows 1,2,4-byte read an 4-byte write. + */ +}; + #endif /* __LINUX_BPF_H__ */ diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index cf8d5d47..1dc7cc67 100644 --- a/include/uapi/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h @@ -90,6 +90,8 @@ struct icmp6hdr { #define ICMPV6_TIME_EXCEED 3 #define ICMPV6_PARAMPROB 4 +#define ICMPV6_ERRMSG_MAX 127 + #define ICMPV6_INFOMSG_MASK 0x80 #define ICMPV6_ECHO_REQUEST 128 @@ -110,6 +112,8 @@ struct icmp6hdr { #define ICMPV6_MRDISC_ADV 151 +#define ICMPV6_MSG_MAX 255 + /* * Codes for Destination Unreachable */ diff --git a/include/uapi/linux/if_vlan.h b/include/uapi/linux/if_vlan.h index 18a15dad..04bca79d 100644 --- a/include/uapi/linux/if_vlan.h +++ b/include/uapi/linux/if_vlan.h @@ -32,10 +32,11 @@ enum vlan_ioctl_cmds { }; enum vlan_flags { - VLAN_FLAG_REORDER_HDR = 0x1, - VLAN_FLAG_GVRP = 0x2, - VLAN_FLAG_LOOSE_BINDING = 0x4, - VLAN_FLAG_MVRP = 0x8, + VLAN_FLAG_REORDER_HDR = 0x1, + VLAN_FLAG_GVRP = 0x2, + VLAN_FLAG_LOOSE_BINDING = 0x4, + VLAN_FLAG_MVRP = 0x8, + VLAN_FLAG_BRIDGE_BINDING = 0x10, }; enum vlan_name_types { diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h index d393e9ed..7d1bccbb 100644 --- a/include/uapi/linux/sockios.h +++ b/include/uapi/linux/sockios.h @@ -19,6 +19,7 @@ #ifndef _LINUX_SOCKIOS_H #define _LINUX_SOCKIOS_H +#include #include /* Linux-specific socket ioctls */ @@ -27,6 +28,26 @@ #define SOCK_IOC_TYPE 0x89 +/* + * the timeval/timespec data structure layout is defined by libc, + * so we need to cover both possible versions on 32-bit. + */ +/* Get stamp (timeval) */ +#define SIOCGSTAMP_NEW _IOR(SOCK_IOC_TYPE, 0x06, long long[2]) +/* Get stamp (timespec) */ +#define SIOCGSTAMPNS_NEW _IOR(SOCK_IOC_TYPE, 0x07, long long[2]) + +#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) +/* on 64-bit and x32, avoid the ?: operator */ +#define SIOCGSTAMP SIOCGSTAMP_OLD +#define SIOCGSTAMPNS SIOCGSTAMPNS_OLD +#else +#define SIOCGSTAMP ((sizeof(struct timeval)) == 8 ? \ + SIOCGSTAMP_OLD : SIOCGSTAMP_NEW) +#define SIOCGSTAMPNS ((sizeof(struct timespec)) == 8 ? \ + SIOCGSTAMPNS_OLD : SIOCGSTAMPNS_NEW) +#endif + /* Routing table calls. */ #define SIOCADDRT 0x890B /* add routing table entry */ #define SIOCDELRT 0x890C /* delete routing table entry */ diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 7a166a0f..e16cb4e2 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -190,6 +190,7 @@ struct sockaddr_tipc { #define TIPC_MCAST_REPLICAST 134 /* Default: TIPC selects. No arg */ #define TIPC_GROUP_JOIN 135 /* Takes struct tipc_group_req* */ #define TIPC_GROUP_LEAVE 136 /* No argument */ +#define TIPC_SOCK_RECVQ_USED 137 /* Default: none (read only) */ /* * Flag values From 3f2e457ae40cdf22ff5c197388409387be7d1332 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Sat, 20 Apr 2019 11:45:37 +0100 Subject: [PATCH 14/22] iplink_vlan: add support for VLAN bridge binding flag This patch adds support for the VLAN bridge binding flag that is provided in net-next kernel by the series merged by 1ab839281cf7 ("net-support-binding-vlan-dev-link-state-to-vlan-member-bridge-ports") Signed-off-by: Mike Manning Signed-off-by: David Ahern --- ip/iplink_vlan.c | 11 +++++++++++ man/man8/ip-link.8.in | 7 +++++++ 2 files changed, 18 insertions(+) diff --git a/ip/iplink_vlan.c b/ip/iplink_vlan.c index 08e49956..26f6ee83 100644 --- a/ip/iplink_vlan.c +++ b/ip/iplink_vlan.c @@ -27,6 +27,7 @@ static void print_explain(FILE *f) " [ gvrp { on | off } ]\n" " [ mvrp { on | off } ]\n" " [ loose_binding { on | off } ]\n" + " [ bridge_binding { on | off } ]\n" " [ ingress-qos-map QOS-MAP ]\n" " [ egress-qos-map QOS-MAP ]\n" "\n" @@ -134,6 +135,15 @@ static int vlan_parse_opt(struct link_util *lu, int argc, char **argv, flags.flags &= ~VLAN_FLAG_LOOSE_BINDING; else return on_off("loose_binding", *argv); + } else if (matches(*argv, "bridge_binding") == 0) { + NEXT_ARG(); + flags.mask |= VLAN_FLAG_BRIDGE_BINDING; + if (strcmp(*argv, "on") == 0) + flags.flags |= VLAN_FLAG_BRIDGE_BINDING; + else if (strcmp(*argv, "off") == 0) + flags.flags &= ~VLAN_FLAG_BRIDGE_BINDING; + else + return on_off("bridge_binding", *argv); } else if (matches(*argv, "ingress-qos-map") == 0) { NEXT_ARG(); if (vlan_parse_qos_map(&argc, &argv, n, @@ -204,6 +214,7 @@ static void vlan_print_flags(FILE *fp, __u32 flags) _PF(GVRP); _PF(MVRP); _PF(LOOSE_BINDING); + _PF(BRIDGE_BINDING); #undef _PF if (flags) print_hex(PRINT_ANY, NULL, "%x", flags); diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 628a3651..d035a5c9 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -406,6 +406,9 @@ the following additional arguments are supported: .BR loose_binding " { " on " | " off " } " ] [ +.BR bridge_binding " { " on " | " off " } " +] +[ .BI ingress-qos-map " QOS-MAP " ] [ @@ -459,6 +462,10 @@ where is the physical device to which VLAN device is bound. .BR loose_binding " { " on " | " off " } " - specifies whether the VLAN device state is bound to the physical device state. +.BR bridge_binding " { " on " | " off " } " +- specifies whether the VLAN device link state tracks the state of bridge ports +that are members of the VLAN. + .BI ingress-qos-map " QOS-MAP " - defines a mapping of VLAN header prio field to the Linux internal packet priority on incoming frames. The format is FROM:TO with multiple mappings From 4d9e90f36b3f92644ed5c915758b403953727e40 Mon Sep 17 00:00:00 2001 From: "Lucas Siba 2019-04-20 11:40 UTC" <@> Date: Sat, 20 Apr 2019 12:06:18 -0700 Subject: [PATCH 15/22] Update tc-bpf.8 man page examples This patch updates the tc-bpf.8 example application for changes to the struct bpf_elf_map definition. In it's current form, things compile, but the resulting object file is rejected by the verifier when attempting to load it through tc. Signed-off-by: Lucas Siba Signed-off-by: David Ahern [ dropped the unnecessary flags initialization on commit ] --- man/man8/tc-bpf.8 | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/man/man8/tc-bpf.8 b/man/man8/tc-bpf.8 index b2f9344f..e4f68aaa 100644 --- a/man/man8/tc-bpf.8 +++ b/man/man8/tc-bpf.8 @@ -597,6 +597,7 @@ struct bpf_elf_map __section("maps") map_stats = { .size_key = sizeof(uint32_t), .size_value = sizeof(struct tuple), .max_elem = BPF_MAX_MARK, + .pinning = PIN_GLOBAL_NS, }; static inline void cls_update_stats(const struct __sk_buff *skb, @@ -709,13 +710,22 @@ in both examples was: #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) -/* Used map structure */ +/* Object pinning settings */ +#define PIN_NONE 0 +#define PIN_OBJECT_NS 1 +#define PIN_GLOBAL_NS 2 + +/* ELF map definition */ struct bpf_elf_map { __u32 type; __u32 size_key; __u32 size_value; __u32 max_elem; + __u32 flags; __u32 id; + __u32 pinning; + __u32 inner_id; + __u32 inner_idx; }; /* Some used BPF function calls. */ From 517ea57c6dfc44093df8ca8e1852e9417643185d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 30 Apr 2019 11:42:08 +0300 Subject: [PATCH 16/22] devlink: Increase column size for larger shared buffers With current number of spaces the output is mangled if the shared buffer is congested. Before: # devlink sb occupancy show swp25 swp25: pool: 0: 33384960/39344256 1: 0/0 2: 0/0 3: 0/0 4: 0/720 5: 0/0 6: 0/0 7: 0/0 8: 0/288 9: 0/0 10: 0/0 itc: 0(0): 33272064/39344256 1(0): 0/0 2(0): 0/0 3(0): 0/0 4(0): 0/0 5(0): 0/0 6(0): 0/0 7(0): 0/0 etc: 0(4): 0/720 1(4): 0/0 2(4): 0/0 3(4): 0/0 4(4): 0/0 5(4): 0/0 6(4): 0/0 7(4): 0/0 8(8): 0/288 9(8): 0/0 10(8): 0/0 11(8): 0/0 12(8): 0/0 13(8): 0/0 14(8): 0/0 15(8): 0/0 After: # devlink sb occupancy show swp25 swp25: pool: 0: 39070080/39344256 1: 0/0 2: 0/0 3: 0/0 4: 0/720 5: 0/0 6: 0/0 7: 0/0 8: 0/288 9: 0/0 10: 0/0 itc: 0(0): 39062016/39344256 1(0): 0/0 2(0): 0/0 3(0): 0/0 4(0): 0/0 5(0): 0/0 6(0): 0/0 7(0): 0/0 etc: 0(4): 0/720 1(4): 0/0 2(4): 0/0 3(4): 0/0 4(4): 0/0 5(4): 0/0 6(4): 0/0 7(4): 0/0 8(8): 0/288 9(8): 0/0 10(8): 0/0 11(8): 0/0 12(8): 0/0 13(8): 0/0 14(8): 0/0 15(8): 0/0 v2: * Increase number of spaces to make the change more future-proof Signed-off-by: Ido Schimmel Reported-by: Alex Kushnarov Signed-off-by: David Ahern --- devlink/devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devlink/devlink.c b/devlink/devlink.c index dc6e73fe..5bf81f55 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -3422,7 +3422,7 @@ static void pr_out_occ_show_item_list(const char *label, struct list_head *list, occ_item->bound_pool_index); else pr_out_sp(7, "%2u:", occ_item->index); - pr_out_sp(15, "%7u/%u", occ_item->cur, occ_item->max); + pr_out_sp(21, "%10u/%u", occ_item->cur, occ_item->max); if (i++ % 4 == 0) pr_out("\n"); } From 296b5de72423ea93bc62dcbc51c54fa096bd5ec7 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Tue, 30 Apr 2019 21:38:38 -0400 Subject: [PATCH 17/22] ss: add option to print socket information on one line Multi-line output in ss makes it difficult to search for things with grep. This new option will make it easier to find sockets matching certain criteria with simple grep commands. Example without option: $ ss -emoitn State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 0 0 127.0.0.1:13265 127.0.0.1:36743 uid:1974 ino:48271 sk:1 <-> skmem:(r0,rb2227595,t0,tb2626560,f0,w0,o0,bl0,d0) ts sack reno wscale:7,7 rto:211 rtt:10.245/16.616 ato:40 mss:65483 cwnd:10 bytes_acked:41865496 bytes_received:21580440 segs_out:242496 segs_in:351446 data_segs_out:242495 data_segs_in:242495 send 511.3Mbps lastsnd:2383 lastrcv:2383 lastack:2342 pacing_rate 1022.6Mbps rcv_rtt:92427.6 rcv_space:43725 minrtt:0.007 Example with new option: $ ss -emoitnO State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 0 0 127.0.0.1:13265 127.0.0.1:36743 uid:1974 ino:48271 sk:1 <-> skmem:(r0,rb2227595,t0,tb2626560,f0,w0,o0,bl0,d0) ts sack reno wscale:7,7 rto:211 rtt:10.067/16.429 ato:40 mss:65483 pmtu:65535 rcvmss:536 advmss:65483 cwnd:10 bytes_sent:41868244 bytes_acked:41868244 bytes_received:21581866 segs_out:242512 segs_in:351469 data_segs_out:242511 data_segs_in:242511 send 520.4Mbps lastsnd:14355 lastrcv:14355 lastack:14314 pacing_rate 1040.7Mbps delivery_rate 74837.7Mbps delivered:242512 app_limited busy:1861946ms rcv_rtt:92427.6 rcv_space:43725 rcv_ssthresh:43690 minrtt:0.007 Signed-off-by: Josh Hunt Signed-off-by: David Ahern --- man/man8/ss.8 | 3 +++ misc/ss.c | 51 +++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/man/man8/ss.8 b/man/man8/ss.8 index 03a3dcc6..9054fab9 100644 --- a/man/man8/ss.8 +++ b/man/man8/ss.8 @@ -24,6 +24,9 @@ Output version information. .B \-H, \-\-no-header Suppress header line. .TP +.B \-O, \-\-oneline +Print each socket's data on a single line. +.TP .B \-n, \-\-numeric Do not try to resolve service names. .TP diff --git a/misc/ss.c b/misc/ss.c index 9cb3ee19..99c06d31 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -121,6 +121,7 @@ static int follow_events; static int sctp_ino; static int show_tipcinfo; static int show_tos; +int oneline; enum col_id { COL_NETID, @@ -3053,7 +3054,8 @@ static int inet_show_sock(struct nlmsghdr *nlh, } if (show_mem || (show_tcpinfo && s->type != IPPROTO_UDP)) { - out("\n\t"); + if (!oneline) + out("\n\t"); if (s->type == IPPROTO_SCTP) sctp_show_info(nlh, r, tb); else @@ -3973,7 +3975,10 @@ static int packet_show_sock(struct nlmsghdr *nlh, void *arg) if (show_details) { if (pinfo) { - out("\n\tver:%d", pinfo->pdi_version); + if (oneline) + out(" ver:%d", pinfo->pdi_version); + else + out("\n\tver:%d", pinfo->pdi_version); out(" cpy_thresh:%d", pinfo->pdi_copy_thresh); out(" flags( "); if (pinfo->pdi_flags & PDI_RUNNING) @@ -3991,19 +3996,28 @@ static int packet_show_sock(struct nlmsghdr *nlh, void *arg) out(" )"); } if (ring_rx) { - out("\n\tring_rx("); + if (oneline) + out(" ring_rx("); + else + out("\n\tring_rx("); packet_show_ring(ring_rx); out(")"); } if (ring_tx) { - out("\n\tring_tx("); + if (oneline) + out(" ring_tx("); + else + out("\n\tring_tx("); packet_show_ring(ring_tx); out(")"); } if (has_fanout) { uint16_t type = (fanout >> 16) & 0xffff; - out("\n\tfanout("); + if (oneline) + out(" fanout("); + else + out("\n\tfanout("); out("id:%d,", fanout & 0xffff); out("type:"); @@ -4032,7 +4046,10 @@ static int packet_show_sock(struct nlmsghdr *nlh, void *arg) int num = RTA_PAYLOAD(tb[PACKET_DIAG_FILTER]) / sizeof(struct sock_filter); - out("\n\tbpf filter (%d): ", num); + if (oneline) + out(" bpf filter (%d): ", num); + else + out("\n\tbpf filter (%d): ", num); while (num) { out(" 0x%02x %u %u %u,", fil->code, fil->jt, fil->jf, fil->k); @@ -4144,7 +4161,10 @@ static int xdp_stats_print(struct sockstat *s, const struct filter *f) static void xdp_show_ring(const char *name, struct xdp_diag_ring *ring) { - out("\n\t%s(", name); + if (oneline) + out(" %s(", name); + else + out("\n\t%s(", name); out("entries:%u", ring->entries); out(")"); } @@ -4152,7 +4172,10 @@ static void xdp_show_ring(const char *name, struct xdp_diag_ring *ring) static void xdp_show_umem(struct xdp_diag_umem *umem, struct xdp_diag_ring *fr, struct xdp_diag_ring *cr) { - out("\n\tumem("); + if (oneline) + out(" tumem("); + else + out("\n\tumem("); out("id:%u", umem->id); out(",size:%llu", umem->size); out(",num_pages:%u", umem->num_pages); @@ -4574,7 +4597,10 @@ static int tipc_show_sock(struct nlmsghdr *nlh, void *arg) proc_ctx_print(&ss); if (show_tipcinfo) { - out("\n type:%s", stype_nameg[ss.type]); + if (oneline) + out(" type:%s", stype_nameg[ss.type]); + else + out("\n type:%s", stype_nameg[ss.type]); out(" cong:%s ", stat[TIPC_NLA_SOCK_STAT_LINK_CONG] ? "link" : stat[TIPC_NLA_SOCK_STAT_CONN_CONG] ? "conn" : "none"); @@ -4877,6 +4903,7 @@ static void _usage(FILE *dest) "\n" " -K, --kill forcibly close sockets, display what was closed\n" " -H, --no-header Suppress header line\n" +" -O, --oneline socket's data printed on a single line\n" "\n" " -A, --query=QUERY, --socket=QUERY\n" " QUERY := {all|inet|tcp|udp|raw|unix|unix_dgram|unix_stream|unix_seqpacket|packet|netlink|vsock_stream|vsock_dgram|tipc}[,QUERY]\n" @@ -5003,6 +5030,7 @@ static const struct option long_opts[] = { { "kill", 0, 0, 'K' }, { "no-header", 0, 0, 'H' }, { "xdp", 0, 0, OPT_XDPSOCK}, + { "oneline", 0, 0, 'O' }, { 0 } }; @@ -5018,7 +5046,7 @@ int main(int argc, char *argv[]) int state_filter = 0; while ((ch = getopt_long(argc, argv, - "dhaletuwxnro460spbEf:miA:D:F:vVzZN:KHS", + "dhaletuwxnro460spbEf:miA:D:F:vVzZN:KHSO", long_opts, NULL)) != EOF) { switch (ch) { case 'n': @@ -5192,6 +5220,9 @@ int main(int argc, char *argv[]) case 'H': show_header = 0; break; + case 'O': + oneline = 1; + break; case 'h': help(); case '?': From 420b36a874b72bbe62feb2de903a705e7d177bd0 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 2 May 2019 16:13:21 -0700 Subject: [PATCH 18/22] uapi: wrap SIOCGSTAMP and SIOCGSTAMPNS in ifndef These warnings: ../include/uapi/linux/sockios.h:42:0: warning: "SIOCGSTAMP" redefined ../include/uapi/linux/sockios.h:43:0: warning: "SIOCGSTAMPNS" redefined are from kernel commit 0768e17073dc5 ("net: socket: implement 64-bit timestamps"). This commit moved the definitions of SIOCGSTAMP and SIOCGSTAMPNS from include/asm-generic/sockios.h to include/uapi/linux/sockios.h. Older OS'es already define them in /usr/include/asm-generic/sockios.h resulting in ugly compile errors now: In file included from ll_types.c:24:0: ../include/uapi/linux/sockios.h:42:0: warning: "SIOCGSTAMP" redefined #define SIOCGSTAMP SIOCGSTAMP_OLD In file included from /usr/include/x86_64-linux-gnu/asm/sockios.h:1:0, from /usr/include/asm-generic/socket.h:5, from /usr/include/x86_64-linux-gnu/asm/socket.h:1, from /usr/include/x86_64-linux-gnu/bits/socket.h:368, from /usr/include/x86_64-linux-gnu/sys/socket.h:38, from ll_types.c:17: /usr/include/asm-generic/sockios.h:11:0: note: this is the location of the previous definition #define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ so wrap them in #ifndef. Signed-off-by: David Ahern --- include/uapi/linux/sockios.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h index 7d1bccbb..35d7a60f 100644 --- a/include/uapi/linux/sockios.h +++ b/include/uapi/linux/sockios.h @@ -39,8 +39,12 @@ #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) /* on 64-bit and x32, avoid the ?: operator */ +#ifndef SIOCGSTAMP #define SIOCGSTAMP SIOCGSTAMP_OLD +#endif +#ifndef SIOCGSTAMPNS #define SIOCGSTAMPNS SIOCGSTAMPNS_OLD +#endif #else #define SIOCGSTAMP ((sizeof(struct timeval)) == 8 ? \ SIOCGSTAMP_OLD : SIOCGSTAMP_NEW) From fd6580972bcf6c24b3465684def17ac7b4a52c5f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 4 May 2019 09:13:26 -0700 Subject: [PATCH 19/22] Update kernel headers Update kernel headers to commit a734d1f4c2fc ("net: openvswitch: return an error instead of doing BUG_ON()") Signed-off-by: David Ahern --- include/uapi/linux/bpf.h | 45 +++++++++++++++++++++++++++++++++- include/uapi/linux/if_ether.h | 1 + include/uapi/linux/pkt_sched.h | 13 ++++++++++ 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fcfd7e3c..4ba5a992 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, + BPF_MAP_TYPE_SK_STORAGE, }; /* Note that tracing related programs such as @@ -168,6 +169,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_CGROUP_SYSCTL, + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, }; enum bpf_attach_type { @@ -2629,6 +2631,42 @@ union bpf_attr { * was provided. * * **-ERANGE** if resulting value was out of range. + * + * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) + * Description + * Get a bpf-local-storage from a sk. + * + * Logically, it could be thought of getting the value from + * a *map* with *sk* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem(map, &sk)** except this + * helper enforces the key must be a **bpf_fullsock()** + * and the map must be a BPF_MAP_TYPE_SK_STORAGE also. + * + * Underneath, the value is stored locally at *sk* instead of + * the map. The *map* is used as the bpf-local-storage **type**. + * The bpf-local-storage **type** (i.e. the *map*) is searched + * against all bpf-local-storages residing at sk. + * + * An optional *flags* (BPF_SK_STORAGE_GET_F_CREATE) can be + * used such that a new bpf-local-storage will be + * created if one does not exist. *value* can be used + * together with BPF_SK_STORAGE_GET_F_CREATE to specify + * the initial value of a bpf-local-storage. If *value* is + * NULL, the new bpf-local-storage will be zero initialized. + * Return + * A bpf-local-storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf-local-storage. + * + * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * Description + * Delete a bpf-local-storage from a sk. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf-local-storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2737,7 +2775,9 @@ union bpf_attr { FN(sysctl_get_new_value), \ FN(sysctl_set_new_value), \ FN(strtol), \ - FN(strtoul), + FN(strtoul), \ + FN(sk_storage_get), \ + FN(sk_storage_delete), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2813,6 +2853,9 @@ enum bpf_func_id { /* BPF_FUNC_sysctl_get_name flags. */ #define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) +/* BPF_FUNC_sk_storage_get flags */ +#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 8c36f63e..18c5b68b 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -109,6 +109,7 @@ #define ETH_P_QINQ2 0x9200 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_QINQ3 0x9300 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_EDSA 0xDADA /* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_DSA_8021Q 0xDADB /* Fake VLAN Header for DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_IFE 0xED3E /* ForCES inter-FE LFB type */ #define ETH_P_AF_IUCV 0xFBFB /* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */ diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 7ee74c34..8b2f993c 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1148,6 +1148,16 @@ enum { #define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1) +/* The format for the admin sched (dump only): + * [TCA_TAPRIO_SCHED_ADMIN_SCHED] + * [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_INTERVAL] + */ + enum { TCA_TAPRIO_ATTR_UNSPEC, TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ @@ -1156,6 +1166,9 @@ enum { TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ TCA_TAPRIO_PAD, + TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */ + TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */ __TCA_TAPRIO_ATTR_MAX, }; From c865c52365f477eeb7286b3387d18184bfbf08c6 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Apr 2019 18:53:57 +0200 Subject: [PATCH 20/22] tc: add support for plug qdisc sch_plug can be used to perform functional qdisc unit tests controlling explicitly the queuing behaviour from user-space. Plug support lacks since its introduction in 2012. This change introduces basic support, to control the tc status. v1 -> v2: - use the SPDX identifier Signed-off-by: Paolo Abeni Signed-off-by: David Ahern --- tc/Makefile | 1 + tc/q_plug.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 tc/q_plug.c diff --git a/tc/Makefile b/tc/Makefile index 2edaf2c8..1a305cf4 100644 --- a/tc/Makefile +++ b/tc/Makefile @@ -75,6 +75,7 @@ TCMODULES += f_matchall.o TCMODULES += q_cbs.o TCMODULES += q_etf.o TCMODULES += q_taprio.o +TCMODULES += q_plug.o TCSO := ifeq ($(TC_CONFIG_ATM),y) diff --git a/tc/q_plug.c b/tc/q_plug.c new file mode 100644 index 00000000..2c1c1a0b --- /dev/null +++ b/tc/q_plug.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * q_log.c plug scheduler + * + * Copyright (C) 2019 Paolo Abeni + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, "Usage: ... plug [block | release | release_indefinite | limit NUMBER]\n"); +} + +static int plug_parse_opt(struct qdisc_util *qu, int argc, char **argv, + struct nlmsghdr *n, const char *dev) +{ + struct tc_plug_qopt opt = {}; + int ok = 0; + + while (argc > 0) { + if (strcmp(*argv, "block") == 0) { + opt.action = TCQ_PLUG_BUFFER; + ok++; + } else if (strcmp(*argv, "release") == 0) { + opt.action = TCQ_PLUG_RELEASE_ONE; + ok++; + } else if (strcmp(*argv, "release_indefinite") == 0) { + opt.action = TCQ_PLUG_RELEASE_INDEFINITE; + ok++; + } else if (strcmp(*argv, "limit") == 0) { + opt.action = TCQ_PLUG_LIMIT; + NEXT_ARG(); + if (get_size(&opt.limit, *argv)) { + fprintf(stderr, "Illegal value for \"limit\": \"%s\"\n", *argv); + return -1; + } + ok++; + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "%s: unknown parameter \"%s\"\n", qu->id, *argv); + explain(); + return -1; + } + argc--; argv++; + } + + if (ok) + addattr_l(n, 1024, TCA_OPTIONS, &opt, sizeof(opt)); + return 0; +} + +static int plug_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + /* dummy implementation as sch_plug does not implement a dump op */ + return 0; +} + + +struct qdisc_util plug_qdisc_util = { + .id = "plug", + .parse_qopt = plug_parse_opt, + .print_qopt = plug_print_opt, +}; From 602fae856d80bbaa365fd0421e3f2c2417ea804f Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 29 Apr 2019 15:52:18 -0700 Subject: [PATCH 21/22] taprio: Add support for changing schedules This allows for a new schedule to be specified during runtime, without removing the current one. For that, the semantics of the 'tc qdisc change' operation in the context of taprio is that if "change" is called and there is a running schedule, a new schedule is created and the base-time (let's call it X) of this new schedule is used so at instant X, it becomes the "current" schedule. So, in short, "change" doesn't change the current schedule, it creates a new one and sets it up to it becomes the current one at some point. In IEEE 802.1Q terms, it means that we have support for the "Oper" (current and read-only) and "Admin" (future and mutable) schedules. Example of creating the first schedule, then adding a new one: (1) tc qdisc add dev IFACE parent root handle 100 taprio \ num_tc 1 \ map 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \ queues 1@0 \ sched-entry S 0x1 1000000 \ sched-entry S 0x0 2000000 \ sched-entry S 0x1 3000000 \ sched-entry S 0x0 4000000 \ base-time 100000000 \ clockid CLOCK_TAI (2) tc qdisc change dev IFACE parent root handle 100 taprio \ base-time 7500000000000 \ sched-entry S 0x0 5000000 \ sched-entry S 0x1 5000000 \ It was necessary to fix a bug, so the clockid doesn't need to be specified when changing the schedule. Most of the changes are related to make it easier to reuse the same function for printing the "admin" and "oper" schedules. Signed-off-by: Vinicius Costa Gomes Signed-off-by: David Ahern --- tc/q_taprio.c | 42 +++++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/tc/q_taprio.c b/tc/q_taprio.c index 8f6b263a..336bb245 100644 --- a/tc/q_taprio.c +++ b/tc/q_taprio.c @@ -268,14 +268,15 @@ static int taprio_parse_opt(struct qdisc_util *qu, int argc, tail = NLMSG_TAIL(n); addattr_l(n, 1024, TCA_OPTIONS, NULL, 0); + if (clockid != CLOCKID_INVALID) + addattr_l(n, 1024, TCA_TAPRIO_ATTR_SCHED_CLOCKID, &clockid, sizeof(clockid)); + if (opt.num_tc > 0) addattr_l(n, 1024, TCA_TAPRIO_ATTR_PRIOMAP, &opt, sizeof(opt)); if (base_time) addattr_l(n, 1024, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, &base_time, sizeof(base_time)); - addattr_l(n, 1024, TCA_TAPRIO_ATTR_SCHED_CLOCKID, &clockid, sizeof(clockid)); - if (!list_empty(&sched_entries)) { struct rtattr *entry_list; entry_list = addattr_nest(n, 1024, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST | NLA_F_NESTED); @@ -306,6 +307,8 @@ static int print_sched_list(FILE *f, struct rtattr *list) open_json_array(PRINT_JSON, "schedule"); + print_string(PRINT_FP, NULL, "%s", _SL_); + for (item = RTA_DATA(list); RTA_OK(item, rem); item = RTA_NEXT(item, rem)) { struct rtattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1]; __u32 index = 0, gatemask = 0, interval = 0; @@ -340,12 +343,25 @@ static int print_sched_list(FILE *f, struct rtattr *list) return 0; } +static int print_schedule(FILE *f, struct rtattr **tb) +{ + int64_t base_time = 0; + + if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) + base_time = rta_getattr_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); + + print_lluint(PRINT_ANY, "base_time", "\tbase-time %lld", base_time); + + print_sched_list(f, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]); + + return 0; +} + static int taprio_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) { struct rtattr *tb[TCA_TAPRIO_ATTR_MAX + 1]; struct tc_mqprio_qopt *qopt = 0; __s32 clockid = CLOCKID_INVALID; - __s64 base_time = 0; int i; if (opt == NULL) @@ -378,19 +394,27 @@ static int taprio_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) print_string(PRINT_FP, NULL, "%s", _SL_); - if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) - base_time = rta_getattr_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); - if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) clockid = rta_getattr_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); print_string(PRINT_ANY, "clockid", "clockid %s", get_clock_name(clockid)); - print_lluint(PRINT_ANY, "base_time", " base-time %lld", base_time); + print_schedule(f, tb); - print_string(PRINT_FP, NULL, "%s", _SL_); + if (tb[TCA_TAPRIO_ATTR_ADMIN_SCHED]) { + struct rtattr *t[TCA_TAPRIO_ATTR_MAX + 1]; - return print_sched_list(f, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]); + parse_rtattr_nested(t, TCA_TAPRIO_ATTR_MAX, + tb[TCA_TAPRIO_ATTR_ADMIN_SCHED]); + + open_json_object(NULL); + + print_schedule(f, t); + + close_json_object(); + } + + return 0; } struct qdisc_util taprio_qdisc_util = { From 92f4b6032e7971d9b0247d7370c08cae2f1c58f9 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 29 Apr 2019 15:52:19 -0700 Subject: [PATCH 22/22] taprio: Add support for cycle_time and cycle_time_extension This allows a cycle-time and a cycle-time-extension to be specified. Specifying a cycle-time will truncate that cycle, so when that instant is reached, the cycle will start from its beginning. A cycle-time-extension may cause the last entry of a cycle, just before the start of a new schedule (the base-time of the "admin" schedule) to be extended by at maximum "cycle-time-extension" nanoseconds. The idea of this feauture, as described by the IEEE 802.1Q, is too avoid too narrow gate states. Example: tc qdisc change dev IFACE parent root handle 100 taprio \ sched-entry S 0x1 1000000 \ sched-entry S 0x0 2000000 \ sched-entry S 0x1 3000000 \ sched-entry S 0x0 4000000 \ cycle-time-extension 100000 \ cycle-time 9000000 \ base-time 12345678900000000 Signed-off-by: Vinicius Costa Gomes Signed-off-by: David Ahern --- tc/q_taprio.c | 64 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/tc/q_taprio.c b/tc/q_taprio.c index 336bb245..aad055d8 100644 --- a/tc/q_taprio.c +++ b/tc/q_taprio.c @@ -155,8 +155,10 @@ static int taprio_parse_opt(struct qdisc_util *qu, int argc, { __s32 clockid = CLOCKID_INVALID; struct tc_mqprio_qopt opt = { }; + __s64 cycle_time_extension = 0; struct list_head sched_entries; - struct rtattr *tail; + struct rtattr *tail, *l; + __s64 cycle_time = 0; __s64 base_time = 0; int err, idx; @@ -245,6 +247,29 @@ static int taprio_parse_opt(struct qdisc_util *qu, int argc, PREV_ARG(); break; } + } else if (strcmp(*argv, "cycle-time") == 0) { + NEXT_ARG(); + if (cycle_time) { + fprintf(stderr, "taprio: duplicate \"cycle-time\" specification\n"); + return -1; + } + + if (get_s64(&cycle_time, *argv, 10)) { + PREV_ARG(); + break; + } + + } else if (strcmp(*argv, "cycle-time-extension") == 0) { + NEXT_ARG(); + if (cycle_time_extension) { + fprintf(stderr, "taprio: duplicate \"cycle-time-extension\" specification\n"); + return -1; + } + + if (get_s64(&cycle_time_extension, *argv, 10)) { + PREV_ARG(); + break; + } } else if (strcmp(*argv, "clockid") == 0) { NEXT_ARG(); if (clockid != CLOCKID_INVALID) { @@ -277,19 +302,24 @@ static int taprio_parse_opt(struct qdisc_util *qu, int argc, if (base_time) addattr_l(n, 1024, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, &base_time, sizeof(base_time)); - if (!list_empty(&sched_entries)) { - struct rtattr *entry_list; - entry_list = addattr_nest(n, 1024, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST | NLA_F_NESTED); + if (cycle_time) + addattr_l(n, 1024, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, + &cycle_time, sizeof(cycle_time)); - err = add_sched_list(&sched_entries, n); - if (err < 0) { - fprintf(stderr, "Could not add schedule to netlink message\n"); - return -1; - } + if (cycle_time_extension) + addattr_l(n, 1024, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, + &cycle_time_extension, sizeof(cycle_time_extension)); - addattr_nest_end(n, entry_list); + l = addattr_nest(n, 1024, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST | NLA_F_NESTED); + + err = add_sched_list(&sched_entries, n); + if (err < 0) { + fprintf(stderr, "Could not add schedule to netlink message\n"); + return -1; } + addattr_nest_end(n, l); + tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail; return 0; @@ -345,13 +375,25 @@ static int print_sched_list(FILE *f, struct rtattr *list) static int print_schedule(FILE *f, struct rtattr **tb) { - int64_t base_time = 0; + int64_t base_time = 0, cycle_time = 0, cycle_time_extension = 0; if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) base_time = rta_getattr_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); + if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]) + cycle_time = rta_getattr_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]); + + if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]) + cycle_time_extension = rta_getattr_s64( + tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]); + print_lluint(PRINT_ANY, "base_time", "\tbase-time %lld", base_time); + print_lluint(PRINT_ANY, "cycle_time", " cycle-time %lld", cycle_time); + + print_lluint(PRINT_ANY, "cycle_time_extension", + " cycle-time-extension %lld", cycle_time_extension); + print_sched_list(f, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]); return 0;