aboutsummaryrefslogtreecommitdiffstats
path: root/samples
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2019-07-04 19:48:21 (GMT)
committerDavid S. Miller <davem@davemloft.net>2019-07-04 19:48:21 (GMT)
commitc4cde5804d512a2f8934017dbf7df642dfbdf2ad (patch)
tree1ad14ebc7b8fe872b6a216c7e498bd5e464ef342 /samples
parente2c746944e26609f63661cedb7c7c31f0578c58f (diff)
parente5a3e259ef239f443951d401db10db7d426c9497 (diff)
downloadkernel_replicant_linux-c4cde5804d512a2f8934017dbf7df642dfbdf2ad.zip
kernel_replicant_linux-c4cde5804d512a2f8934017dbf7df642dfbdf2ad.tar.gz
kernel_replicant_linux-c4cde5804d512a2f8934017dbf7df642dfbdf2ad.tar.bz2
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says: ==================== pull-request: bpf-next 2019-07-03 The following pull-request contains BPF updates for your *net-next* tree. There is a minor merge conflict in mlx5 due to 8960b38932be ("linux/dim: Rename externally used net_dim members") which has been pulled into your tree in the meantime, but resolution seems not that bad ... getting current bpf-next out now before there's coming more on mlx5. ;) I'm Cc'ing Saeed just so he's aware of the resolution below: ** First conflict in drivers/net/ethernet/mellanox/mlx5/core/en_main.c: <<<<<<< HEAD static int mlx5e_open_cq(struct mlx5e_channel *c, struct dim_cq_moder moder, struct mlx5e_cq_param *param, struct mlx5e_cq *cq) ======= int mlx5e_open_cq(struct mlx5e_channel *c, struct net_dim_cq_moder moder, struct mlx5e_cq_param *param, struct mlx5e_cq *cq) >>>>>>> e5a3e259ef239f443951d401db10db7d426c9497 Resolution is to take the second chunk and rename net_dim_cq_moder into dim_cq_moder. Also the signature for mlx5e_open_cq() in ... drivers/net/ethernet/mellanox/mlx5/core/en.h +977 ... and in mlx5e_open_xsk() ... drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c +64 ... needs the same rename from net_dim_cq_moder into dim_cq_moder. ** Second conflict in drivers/net/ethernet/mellanox/mlx5/core/en_main.c: <<<<<<< HEAD int cpu = cpumask_first(mlx5_comp_irq_get_affinity_mask(priv->mdev, ix)); struct dim_cq_moder icocq_moder = {0, 0}; struct net_device *netdev = priv->netdev; struct mlx5e_channel *c; unsigned int irq; ======= struct net_dim_cq_moder icocq_moder = {0, 0}; >>>>>>> e5a3e259ef239f443951d401db10db7d426c9497 Take the second chunk and rename net_dim_cq_moder into dim_cq_moder as well. Let me know if you run into any issues. Anyway, the main changes are: 1) Long-awaited AF_XDP support for mlx5e driver, from Maxim. 2) Addition of two new per-cgroup BPF hooks for getsockopt and setsockopt along with a new sockopt program type which allows more fine-grained pass/reject settings for containers. Also add a sock_ops callback that can be selectively enabled on a per-socket basis and is executed for every RTT to help tracking TCP statistics, both features from Stanislav. 3) Follow-up fix from loops in precision tracking which was not propagating precision marks and as a result verifier assumed that some branches were not taken and therefore wrongly removed as dead code, from Alexei. 4) Fix BPF cgroup release synchronization race which could lead to a double-free if a leaf's cgroup_bpf object is released and a new BPF program is attached to the one of ancestor cgroups in parallel, from Roman. 5) Support for bulking XDP_TX on veth devices which improves performance in some cases by around 9%, from Toshiaki. 6) Allow for lookups into BPF devmap and improve feedback when calling into bpf_redirect_map() as lookup is now performed right away in the helper itself, from Toke. 7) Add support for fq's Earliest Departure Time to the Host Bandwidth Manager (HBM) sample BPF program, from Lawrence. 8) Various cleanups and minor fixes all over the place from many others. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'samples')
-rw-r--r--samples/bpf/Makefile3
-rwxr-xr-xsamples/bpf/do_hbm_test.sh22
-rw-r--r--samples/bpf/hbm.c18
-rw-r--r--samples/bpf/hbm_edt_kern.c168
-rw-r--r--samples/bpf/hbm_kern.h40
-rw-r--r--samples/bpf/ibumad_kern.c18
-rw-r--r--samples/bpf/tcp_bpf.readme2
-rw-r--r--samples/bpf/tcp_dumpstats_kern.c68
-rw-r--r--samples/bpf/xdp_adjust_tail_user.c12
-rw-r--r--samples/bpf/xdp_redirect_map_user.c15
-rw-r--r--samples/bpf/xdp_redirect_user.c15
-rw-r--r--samples/bpf/xdp_tx_iptunnel_user.c12
-rw-r--r--samples/bpf/xdpsock_user.c44
13 files changed, 377 insertions, 60 deletions
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 0917f8c..f90daad 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -154,6 +154,7 @@ always += tcp_iw_kern.o
always += tcp_clamp_kern.o
always += tcp_basertt_kern.o
always += tcp_tos_reflect_kern.o
+always += tcp_dumpstats_kern.o
always += xdp_redirect_kern.o
always += xdp_redirect_map_kern.o
always += xdp_redirect_cpu_kern.o
@@ -168,6 +169,7 @@ always += task_fd_query_kern.o
always += xdp_sample_pkts_kern.o
always += ibumad_kern.o
always += hbm_out_kern.o
+always += hbm_edt_kern.o
KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/bpf/
@@ -272,6 +274,7 @@ $(src)/*.c: verify_target_bpf $(LIBBPF)
$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
$(obj)/hbm.o: $(src)/hbm.h
+$(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
# But, there is no easy way to fix it, so just exclude it since it is
diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh
index e48b047..ffe4c06 100755
--- a/samples/bpf/do_hbm_test.sh
+++ b/samples/bpf/do_hbm_test.sh
@@ -14,7 +14,7 @@ Usage() {
echo "loads. The output is the goodput in Mbps (unless -D was used)."
echo ""
echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]"
- echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E]"
+ echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E] [--edt]"
echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
echo " [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]"
echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
@@ -30,6 +30,7 @@ Usage() {
echo " other detailed information. This information is"
echo " test dependent (i.e. iperf3 or netperf)."
echo " -E enable ECN (not required for dctcp)"
+ echo " --edt use fq's Earliest Departure Time (requires fq)"
echo " -f or --flows number of concurrent flows (default=1)"
echo " -i or --id cgroup id (an integer, default is 1)"
echo " -N use netperf instead of iperf3"
@@ -130,13 +131,12 @@ processArgs () {
details=1
;;
-E)
- ecn=1
+ ecn=1
+ ;;
+ --edt)
+ flags="$flags --edt"
+ qdisc="fq"
;;
- # Support for upcomming fq Early Departure Time egress rate limiting
- #--edt)
- # prog="hbm_out_edt_kern.o"
- # qdisc="fq"
- # ;;
-f=*|--flows=*)
flows="${i#*=}"
;;
@@ -228,8 +228,8 @@ if [ "$netem" -ne "0" ] ; then
tc qdisc del dev lo root > /dev/null 2>&1
tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1
elif [ "$qdisc" != "" ] ; then
- tc qdisc del dev lo root > /dev/null 2>&1
- tc qdisc add dev lo root $qdisc > /dev/null 2>&1
+ tc qdisc del dev eth0 root > /dev/null 2>&1
+ tc qdisc add dev eth0 root $qdisc > /dev/null 2>&1
fi
n=0
@@ -399,7 +399,9 @@ fi
if [ "$netem" -ne "0" ] ; then
tc qdisc del dev lo root > /dev/null 2>&1
fi
-
+if [ "$qdisc" != "" ] ; then
+ tc qdisc del dev eth0 root > /dev/null 2>&1
+fi
sleep 2
hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'`
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
index b905b32..e0fbab9 100644
--- a/samples/bpf/hbm.c
+++ b/samples/bpf/hbm.c
@@ -62,6 +62,7 @@ bool loopback_flag;
bool debugFlag;
bool work_conserving_flag;
bool no_cn_flag;
+bool edt_flag;
static void Usage(void);
static void read_trace_pipe2(void);
@@ -372,9 +373,14 @@ static int run_bpf_prog(char *prog, int cg_id)
fprintf(fout, "avg rtt:%d\n",
(int)(qstats.sum_rtt / (qstats.pkts_total + 1)));
// Average credit
- fprintf(fout, "avg credit:%d\n",
- (int)(qstats.sum_credit /
- (1500 * ((int)qstats.pkts_total) + 1)));
+ if (edt_flag)
+ fprintf(fout, "avg credit_ms:%.03f\n",
+ (qstats.sum_credit /
+ (qstats.pkts_total + 1.0)) / 1000000.0);
+ else
+ fprintf(fout, "avg credit:%d\n",
+ (int)(qstats.sum_credit /
+ (1500 * ((int)qstats.pkts_total ) + 1)));
// Return values stats
for (k = 0; k < RET_VAL_COUNT; k++) {
@@ -408,6 +414,7 @@ static void Usage(void)
" Where:\n"
" -o indicates egress direction (default)\n"
" -d print BPF trace debug buffer\n"
+ " --edt use fq's Earliest Departure Time\n"
" -l also limit flows using loopback\n"
" -n <#> to create cgroup \"/hbm#\" and attach prog\n"
" Default is /hbm1\n"
@@ -433,6 +440,7 @@ int main(int argc, char **argv)
char *optstring = "iodln:r:st:wh";
struct option loptions[] = {
{"no_cn", 0, NULL, 1},
+ {"edt", 0, NULL, 2},
{NULL, 0, NULL, 0}
};
@@ -441,6 +449,10 @@ int main(int argc, char **argv)
case 1:
no_cn_flag = true;
break;
+ case 2:
+ prog = "hbm_edt_kern.o";
+ edt_flag = true;
+ break;
case'o':
break;
case 'd':
diff --git a/samples/bpf/hbm_edt_kern.c b/samples/bpf/hbm_edt_kern.c
new file mode 100644
index 0000000..a65b677
--- /dev/null
+++ b/samples/bpf/hbm_edt_kern.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample Host Bandwidth Manager (HBM) BPF program.
+ *
+ * A cgroup skb BPF egress program to limit cgroup output bandwidth.
+ * It uses a modified virtual token bucket queue to limit average
+ * egress bandwidth. The implementation uses credits instead of tokens.
+ * Negative credits imply that queueing would have happened (this is
+ * a virtual queue, so no queueing is done by it. However, queueing may
+ * occur at the actual qdisc (which is not used for rate limiting).
+ *
+ * This implementation uses 3 thresholds, one to start marking packets and
+ * the other two to drop packets:
+ * CREDIT
+ * - <--------------------------|------------------------> +
+ * | | | 0
+ * | Large pkt |
+ * | drop thresh |
+ * Small pkt drop Mark threshold
+ * thresh
+ *
+ * The effect of marking depends on the type of packet:
+ * a) If the packet is ECN enabled and it is a TCP packet, then the packet
+ * is ECN marked.
+ * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
+ * to reduce the congestion window. The current implementation uses a linear
+ * distribution (0% probability at marking threshold, 100% probability
+ * at drop threshold).
+ * c) If the packet is not a TCP packet, then it is dropped.
+ *
+ * If the credit is below the drop threshold, the packet is dropped. If it
+ * is a TCP packet, then it also calls tcp_cwr since packets dropped by
+ * by a cgroup skb BPF program do not automatically trigger a call to
+ * tcp_cwr in the current kernel code.
+ *
+ * This BPF program actually uses 2 drop thresholds, one threshold
+ * for larger packets (>= 120 bytes) and another for smaller packets. This
+ * protects smaller packets such as SYNs, ACKs, etc.
+ *
+ * The default bandwidth limit is set at 1Gbps but this can be changed by
+ * a user program through a shared BPF map. In addition, by default this BPF
+ * program does not limit connections using loopback. This behavior can be
+ * overwritten by the user program. There is also an option to calculate
+ * some statistics, such as percent of packets marked or dropped, which
+ * a user program, such as hbm, can access.
+ */
+
+#include "hbm_kern.h"
+
+SEC("cgroup_skb/egress")
+int _hbm_out_cg(struct __sk_buff *skb)
+{
+ long long delta = 0, delta_send;
+ unsigned long long curtime, sendtime;
+ struct hbm_queue_stats *qsp = NULL;
+ unsigned int queue_index = 0;
+ bool congestion_flag = false;
+ bool ecn_ce_flag = false;
+ struct hbm_pkt_info pkti = {};
+ struct hbm_vqueue *qdp;
+ bool drop_flag = false;
+ bool cwr_flag = false;
+ int len = skb->len;
+ int rv = ALLOW_PKT;
+
+ qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
+
+ // Check if we should ignore loopback traffic
+ if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
+ return ALLOW_PKT;
+
+ hbm_get_pkt_info(skb, &pkti);
+
+ // We may want to account for the length of headers in len
+ // calculation, like ETH header + overhead, specially if it
+ // is a gso packet. But I am not doing it right now.
+
+ qdp = bpf_get_local_storage(&queue_state, 0);
+ if (!qdp)
+ return ALLOW_PKT;
+ if (qdp->lasttime == 0)
+ hbm_init_edt_vqueue(qdp, 1024);
+
+ curtime = bpf_ktime_get_ns();
+
+ // Begin critical section
+ bpf_spin_lock(&qdp->lock);
+ delta = qdp->lasttime - curtime;
+ // bound bursts to 100us
+ if (delta < -BURST_SIZE_NS) {
+ // negative delta is a credit that allows bursts
+ qdp->lasttime = curtime - BURST_SIZE_NS;
+ delta = -BURST_SIZE_NS;
+ }
+ sendtime = qdp->lasttime;
+ delta_send = BYTES_TO_NS(len, qdp->rate);
+ __sync_add_and_fetch(&(qdp->lasttime), delta_send);
+ bpf_spin_unlock(&qdp->lock);
+ // End critical section
+
+ // Set EDT of packet
+ skb->tstamp = sendtime;
+
+ // Check if we should update rate
+ if (qsp != NULL && (qsp->rate * 128) != qdp->rate)
+ qdp->rate = qsp->rate * 128;
+
+ // Set flags (drop, congestion, cwr)
+ // last packet will be sent in the future, bound latency
+ if (delta > DROP_THRESH_NS || (delta > LARGE_PKT_DROP_THRESH_NS &&
+ len > LARGE_PKT_THRESH)) {
+ drop_flag = true;
+ if (pkti.is_tcp && pkti.ecn == 0)
+ cwr_flag = true;
+ } else if (delta > MARK_THRESH_NS) {
+ if (pkti.is_tcp)
+ congestion_flag = true;
+ else
+ drop_flag = true;
+ }
+
+ if (congestion_flag) {
+ if (bpf_skb_ecn_set_ce(skb)) {
+ ecn_ce_flag = true;
+ } else {
+ if (pkti.is_tcp) {
+ unsigned int rand = bpf_get_prandom_u32();
+
+ if (delta >= MARK_THRESH_NS +
+ (rand % MARK_REGION_SIZE_NS)) {
+ // Do congestion control
+ cwr_flag = true;
+ }
+ } else if (len > LARGE_PKT_THRESH) {
+ // Problem if too many small packets?
+ drop_flag = true;
+ congestion_flag = false;
+ }
+ }
+ }
+
+ if (pkti.is_tcp && drop_flag && pkti.packets_out <= 1) {
+ drop_flag = false;
+ cwr_flag = true;
+ congestion_flag = false;
+ }
+
+ if (qsp != NULL && qsp->no_cn)
+ cwr_flag = false;
+
+ hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag,
+ cwr_flag, ecn_ce_flag, &pkti, (int) delta);
+
+ if (drop_flag) {
+ __sync_add_and_fetch(&(qdp->lasttime), -delta_send);
+ rv = DROP_PKT;
+ }
+
+ if (cwr_flag)
+ rv |= CWR;
+ return rv;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h
index be19cf1..aa207a2 100644
--- a/samples/bpf/hbm_kern.h
+++ b/samples/bpf/hbm_kern.h
@@ -29,6 +29,7 @@
#define DROP_PKT 0
#define ALLOW_PKT 1
#define TCP_ECN_OK 1
+#define CWR 2
#ifndef HBM_DEBUG // Define HBM_DEBUG to enable debugging
#undef bpf_printk
@@ -45,8 +46,18 @@
#define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET)
#define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET)
+// Time base accounting for fq's EDT
+#define BURST_SIZE_NS 100000 // 100us
+#define MARK_THRESH_NS 50000 // 50us
+#define DROP_THRESH_NS 500000 // 500us
+// Reserve 20us of queuing for small packets (less than 120 bytes)
+#define LARGE_PKT_DROP_THRESH_NS (DROP_THRESH_NS - 20000)
+#define MARK_REGION_SIZE_NS (LARGE_PKT_DROP_THRESH_NS - MARK_THRESH_NS)
+
// rate in bytes per ns << 20
#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20)
+#define BYTES_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20)
+#define BYTES_TO_NS(bytes, rate) div64_u64(((u64)(bytes)) << 20, (u64)(rate))
struct bpf_map_def SEC("maps") queue_state = {
.type = BPF_MAP_TYPE_CGROUP_STORAGE,
@@ -67,6 +78,7 @@ BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats);
struct hbm_pkt_info {
int cwnd;
int rtt;
+ int packets_out;
bool is_ip;
bool is_tcp;
short ecn;
@@ -86,16 +98,20 @@ static int get_tcp_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti)
if (tp) {
pkti->cwnd = tp->snd_cwnd;
pkti->rtt = tp->srtt_us >> 3;
+ pkti->packets_out = tp->packets_out;
return 0;
}
}
}
}
+ pkti->cwnd = 0;
+ pkti->rtt = 0;
+ pkti->packets_out = 0;
return 1;
}
-static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
- struct hbm_pkt_info *pkti)
+static void hbm_get_pkt_info(struct __sk_buff *skb,
+ struct hbm_pkt_info *pkti)
{
struct iphdr iph;
struct ipv6hdr *ip6h;
@@ -123,10 +139,22 @@ static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate)
{
- bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
- qdp->lasttime = bpf_ktime_get_ns();
- qdp->credit = INIT_CREDIT;
- qdp->rate = rate * 128;
+ bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
+ qdp->lasttime = bpf_ktime_get_ns();
+ qdp->credit = INIT_CREDIT;
+ qdp->rate = rate * 128;
+}
+
+static __always_inline void hbm_init_edt_vqueue(struct hbm_vqueue *qdp,
+ int rate)
+{
+ unsigned long long curtime;
+
+ curtime = bpf_ktime_get_ns();
+ bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
+ qdp->lasttime = curtime - BURST_SIZE_NS; // support initial burst
+ qdp->credit = 0; // not used
+ qdp->rate = rate * 128;
}
static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c
index 38b2b3f..f281df7 100644
--- a/samples/bpf/ibumad_kern.c
+++ b/samples/bpf/ibumad_kern.c
@@ -31,15 +31,9 @@ struct bpf_map_def SEC("maps") write_count = {
};
#undef DEBUG
-#ifdef DEBUG
-#define bpf_debug(fmt, ...) \
-({ \
- char ____fmt[] = fmt; \
- bpf_trace_printk(____fmt, sizeof(____fmt), \
- ##__VA_ARGS__); \
-})
-#else
-#define bpf_debug(fmt, ...)
+#ifndef DEBUG
+#undef bpf_printk
+#define bpf_printk(fmt, ...)
#endif
/* Taken from the current format defined in
@@ -86,7 +80,7 @@ int on_ib_umad_read_recv(struct ib_umad_rw_args *ctx)
u64 zero = 0, *val;
u8 class = ctx->mgmt_class;
- bpf_debug("ib_umad read recv : class 0x%x\n", class);
+ bpf_printk("ib_umad read recv : class 0x%x\n", class);
val = bpf_map_lookup_elem(&read_count, &class);
if (!val) {
@@ -106,7 +100,7 @@ int on_ib_umad_read_send(struct ib_umad_rw_args *ctx)
u64 zero = 0, *val;
u8 class = ctx->mgmt_class;
- bpf_debug("ib_umad read send : class 0x%x\n", class);
+ bpf_printk("ib_umad read send : class 0x%x\n", class);
val = bpf_map_lookup_elem(&read_count, &class);
if (!val) {
@@ -126,7 +120,7 @@ int on_ib_umad_write(struct ib_umad_rw_args *ctx)
u64 zero = 0, *val;
u8 class = ctx->mgmt_class;
- bpf_debug("ib_umad write : class 0x%x\n", class);
+ bpf_printk("ib_umad write : class 0x%x\n", class);
val = bpf_map_lookup_elem(&write_count, &class);
if (!val) {
diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme
index fee7466..78e247f 100644
--- a/samples/bpf/tcp_bpf.readme
+++ b/samples/bpf/tcp_bpf.readme
@@ -25,4 +25,4 @@ attached to the cgroupv2).
To remove (unattach) a socket_ops BPF program from a cgroupv2:
- bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
+ bpftool cgroup detach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
diff --git a/samples/bpf/tcp_dumpstats_kern.c b/samples/bpf/tcp_dumpstats_kern.c
new file mode 100644
index 0000000..8557913
--- /dev/null
+++ b/samples/bpf/tcp_dumpstats_kern.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Refer to samples/bpf/tcp_bpf.readme for the instructions on
+ * how to run this sample program.
+ */
+#include <linux/bpf.h>
+
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define INTERVAL 1000000000ULL
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+struct {
+ __u32 type;
+ __u32 map_flags;
+ int *key;
+ __u64 *value;
+} bpf_next_dump SEC(".maps") = {
+ .type = BPF_MAP_TYPE_SK_STORAGE,
+ .map_flags = BPF_F_NO_PREALLOC,
+};
+
+SEC("sockops")
+int _sockops(struct bpf_sock_ops *ctx)
+{
+ struct bpf_tcp_sock *tcp_sk;
+ struct bpf_sock *sk;
+ __u64 *next_dump;
+ __u64 now;
+
+ switch (ctx->op) {
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ bpf_sock_ops_cb_flags_set(ctx, BPF_SOCK_OPS_RTT_CB_FLAG);
+ return 1;
+ case BPF_SOCK_OPS_RTT_CB:
+ break;
+ default:
+ return 1;
+ }
+
+ sk = ctx->sk;
+ if (!sk)
+ return 1;
+
+ next_dump = bpf_sk_storage_get(&bpf_next_dump, sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!next_dump)
+ return 1;
+
+ now = bpf_ktime_get_ns();
+ if (now < *next_dump)
+ return 1;
+
+ tcp_sk = bpf_tcp_sock(sk);
+ if (!tcp_sk)
+ return 1;
+
+ *next_dump = now + INTERVAL;
+
+ bpf_printk("dsack_dups=%u delivered=%u\n",
+ tcp_sk->dsack_dups, tcp_sk->delivered);
+ bpf_printk("delivered_ce=%u icsk_retransmits=%u\n",
+ tcp_sk->delivered_ce, tcp_sk->icsk_retransmits);
+
+ return 1;
+}
diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c
index 586ff75..a3596b6 100644
--- a/samples/bpf/xdp_adjust_tail_user.c
+++ b/samples/bpf/xdp_adjust_tail_user.c
@@ -13,6 +13,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <net/if.h>
#include <sys/resource.h>
#include <arpa/inet.h>
#include <netinet/ether.h>
@@ -69,7 +70,7 @@ static void usage(const char *cmd)
printf("Start a XDP prog which send ICMP \"packet too big\" \n"
"messages if ingress packet is bigger then MAX_SIZE bytes\n");
printf("Usage: %s [...]\n", cmd);
- printf(" -i <ifindex> Interface Index\n");
+ printf(" -i <ifname|ifindex> Interface\n");
printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n");
printf(" -S use skb-mode\n");
printf(" -N enforce native mode\n");
@@ -102,7 +103,9 @@ int main(int argc, char **argv)
switch (opt) {
case 'i':
- ifindex = atoi(optarg);
+ ifindex = if_nametoindex(optarg);
+ if (!ifindex)
+ ifindex = atoi(optarg);
break;
case 'T':
kill_after_s = atoi(optarg);
@@ -136,6 +139,11 @@ int main(int argc, char **argv)
return 1;
}
+ if (!ifindex) {
+ fprintf(stderr, "Invalid ifname\n");
+ return 1;
+ }
+
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
prog_load_attr.file = filename;
diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c
index 15bb6f6..f70ee33 100644
--- a/samples/bpf/xdp_redirect_map_user.c
+++ b/samples/bpf/xdp_redirect_map_user.c
@@ -10,6 +10,7 @@
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
+#include <net/if.h>
#include <unistd.h>
#include <libgen.h>
#include <sys/resource.h>
@@ -85,7 +86,7 @@ static void poll_stats(int interval, int ifindex)
static void usage(const char *prog)
{
fprintf(stderr,
- "usage: %s [OPTS] IFINDEX_IN IFINDEX_OUT\n\n"
+ "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n"
"OPTS:\n"
" -S use skb-mode\n"
" -N enforce native mode\n"
@@ -127,7 +128,7 @@ int main(int argc, char **argv)
}
if (optind == argc) {
- printf("usage: %s IFINDEX_IN IFINDEX_OUT\n", argv[0]);
+ printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]);
return 1;
}
@@ -136,8 +137,14 @@ int main(int argc, char **argv)
return 1;
}
- ifindex_in = strtoul(argv[optind], NULL, 0);
- ifindex_out = strtoul(argv[optind + 1], NULL, 0);
+ ifindex_in = if_nametoindex(argv[optind]);
+ if (!ifindex_in)
+ ifindex_in = strtoul(argv[optind], NULL, 0);
+
+ ifindex_out = if_nametoindex(argv[optind + 1]);
+ if (!ifindex_out)
+ ifindex_out = strtoul(argv[optind + 1], NULL, 0);
+
printf("input: %d output: %d\n", ifindex_in, ifindex_out);
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c
index ce71be1..39de06f 100644
--- a/samples/bpf/xdp_redirect_user.c
+++ b/samples/bpf/xdp_redirect_user.c
@@ -10,6 +10,7 @@
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
+#include <net/if.h>
#include <unistd.h>
#include <libgen.h>
#include <sys/resource.h>
@@ -85,7 +86,7 @@ static void poll_stats(int interval, int ifindex)
static void usage(const char *prog)
{
fprintf(stderr,
- "usage: %s [OPTS] IFINDEX_IN IFINDEX_OUT\n\n"
+ "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n"
"OPTS:\n"
" -S use skb-mode\n"
" -N enforce native mode\n"
@@ -128,7 +129,7 @@ int main(int argc, char **argv)
}
if (optind == argc) {
- printf("usage: %s IFINDEX_IN IFINDEX_OUT\n", argv[0]);
+ printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]);
return 1;
}
@@ -137,8 +138,14 @@ int main(int argc, char **argv)
return 1;
}
- ifindex_in = strtoul(argv[optind], NULL, 0);
- ifindex_out = strtoul(argv[optind + 1], NULL, 0);
+ ifindex_in = if_nametoindex(argv[optind]);
+ if (!ifindex_in)
+ ifindex_in = strtoul(argv[optind], NULL, 0);
+
+ ifindex_out = if_nametoindex(argv[optind + 1]);
+ if (!ifindex_out)
+ ifindex_out = strtoul(argv[optind + 1], NULL, 0);
+
printf("input: %d output: %d\n", ifindex_in, ifindex_out);
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c
index 3948964..dfb6858 100644
--- a/samples/bpf/xdp_tx_iptunnel_user.c
+++ b/samples/bpf/xdp_tx_iptunnel_user.c
@@ -9,6 +9,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <net/if.h>
#include <sys/resource.h>
#include <arpa/inet.h>
#include <netinet/ether.h>
@@ -83,7 +84,7 @@ static void usage(const char *cmd)
"in an IPv4/v6 header and XDP_TX it out. The dst <VIP:PORT>\n"
"is used to select packets to encapsulate\n\n");
printf("Usage: %s [...]\n", cmd);
- printf(" -i <ifindex> Interface Index\n");
+ printf(" -i <ifname|ifindex> Interface\n");
printf(" -a <vip-service-address> IPv4 or IPv6\n");
printf(" -p <vip-service-port> A port range (e.g. 433-444) is also allowed\n");
printf(" -s <source-ip> Used in the IPTunnel header\n");
@@ -181,7 +182,9 @@ int main(int argc, char **argv)
switch (opt) {
case 'i':
- ifindex = atoi(optarg);
+ ifindex = if_nametoindex(optarg);
+ if (!ifindex)
+ ifindex = atoi(optarg);
break;
case 'a':
vip.family = parse_ipstr(optarg, vip.daddr.v6);
@@ -253,6 +256,11 @@ int main(int argc, char **argv)
return 1;
}
+ if (!ifindex) {
+ fprintf(stderr, "Invalid ifname\n");
+ return 1;
+ }
+
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
prog_load_attr.file = filename;
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 0f5eb0d..93eaaf7 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -68,6 +68,7 @@ static int opt_queue;
static int opt_poll;
static int opt_interval = 1;
static u32 opt_xdp_bind_flags;
+static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
static __u32 prog_id;
struct xsk_umem_info {
@@ -276,6 +277,12 @@ static size_t gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
{
struct xsk_umem_info *umem;
+ struct xsk_umem_config cfg = {
+ .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+ .frame_size = opt_xsk_frame_size,
+ .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM,
+ };
int ret;
umem = calloc(1, sizeof(*umem));
@@ -283,7 +290,7 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
exit_with_error(errno);
ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
- NULL);
+ &cfg);
if (ret)
exit_with_error(-ret);
@@ -323,11 +330,9 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem)
&idx);
if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS)
exit_with_error(-ret);
- for (i = 0;
- i < XSK_RING_PROD__DEFAULT_NUM_DESCS *
- XSK_UMEM__DEFAULT_FRAME_SIZE;
- i += XSK_UMEM__DEFAULT_FRAME_SIZE)
- *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = i;
+ for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++)
+ *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) =
+ i * opt_xsk_frame_size;
xsk_ring_prod__submit(&xsk->umem->fq,
XSK_RING_PROD__DEFAULT_NUM_DESCS);
@@ -346,6 +351,7 @@ static struct option long_options[] = {
{"interval", required_argument, 0, 'n'},
{"zero-copy", no_argument, 0, 'z'},
{"copy", no_argument, 0, 'c'},
+ {"frame-size", required_argument, 0, 'f'},
{0, 0, 0, 0}
};
@@ -365,8 +371,9 @@ static void usage(const char *prog)
" -n, --interval=n Specify statistics update interval (default 1 sec).\n"
" -z, --zero-copy Force zero-copy mode.\n"
" -c, --copy Force copy mode.\n"
+ " -f, --frame-size=n Set the frame size (must be a power of two, default is %d).\n"
"\n";
- fprintf(stderr, str, prog);
+ fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE);
exit(EXIT_FAILURE);
}
@@ -377,7 +384,7 @@ static void parse_command_line(int argc, char **argv)
opterr = 0;
for (;;) {
- c = getopt_long(argc, argv, "Frtli:q:psSNn:cz", long_options,
+ c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:", long_options,
&option_index);
if (c == -1)
break;
@@ -420,6 +427,9 @@ static void parse_command_line(int argc, char **argv)
case 'F':
opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
break;
+ case 'f':
+ opt_xsk_frame_size = atoi(optarg);
+ break;
default:
usage(basename(argv[0]));
}
@@ -432,6 +442,11 @@ static void parse_command_line(int argc, char **argv)
usage(basename(argv[0]));
}
+ if (opt_xsk_frame_size & (opt_xsk_frame_size - 1)) {
+ fprintf(stderr, "--frame-size=%d is not a power of two\n",
+ opt_xsk_frame_size);
+ usage(basename(argv[0]));
+ }
}
static void kick_tx(struct xsk_socket_info *xsk)
@@ -583,8 +598,7 @@ static void tx_only(struct xsk_socket_info *xsk)
for (i = 0; i < BATCH_SIZE; i++) {
xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr
- = (frame_nb + i) <<
- XSK_UMEM__DEFAULT_FRAME_SHIFT;
+ = (frame_nb + i) * opt_xsk_frame_size;
xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
sizeof(pkt_data) - 1;
}
@@ -661,21 +675,19 @@ int main(int argc, char **argv)
}
ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
- NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE);
+ NUM_FRAMES * opt_xsk_frame_size);
if (ret)
exit_with_error(ret);
/* Create sockets... */
- umem = xsk_configure_umem(bufs,
- NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE);
+ umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size);
xsks[num_socks++] = xsk_configure_socket(umem);
if (opt_bench == BENCH_TXONLY) {
int i;
- for (i = 0; i < NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE;
- i += XSK_UMEM__DEFAULT_FRAME_SIZE)
- (void)gen_eth_frame(umem, i);
+ for (i = 0; i < NUM_FRAMES; i++)
+ (void)gen_eth_frame(umem, i * opt_xsk_frame_size);
}
signal(SIGINT, int_exit);