From 943170998b200190f99d3fe7e771437e2c51f319 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 22 Sep 2017 13:49:14 -0700 Subject: tun: enable NAPI for TUN/TAP driver Changes TUN driver to use napi_gro_receive() upon receiving packets rather than netif_rx_ni(). Adds flag IFF_NAPI that enables these changes and operation is not affected if the flag is disabled. SKBs are constructed upon packet arrival and are queued to be processed later. The new path was evaluated with a benchmark with the following setup: Open two tap devices and a receiver thread that reads in a loop for each device. Start one sender thread and pin all threads to different CPUs. Send 1M minimum UDP packets to each device and measure sending time for each of the sending methods: napi_gro_receive(): 4.90s netif_rx_ni(): 4.90s netif_receive_skb(): 7.20s Signed-off-by: Petar Penkov Cc: Eric Dumazet Cc: Mahesh Bandewar Cc: Willem de Bruijn Cc: davem@davemloft.net Cc: ppenkov@stanford.edu Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/tun.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 118 insertions(+), 15 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3c9985f29950..f16407242b18 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -121,7 +121,7 @@ do { \ #define TUN_VNET_BE 0x40000000 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ - IFF_MULTI_QUEUE) + IFF_MULTI_QUEUE | IFF_NAPI) #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 @@ -172,6 +172,7 @@ struct tun_file { u16 queue_index; unsigned int ifindex; }; + struct napi_struct napi; struct list_head next; struct tun_struct *detached; struct skb_array tx_array; @@ -229,6 +230,68 @@ struct tun_struct { struct bpf_prog __rcu *xdp_prog; }; +static int tun_napi_receive(struct napi_struct *napi, int budget) +{ + struct tun_file *tfile = container_of(napi, struct tun_file, napi); + struct sk_buff_head *queue = &tfile->sk.sk_write_queue; + struct sk_buff_head process_queue; + struct sk_buff *skb; + int received = 0; + + __skb_queue_head_init(&process_queue); + + spin_lock(&queue->lock); + skb_queue_splice_tail_init(queue, &process_queue); + spin_unlock(&queue->lock); + + while (received < budget && (skb = __skb_dequeue(&process_queue))) { + napi_gro_receive(napi, skb); + ++received; + } + + if (!skb_queue_empty(&process_queue)) { + spin_lock(&queue->lock); + skb_queue_splice(&process_queue, queue); + spin_unlock(&queue->lock); + } + + return received; +} + +static int tun_napi_poll(struct napi_struct *napi, int budget) +{ + unsigned int received; + + received = tun_napi_receive(napi, budget); + + if (received < budget) + napi_complete_done(napi, received); + + return received; +} + +static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, + bool napi_en) +{ + if (napi_en) { + netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll, + NAPI_POLL_WEIGHT); + napi_enable(&tfile->napi); + } +} + +static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile) +{ + if (tun->flags & IFF_NAPI) + napi_disable(&tfile->napi); +} + +static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile) +{ + if (tun->flags & IFF_NAPI) + netif_napi_del(&tfile->napi); +} + #ifdef CONFIG_TUN_VNET_CROSS_LE static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) { @@ -541,6 +604,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean) tun = rtnl_dereference(tfile->tun); + if (tun && clean) { + tun_napi_disable(tun, tfile); + tun_napi_del(tun, tfile); + } + if (tun && !tfile->detached) { u16 index = tfile->queue_index; BUG_ON(index >= tun->numqueues); @@ -598,6 +666,7 @@ static void tun_detach_all(struct net_device *dev) for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); + tun_napi_disable(tun, tfile); tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); @@ -613,6 +682,7 @@ static void tun_detach_all(struct net_device *dev) synchronize_net(); for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); + tun_napi_del(tun, tfile); /* Drop read queue */ tun_queue_purge(tfile); sock_put(&tfile->sk); @@ -631,7 +701,8 @@ static void tun_detach_all(struct net_device *dev) module_put(THIS_MODULE); } -static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter) +static int tun_attach(struct tun_struct *tun, struct file *file, + bool skip_filter, bool napi) { struct tun_file *tfile = file->private_data; struct net_device *dev = tun->dev; @@ -677,10 +748,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); tun->numqueues++; - if (tfile->detached) + if (tfile->detached) { tun_enable_queue(tfile); - else + } else { sock_hold(&tfile->sk); + tun_napi_init(tun, tfile, napi); + } tun_set_real_num_queues(tun); @@ -956,13 +1029,28 @@ static void tun_poll_controller(struct net_device *dev) * Tun only receives frames when: * 1) the char device endpoint gets data from user space * 2) the tun socket gets a sendmsg call from user space - * Since both of those are synchronous operations, we are guaranteed - * never to have pending data when we poll for it - * so there is nothing to do here but return. + * If NAPI is not enabled, since both of those are synchronous + * operations, we are guaranteed never to have pending data when we poll + * for it so there is nothing to do here but return. * We need this though so netpoll recognizes us as an interface that * supports polling, which enables bridge devices in virt setups to * still use netconsole + * If NAPI is enabled, however, we need to schedule polling for all + * queues. */ + struct tun_struct *tun = netdev_priv(dev); + + if (tun->flags & IFF_NAPI) { + struct tun_file *tfile; + int i; + + rcu_read_lock(); + for (i = 0; i < tun->numqueues; i++) { + tfile = rcu_dereference(tun->tfiles[i]); + napi_schedule(&tfile->napi); + } + rcu_read_unlock(); + } return; } #endif @@ -1549,11 +1637,25 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } rxhash = __skb_get_hash_symmetric(skb); -#ifndef CONFIG_4KSTACKS - tun_rx_batched(tun, tfile, skb, more); -#else - netif_rx_ni(skb); -#endif + + if (tun->flags & IFF_NAPI) { + struct sk_buff_head *queue = &tfile->sk.sk_write_queue; + int queue_len; + + spin_lock_bh(&queue->lock); + __skb_queue_tail(queue, skb); + queue_len = skb_queue_len(queue); + spin_unlock(&queue->lock); + + if (!more || queue_len > NAPI_POLL_WEIGHT) + napi_schedule(&tfile->napi); + + local_bh_enable(); + } else if (!IS_ENABLED(CONFIG_4KSTACKS)) { + tun_rx_batched(tun, tfile, skb, more); + } else { + netif_rx_ni(skb); + } stats = get_cpu_ptr(tun->pcpu_stats); u64_stats_update_begin(&stats->syncp); @@ -1980,7 +2082,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (err < 0) return err; - err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER); + err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER, + ifr->ifr_flags & IFF_NAPI); if (err < 0) return err; @@ -2066,7 +2169,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) NETIF_F_HW_VLAN_STAG_TX); INIT_LIST_HEAD(&tun->disabled); - err = tun_attach(tun, file, false); + err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI); if (err < 0) goto err_free_flow; @@ -2216,7 +2319,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) ret = security_tun_dev_attach_queue(tun->security); if (ret < 0) goto unlock; - ret = tun_attach(tun, file, false); + ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached) -- cgit v1.2.3 From 90e33d45940793def6f773b2d528e9f3c84ffdc7 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 22 Sep 2017 13:49:15 -0700 Subject: tun: enable napi_gro_frags() for TUN/TAP driver Add a TUN/TAP receive mode that exercises the napi_gro_frags() interface. This mode is available only in TAP mode, as the interface expects packets with Ethernet headers. Furthermore, packets follow the layout of the iovec_iter that was received. The first iovec is the linear data, and every one after the first is a fragment. If there are more fragments than the max number, drop the packet. Additionally, invoke eth_get_headlen() to exercise flow dissector code and to verify that the header resides in the linear data. The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option. This is imposed because this mode is intended for testing via tools like syzkaller and packetdrill, and the increased flexibility it provides can introduce security vulnerabilities. This flag is accepted only if the device is in TAP mode and has the IFF_NAPI flag set as well. This is done because both of these are explicit requirements for correct operation in this mode. Signed-off-by: Petar Penkov Cc: Eric Dumazet Cc: Mahesh Bandewar Cc: Willem de Bruijn Cc: davem@davemloft.net Cc: ppenkov@stanford.edu Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/tun.c | 134 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 128 insertions(+), 6 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index f16407242b18..9880b3bc8fa5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -75,6 +75,7 @@ #include #include #include +#include #include @@ -121,7 +122,8 @@ do { \ #define TUN_VNET_BE 0x40000000 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ - IFF_MULTI_QUEUE | IFF_NAPI) + IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS) + #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 @@ -173,6 +175,7 @@ struct tun_file { unsigned int ifindex; }; struct napi_struct napi; + struct mutex napi_mutex; /* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; struct skb_array tx_array; @@ -277,6 +280,7 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll, NAPI_POLL_WEIGHT); napi_enable(&tfile->napi); + mutex_init(&tfile->napi_mutex); } } @@ -292,6 +296,11 @@ static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile) netif_napi_del(&tfile->napi); } +static bool tun_napi_frags_enabled(const struct tun_struct *tun) +{ + return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS; +} + #ifdef CONFIG_TUN_VNET_CROSS_LE static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) { @@ -1036,7 +1045,8 @@ static void tun_poll_controller(struct net_device *dev) * supports polling, which enables bridge devices in virt setups to * still use netconsole * If NAPI is enabled, however, we need to schedule polling for all - * queues. + * queues unless we are using napi_gro_frags(), which we call in + * process context and not in NAPI context. */ struct tun_struct *tun = netdev_priv(dev); @@ -1044,6 +1054,9 @@ static void tun_poll_controller(struct net_device *dev) struct tun_file *tfile; int i; + if (tun_napi_frags_enabled(tun)) + return; + rcu_read_lock(); for (i = 0; i < tun->numqueues; i++) { tfile = rcu_dereference(tun->tfiles[i]); @@ -1266,6 +1279,64 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait) return mask; } +static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, + size_t len, + const struct iov_iter *it) +{ + struct sk_buff *skb; + size_t linear; + int err; + int i; + + if (it->nr_segs > MAX_SKB_FRAGS + 1) + return ERR_PTR(-ENOMEM); + + local_bh_disable(); + skb = napi_get_frags(&tfile->napi); + local_bh_enable(); + if (!skb) + return ERR_PTR(-ENOMEM); + + linear = iov_iter_single_seg_count(it); + err = __skb_grow(skb, linear); + if (err) + goto free; + + skb->len = len; + skb->data_len = len - linear; + skb->truesize += skb->data_len; + + for (i = 1; i < it->nr_segs; i++) { + size_t fragsz = it->iov[i].iov_len; + unsigned long offset; + struct page *page; + void *data; + + if (fragsz == 0 || fragsz > PAGE_SIZE) { + err = -EINVAL; + goto free; + } + + local_bh_disable(); + data = napi_alloc_frag(fragsz); + local_bh_enable(); + if (!data) { + err = -ENOMEM; + goto free; + } + + page = virt_to_head_page(data); + offset = data - page_address(page); + skb_fill_page_desc(skb, i - 1, page, offset, fragsz); + } + + return skb; +free: + /* frees skb and all frags allocated with napi_alloc_frag() */ + napi_free_frags(&tfile->napi); + return ERR_PTR(err); +} + /* prepad is the amount to reserve at front. len is length after that. * linear is a hint as to how much to copy (usually headers). */ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, @@ -1478,6 +1549,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, int err; u32 rxhash; int skb_xdp = 1; + bool frags = tun_napi_frags_enabled(tun); if (!(tun->dev->flags & IFF_UP)) return -EIO; @@ -1535,7 +1607,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, zerocopy = true; } - if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { + if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { /* For the packet that is not easy to be processed * (e.g gso or jumbo packet), we will do it at after * skb was created with generic XDP routine. @@ -1556,10 +1628,24 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, linear = tun16_to_cpu(tun, gso.hdr_len); } - skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); + if (frags) { + mutex_lock(&tfile->napi_mutex); + skb = tun_napi_alloc_frags(tfile, copylen, from); + /* tun_napi_alloc_frags() enforces a layout for the skb. + * If zerocopy is enabled, then this layout will be + * overwritten by zerocopy_sg_from_iter(). + */ + zerocopy = false; + } else { + skb = tun_alloc_skb(tfile, align, copylen, linear, + noblock); + } + if (IS_ERR(skb)) { if (PTR_ERR(skb) != -EAGAIN) this_cpu_inc(tun->pcpu_stats->rx_dropped); + if (frags) + mutex_unlock(&tfile->napi_mutex); return PTR_ERR(skb); } @@ -1571,6 +1657,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (err) { this_cpu_inc(tun->pcpu_stats->rx_dropped); kfree_skb(skb); + if (frags) { + tfile->napi.skb = NULL; + mutex_unlock(&tfile->napi_mutex); + } + return -EFAULT; } } @@ -1578,6 +1669,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) { this_cpu_inc(tun->pcpu_stats->rx_frame_errors); kfree_skb(skb); + if (frags) { + tfile->napi.skb = NULL; + mutex_unlock(&tfile->napi_mutex); + } + return -EINVAL; } @@ -1603,7 +1699,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, skb->dev = tun->dev; break; case IFF_TAP: - skb->protocol = eth_type_trans(skb, tun->dev); + if (!frags) + skb->protocol = eth_type_trans(skb, tun->dev); break; } @@ -1638,7 +1735,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, rxhash = __skb_get_hash_symmetric(skb); - if (tun->flags & IFF_NAPI) { + if (frags) { + /* Exercise flow dissector code path. */ + u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb)); + + if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) { + this_cpu_inc(tun->pcpu_stats->rx_dropped); + napi_free_frags(&tfile->napi); + mutex_unlock(&tfile->napi_mutex); + WARN_ON(1); + return -ENOMEM; + } + + local_bh_disable(); + napi_gro_frags(&tfile->napi); + local_bh_enable(); + mutex_unlock(&tfile->napi_mutex); + } else if (tun->flags & IFF_NAPI) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; int queue_len; @@ -2061,6 +2174,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (tfile->detached) return -EINVAL; + if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) { + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!(ifr->ifr_flags & IFF_NAPI) || + (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP) + return -EINVAL; + } + dev = __dev_get_by_name(net, ifr->ifr_name); if (dev) { if (ifr->ifr_flags & IFF_TUN_EXCL) -- cgit v1.2.3 From 9484dc74fcf0750cd6726c9aa27edf97223916a8 Mon Sep 17 00:00:00 2001 From: yuan linyu Date: Sat, 23 Sep 2017 22:36:52 +0800 Subject: tun: delete original tun_get() and rename __tun_get() to tun_get() it seems no need to keep tun_get() and __tun_get() at same time. Signed-off-by: yuan linyu Signed-off-by: David S. Miller --- drivers/net/tun.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9880b3bc8fa5..2c36f6ebad79 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -774,7 +774,7 @@ out: return err; } -static struct tun_struct *__tun_get(struct tun_file *tfile) +static struct tun_struct *tun_get(struct tun_file *tfile) { struct tun_struct *tun; @@ -787,11 +787,6 @@ static struct tun_struct *__tun_get(struct tun_file *tfile) return tun; } -static struct tun_struct *tun_get(struct file *file) -{ - return __tun_get(file->private_data); -} - static void tun_put(struct tun_struct *tun) { dev_put(tun->dev); @@ -1250,7 +1245,7 @@ static void tun_net_init(struct net_device *dev) static unsigned int tun_chr_poll(struct file *file, poll_table *wait) { struct tun_file *tfile = file->private_data; - struct tun_struct *tun = __tun_get(tfile); + struct tun_struct *tun = tun_get(tfile); struct sock *sk; unsigned int mask = 0; @@ -1784,8 +1779,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct tun_struct *tun = tun_get(file); struct tun_file *tfile = file->private_data; + struct tun_struct *tun = tun_get(tfile); ssize_t result; if (!tun) @@ -1969,7 +1964,7 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; - struct tun_struct *tun = __tun_get(tfile); + struct tun_struct *tun = tun_get(tfile); ssize_t len = iov_iter_count(to), ret; if (!tun) @@ -2046,7 +2041,7 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret; struct tun_file *tfile = container_of(sock, struct tun_file, socket); - struct tun_struct *tun = __tun_get(tfile); + struct tun_struct *tun = tun_get(tfile); if (!tun) return -EBADFD; @@ -2062,7 +2057,7 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); - struct tun_struct *tun = __tun_get(tfile); + struct tun_struct *tun = tun_get(tfile); int ret; if (!tun) @@ -2094,7 +2089,7 @@ static int tun_peek_len(struct socket *sock) struct tun_struct *tun; int ret = 0; - tun = __tun_get(tfile); + tun = tun_get(tfile); if (!tun) return 0; @@ -2490,7 +2485,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = 0; rtnl_lock(); - tun = __tun_get(tfile); + tun = tun_get(tfile); if (cmd == TUNSETIFF) { ret = -EEXIST; if (tun) @@ -2837,15 +2832,16 @@ static int tun_chr_close(struct inode *inode, struct file *file) } #ifdef CONFIG_PROC_FS -static void tun_chr_show_fdinfo(struct seq_file *m, struct file *f) +static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file) { + struct tun_file *tfile = file->private_data; struct tun_struct *tun; struct ifreq ifr; memset(&ifr, 0, sizeof(ifr)); rtnl_lock(); - tun = tun_get(f); + tun = tun_get(tfile); if (tun) tun_get_iff(current->nsproxy->net_ns, tun, &ifr); rtnl_unlock(); -- cgit v1.2.3 From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:51 +0200 Subject: bpf: add meta pointer for direct access This work enables generic transfer of metadata from XDP into skb. The basic idea is that we can make use of the fact that the resulting skb must be linear and already comes with a larger headroom for supporting bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work on a similar principle and introduce a small helper bpf_xdp_adjust_meta() for adjusting a new pointer called xdp->data_meta. Thus, the packet has a flexible and programmable room for meta data, followed by the actual packet data. struct xdp_buff is therefore laid out that we first point to data_hard_start, then data_meta directly prepended to data followed by data_end marking the end of packet. bpf_xdp_adjust_head() takes into account whether we have meta data already prepended and if so, memmove()s this along with the given offset provided there's enough room. xdp->data_meta is optional and programs are not required to use it. The rationale is that when we process the packet in XDP (e.g. as DoS filter), we can push further meta data along with it for the XDP_PASS case, and give the guarantee that a clsact ingress BPF program on the same device can pick this up for further post-processing. Since we work with skb there, we can also set skb->mark, skb->priority or other skb meta data out of BPF, thus having this scratch space generic and programmable allows for more flexibility than defining a direct 1:1 transfer of potentially new XDP members into skb (it's also more efficient as we don't need to initialize/handle each of such new members). The facility also works together with GRO aggregation. The scratch space at the head of the packet can be multiple of 4 byte up to 32 byte large. Drivers not yet supporting xdp->data_meta can simply be set up with xdp->data_meta as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out, such that the subsequent match against xdp->data for later access is guaranteed to fail. The verifier treats xdp->data_meta/xdp->data the same way as we treat xdp->data/xdp->data_end pointer comparisons. The requirement for doing the compare against xdp->data is that it hasn't been modified from it's original address we got from ctx access. It may have a range marking already from prior successful xdp->data/xdp->data_end pointer comparisons though. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- drivers/net/tun.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2c36f6ebad79..a6e0bffe3d29 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1468,6 +1468,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, xdp.data_hard_start = buf; xdp.data = buf + pad; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); -- cgit v1.2.3 From 010f245b9dd734adda6386c494a4ace953ea8dc4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Oct 2017 10:07:44 -0700 Subject: tun: relax check on eth_get_headlen() return value syzkaller hit the WARN() in tun_get_user(), providing skb with payload in fragments only, and nothing in skb->head GRO layer is fine with this, so relax the check. Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 57e4c31fa84a..c64ec19af9b7 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1737,7 +1737,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, /* Exercise flow dissector code path. */ u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb)); - if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) { + if (unlikely(headlen > skb_headlen(skb))) { this_cpu_inc(tun->pcpu_stats->rx_dropped); napi_free_frags(&tfile->napi); mutex_unlock(&tfile->napi_mutex); -- cgit v1.2.3 From aec72f3392b1d598a979e89c4fdb131965ae0ab3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Oct 2017 12:12:09 -0700 Subject: net-tun: fix panics at dismantle time syzkaller got crashes at dismantle time [1] It is not correct to test (tun->flags & IFF_NAPI) in tun_napi_disable() and tun_napi_del() : Each tun_file can have different mode, depending on how they were created. Similarly I have changed tun_get_user() and tun_poll_controller() to use the new tfile->napi_enabled boolean. [ 154.331360] BUG: unable to handle kernel NULL pointer dereference at (null) [ 154.339220] IP: [] hrtimer_active+0x26/0x60 [ 154.344983] PGD 0 [ 154.347009] Oops: 0000 [#1] SMP [ 154.350680] gsmi: Log Shutdown Reason 0x03 [ 154.379572] task: ffff994719150dc0 ti: ffff99475c0ae000 task.ti: ffff99475c0ae000 [ 154.387043] RIP: 0010:[] [] hrtimer_active+0x26/0x60 [ 154.395232] RSP: 0018:ffff99475c0afce8 EFLAGS: 00010246 [ 154.400542] RAX: ffff994754850ac0 RBX: ffff994753e65408 RCX: ffff994753e65388 [ 154.407666] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff994753e65408 [ 154.414790] RBP: ffff99475c0afce8 R08: 0000000000000000 R09: 0000000000000000 [ 154.421921] R10: ffff99475f6f5910 R11: 0000000000000001 R12: 0000000000000000 [ 154.429044] R13: ffff99417deab668 R14: ffff99417deaa780 R15: ffff99475f45dde0 [ 154.436174] FS: 0000000000000000(0000) GS:ffff994767a00000(0000) knlGS:0000000000000000 [ 154.444249] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 154.449986] CR2: 0000000000000000 CR3: 00000005a8a0e000 CR4: 0000000000022670 [ 154.457110] Stack: [ 154.459120] ffff99475c0afd28 ffffffff9634d614 1000000000000000 0000000000000000 [ 154.466598] ffffe54240000000 ffff994753e65408 ffff994753e653a8 ffff99417deab668 [ 154.474067] ffff99475c0afd48 ffffffff9634d6fd ffff99474c2be678 ffff994753e65398 [ 154.481537] Call Trace: [ 154.483985] [] hrtimer_try_to_cancel+0x24/0xf0 [ 154.490074] [] hrtimer_cancel+0x1d/0x30 [ 154.495563] [] napi_disable+0x3c/0x70 [ 154.500875] [] __tun_detach+0xd2/0x360 [ 154.506272] [] tun_chr_close+0x27/0x40 [ 154.511669] [] __fput+0xd6/0x1e0 [ 154.516548] [] ____fput+0xe/0x10 [ 154.521429] [] task_work_run+0x72/0x90 [ 154.526827] [] do_exit+0x317/0xb60 [ 154.531879] [] do_group_exit+0x3f/0xa0 [ 154.537275] [] SyS_exit_group+0x17/0x20 [ 154.542769] [] entry_SYSCALL_64_fastpath+0x12/0x17 Fixes: 943170998b20 ("net-tun: enable NAPI for TUN/TAP driver") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/tun.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index c64ec19af9b7..3b41c36eae90 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -175,6 +175,7 @@ struct tun_file { unsigned int ifindex; }; struct napi_struct napi; + bool napi_enabled; struct mutex napi_mutex; /* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; @@ -276,6 +277,7 @@ static int tun_napi_poll(struct napi_struct *napi, int budget) static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, bool napi_en) { + tfile->napi_enabled = napi_en; if (napi_en) { netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll, NAPI_POLL_WEIGHT); @@ -286,13 +288,13 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile) { - if (tun->flags & IFF_NAPI) + if (tfile->napi_enabled) napi_disable(&tfile->napi); } static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile) { - if (tun->flags & IFF_NAPI) + if (tfile->napi_enabled) netif_napi_del(&tfile->napi); } @@ -1055,7 +1057,8 @@ static void tun_poll_controller(struct net_device *dev) rcu_read_lock(); for (i = 0; i < tun->numqueues; i++) { tfile = rcu_dereference(tun->tfiles[i]); - napi_schedule(&tfile->napi); + if (tfile->napi_enabled) + napi_schedule(&tfile->napi); } rcu_read_unlock(); } @@ -1749,7 +1752,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, napi_gro_frags(&tfile->napi); local_bh_enable(); mutex_unlock(&tfile->napi_mutex); - } else if (tun->flags & IFF_NAPI) { + } else if (tfile->napi_enabled) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; int queue_len; -- cgit v1.2.3 From 7dbfb4ef77db5666f0f3a425e7db93ca30ff4285 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2017 11:29:55 -0700 Subject: tun: do not block BH again in tun_flow_cleanup() tun_flow_cleanup() being a timer callback, it is already running in BH context. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/tun.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3b41c36eae90..f9541f7f9fb7 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -454,7 +454,7 @@ static void tun_flow_cleanup(unsigned long data) tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n"); - spin_lock_bh(&tun->lock); + spin_lock(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; @@ -472,7 +472,7 @@ static void tun_flow_cleanup(unsigned long data) if (count) mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); - spin_unlock_bh(&tun->lock); + spin_unlock(&tun->lock); } static void tun_flow_update(struct tun_struct *tun, u32 rxhash, -- cgit v1.2.3 From 81d98fa4df3d1683b3ef21e8a7a0ccac7874f0de Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2017 11:29:56 -0700 Subject: tun: avoid extra timer schedule in tun_flow_cleanup() If tun_flow_cleanup() deleted all flows, no need to arm the timer again. It will be armed next time tun_flow_update() is called. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/tun.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index f9541f7f9fb7..995887de5a98 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -461,11 +461,14 @@ static void tun_flow_cleanup(unsigned long data) hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { unsigned long this_timer; - count++; + this_timer = e->updated + delay; - if (time_before_eq(this_timer, jiffies)) + if (time_before_eq(this_timer, jiffies)) { tun_flow_delete(tun, e); - else if (time_before(this_timer, next_timer)) + continue; + } + count++; + if (time_before(this_timer, next_timer)) next_timer = this_timer; } } -- cgit v1.2.3 From ee74d9967b829232723939cb7c9b100b29f6ec98 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2017 11:29:57 -0700 Subject: tun: do not arm flow_gc_timer in tun_flow_init() Timer is properly armed on demand from tun_flow_update(), so there is no need to arm it at tun init. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 995887de5a98..2a2d058cdd40 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1197,8 +1197,6 @@ static void tun_flow_init(struct tun_struct *tun) tun->ageing_time = TUN_FLOW_EXPIRE; setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun); - mod_timer(&tun->flow_gc_timer, - round_jiffies_up(jiffies + tun->ageing_time)); } static void tun_flow_uninit(struct tun_struct *tun) -- cgit v1.2.3 From f4e63525ee35f9c02e9f51f90571718363e9a9a9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:16 -0700 Subject: net: bpf: rename ndo_xdp to ndo_bpf ndo_xdp is a control path callback for setting up XDP in the driver. We can reuse it for other forms of communication between the eBPF stack and the drivers. Rename the callback and associated structures and definitions. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- drivers/net/tun.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 8125956f62a1..1a326b697221 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1141,7 +1141,7 @@ static u32 tun_xdp_query(struct net_device *dev) return 0; } -static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp) +static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: @@ -1185,7 +1185,7 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, - .ndo_xdp = tun_xdp, + .ndo_bpf = tun_xdp, }; static void tun_flow_init(struct tun_struct *tun) -- cgit v1.2.3