From 38cb52455c2c3e8b5751350a3fb32e43e82e129a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Oct 2015 11:43:26 -0700
Subject: tcp: call sk_mark_napi_id() on the child, not the listener

This fixes a typo : We want to store the NAPI id on child socket.
Presumably nobody really uses busy polling, on short lived flows.

Fixes: 3d97379a67486 ("tcp: move sk_mark_napi_id() at the right place")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2ae95e1d03e1..e463583c39ee 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1265,7 +1265,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 		 */
 		if (nsk != sk) {
 			sock_rps_save_rxhash(nsk, skb);
-			sk_mark_napi_id(sk, skb);
+			sk_mark_napi_id(nsk, skb);
 			if (tcp_child_process(sk, nsk, skb))
 				goto reset;
 			if (opt_skb)
-- 
cgit v1.2.3


From ba8e275a457397ab06f3567cf7bef0d78a43ae7e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Oct 2015 11:43:28 -0700
Subject: tcp: cleanup tcp_v[46]_inbound_md5_hash()

We'll soon have to call tcp_v[46]_inbound_md5_hash() twice.
Also add const attribute to the socket, as it might be the
unlocked listener for SYN packets.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e463583c39ee..65e797dba504 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -622,8 +622,12 @@ clear_hash_noput:
 	return 1;
 }
 
-static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
+#endif
+
+static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
+				    const struct sk_buff *skb)
 {
+#ifdef CONFIG_TCP_MD5SIG
 	const __u8 *hash_location = NULL;
 	struct tcp_md5sig_key *hash_expected;
 	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
@@ -660,9 +664,9 @@ static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
 				     &ip6h->daddr, ntohs(th->dest));
 		return true;
 	}
+#endif
 	return false;
 }
-#endif
 
 static void tcp_v6_init_req(struct request_sock *req,
 			    const struct sock *sk_listener,
@@ -1408,10 +1412,8 @@ process:
 
 	tcp_v6_fill_cb(skb, hdr, th);
 
-#ifdef CONFIG_TCP_MD5SIG
 	if (tcp_v6_inbound_md5_hash(sk, skb))
 		goto discard_and_relse;
-#endif
 
 	if (sk_filter(sk, skb))
 		goto discard_and_relse;
-- 
cgit v1.2.3


From aa3a0c8ce651b5e16124866b0a10d1b90b9ef022 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Oct 2015 11:43:30 -0700
Subject: tcp: get_openreq[46]() changes

When request sockets are no longer in a per listener hash table
but on regular TCP ehash, we need to access listener uid
through req->rsk_listener

get_openreq6() also gets a const for its request socket argument.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 65e797dba504..cadb44a2d34e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1635,7 +1635,7 @@ static void tcp_v6_destroy_sock(struct sock *sk)
 #ifdef CONFIG_PROC_FS
 /* Proc filesystem TCPv6 sock list dumping. */
 static void get_openreq6(struct seq_file *seq,
-			 struct request_sock *req, int i, kuid_t uid)
+			 const struct request_sock *req, int i)
 {
 	long ttd = req->rsk_timer.expires - jiffies;
 	const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr;
@@ -1659,7 +1659,8 @@ static void get_openreq6(struct seq_file *seq,
 		   1,   /* timers active (only the expire timer) */
 		   jiffies_to_clock_t(ttd),
 		   req->num_timeout,
-		   from_kuid_munged(seq_user_ns(seq), uid),
+		   from_kuid_munged(seq_user_ns(seq),
+				    sock_i_uid(req->rsk_listener)),
 		   0,  /* non standard timer */
 		   0, /* open_requests have no inode */
 		   0, req);
@@ -1773,7 +1774,7 @@ static int tcp6_seq_show(struct seq_file *seq, void *v)
 			get_tcp6_sock(seq, v, st->num);
 		break;
 	case TCP_SEQ_STATE_OPENREQ:
-		get_openreq6(seq, v, st->num, st->uid);
+		get_openreq6(seq, v, st->num);
 		break;
 	}
 out:
-- 
cgit v1.2.3


From 079096f103faca2dd87342cca6f23d4b34da8871 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Oct 2015 11:43:32 -0700
Subject: tcp/dccp: install syn_recv requests into ehash table

In this patch, we insert request sockets into TCP/DCCP
regular ehash table (where ESTABLISHED and TIMEWAIT sockets
are) instead of using the per listener hash table.

ACK packets find SYN_RECV pseudo sockets without having
to find and lock the listener.

In nominal conditions, this halves pressure on listener lock.

Note that this will allow for SO_REUSEPORT refinements,
so that we can select a listener using cpu/numa affinities instead
of the prior 'consistent hash', since only SYN packets will
apply this selection logic.

We will shrink listen_sock in the following patch to ease
code review.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ying Cai <ycai@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 82 +++++++++++++++++++++++------------------------------
 1 file changed, 36 insertions(+), 46 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index cadb44a2d34e..a215614cfb2b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -727,7 +727,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
 	.route_req	=	tcp_v6_route_req,
 	.init_seq	=	tcp_v6_init_sequence,
 	.send_synack	=	tcp_v6_send_synack,
-	.queue_hash_add =	inet6_csk_reqsk_queue_hash_add,
 };
 
 static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
@@ -938,37 +937,11 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 }
 
 
-static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
+static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
 {
+#ifdef CONFIG_SYN_COOKIES
 	const struct tcphdr *th = tcp_hdr(skb);
-	struct request_sock *req;
-	struct sock *nsk;
-
-	/* Find possible connection requests. */
-	req = inet6_csk_search_req(sk, th->source,
-				   &ipv6_hdr(skb)->saddr,
-				   &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb));
-	if (req) {
-		nsk = tcp_check_req(sk, skb, req, false);
-		if (!nsk || nsk == sk)
-			reqsk_put(req);
-		return nsk;
-	}
-	nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
-					 &ipv6_hdr(skb)->saddr, th->source,
-					 &ipv6_hdr(skb)->daddr, ntohs(th->dest),
-					 tcp_v6_iif(skb));
-
-	if (nsk) {
-		if (nsk->sk_state != TCP_TIME_WAIT) {
-			bh_lock_sock(nsk);
-			return nsk;
-		}
-		inet_twsk_put(inet_twsk(nsk));
-		return NULL;
-	}
 
-#ifdef CONFIG_SYN_COOKIES
 	if (!th->syn)
 		sk = cookie_v6_check(sk, skb);
 #endif
@@ -1258,15 +1231,11 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 		goto csum_err;
 
 	if (sk->sk_state == TCP_LISTEN) {
-		struct sock *nsk = tcp_v6_hnd_req(sk, skb);
+		struct sock *nsk = tcp_v6_cookie_check(sk, skb);
+
 		if (!nsk)
 			goto discard;
 
-		/*
-		 * Queue it on the new socket if the new socket is active,
-		 * otherwise we just shortcircuit this and continue with
-		 * the new socket..
-		 */
 		if (nsk != sk) {
 			sock_rps_save_rxhash(nsk, skb);
 			sk_mark_napi_id(nsk, skb);
@@ -1402,6 +1371,33 @@ process:
 	if (sk->sk_state == TCP_TIME_WAIT)
 		goto do_time_wait;
 
+	if (sk->sk_state == TCP_NEW_SYN_RECV) {
+		struct request_sock *req = inet_reqsk(sk);
+		struct sock *nsk = NULL;
+
+		sk = req->rsk_listener;
+		tcp_v6_fill_cb(skb, hdr, th);
+		if (tcp_v6_inbound_md5_hash(sk, skb)) {
+			reqsk_put(req);
+			goto discard_it;
+		}
+		if (sk->sk_state == TCP_LISTEN)
+			nsk = tcp_check_req(sk, skb, req, false);
+		if (!nsk) {
+			reqsk_put(req);
+			goto discard_it;
+		}
+		if (nsk == sk) {
+			sock_hold(sk);
+			reqsk_put(req);
+			tcp_v6_restore_cb(skb);
+		} else if (tcp_child_process(sk, nsk, skb)) {
+			tcp_v6_send_reset(nsk, skb);
+			goto discard_it;
+		} else {
+			return 0;
+		}
+	}
 	if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 		goto discard_and_relse;
@@ -1765,18 +1761,12 @@ static int tcp6_seq_show(struct seq_file *seq, void *v)
 	}
 	st = seq->private;
 
-	switch (st->state) {
-	case TCP_SEQ_STATE_LISTENING:
-	case TCP_SEQ_STATE_ESTABLISHED:
-		if (sk->sk_state == TCP_TIME_WAIT)
-			get_timewait6_sock(seq, v, st->num);
-		else
-			get_tcp6_sock(seq, v, st->num);
-		break;
-	case TCP_SEQ_STATE_OPENREQ:
+	if (sk->sk_state == TCP_TIME_WAIT)
+		get_timewait6_sock(seq, v, st->num);
+	else if (sk->sk_state == TCP_NEW_SYN_RECV)
 		get_openreq6(seq, v, st->num);
-		break;
-	}
+	else
+		get_tcp6_sock(seq, v, st->num);
 out:
 	return 0;
 }
-- 
cgit v1.2.3


From ca6fb06518836ef9b65dc0aac02ff97704d52a05 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Oct 2015 11:43:35 -0700
Subject: tcp: attach SYNACK messages to request sockets instead of listener

If a listen backlog is very big (to avoid syncookies), then
the listener sk->sk_wmem_alloc is the main source of false
sharing, as we need to touch it twice per SYNACK re-transmit
and TX completion.

(One SYN packet takes listener lock once, but up to 6 SYNACK
are generated)

By attaching the skb to the request socket, we remove this
source of contention.

Tested:

 listen(fd, 10485760); // single listener (no SO_REUSEPORT)
 16 RX/TX queue NIC
 Sustain a SYNFLOOD attack of ~320,000 SYN per second,
 Sending ~1,400,000 SYNACK per second.
 Perf profiles now show listener spinlock being next bottleneck.

    20.29%  [kernel]  [k] queued_spin_lock_slowpath
    10.06%  [kernel]  [k] __inet_lookup_established
     5.12%  [kernel]  [k] reqsk_timer_handler
     3.22%  [kernel]  [k] get_next_timer_interrupt
     3.00%  [kernel]  [k] tcp_make_synack
     2.77%  [kernel]  [k] ipt_do_table
     2.70%  [kernel]  [k] run_timer_softirq
     2.50%  [kernel]  [k] ip_finish_output
     2.04%  [kernel]  [k] cascade

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index a215614cfb2b..3d18571811c5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -438,7 +438,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
 			      struct flowi *fl,
 			      struct request_sock *req,
 			      u16 queue_mapping,
-			      struct tcp_fastopen_cookie *foc)
+			      struct tcp_fastopen_cookie *foc,
+			      bool attach_req)
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -451,7 +452,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
 					       IPPROTO_TCP)) == NULL)
 		goto done;
 
-	skb = tcp_make_synack(sk, dst, req, foc);
+	skb = tcp_make_synack(sk, dst, req, foc, attach_req);
 
 	if (skb) {
 		__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
-- 
cgit v1.2.3


From e994b2f0fb9229aeff5eea9541320bd7b2ca8714 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Oct 2015 11:43:39 -0700
Subject: tcp: do not lock listener to process SYN packets

Everything should now be ready to finally allow SYN
packets processing without holding listener lock.

Tested:

3.5 Mpps SYNFLOOD. Plenty of cpu cycles available.

Next bottleneck is the refcount taken on listener,
that could be avoided if we remove SLAB_DESTROY_BY_RCU
strict semantic for listeners, and use regular RCU.

    13.18%  [kernel]  [k] __inet_lookup_listener
     9.61%  [kernel]  [k] tcp_conn_request
     8.16%  [kernel]  [k] sha_transform
     5.30%  [kernel]  [k] inet_reqsk_alloc
     4.22%  [kernel]  [k] sock_put
     3.74%  [kernel]  [k] tcp_make_synack
     2.88%  [kernel]  [k] ipt_do_table
     2.56%  [kernel]  [k] memcpy_erms
     2.53%  [kernel]  [k] sock_wfree
     2.40%  [kernel]  [k] tcp_v4_rcv
     2.08%  [kernel]  [k] fib_table_lookup
     1.84%  [kernel]  [k] tcp_openreq_init_rwin

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3d18571811c5..33334f0c217d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1161,7 +1161,7 @@ out:
 }
 
 /* The socket must have it's spinlock held when we get
- * here.
+ * here, unless it is a TCP_LISTEN socket.
  *
  * We have a potential double-lock case here, so even when
  * doing backlog processing we use the BH locking scheme.
@@ -1415,9 +1415,15 @@ process:
 	if (sk_filter(sk, skb))
 		goto discard_and_relse;
 
-	sk_incoming_cpu_update(sk);
 	skb->dev = NULL;
 
+	if (sk->sk_state == TCP_LISTEN) {
+		ret = tcp_v6_do_rcv(sk, skb);
+		goto put_and_return;
+	}
+
+	sk_incoming_cpu_update(sk);
+
 	bh_lock_sock_nested(sk);
 	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
 	ret = 0;
@@ -1432,6 +1438,7 @@ process:
 	}
 	bh_unlock_sock(sk);
 
+put_and_return:
 	sock_put(sk);
 	return ret ? -1 : 0;
 
-- 
cgit v1.2.3