An overall throughput gain of 22 % for heavy TCP use over a single TX queue. Original patchset comment https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v4.13&id=3f4888adae7c1619b990d98a9b967536f71822b8 Signed-off-by: Pavel Kubelun <be.dissent@gmail.com>master
parent
3a69ad3b2a
commit
b2ea46fe23
@ -0,0 +1,90 @@ |
|||||||
|
From 40fc3423b983b864bf70b03199191260ae9b2ea6 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Eric Dumazet <edumazet@google.com>
|
||||||
|
Date: Sat, 3 Dec 2016 11:14:50 -0800
|
||||||
|
Subject: [PATCH 01/10] tcp: tsq: add tsq_flags / tsq_enum
|
||||||
|
|
||||||
|
This is a cleanup, to ease code review of following patches.
|
||||||
|
|
||||||
|
Old 'enum tsq_flags' is renamed, and a new enumeration is added
|
||||||
|
with the flags used in cmpxchg() operations as opposed to
|
||||||
|
single bit operations.
|
||||||
|
|
||||||
|
Signed-off-by: Eric Dumazet <edumazet@google.com>
|
||||||
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||||
|
---
|
||||||
|
include/linux/tcp.h | 11 ++++++++++-
|
||||||
|
net/ipv4/tcp_output.c | 16 ++++++++--------
|
||||||
|
2 files changed, 18 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
--- a/include/linux/tcp.h
|
||||||
|
+++ b/include/linux/tcp.h
|
||||||
|
@@ -367,7 +367,7 @@ struct tcp_sock {
|
||||||
|
u32 *saved_syn;
|
||||||
|
};
|
||||||
|
|
||||||
|
-enum tsq_flags {
|
||||||
|
+enum tsq_enum {
|
||||||
|
TSQ_THROTTLED,
|
||||||
|
TSQ_QUEUED,
|
||||||
|
TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */
|
||||||
|
@@ -378,6 +378,15 @@ enum tsq_flags {
|
||||||
|
*/
|
||||||
|
};
|
||||||
|
|
||||||
|
+enum tsq_flags {
|
||||||
|
+ TSQF_THROTTLED = (1UL << TSQ_THROTTLED),
|
||||||
|
+ TSQF_QUEUED = (1UL << TSQ_QUEUED),
|
||||||
|
+ TCPF_TSQ_DEFERRED = (1UL << TCP_TSQ_DEFERRED),
|
||||||
|
+ TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED),
|
||||||
|
+ TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
|
||||||
|
+ TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED),
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
static inline struct tcp_sock *tcp_sk(const struct sock *sk)
|
||||||
|
{
|
||||||
|
return (struct tcp_sock *)sk;
|
||||||
|
--- a/net/ipv4/tcp_output.c
|
||||||
|
+++ b/net/ipv4/tcp_output.c
|
||||||
|
@@ -784,10 +784,10 @@ static void tcp_tasklet_func(unsigned lo
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
-#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
|
||||||
|
- (1UL << TCP_WRITE_TIMER_DEFERRED) | \
|
||||||
|
- (1UL << TCP_DELACK_TIMER_DEFERRED) | \
|
||||||
|
- (1UL << TCP_MTU_REDUCED_DEFERRED))
|
||||||
|
+#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
|
||||||
|
+ TCPF_WRITE_TIMER_DEFERRED | \
|
||||||
|
+ TCPF_DELACK_TIMER_DEFERRED | \
|
||||||
|
+ TCPF_MTU_REDUCED_DEFERRED)
|
||||||
|
/**
|
||||||
|
* tcp_release_cb - tcp release_sock() callback
|
||||||
|
* @sk: socket
|
||||||
|
@@ -808,7 +808,7 @@ void tcp_release_cb(struct sock *sk)
|
||||||
|
nflags = flags & ~TCP_DEFERRED_ALL;
|
||||||
|
} while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
|
||||||
|
|
||||||
|
- if (flags & (1UL << TCP_TSQ_DEFERRED))
|
||||||
|
+ if (flags & TCPF_TSQ_DEFERRED)
|
||||||
|
tcp_tsq_handler(sk);
|
||||||
|
|
||||||
|
/* Here begins the tricky part :
|
||||||
|
@@ -822,15 +822,15 @@ void tcp_release_cb(struct sock *sk)
|
||||||
|
*/
|
||||||
|
sock_release_ownership(sk);
|
||||||
|
|
||||||
|
- if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
|
||||||
|
+ if (flags & TCPF_WRITE_TIMER_DEFERRED) {
|
||||||
|
tcp_write_timer_handler(sk);
|
||||||
|
__sock_put(sk);
|
||||||
|
}
|
||||||
|
- if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
|
||||||
|
+ if (flags & TCPF_DELACK_TIMER_DEFERRED) {
|
||||||
|
tcp_delack_timer_handler(sk);
|
||||||
|
__sock_put(sk);
|
||||||
|
}
|
||||||
|
- if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
|
||||||
|
+ if (flags & TCPF_MTU_REDUCED_DEFERRED) {
|
||||||
|
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
|
||||||
|
__sock_put(sk);
|
||||||
|
}
|
@ -0,0 +1,48 @@ |
|||||||
|
From 408f0a6c21e124cc4f6c7aa370b38aa47e55428d Mon Sep 17 00:00:00 2001
|
||||||
|
From: Eric Dumazet <edumazet@google.com>
|
||||||
|
Date: Sat, 3 Dec 2016 11:14:51 -0800
|
||||||
|
Subject: [PATCH 02/10] tcp: tsq: remove one locked operation in tcp_wfree()
|
||||||
|
|
||||||
|
Instead of atomically clear TSQ_THROTTLED and atomically set TSQ_QUEUED
|
||||||
|
bits, use one cmpxchg() to perform a single locked operation.
|
||||||
|
|
||||||
|
Since the following patch will also set TCP_TSQ_DEFERRED here,
|
||||||
|
this cmpxchg() will make this addition free.
|
||||||
|
|
||||||
|
Signed-off-by: Eric Dumazet <edumazet@google.com>
|
||||||
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||||
|
---
|
||||||
|
net/ipv4/tcp_output.c | 13 ++++++++++---
|
||||||
|
1 file changed, 10 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
--- a/net/ipv4/tcp_output.c
|
||||||
|
+++ b/net/ipv4/tcp_output.c
|
||||||
|
@@ -860,6 +860,7 @@ void tcp_wfree(struct sk_buff *skb)
|
||||||
|
{
|
||||||
|
struct sock *sk = skb->sk;
|
||||||
|
struct tcp_sock *tp = tcp_sk(sk);
|
||||||
|
+ unsigned long flags, nval, oval;
|
||||||
|
int wmem;
|
||||||
|
|
||||||
|
/* Keep one reference on sk_wmem_alloc.
|
||||||
|
@@ -877,11 +878,17 @@ void tcp_wfree(struct sk_buff *skb)
|
||||||
|
if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
- if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
|
||||||
|
- !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
|
||||||
|
- unsigned long flags;
|
||||||
|
+ for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) {
|
||||||
|
struct tsq_tasklet *tsq;
|
||||||
|
|
||||||
|
+ if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
|
||||||
|
+ goto out;
|
||||||
|
+
|
||||||
|
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
|
||||||
|
+ nval = cmpxchg(&tp->tsq_flags, oval, nval);
|
||||||
|
+ if (nval != oval)
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
/* queue this socket to tasklet queue */
|
||||||
|
local_irq_save(flags);
|
||||||
|
tsq = this_cpu_ptr(&tsq_tasklet);
|
@ -0,0 +1,71 @@ |
|||||||
|
From b223feb9de2a65c533ff95c08e834fa732906ea5 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Eric Dumazet <edumazet@google.com>
|
||||||
|
Date: Sat, 3 Dec 2016 11:14:52 -0800
|
||||||
|
Subject: [PATCH 03/10] tcp: tsq: add shortcut in tcp_tasklet_func()
|
||||||
|
|
||||||
|
Under high stress, I've seen tcp_tasklet_func() consuming
|
||||||
|
~700 usec, handling ~150 tcp sockets.
|
||||||
|
|
||||||
|
By setting TCP_TSQ_DEFERRED in tcp_wfree(), we give a chance
|
||||||
|
for other cpus/threads entering tcp_write_xmit() to grab it,
|
||||||
|
allowing tcp_tasklet_func() to skip sockets that already did
|
||||||
|
an xmit cycle.
|
||||||
|
|
||||||
|
In the future, we might give to ACK processing an increased
|
||||||
|
budget to reduce even more tcp_tasklet_func() amount of work.
|
||||||
|
|
||||||
|
Signed-off-by: Eric Dumazet <edumazet@google.com>
|
||||||
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||||
|
---
|
||||||
|
net/ipv4/tcp_output.c | 22 ++++++++++++----------
|
||||||
|
1 file changed, 12 insertions(+), 10 deletions(-)
|
||||||
|
|
||||||
|
--- a/net/ipv4/tcp_output.c
|
||||||
|
+++ b/net/ipv4/tcp_output.c
|
||||||
|
@@ -767,19 +767,19 @@ static void tcp_tasklet_func(unsigned lo
|
||||||
|
list_for_each_safe(q, n, &list) {
|
||||||
|
tp = list_entry(q, struct tcp_sock, tsq_node);
|
||||||
|
list_del(&tp->tsq_node);
|
||||||
|
+ clear_bit(TSQ_QUEUED, &tp->tsq_flags);
|
||||||
|
|
||||||
|
sk = (struct sock *)tp;
|
||||||
|
- bh_lock_sock(sk);
|
||||||
|
-
|
||||||
|
- if (!sock_owned_by_user(sk)) {
|
||||||
|
- tcp_tsq_handler(sk);
|
||||||
|
- } else {
|
||||||
|
- /* defer the work to tcp_release_cb() */
|
||||||
|
- set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
|
||||||
|
+ if (!sk->sk_lock.owned &&
|
||||||
|
+ test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags)) {
|
||||||
|
+ bh_lock_sock(sk);
|
||||||
|
+ if (!sock_owned_by_user(sk)) {
|
||||||
|
+ clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
|
||||||
|
+ tcp_tsq_handler(sk);
|
||||||
|
+ }
|
||||||
|
+ bh_unlock_sock(sk);
|
||||||
|
}
|
||||||
|
- bh_unlock_sock(sk);
|
||||||
|
|
||||||
|
- clear_bit(TSQ_QUEUED, &tp->tsq_flags);
|
||||||
|
sk_free(sk);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@@ -884,7 +884,7 @@ void tcp_wfree(struct sk_buff *skb)
|
||||||
|
if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
- nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
|
||||||
|
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
|
||||||
|
nval = cmpxchg(&tp->tsq_flags, oval, nval);
|
||||||
|
if (nval != oval)
|
||||||
|
continue;
|
||||||
|
@@ -2179,6 +2179,8 @@ static bool tcp_write_xmit(struct sock *
|
||||||
|
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
|
||||||
|
break;
|
||||||
|
|
||||||
|
+ if (test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags))
|
||||||
|
+ clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
|
||||||
|
if (tcp_small_queue_check(sk, skb, 0))
|
||||||
|
break;
|
||||||
|
|
@ -0,0 +1,38 @@ |
|||||||
|
From a9b204d1564702b704ad6fe74f10a102c7b87ba3 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Eric Dumazet <edumazet@google.com>
|
||||||
|
Date: Sat, 3 Dec 2016 11:14:53 -0800
|
||||||
|
Subject: [PATCH 04/10] tcp: tsq: avoid one atomic in tcp_wfree()
|
||||||
|
|
||||||
|
Under high load, tcp_wfree() has an atomic operation trying
|
||||||
|
to schedule a tasklet over and over.
|
||||||
|
|
||||||
|
We can schedule it only if our per cpu list was empty.
|
||||||
|
|
||||||
|
Signed-off-by: Eric Dumazet <edumazet@google.com>
|
||||||
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||||
|
---
|
||||||
|
net/ipv4/tcp_output.c | 5 ++++-
|
||||||
|
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
--- a/net/ipv4/tcp_output.c
|
||||||
|
+++ b/net/ipv4/tcp_output.c
|
||||||
|
@@ -880,6 +880,7 @@ void tcp_wfree(struct sk_buff *skb)
|
||||||
|
|
||||||
|
for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) {
|
||||||
|
struct tsq_tasklet *tsq;
|
||||||
|
+ bool empty;
|
||||||
|
|
||||||
|
if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
|
||||||
|
goto out;
|
||||||
|
@@ -892,8 +893,10 @@ void tcp_wfree(struct sk_buff *skb)
|
||||||
|
/* queue this socket to tasklet queue */
|
||||||
|
local_irq_save(flags);
|
||||||
|
tsq = this_cpu_ptr(&tsq_tasklet);
|
||||||
|
+ empty = list_empty(&tsq->head);
|
||||||
|
list_add(&tp->tsq_node, &tsq->head);
|
||||||
|
- tasklet_schedule(&tsq->tasklet);
|
||||||
|
+ if (empty)
|
||||||
|
+ tasklet_schedule(&tsq->tasklet);
|
||||||
|
local_irq_restore(flags);
|
||||||
|
return;
|
||||||
|
}
|
@ -0,0 +1,37 @@ |
|||||||
|
From 75eefc6c59fd2c5f1ab95a3a113c217237d12a31 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Eric Dumazet <edumazet@google.com>
|
||||||
|
Date: Sat, 3 Dec 2016 11:14:54 -0800
|
||||||
|
Subject: [PATCH 05/10] tcp: tsq: add a shortcut in tcp_small_queue_check()
|
||||||
|
|
||||||
|
Always allow the two first skbs in write queue to be sent,
|
||||||
|
regardless of sk_wmem_alloc/sk_pacing_rate values.
|
||||||
|
|
||||||
|
This helps a lot in situations where TX completions are delayed either
|
||||||
|
because of driver latencies or softirq latencies.
|
||||||
|
|
||||||
|
Test is done with no cache line misses.
|
||||||
|
|
||||||
|
Signed-off-by: Eric Dumazet <edumazet@google.com>
|
||||||
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||||
|
---
|
||||||
|
net/ipv4/tcp_output.c | 9 +++++++++
|
||||||
|
1 file changed, 9 insertions(+)
|
||||||
|
|
||||||
|
--- a/net/ipv4/tcp_output.c
|
||||||
|
+++ b/net/ipv4/tcp_output.c
|
||||||
|
@@ -2084,6 +2084,15 @@ static bool tcp_small_queue_check(struct
|
||||||
|
limit <<= factor;
|
||||||
|
|
||||||
|
if (atomic_read(&sk->sk_wmem_alloc) > limit) {
|
||||||
|
+ /* Always send the 1st or 2nd skb in write queue.
|
||||||
|
+ * No need to wait for TX completion to call us back,
|
||||||
|
+ * after softirq/tasklet schedule.
|
||||||
|
+ * This helps when TX completions are delayed too much.
|
||||||
|
+ */
|
||||||
|
+ if (skb == sk->sk_write_queue.next ||
|
||||||
|
+ skb->prev == sk->sk_write_queue.next)
|
||||||
|
+ return false;
|
||||||
|
+
|
||||||
|
set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
|
||||||
|
/* It is possible TX completion already happened
|
||||||
|
* before we set TSQ_THROTTLED, so we must
|
@ -0,0 +1,55 @@ |
|||||||
|
From 12a59abc22d6664f7d3944f625ceefee92de8820 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Eric Dumazet <edumazet@google.com>
|
||||||
|
Date: Sat, 3 Dec 2016 11:14:55 -0800
|
||||||
|
Subject: [PATCH 06/10] tcp: tcp_mtu_probe() is likely to exit early
|
||||||
|
|
||||||
|
Adding a likely() in tcp_mtu_probe() moves its code which used to
|
||||||
|
be inlined in front of tcp_write_xmit()
|
||||||
|
|
||||||
|
We still have a cache line miss to access icsk->icsk_mtup.enabled,
|
||||||
|
we will probably have to reorganize fields to help data locality.
|
||||||
|
|
||||||
|
Signed-off-by: Eric Dumazet <edumazet@google.com>
|
||||||
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||||
|
---
|
||||||
|
net/ipv4/tcp_output.c | 18 +++++++++---------
|
||||||
|
1 file changed, 9 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
--- a/net/ipv4/tcp_output.c
|
||||||
|
+++ b/net/ipv4/tcp_output.c
|
||||||
|
@@ -1925,26 +1925,26 @@ static inline void tcp_mtu_check_reprobe
|
||||||
|
*/
|
||||||
|
static int tcp_mtu_probe(struct sock *sk)
|
||||||
|
{
|
||||||
|
- struct tcp_sock *tp = tcp_sk(sk);
|
||||||
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||||
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
||||||
|
struct sk_buff *skb, *nskb, *next;
|
||||||
|
struct net *net = sock_net(sk);
|
||||||
|
- int len;
|
||||||
|
int probe_size;
|
||||||
|
int size_needed;
|
||||||
|
- int copy;
|
||||||
|
+ int copy, len;
|
||||||
|
int mss_now;
|
||||||
|
int interval;
|
||||||
|
|
||||||
|
/* Not currently probing/verifying,
|
||||||
|
* not in recovery,
|
||||||
|
* have enough cwnd, and
|
||||||
|
- * not SACKing (the variable headers throw things off) */
|
||||||
|
- if (!icsk->icsk_mtup.enabled ||
|
||||||
|
- icsk->icsk_mtup.probe_size ||
|
||||||
|
- inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
|
||||||
|
- tp->snd_cwnd < 11 ||
|
||||||
|
- tp->rx_opt.num_sacks || tp->rx_opt.dsack)
|
||||||
|
+ * not SACKing (the variable headers throw things off)
|
||||||
|
+ */
|
||||||
|
+ if (likely(!icsk->icsk_mtup.enabled ||
|
||||||
|
+ icsk->icsk_mtup.probe_size ||
|
||||||
|
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
|
||||||
|
+ tp->snd_cwnd < 11 ||
|
||||||
|
+ tp->rx_opt.num_sacks || tp->rx_opt.dsack))
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
/* Use binary search for probe_size between tcp_mss_base,
|
@ -0,0 +1,157 @@ |
|||||||
|
From 9115e8cd2a0c6eaaa900c462721f12e1d45f326c Mon Sep 17 00:00:00 2001
|
||||||
|
From: Eric Dumazet <edumazet@google.com>
|
||||||
|
Date: Sat, 3 Dec 2016 11:14:56 -0800
|
||||||
|
Subject: [PATCH 07/10] net: reorganize struct sock for better data locality
|
||||||
|
|
||||||
|
Group fields used in TX path, and keep some cache lines mostly read
|
||||||
|
to permit sharing among cpus.
|
||||||
|
|
||||||
|
Gained two 4 bytes holes on 64bit arches.
|
||||||
|
|
||||||
|
Added a place holder for tcp tsq_flags, next to sk_wmem_alloc
|
||||||
|
to speed up tcp_wfree() in the following patch.
|
||||||
|
|
||||||
|
I have not added ____cacheline_aligned_in_smp, this might be done later.
|
||||||
|
I prefer doing this once inet and tcp/udp sockets reorg is also done.
|
||||||
|
|
||||||
|
Tested with both TCP and UDP.
|
||||||
|
|
||||||
|
UDP receiver performance under flood increased by ~20 % :
|
||||||
|
Accessing sk_filter/sk_wq/sk_napi_id no longer stalls because sk_drops
|
||||||
|
was moved away from a critical cache line, now mostly read and shared.
|
||||||
|
|
||||||
|
/* --- cacheline 4 boundary (256 bytes) --- */
|
||||||
|
unsigned int sk_napi_id; /* 0x100 0x4 */
|
||||||
|
int sk_rcvbuf; /* 0x104 0x4 */
|
||||||
|
struct sk_filter * sk_filter; /* 0x108 0x8 */
|
||||||
|
union {
|
||||||
|
struct socket_wq * sk_wq; /* 0x8 */
|
||||||
|
struct socket_wq * sk_wq_raw; /* 0x8 */
|
||||||
|
}; /* 0x110 0x8 */
|
||||||
|
struct xfrm_policy * sk_policy[2]; /* 0x118 0x10 */
|
||||||
|
struct dst_entry * sk_rx_dst; /* 0x128 0x8 */
|
||||||
|
struct dst_entry * sk_dst_cache; /* 0x130 0x8 */
|
||||||
|
atomic_t sk_omem_alloc; /* 0x138 0x4 */
|
||||||
|
int sk_sndbuf; /* 0x13c 0x4 */
|
||||||
|
/* --- cacheline 5 boundary (320 bytes) --- */
|
||||||
|
int sk_wmem_queued; /* 0x140 0x4 */
|
||||||
|
atomic_t sk_wmem_alloc; /* 0x144 0x4 */
|
||||||
|
long unsigned int sk_tsq_flags; /* 0x148 0x8 */
|
||||||
|
struct sk_buff * sk_send_head; /* 0x150 0x8 */
|
||||||
|
struct sk_buff_head sk_write_queue; /* 0x158 0x18 */
|
||||||
|
__s32 sk_peek_off; /* 0x170 0x4 */
|
||||||
|
int sk_write_pending; /* 0x174 0x4 */
|
||||||
|
long int sk_sndtimeo; /* 0x178 0x8 */
|
||||||
|
|
||||||
|
Signed-off-by: Eric Dumazet <edumazet@google.com>
|
||||||
|
Tested-by: Paolo Abeni <pabeni@redhat.com>
|
||||||
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||||
|
---
|
||||||
|
include/net/sock.h | 51 +++++++++++++++++++++++++++------------------------
|
||||||
|
1 file changed, 27 insertions(+), 24 deletions(-)
|
||||||
|
|
||||||
|
--- a/include/net/sock.h
|
||||||
|
+++ b/include/net/sock.h
|
||||||
|
@@ -343,6 +343,9 @@ struct sock {
|
||||||
|
#define sk_rxhash __sk_common.skc_rxhash
|
||||||
|
|
||||||
|
socket_lock_t sk_lock;
|
||||||
|
+ atomic_t sk_drops;
|
||||||
|
+ int sk_rcvlowat;
|
||||||
|
+ struct sk_buff_head sk_error_queue;
|
||||||
|
struct sk_buff_head sk_receive_queue;
|
||||||
|
/*
|
||||||
|
* The backlog queue is special, it is always used with
|
||||||
|
@@ -359,14 +362,13 @@ struct sock {
|
||||||
|
struct sk_buff *tail;
|
||||||
|
} sk_backlog;
|
||||||
|
#define sk_rmem_alloc sk_backlog.rmem_alloc
|
||||||
|
- int sk_forward_alloc;
|
||||||
|
|
||||||
|
- __u32 sk_txhash;
|
||||||
|
+ int sk_forward_alloc;
|
||||||
|
#ifdef CONFIG_NET_RX_BUSY_POLL
|
||||||
|
- unsigned int sk_napi_id;
|
||||||
|
unsigned int sk_ll_usec;
|
||||||
|
+ /* ===== mostly read cache line ===== */
|
||||||
|
+ unsigned int sk_napi_id;
|
||||||
|
#endif
|
||||||
|
- atomic_t sk_drops;
|
||||||
|
int sk_rcvbuf;
|
||||||
|
|
||||||
|
struct sk_filter __rcu *sk_filter;
|
||||||
|
@@ -379,11 +381,30 @@ struct sock {
|
||||||
|
#endif
|
||||||
|
struct dst_entry *sk_rx_dst;
|
||||||
|
struct dst_entry __rcu *sk_dst_cache;
|
||||||
|
- /* Note: 32bit hole on 64bit arches */
|
||||||
|
- atomic_t sk_wmem_alloc;
|
||||||
|
atomic_t sk_omem_alloc;
|
||||||
|
int sk_sndbuf;
|
||||||
|
+
|
||||||
|
+ /* ===== cache line for TX ===== */
|
||||||
|
+ int sk_wmem_queued;
|
||||||
|
+ atomic_t sk_wmem_alloc;
|
||||||
|
+ unsigned long sk_tsq_flags;
|
||||||
|
+ struct sk_buff *sk_send_head;
|
||||||
|
struct sk_buff_head sk_write_queue;
|
||||||
|
+ __s32 sk_peek_off;
|
||||||
|
+ int sk_write_pending;
|
||||||
|
+ long sk_sndtimeo;
|
||||||
|
+ struct timer_list sk_timer;
|
||||||
|
+ __u32 sk_priority;
|
||||||
|
+ __u32 sk_mark;
|
||||||
|
+ u32 sk_pacing_rate; /* bytes per second */
|
||||||
|
+ u32 sk_max_pacing_rate;
|
||||||
|
+ struct page_frag sk_frag;
|
||||||
|
+ netdev_features_t sk_route_caps;
|
||||||
|
+ netdev_features_t sk_route_nocaps;
|
||||||
|
+ int sk_gso_type;
|
||||||
|
+ unsigned int sk_gso_max_size;
|
||||||
|
+ gfp_t sk_allocation;
|
||||||
|
+ __u32 sk_txhash;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Because of non atomicity rules, all
|
||||||
|
@@ -399,41 +420,23 @@ struct sock {
|
||||||
|
#define SK_PROTOCOL_MAX U8_MAX
|
||||||
|
kmemcheck_bitfield_end(flags);
|
||||||
|
|
||||||
|
- int sk_wmem_queued;
|
||||||
|
- gfp_t sk_allocation;
|
||||||
|
- u32 sk_pacing_rate; /* bytes per second */
|
||||||
|
- u32 sk_max_pacing_rate;
|
||||||
|
- netdev_features_t sk_route_caps;
|
||||||
|
- netdev_features_t sk_route_nocaps;
|
||||||
|
- int sk_gso_type;
|
||||||
|
- unsigned int sk_gso_max_size;
|
||||||
|
u16 sk_gso_max_segs;
|
||||||
|
- int sk_rcvlowat;
|
||||||
|
unsigned long sk_lingertime;
|
||||||
|
- struct sk_buff_head sk_error_queue;
|
||||||
|
struct proto *sk_prot_creator;
|
||||||
|
rwlock_t sk_callback_lock;
|
||||||
|
int sk_err,
|
||||||
|
sk_err_soft;
|
||||||
|
u32 sk_ack_backlog;
|
||||||
|
u32 sk_max_ack_backlog;
|
||||||
|
- __u32 sk_priority;
|
||||||
|
- __u32 sk_mark;
|
||||||
|
struct pid *sk_peer_pid;
|
||||||
|
const struct cred *sk_peer_cred;
|
||||||
|
long sk_rcvtimeo;
|
||||||
|
- long sk_sndtimeo;
|
||||||
|
- struct timer_list sk_timer;
|
||||||
|
ktime_t sk_stamp;
|
||||||
|
u16 sk_tsflags;
|
||||||
|
u8 sk_shutdown;
|
||||||
|
u32 sk_tskey;
|
||||||
|
struct socket *sk_socket;
|
||||||
|
void *sk_user_data;
|
||||||
|
- struct page_frag sk_frag;
|
||||||
|
- struct sk_buff *sk_send_head;
|
||||||
|
- __s32 sk_peek_off;
|
||||||
|
- int sk_write_pending;
|
||||||
|
#ifdef CONFIG_SECURITY
|
||||||
|
void *sk_security;
|
||||||
|
#endif
|
@ -0,0 +1,176 @@ |
|||||||
|
From 7aa5470c2c09265902b5e4289afa82e4e7c2987e Mon Sep 17 00:00:00 2001
|
||||||
|
From: Eric Dumazet <edumazet@google.com>
|
||||||
|
Date: Sat, 3 Dec 2016 11:14:57 -0800
|
||||||
|
Subject: [PATCH 08/10] tcp: tsq: move tsq_flags close to sk_wmem_alloc
|
||||||
|
|
||||||
|
tsq_flags being in the same cache line than sk_wmem_alloc
|
||||||
|
makes a lot of sense. Both fields are changed from tcp_wfree()
|
||||||
|
and more generally by various TSQ related functions.
|
||||||
|
|
||||||
|
Prior patch made room in struct sock and added sk_tsq_flags,
|
||||||
|
this patch deletes tsq_flags from struct tcp_sock.
|
||||||
|
|
||||||
|
Signed-off-by: Eric Dumazet <edumazet@google.com>
|
||||||
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||||
|
---
|
||||||
|
include/linux/tcp.h | 1 -
|
||||||
|
net/ipv4/tcp.c | 4 ++--
|
||||||
|
net/ipv4/tcp_ipv4.c | 2 +-
|
||||||
|
net/ipv4/tcp_output.c | 24 +++++++++++-------------
|
||||||
|
net/ipv4/tcp_timer.c | 4 ++--
|
||||||
|
net/ipv6/tcp_ipv6.c | 2 +-
|
||||||
|
6 files changed, 17 insertions(+), 20 deletions(-)
|
||||||
|
|
||||||
|
--- a/include/linux/tcp.h
|
||||||
|
+++ b/include/linux/tcp.h
|
||||||
|
@@ -192,7 +192,6 @@ struct tcp_sock {
|
||||||
|
u32 tsoffset; /* timestamp offset */
|
||||||
|
|
||||||
|
struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
|
||||||
|
- unsigned long tsq_flags;
|
||||||
|
|
||||||
|
/* Data for direct copy to user */
|
||||||
|
struct {
|
||||||
|
--- a/net/ipv4/tcp.c
|
||||||
|
+++ b/net/ipv4/tcp.c
|
||||||
|
@@ -665,9 +665,9 @@ static void tcp_push(struct sock *sk, in
|
||||||
|
if (tcp_should_autocork(sk, skb, size_goal)) {
|
||||||
|
|
||||||
|
/* avoid atomic op if TSQ_THROTTLED bit is already set */
|
||||||
|
- if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
|
||||||
|
+ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
|
||||||
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
|
||||||
|
- set_bit(TSQ_THROTTLED, &tp->tsq_flags);
|
||||||
|
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
|
||||||
|
}
|
||||||
|
/* It is possible TX completion already happened
|
||||||
|
* before we set TSQ_THROTTLED.
|
||||||
|
--- a/net/ipv4/tcp_ipv4.c
|
||||||
|
+++ b/net/ipv4/tcp_ipv4.c
|
||||||
|
@@ -446,7 +446,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb
|
||||||
|
if (!sock_owned_by_user(sk)) {
|
||||||
|
tcp_v4_mtu_reduced(sk);
|
||||||
|
} else {
|
||||||
|
- if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
|
||||||
|
+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
|
||||||
|
sock_hold(sk);
|
||||||
|
}
|
||||||
|
goto out;
|
||||||
|
--- a/net/ipv4/tcp_output.c
|
||||||
|
+++ b/net/ipv4/tcp_output.c
|
||||||
|
@@ -767,14 +767,15 @@ static void tcp_tasklet_func(unsigned lo
|
||||||
|
list_for_each_safe(q, n, &list) {
|
||||||
|
tp = list_entry(q, struct tcp_sock, tsq_node);
|
||||||
|
list_del(&tp->tsq_node);
|
||||||
|
- clear_bit(TSQ_QUEUED, &tp->tsq_flags);
|
||||||
|
|
||||||
|
sk = (struct sock *)tp;
|
||||||
|
+ clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
|
||||||
|
+
|
||||||
|
if (!sk->sk_lock.owned &&
|
||||||
|
- test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags)) {
|
||||||
|
+ test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
|
||||||
|
bh_lock_sock(sk);
|
||||||
|
if (!sock_owned_by_user(sk)) {
|
||||||
|
- clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
|
||||||
|
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
|
||||||
|
tcp_tsq_handler(sk);
|
||||||
|
}
|
||||||
|
bh_unlock_sock(sk);
|
||||||
|
@@ -797,16 +798,15 @@ static void tcp_tasklet_func(unsigned lo
|
||||||
|
*/
|
||||||
|
void tcp_release_cb(struct sock *sk)
|
||||||
|
{
|
||||||
|
- struct tcp_sock *tp = tcp_sk(sk);
|
||||||
|
unsigned long flags, nflags;
|
||||||
|
|
||||||
|
/* perform an atomic operation only if at least one flag is set */
|
||||||
|
do {
|
||||||
|
- flags = tp->tsq_flags;
|
||||||
|
+ flags = sk->sk_tsq_flags;
|
||||||
|
if (!(flags & TCP_DEFERRED_ALL))
|
||||||
|
return;
|
||||||
|
nflags = flags & ~TCP_DEFERRED_ALL;
|
||||||
|
- } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
|
||||||
|
+ } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
|
||||||
|
|
||||||
|
if (flags & TCPF_TSQ_DEFERRED)
|
||||||
|
tcp_tsq_handler(sk);
|
||||||
|
@@ -878,7 +878,7 @@ void tcp_wfree(struct sk_buff *skb)
|
||||||
|
if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
- for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) {
|
||||||
|
+ for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
|
||||||
|
struct tsq_tasklet *tsq;
|
||||||
|
bool empty;
|
||||||
|
|
||||||
|
@@ -886,7 +886,7 @@ void tcp_wfree(struct sk_buff *skb)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
|
||||||
|
- nval = cmpxchg(&tp->tsq_flags, oval, nval);
|
||||||
|
+ nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
|
||||||
|
if (nval != oval)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
@@ -2093,7 +2093,7 @@ static bool tcp_small_queue_check(struct
|
||||||
|
skb->prev == sk->sk_write_queue.next)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
- set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
|
||||||
|
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
|
||||||
|
/* It is possible TX completion already happened
|
||||||
|
* before we set TSQ_THROTTLED, so we must
|
||||||
|
* test again the condition.
|
||||||
|
@@ -2191,8 +2191,8 @@ static bool tcp_write_xmit(struct sock *
|
||||||
|
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
|
||||||
|
break;
|
||||||
|
|
||||||
|
- if (test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags))
|
||||||
|
- clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
|
||||||
|
+ if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
|
||||||
|
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
|
||||||
|
if (tcp_small_queue_check(sk, skb, 0))
|
||||||
|
break;
|
||||||
|
|
||||||
|
@@ -3495,8 +3495,6 @@ void tcp_send_ack(struct sock *sk)
|
||||||
|
/* We do not want pure acks influencing TCP Small Queues or fq/pacing
|
||||||
|
* too much.
|
||||||
|
* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
|
||||||
|
- * We also avoid tcp_wfree() overhead (cache line miss accessing
|
||||||
|
- * tp->tsq_flags) by using regular sock_wfree()
|
||||||
|
*/
|
||||||
|
skb_set_tcp_pure_ack(buff);
|
||||||
|
|
||||||
|
--- a/net/ipv4/tcp_timer.c
|
||||||
|
+++ b/net/ipv4/tcp_timer.c
|
||||||
|
@@ -311,7 +311,7 @@ static void tcp_delack_timer(unsigned lo
|
||||||
|
inet_csk(sk)->icsk_ack.blocked = 1;
|
||||||
|
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
|
||||||
|
/* deleguate our work to tcp_release_cb() */
|
||||||
|
- if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
|
||||||
|
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
|
||||||
|
sock_hold(sk);
|
||||||
|
}
|
||||||
|
bh_unlock_sock(sk);
|
||||||
|
@@ -594,7 +594,7 @@ static void tcp_write_timer(unsigned lon
|
||||||
|
tcp_write_timer_handler(sk);
|
||||||
|
} else {
|
||||||
|
/* delegate our work to tcp_release_cb() */
|
||||||
|
- if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
|
||||||
|
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
|
||||||
|
sock_hold(sk);
|
||||||
|
}
|
||||||
|
bh_unlock_sock(sk);
|
||||||
|
--- a/net/ipv6/tcp_ipv6.c
|
||||||
|
+++ b/net/ipv6/tcp_ipv6.c
|
||||||
|
@@ -404,7 +404,7 @@ static void tcp_v6_err(struct sk_buff *s
|
||||||
|
if (!sock_owned_by_user(sk))
|
||||||
|
tcp_v6_mtu_reduced(sk);
|
||||||
|
else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
|
||||||
|
- &tp->tsq_flags))
|
||||||
|
+ &sk->sk_tsq_flags))
|
||||||
|
sock_hold(sk);
|
||||||
|
goto out;
|
||||||
|
}
|
@ -0,0 +1,40 @@ |
|||||||
|
From 0a9648f1293966c838dc570da73c15a76f4c89d6 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Eric Dumazet <edumazet@google.com>
|
||||||
|
Date: Wed, 21 Dec 2016 05:42:43 -0800
|
||||||
|
Subject: [PATCH 09/10] tcp: add a missing barrier in tcp_tasklet_func()
|
||||||
|
|
||||||
|
Madalin reported crashes happening in tcp_tasklet_func() on powerpc64
|
||||||
|
|
||||||
|
Before TSQ_QUEUED bit is cleared, we must ensure the changes done
|
||||||
|
by list_del(&tp->tsq_node); are committed to memory, otherwise
|
||||||
|
corruption might happen, as an other cpu could catch TSQ_QUEUED
|
||||||
|
clearance too soon.
|
||||||
|
|
||||||
|
We can notice that old kernels were immune to this bug, because
|
||||||
|
TSQ_QUEUED was cleared after a bh_lock_sock(sk)/bh_unlock_sock(sk)
|
||||||
|
section, but they could have missed a kick to write additional bytes,
|
||||||
|
when NIC interrupts for a given flow are spread to multiple cpus.
|
||||||
|
|
||||||
|
Affected TCP flows would need an incoming ACK or RTO timer to add more
|
||||||
|
packets to the pipe. So overall situation should be better now.
|
||||||
|
|
||||||
|
Fixes: b223feb9de2a ("tcp: tsq: add shortcut in tcp_tasklet_func()")
|
||||||
|
Signed-off-by: Eric Dumazet <edumazet@google.com>
|
||||||
|
Reported-by: Madalin Bucur <madalin.bucur@nxp.com>
|
||||||
|
Tested-by: Madalin Bucur <madalin.bucur@nxp.com>
|
||||||
|
Tested-by: Xing Lei <xing.lei@nxp.com>
|
||||||
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||||
|
---
|
||||||
|
net/ipv4/tcp_output.c | 1 +
|
||||||
|
1 file changed, 1 insertion(+)
|
||||||
|
|
||||||
|
--- a/net/ipv4/tcp_output.c
|
||||||
|
+++ b/net/ipv4/tcp_output.c
|
||||||
|
@@ -769,6 +769,7 @@ static void tcp_tasklet_func(unsigned lo
|
||||||
|
list_del(&tp->tsq_node);
|
||||||
|
|
||||||
|
sk = (struct sock *)tp;
|
||||||
|
+ smp_mb__before_atomic();
|
||||||
|
clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
|
||||||
|
|
||||||
|
if (!sk->sk_lock.owned &&
|
Loading…
Reference in new issue