generic: net: tcp: backport tcp tx performance patches

An overall throughput gain of 22 % for heavy TCP use over a single TX queue.

Original patchset comment
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v4.13&id=3f4888adae7c1619b990d98a9b967536f71822b8

Signed-off-by: Pavel Kubelun <be.dissent@gmail.com>
master
Pavel Kubelun 7 years ago committed by Stijn Tintel
parent 3a69ad3b2a
commit b2ea46fe23
  1. 90
      target/linux/generic/backport-4.9/024-1-tcp-tsq-add-tsq_flags-tsq_enum.patch
  2. 48
      target/linux/generic/backport-4.9/024-2-tcp-tsq-remove-one-locked-operation-in-tcp_wfree.patch
  3. 71
      target/linux/generic/backport-4.9/024-3-tcp-tsq-add-shortcut-in-tcp_tasklet_func.patch
  4. 38
      target/linux/generic/backport-4.9/024-4-tcp-tsq-avoid-one-atomic-in-tcp_wfree.patch
  5. 37
      target/linux/generic/backport-4.9/024-5-tcp-tsq-add-a-shortcut-in-tcp_small_queue_check.patch
  6. 55
      target/linux/generic/backport-4.9/024-6-tcp-tcp_mtu_probe-is-likely-to-exit-early.patch
  7. 157
      target/linux/generic/backport-4.9/024-7-net-reorganize-struct-sock-for-better-data-locality.patch
  8. 176
      target/linux/generic/backport-4.9/024-8-tcp-tsq-move-tsq_flags-close-to-sk_wmem_alloc.patch
  9. 40
      target/linux/generic/backport-4.9/024-9-tcp-add-a-missing-barrier-in-tcp_tasklet_func.patch

@ -0,0 +1,90 @@
From 40fc3423b983b864bf70b03199191260ae9b2ea6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 3 Dec 2016 11:14:50 -0800
Subject: [PATCH 01/10] tcp: tsq: add tsq_flags / tsq_enum
This is a cleanup, to ease code review of following patches.
Old 'enum tsq_flags' is renamed, and a new enumeration is added
with the flags used in cmpxchg() operations as opposed to
single bit operations.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/linux/tcp.h | 11 ++++++++++-
net/ipv4/tcp_output.c | 16 ++++++++--------
2 files changed, 18 insertions(+), 9 deletions(-)
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -367,7 +367,7 @@ struct tcp_sock {
u32 *saved_syn;
};
-enum tsq_flags {
+enum tsq_enum {
TSQ_THROTTLED,
TSQ_QUEUED,
TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */
@@ -378,6 +378,15 @@ enum tsq_flags {
*/
};
+enum tsq_flags {
+ TSQF_THROTTLED = (1UL << TSQ_THROTTLED),
+ TSQF_QUEUED = (1UL << TSQ_QUEUED),
+ TCPF_TSQ_DEFERRED = (1UL << TCP_TSQ_DEFERRED),
+ TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED),
+ TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
+ TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED),
+};
+
static inline struct tcp_sock *tcp_sk(const struct sock *sk)
{
return (struct tcp_sock *)sk;
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -784,10 +784,10 @@ static void tcp_tasklet_func(unsigned lo
}
}
-#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
- (1UL << TCP_WRITE_TIMER_DEFERRED) | \
- (1UL << TCP_DELACK_TIMER_DEFERRED) | \
- (1UL << TCP_MTU_REDUCED_DEFERRED))
+#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
+ TCPF_WRITE_TIMER_DEFERRED | \
+ TCPF_DELACK_TIMER_DEFERRED | \
+ TCPF_MTU_REDUCED_DEFERRED)
/**
* tcp_release_cb - tcp release_sock() callback
* @sk: socket
@@ -808,7 +808,7 @@ void tcp_release_cb(struct sock *sk)
nflags = flags & ~TCP_DEFERRED_ALL;
} while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
- if (flags & (1UL << TCP_TSQ_DEFERRED))
+ if (flags & TCPF_TSQ_DEFERRED)
tcp_tsq_handler(sk);
/* Here begins the tricky part :
@@ -822,15 +822,15 @@ void tcp_release_cb(struct sock *sk)
*/
sock_release_ownership(sk);
- if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
+ if (flags & TCPF_WRITE_TIMER_DEFERRED) {
tcp_write_timer_handler(sk);
__sock_put(sk);
}
- if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
+ if (flags & TCPF_DELACK_TIMER_DEFERRED) {
tcp_delack_timer_handler(sk);
__sock_put(sk);
}
- if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
+ if (flags & TCPF_MTU_REDUCED_DEFERRED) {
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}

@ -0,0 +1,48 @@
From 408f0a6c21e124cc4f6c7aa370b38aa47e55428d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 3 Dec 2016 11:14:51 -0800
Subject: [PATCH 02/10] tcp: tsq: remove one locked operation in tcp_wfree()
Instead of atomically clear TSQ_THROTTLED and atomically set TSQ_QUEUED
bits, use one cmpxchg() to perform a single locked operation.
Since the following patch will also set TCP_TSQ_DEFERRED here,
this cmpxchg() will make this addition free.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
net/ipv4/tcp_output.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -860,6 +860,7 @@ void tcp_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long flags, nval, oval;
int wmem;
/* Keep one reference on sk_wmem_alloc.
@@ -877,11 +878,17 @@ void tcp_wfree(struct sk_buff *skb)
if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
goto out;
- if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
- !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
- unsigned long flags;
+ for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) {
struct tsq_tasklet *tsq;
+ if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
+ goto out;
+
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
+ nval = cmpxchg(&tp->tsq_flags, oval, nval);
+ if (nval != oval)
+ continue;
+
/* queue this socket to tasklet queue */
local_irq_save(flags);
tsq = this_cpu_ptr(&tsq_tasklet);

@ -0,0 +1,71 @@
From b223feb9de2a65c533ff95c08e834fa732906ea5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 3 Dec 2016 11:14:52 -0800
Subject: [PATCH 03/10] tcp: tsq: add shortcut in tcp_tasklet_func()
Under high stress, I've seen tcp_tasklet_func() consuming
~700 usec, handling ~150 tcp sockets.
By setting TCP_TSQ_DEFERRED in tcp_wfree(), we give a chance
for other cpus/threads entering tcp_write_xmit() to grab it,
allowing tcp_tasklet_func() to skip sockets that already did
an xmit cycle.
In the future, we might give to ACK processing an increased
budget to reduce even more tcp_tasklet_func() amount of work.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
net/ipv4/tcp_output.c | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -767,19 +767,19 @@ static void tcp_tasklet_func(unsigned lo
list_for_each_safe(q, n, &list) {
tp = list_entry(q, struct tcp_sock, tsq_node);
list_del(&tp->tsq_node);
+ clear_bit(TSQ_QUEUED, &tp->tsq_flags);
sk = (struct sock *)tp;
- bh_lock_sock(sk);
-
- if (!sock_owned_by_user(sk)) {
- tcp_tsq_handler(sk);
- } else {
- /* defer the work to tcp_release_cb() */
- set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+ if (!sk->sk_lock.owned &&
+ test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags)) {
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+ tcp_tsq_handler(sk);
+ }
+ bh_unlock_sock(sk);
}
- bh_unlock_sock(sk);
- clear_bit(TSQ_QUEUED, &tp->tsq_flags);
sk_free(sk);
}
}
@@ -884,7 +884,7 @@ void tcp_wfree(struct sk_buff *skb)
if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
goto out;
- nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
nval = cmpxchg(&tp->tsq_flags, oval, nval);
if (nval != oval)
continue;
@@ -2179,6 +2179,8 @@ static bool tcp_write_xmit(struct sock *
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
+ if (test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags))
+ clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
if (tcp_small_queue_check(sk, skb, 0))
break;

@ -0,0 +1,38 @@
From a9b204d1564702b704ad6fe74f10a102c7b87ba3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 3 Dec 2016 11:14:53 -0800
Subject: [PATCH 04/10] tcp: tsq: avoid one atomic in tcp_wfree()
Under high load, tcp_wfree() has an atomic operation trying
to schedule a tasklet over and over.
We can schedule it only if our per cpu list was empty.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
net/ipv4/tcp_output.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -880,6 +880,7 @@ void tcp_wfree(struct sk_buff *skb)
for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) {
struct tsq_tasklet *tsq;
+ bool empty;
if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
goto out;
@@ -892,8 +893,10 @@ void tcp_wfree(struct sk_buff *skb)
/* queue this socket to tasklet queue */
local_irq_save(flags);
tsq = this_cpu_ptr(&tsq_tasklet);
+ empty = list_empty(&tsq->head);
list_add(&tp->tsq_node, &tsq->head);
- tasklet_schedule(&tsq->tasklet);
+ if (empty)
+ tasklet_schedule(&tsq->tasklet);
local_irq_restore(flags);
return;
}

@ -0,0 +1,37 @@
From 75eefc6c59fd2c5f1ab95a3a113c217237d12a31 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 3 Dec 2016 11:14:54 -0800
Subject: [PATCH 05/10] tcp: tsq: add a shortcut in tcp_small_queue_check()
Always allow the two first skbs in write queue to be sent,
regardless of sk_wmem_alloc/sk_pacing_rate values.
This helps a lot in situations where TX completions are delayed either
because of driver latencies or softirq latencies.
Test is done with no cache line misses.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
net/ipv4/tcp_output.c | 9 +++++++++
1 file changed, 9 insertions(+)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2084,6 +2084,15 @@ static bool tcp_small_queue_check(struct
limit <<= factor;
if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+ /* Always send the 1st or 2nd skb in write queue.
+ * No need to wait for TX completion to call us back,
+ * after softirq/tasklet schedule.
+ * This helps when TX completions are delayed too much.
+ */
+ if (skb == sk->sk_write_queue.next ||
+ skb->prev == sk->sk_write_queue.next)
+ return false;
+
set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED, so we must

@ -0,0 +1,55 @@
From 12a59abc22d6664f7d3944f625ceefee92de8820 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 3 Dec 2016 11:14:55 -0800
Subject: [PATCH 06/10] tcp: tcp_mtu_probe() is likely to exit early
Adding a likely() in tcp_mtu_probe() moves its code which used to
be inlined in front of tcp_write_xmit()
We still have a cache line miss to access icsk->icsk_mtup.enabled,
we will probably have to reorganize fields to help data locality.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
net/ipv4/tcp_output.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1925,26 +1925,26 @@ static inline void tcp_mtu_check_reprobe
*/
static int tcp_mtu_probe(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb, *nskb, *next;
struct net *net = sock_net(sk);
- int len;
int probe_size;
int size_needed;
- int copy;
+ int copy, len;
int mss_now;
int interval;
/* Not currently probing/verifying,
* not in recovery,
* have enough cwnd, and
- * not SACKing (the variable headers throw things off) */
- if (!icsk->icsk_mtup.enabled ||
- icsk->icsk_mtup.probe_size ||
- inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
- tp->snd_cwnd < 11 ||
- tp->rx_opt.num_sacks || tp->rx_opt.dsack)
+ * not SACKing (the variable headers throw things off)
+ */
+ if (likely(!icsk->icsk_mtup.enabled ||
+ icsk->icsk_mtup.probe_size ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+ tp->snd_cwnd < 11 ||
+ tp->rx_opt.num_sacks || tp->rx_opt.dsack))
return -1;
/* Use binary search for probe_size between tcp_mss_base,

@ -0,0 +1,157 @@
From 9115e8cd2a0c6eaaa900c462721f12e1d45f326c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 3 Dec 2016 11:14:56 -0800
Subject: [PATCH 07/10] net: reorganize struct sock for better data locality
Group fields used in TX path, and keep some cache lines mostly read
to permit sharing among cpus.
Gained two 4 bytes holes on 64bit arches.
Added a place holder for tcp tsq_flags, next to sk_wmem_alloc
to speed up tcp_wfree() in the following patch.
I have not added ____cacheline_aligned_in_smp, this might be done later.
I prefer doing this once inet and tcp/udp sockets reorg is also done.
Tested with both TCP and UDP.
UDP receiver performance under flood increased by ~20 % :
Accessing sk_filter/sk_wq/sk_napi_id no longer stalls because sk_drops
was moved away from a critical cache line, now mostly read and shared.
/* --- cacheline 4 boundary (256 bytes) --- */
unsigned int sk_napi_id; /* 0x100 0x4 */
int sk_rcvbuf; /* 0x104 0x4 */
struct sk_filter * sk_filter; /* 0x108 0x8 */
union {
struct socket_wq * sk_wq; /* 0x8 */
struct socket_wq * sk_wq_raw; /* 0x8 */
}; /* 0x110 0x8 */
struct xfrm_policy * sk_policy[2]; /* 0x118 0x10 */
struct dst_entry * sk_rx_dst; /* 0x128 0x8 */
struct dst_entry * sk_dst_cache; /* 0x130 0x8 */
atomic_t sk_omem_alloc; /* 0x138 0x4 */
int sk_sndbuf; /* 0x13c 0x4 */
/* --- cacheline 5 boundary (320 bytes) --- */
int sk_wmem_queued; /* 0x140 0x4 */
atomic_t sk_wmem_alloc; /* 0x144 0x4 */
long unsigned int sk_tsq_flags; /* 0x148 0x8 */
struct sk_buff * sk_send_head; /* 0x150 0x8 */
struct sk_buff_head sk_write_queue; /* 0x158 0x18 */
__s32 sk_peek_off; /* 0x170 0x4 */
int sk_write_pending; /* 0x174 0x4 */
long int sk_sndtimeo; /* 0x178 0x8 */
Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/net/sock.h | 51 +++++++++++++++++++++++++++------------------------
1 file changed, 27 insertions(+), 24 deletions(-)
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -343,6 +343,9 @@ struct sock {
#define sk_rxhash __sk_common.skc_rxhash
socket_lock_t sk_lock;
+ atomic_t sk_drops;
+ int sk_rcvlowat;
+ struct sk_buff_head sk_error_queue;
struct sk_buff_head sk_receive_queue;
/*
* The backlog queue is special, it is always used with
@@ -359,14 +362,13 @@ struct sock {
struct sk_buff *tail;
} sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc
- int sk_forward_alloc;
- __u32 sk_txhash;
+ int sk_forward_alloc;
#ifdef CONFIG_NET_RX_BUSY_POLL
- unsigned int sk_napi_id;
unsigned int sk_ll_usec;
+ /* ===== mostly read cache line ===== */
+ unsigned int sk_napi_id;
#endif
- atomic_t sk_drops;
int sk_rcvbuf;
struct sk_filter __rcu *sk_filter;
@@ -379,11 +381,30 @@ struct sock {
#endif
struct dst_entry *sk_rx_dst;
struct dst_entry __rcu *sk_dst_cache;
- /* Note: 32bit hole on 64bit arches */
- atomic_t sk_wmem_alloc;
atomic_t sk_omem_alloc;
int sk_sndbuf;
+
+ /* ===== cache line for TX ===== */
+ int sk_wmem_queued;
+ atomic_t sk_wmem_alloc;
+ unsigned long sk_tsq_flags;
+ struct sk_buff *sk_send_head;
struct sk_buff_head sk_write_queue;
+ __s32 sk_peek_off;
+ int sk_write_pending;
+ long sk_sndtimeo;
+ struct timer_list sk_timer;
+ __u32 sk_priority;
+ __u32 sk_mark;
+ u32 sk_pacing_rate; /* bytes per second */
+ u32 sk_max_pacing_rate;
+ struct page_frag sk_frag;
+ netdev_features_t sk_route_caps;
+ netdev_features_t sk_route_nocaps;
+ int sk_gso_type;
+ unsigned int sk_gso_max_size;
+ gfp_t sk_allocation;
+ __u32 sk_txhash;
/*
* Because of non atomicity rules, all
@@ -399,41 +420,23 @@ struct sock {
#define SK_PROTOCOL_MAX U8_MAX
kmemcheck_bitfield_end(flags);
- int sk_wmem_queued;
- gfp_t sk_allocation;
- u32 sk_pacing_rate; /* bytes per second */
- u32 sk_max_pacing_rate;
- netdev_features_t sk_route_caps;
- netdev_features_t sk_route_nocaps;
- int sk_gso_type;
- unsigned int sk_gso_max_size;
u16 sk_gso_max_segs;
- int sk_rcvlowat;
unsigned long sk_lingertime;
- struct sk_buff_head sk_error_queue;
struct proto *sk_prot_creator;
rwlock_t sk_callback_lock;
int sk_err,
sk_err_soft;
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
- __u32 sk_priority;
- __u32 sk_mark;
struct pid *sk_peer_pid;
const struct cred *sk_peer_cred;
long sk_rcvtimeo;
- long sk_sndtimeo;
- struct timer_list sk_timer;
ktime_t sk_stamp;
u16 sk_tsflags;
u8 sk_shutdown;
u32 sk_tskey;
struct socket *sk_socket;
void *sk_user_data;
- struct page_frag sk_frag;
- struct sk_buff *sk_send_head;
- __s32 sk_peek_off;
- int sk_write_pending;
#ifdef CONFIG_SECURITY
void *sk_security;
#endif

@ -0,0 +1,176 @@
From 7aa5470c2c09265902b5e4289afa82e4e7c2987e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 3 Dec 2016 11:14:57 -0800
Subject: [PATCH 08/10] tcp: tsq: move tsq_flags close to sk_wmem_alloc
tsq_flags being in the same cache line than sk_wmem_alloc
makes a lot of sense. Both fields are changed from tcp_wfree()
and more generally by various TSQ related functions.
Prior patch made room in struct sock and added sk_tsq_flags,
this patch deletes tsq_flags from struct tcp_sock.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/linux/tcp.h | 1 -
net/ipv4/tcp.c | 4 ++--
net/ipv4/tcp_ipv4.c | 2 +-
net/ipv4/tcp_output.c | 24 +++++++++++-------------
net/ipv4/tcp_timer.c | 4 ++--
net/ipv6/tcp_ipv6.c | 2 +-
6 files changed, 17 insertions(+), 20 deletions(-)
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -192,7 +192,6 @@ struct tcp_sock {
u32 tsoffset; /* timestamp offset */
struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
- unsigned long tsq_flags;
/* Data for direct copy to user */
struct {
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -665,9 +665,9 @@ static void tcp_push(struct sock *sk, in
if (tcp_should_autocork(sk, skb, size_goal)) {
/* avoid atomic op if TSQ_THROTTLED bit is already set */
- if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
+ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
- set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
}
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED.
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -446,7 +446,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb
if (!sock_owned_by_user(sk)) {
tcp_v4_mtu_reduced(sk);
} else {
- if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
goto out;
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -767,14 +767,15 @@ static void tcp_tasklet_func(unsigned lo
list_for_each_safe(q, n, &list) {
tp = list_entry(q, struct tcp_sock, tsq_node);
list_del(&tp->tsq_node);
- clear_bit(TSQ_QUEUED, &tp->tsq_flags);
sk = (struct sock *)tp;
+ clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
+
if (!sk->sk_lock.owned &&
- test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags)) {
+ test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
- clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
tcp_tsq_handler(sk);
}
bh_unlock_sock(sk);
@@ -797,16 +798,15 @@ static void tcp_tasklet_func(unsigned lo
*/
void tcp_release_cb(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
unsigned long flags, nflags;
/* perform an atomic operation only if at least one flag is set */
do {
- flags = tp->tsq_flags;
+ flags = sk->sk_tsq_flags;
if (!(flags & TCP_DEFERRED_ALL))
return;
nflags = flags & ~TCP_DEFERRED_ALL;
- } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+ } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
if (flags & TCPF_TSQ_DEFERRED)
tcp_tsq_handler(sk);
@@ -878,7 +878,7 @@ void tcp_wfree(struct sk_buff *skb)
if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
goto out;
- for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) {
+ for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
struct tsq_tasklet *tsq;
bool empty;
@@ -886,7 +886,7 @@ void tcp_wfree(struct sk_buff *skb)
goto out;
nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
- nval = cmpxchg(&tp->tsq_flags, oval, nval);
+ nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
if (nval != oval)
continue;
@@ -2093,7 +2093,7 @@ static bool tcp_small_queue_check(struct
skb->prev == sk->sk_write_queue.next)
return false;
- set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED, so we must
* test again the condition.
@@ -2191,8 +2191,8 @@ static bool tcp_write_xmit(struct sock *
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
- if (test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags))
- clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+ if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
if (tcp_small_queue_check(sk, skb, 0))
break;
@@ -3495,8 +3495,6 @@ void tcp_send_ack(struct sock *sk)
/* We do not want pure acks influencing TCP Small Queues or fq/pacing
* too much.
* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
- * We also avoid tcp_wfree() overhead (cache line miss accessing
- * tp->tsq_flags) by using regular sock_wfree()
*/
skb_set_tcp_pure_ack(buff);
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -311,7 +311,7 @@ static void tcp_delack_timer(unsigned lo
inet_csk(sk)->icsk_ack.blocked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
/* deleguate our work to tcp_release_cb() */
- if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
@@ -594,7 +594,7 @@ static void tcp_write_timer(unsigned lon
tcp_write_timer_handler(sk);
} else {
/* delegate our work to tcp_release_cb() */
- if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -404,7 +404,7 @@ static void tcp_v6_err(struct sk_buff *s
if (!sock_owned_by_user(sk))
tcp_v6_mtu_reduced(sk);
else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
- &tp->tsq_flags))
+ &sk->sk_tsq_flags))
sock_hold(sk);
goto out;
}

@ -0,0 +1,40 @@
From 0a9648f1293966c838dc570da73c15a76f4c89d6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 21 Dec 2016 05:42:43 -0800
Subject: [PATCH 09/10] tcp: add a missing barrier in tcp_tasklet_func()
Madalin reported crashes happening in tcp_tasklet_func() on powerpc64
Before TSQ_QUEUED bit is cleared, we must ensure the changes done
by list_del(&tp->tsq_node); are committed to memory, otherwise
corruption might happen, as an other cpu could catch TSQ_QUEUED
clearance too soon.
We can notice that old kernels were immune to this bug, because
TSQ_QUEUED was cleared after a bh_lock_sock(sk)/bh_unlock_sock(sk)
section, but they could have missed a kick to write additional bytes,
when NIC interrupts for a given flow are spread to multiple cpus.
Affected TCP flows would need an incoming ACK or RTO timer to add more
packets to the pipe. So overall situation should be better now.
Fixes: b223feb9de2a ("tcp: tsq: add shortcut in tcp_tasklet_func()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Madalin Bucur <madalin.bucur@nxp.com>
Tested-by: Madalin Bucur <madalin.bucur@nxp.com>
Tested-by: Xing Lei <xing.lei@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
net/ipv4/tcp_output.c | 1 +
1 file changed, 1 insertion(+)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -769,6 +769,7 @@ static void tcp_tasklet_func(unsigned lo
list_del(&tp->tsq_node);
sk = (struct sock *)tp;
+ smp_mb__before_atomic();
clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
if (!sk->sk_lock.owned &&
Loading…
Cancel
Save