Commit e9bd0cca authored by Kuniyuki Iwashima's avatar Kuniyuki Iwashima Committed by Jakub Kicinski

tcp: Don't allocate tcp_death_row outside of struct netns_ipv4.

We will soon introduce an optional per-netns ehash and access hash
tables via net->ipv4.tcp_death_row->hashinfo instead of &tcp_hashinfo
in most places.

It could harm the fast path because dereferences of two fields in net
and tcp_death_row might incur two extra cache line misses.  To save one
dereference, let's place tcp_death_row back in netns_ipv4 and fetch
hashinfo via net->ipv4.tcp_death_row"."hashinfo.

Note tcp_death_row was initially placed in netns_ipv4, and commit
fbb82952 ("tcp: allocate tcp_death_row outside of struct netns_ipv4")
changed it to a pointer so that we can fire TIME_WAIT timers after freeing
net.  However, we don't do so after commit 04c494e6 ("Revert "tcp/dccp:
get rid of inet_twsk_purge()""), so we need not define tcp_death_row as a
pointer.

Also, we move refcount_dec_and_test(&tw_refcount) from tcp_sk_exit() to
tcp_sk_exit_batch() as a debug check.
Signed-off-by: default avatarKuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 08eaef90
...@@ -34,6 +34,7 @@ struct inet_hashinfo; ...@@ -34,6 +34,7 @@ struct inet_hashinfo;
struct inet_timewait_death_row { struct inet_timewait_death_row {
refcount_t tw_refcount; refcount_t tw_refcount;
/* Padding to avoid false sharing, tw_refcount can be often written */
struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp; struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp;
int sysctl_max_tw_buckets; int sysctl_max_tw_buckets;
}; };
...@@ -41,7 +42,7 @@ struct inet_timewait_death_row { ...@@ -41,7 +42,7 @@ struct inet_timewait_death_row {
struct tcp_fastopen_context; struct tcp_fastopen_context;
struct netns_ipv4 { struct netns_ipv4 {
struct inet_timewait_death_row *tcp_death_row; struct inet_timewait_death_row tcp_death_row;
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
struct ctl_table_header *forw_hdr; struct ctl_table_header *forw_hdr;
......
...@@ -59,9 +59,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) ...@@ -59,9 +59,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
inet_twsk_bind_unhash(tw, hashinfo); inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead->lock); spin_unlock(&bhead->lock);
if (refcount_dec_and_test(&tw->tw_dr->tw_refcount)) refcount_dec(&tw->tw_dr->tw_refcount);
kfree(tw->tw_dr);
inet_twsk_put(tw); inet_twsk_put(tw);
} }
......
...@@ -59,7 +59,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) ...@@ -59,7 +59,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
socket_seq_show(seq); socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
sock_prot_inuse_get(net, &tcp_prot), orphans, sock_prot_inuse_get(net, &tcp_prot), orphans,
refcount_read(&net->ipv4.tcp_death_row->tw_refcount) - 1, refcount_read(&net->ipv4.tcp_death_row.tw_refcount) - 1,
sockets, proto_memory_allocated(&tcp_prot)); sockets, proto_memory_allocated(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n", seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot), sock_prot_inuse_get(net, &udp_prot),
......
...@@ -530,10 +530,9 @@ static struct ctl_table ipv4_table[] = { ...@@ -530,10 +530,9 @@ static struct ctl_table ipv4_table[] = {
}; };
static struct ctl_table ipv4_net_table[] = { static struct ctl_table ipv4_net_table[] = {
/* tcp_max_tw_buckets must be first in this table. */
{ {
.procname = "tcp_max_tw_buckets", .procname = "tcp_max_tw_buckets",
/* .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, */ .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
.maxlen = sizeof(int), .maxlen = sizeof(int),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec .proc_handler = proc_dointvec
...@@ -1361,8 +1360,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) ...@@ -1361,8 +1360,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
if (!table) if (!table)
goto err_alloc; goto err_alloc;
/* skip first entry (sysctl_max_tw_buckets) */ for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
for (i = 1; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
if (table[i].data) { if (table[i].data) {
/* Update the variables to point into /* Update the variables to point into
* the current struct net * the current struct net
...@@ -1377,8 +1375,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) ...@@ -1377,8 +1375,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
} }
} }
table[0].data = &net->ipv4.tcp_death_row->sysctl_max_tw_buckets;
net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
if (!net->ipv4.ipv4_hdr) if (!net->ipv4.ipv4_hdr)
goto err_reg; goto err_reg;
......
...@@ -292,7 +292,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) ...@@ -292,7 +292,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* complete initialization after this. * complete initialization after this.
*/ */
tcp_set_state(sk, TCP_SYN_SENT); tcp_set_state(sk, TCP_SYN_SENT);
tcp_death_row = net->ipv4.tcp_death_row; tcp_death_row = &net->ipv4.tcp_death_row;
err = inet_hash_connect(tcp_death_row, sk); err = inet_hash_connect(tcp_death_row, sk);
if (err) if (err)
goto failure; goto failure;
...@@ -3091,13 +3091,9 @@ EXPORT_SYMBOL(tcp_prot); ...@@ -3091,13 +3091,9 @@ EXPORT_SYMBOL(tcp_prot);
static void __net_exit tcp_sk_exit(struct net *net) static void __net_exit tcp_sk_exit(struct net *net)
{ {
struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
if (net->ipv4.tcp_congestion_control) if (net->ipv4.tcp_congestion_control)
bpf_module_put(net->ipv4.tcp_congestion_control, bpf_module_put(net->ipv4.tcp_congestion_control,
net->ipv4.tcp_congestion_control->owner); net->ipv4.tcp_congestion_control->owner);
if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
kfree(tcp_death_row);
} }
static int __net_init tcp_sk_init(struct net *net) static int __net_init tcp_sk_init(struct net *net)
...@@ -3129,13 +3125,10 @@ static int __net_init tcp_sk_init(struct net *net) ...@@ -3129,13 +3125,10 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_tw_reuse = 2; net->ipv4.sysctl_tcp_tw_reuse = 2;
net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL); refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
if (!net->ipv4.tcp_death_row)
return -ENOMEM;
refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
cnt = tcp_hashinfo.ehash_mask + 1; cnt = tcp_hashinfo.ehash_mask + 1;
net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2; net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo; net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
net->ipv4.sysctl_tcp_sack = 1; net->ipv4.sysctl_tcp_sack = 1;
...@@ -3201,8 +3194,10 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) ...@@ -3201,8 +3194,10 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
inet_twsk_purge(&tcp_hashinfo, AF_INET); inet_twsk_purge(&tcp_hashinfo, AF_INET);
list_for_each_entry(net, net_exit_list, exit_list) list_for_each_entry(net, net_exit_list, exit_list) {
WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
tcp_fastopen_ctx_destroy(net); tcp_fastopen_ctx_destroy(net);
}
} }
static struct pernet_operations __net_initdata tcp_sk_ops = { static struct pernet_operations __net_initdata tcp_sk_ops = {
......
...@@ -250,7 +250,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) ...@@ -250,7 +250,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
struct inet_timewait_sock *tw; struct inet_timewait_sock *tw;
tw = inet_twsk_alloc(sk, net->ipv4.tcp_death_row, state); tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state);
if (tw) { if (tw) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
......
...@@ -325,7 +325,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ...@@ -325,7 +325,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
inet->inet_dport = usin->sin6_port; inet->inet_dport = usin->sin6_port;
tcp_set_state(sk, TCP_SYN_SENT); tcp_set_state(sk, TCP_SYN_SENT);
tcp_death_row = net->ipv4.tcp_death_row; tcp_death_row = &net->ipv4.tcp_death_row;
err = inet6_hash_connect(tcp_death_row, sk); err = inet6_hash_connect(tcp_death_row, sk);
if (err) if (err)
goto late_failure; goto late_failure;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment