Commit 6b200661 authored by Eric Dumazet's avatar Eric Dumazet Committed by Luis Henriques

ipv4: tcp: get rid of ugly unicast_sock

commit bdbbb852 upstream.

In commit be9f4a44 ("ipv4: tcp: remove per net tcp_sock")
I tried to address contention on a socket lock, but the solution
I chose was horrible :

commit 3a7c384f ("ipv4: tcp: unicast_sock should not land outside
of TCP stack") addressed a selinux regression.

commit 0980e56e ("ipv4: tcp: set unicast_sock uc_ttl to -1")
took care of another regression.

commit b5ec8eea ("ipv4: fix ip_send_skb()") fixed another regression.

commit 811230cd ("tcp: ipv4: initialize unicast_sock sk_pacing_rate")
was another shot in the dark.

Really, just use a proper socket per cpu, and remove the skb_orphan()
call, to re-enable flow control.

This solves a serious problem with FQ packet scheduler when used in
hostile environments, as we do not want to allocate a flow structure
for every RST packet sent in response to a spoofed packet.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
[ luis: backported to 3.16: based on davem's backport to 3.14 ]
Signed-off-by: default avatarLuis Henriques <luis.henriques@canonical.com>
parent 57e57144
...@@ -180,7 +180,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) ...@@ -180,7 +180,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
} }
void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
__be32 saddr, const struct ip_reply_arg *arg, __be32 saddr, const struct ip_reply_arg *arg,
unsigned int len); unsigned int len);
......
...@@ -52,6 +52,7 @@ struct netns_ipv4 { ...@@ -52,6 +52,7 @@ struct netns_ipv4 {
struct inet_peer_base *peers; struct inet_peer_base *peers;
struct tcpm_hash_bucket *tcp_metrics_hash; struct tcpm_hash_bucket *tcp_metrics_hash;
unsigned int tcp_metrics_hash_log; unsigned int tcp_metrics_hash_log;
struct sock * __percpu *tcp_sk;
struct netns_frags frags; struct netns_frags frags;
#ifdef CONFIG_NETFILTER #ifdef CONFIG_NETFILTER
struct xt_table *iptable_filter; struct xt_table *iptable_filter;
......
...@@ -1501,24 +1501,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, ...@@ -1501,24 +1501,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
/* /*
* Generic function to send a packet as reply to another packet. * Generic function to send a packet as reply to another packet.
* Used to send some TCP resets/acks so far. * Used to send some TCP resets/acks so far.
*
* Use a fake percpu inet socket to avoid false sharing and contention.
*/ */
static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
.sk = {
.__sk_common = {
.skc_refcnt = ATOMIC_INIT(1),
},
.sk_wmem_alloc = ATOMIC_INIT(1),
.sk_allocation = GFP_ATOMIC,
.sk_flags = (1UL << SOCK_USE_WRITE_QUEUE),
.sk_pacing_rate = ~0U,
},
.pmtudisc = IP_PMTUDISC_WANT,
.uc_ttl = -1,
};
void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
__be32 saddr, const struct ip_reply_arg *arg, __be32 saddr, const struct ip_reply_arg *arg,
unsigned int len) unsigned int len)
{ {
...@@ -1526,9 +1510,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, ...@@ -1526,9 +1510,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
struct ipcm_cookie ipc; struct ipcm_cookie ipc;
struct flowi4 fl4; struct flowi4 fl4;
struct rtable *rt = skb_rtable(skb); struct rtable *rt = skb_rtable(skb);
struct net *net = sock_net(sk);
struct sk_buff *nskb; struct sk_buff *nskb;
struct sock *sk;
struct inet_sock *inet;
int err; int err;
if (ip_options_echo(&replyopts.opt.opt, skb)) if (ip_options_echo(&replyopts.opt.opt, skb))
...@@ -1559,15 +1542,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, ...@@ -1559,15 +1542,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
if (IS_ERR(rt)) if (IS_ERR(rt))
return; return;
inet = &get_cpu_var(unicast_sock); inet_sk(sk)->tos = arg->tos;
inet->tos = arg->tos;
sk = &inet->sk;
sk->sk_priority = skb->priority; sk->sk_priority = skb->priority;
sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if; sk->sk_bound_dev_if = arg->bound_dev_if;
sock_net_set(sk, net);
__skb_queue_head_init(&sk->sk_write_queue);
sk->sk_sndbuf = sysctl_wmem_default; sk->sk_sndbuf = sysctl_wmem_default;
err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
len, 0, &ipc, &rt, MSG_DONTWAIT); len, 0, &ipc, &rt, MSG_DONTWAIT);
...@@ -1583,13 +1562,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, ...@@ -1583,13 +1562,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum)); arg->csum));
nskb->ip_summed = CHECKSUM_NONE; nskb->ip_summed = CHECKSUM_NONE;
skb_orphan(nskb);
skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
ip_push_pending_frames(sk, &fl4); ip_push_pending_frames(sk, &fl4);
} }
out: out:
put_cpu_var(unicast_sock);
ip_rt_put(rt); ip_rt_put(rt);
} }
......
...@@ -684,7 +684,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) ...@@ -684,7 +684,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
net = dev_net(skb_dst(skb)->dev); net = dev_net(skb_dst(skb)->dev);
arg.tos = ip_hdr(skb)->tos; arg.tos = ip_hdr(skb)->tos;
ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, ip_hdr(skb)->saddr,
ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
...@@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ...@@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
if (oif) if (oif)
arg.bound_dev_if = oif; arg.bound_dev_if = oif;
arg.tos = tos; arg.tos = tos;
ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, ip_hdr(skb)->saddr,
ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
...@@ -2532,14 +2534,39 @@ struct proto tcp_prot = { ...@@ -2532,14 +2534,39 @@ struct proto tcp_prot = {
}; };
EXPORT_SYMBOL(tcp_prot); EXPORT_SYMBOL(tcp_prot);
static void __net_exit tcp_sk_exit(struct net *net)
{
int cpu;
for_each_possible_cpu(cpu)
inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
free_percpu(net->ipv4.tcp_sk);
}
static int __net_init tcp_sk_init(struct net *net) static int __net_init tcp_sk_init(struct net *net)
{ {
int res, cpu;
net->ipv4.tcp_sk = alloc_percpu(struct sock *);
if (!net->ipv4.tcp_sk)
return -ENOMEM;
for_each_possible_cpu(cpu) {
struct sock *sk;
res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
IPPROTO_TCP, net);
if (res)
goto fail;
*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
}
net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_ecn = 2;
return 0; return 0;
}
static void __net_exit tcp_sk_exit(struct net *net) fail:
{ tcp_sk_exit(net);
return res;
} }
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment