Commit 1a2e10a2 authored by David S. Miller's avatar David S. Miller

Merge branch 'Rework-ip_ra_chain-protection'

Kirill Tkhai says:

====================
Rework ip_ra_chain protection

Commit 1215e51e "ipv4: fix a deadlock in ip_ra_control"
made rtnl_lock() be used in raw_close(). This function is called
on every RAW socket destruction, so that rtnl_mutex is taken
every time. This scales very sadly. I observe cleanup_net()
spending a lot of time in rtnl_lock() and raw_close() is one
of the biggest rtnl user (since we have percpu net->ipv4.icmp_sk).

This patchset reworks the locking: reverts the problem commit
and its descendant, and introduces rtnl-independent locking.
This may have a continuation, and someone may work on killing
rtnl_lock() in mrtsock_destruct() in the future.

v3: Change patches order: [2/5] and [3/5].
v2: Fix sparse warning [4/5], as reported by kbuild test robot.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents f2d254fa d9ff3049
......@@ -91,6 +91,17 @@ static inline int inet_sdif(struct sk_buff *skb)
return 0;
}
/* Special input handler for packets caught by router alert option.
They are selected only by protocol field, and then processed likely
local ones; but only if someone wants them! Otherwise, router
not running rsvpd will kill RSVP.
It is user level problem, what it will make with them.
I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
but receiver should be enough clever f.e. to forward mtrace requests,
sent to multicast group to reach destination designated router.
*/
struct ip_ra_chain {
struct ip_ra_chain __rcu *next;
struct sock *sk;
......@@ -101,8 +112,6 @@ struct ip_ra_chain {
struct rcu_head rcu;
};
extern struct ip_ra_chain __rcu *ip_ra_chain;
/* IP flags. */
#define IP_CE 0x8000 /* Flag: "Congestion" */
#define IP_DF 0x4000 /* Flag: "Don't Fragment" */
......
......@@ -49,6 +49,8 @@ struct netns_ipv4 {
#endif
struct ipv4_devconf *devconf_all;
struct ipv4_devconf *devconf_dflt;
struct ip_ra_chain __rcu *ra_chain;
struct mutex ra_mutex;
#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rules_ops *rules_ops;
bool fib_has_custom_rules;
......
......@@ -301,6 +301,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
net->user_ns = user_ns;
idr_init(&net->netns_ids);
spin_lock_init(&net->nsid_lock);
mutex_init(&net->ipv4.ra_mutex);
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
......
......@@ -159,7 +159,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
struct net_device *dev = skb->dev;
struct net *net = dev_net(dev);
for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) {
struct sock *sk = ra->sk;
/* If socket is bound to an interface, only report
......@@ -167,8 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
*/
if (sk && inet_sk(sk)->inet_num == protocol &&
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == dev->ifindex) &&
net_eq(sock_net(sk), net)) {
sk->sk_bound_dev_if == dev->ifindex)) {
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
return true;
......
......@@ -322,20 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
return 0;
}
/* Special input handler for packets caught by router alert option.
They are selected only by protocol field, and then processed likely
local ones; but only if someone wants them! Otherwise, router
not running rsvpd will kill RSVP.
It is user level problem, what it will make with them.
I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
but receiver should be enough clever f.e. to forward mtrace requests,
sent to multicast group to reach destination designated router.
*/
struct ip_ra_chain __rcu *ip_ra_chain;
static void ip_ra_destroy_rcu(struct rcu_head *head)
{
struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
......@@ -349,23 +335,28 @@ int ip_ra_control(struct sock *sk, unsigned char on,
{
struct ip_ra_chain *ra, *new_ra;
struct ip_ra_chain __rcu **rap;
struct net *net = sock_net(sk);
if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
return -EINVAL;
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
for (rap = &ip_ra_chain;
(ra = rtnl_dereference(*rap)) != NULL;
mutex_lock(&net->ipv4.ra_mutex);
for (rap = &net->ipv4.ra_chain;
(ra = rcu_dereference_protected(*rap,
lockdep_is_held(&net->ipv4.ra_mutex))) != NULL;
rap = &ra->next) {
if (ra->sk == sk) {
if (on) {
mutex_unlock(&net->ipv4.ra_mutex);
kfree(new_ra);
return -EADDRINUSE;
}
/* dont let ip_call_ra_chain() use sk again */
ra->sk = NULL;
RCU_INIT_POINTER(*rap, ra->next);
mutex_unlock(&net->ipv4.ra_mutex);
if (ra->destructor)
ra->destructor(sk);
......@@ -379,14 +370,17 @@ int ip_ra_control(struct sock *sk, unsigned char on,
return 0;
}
}
if (!new_ra)
if (!new_ra) {
mutex_unlock(&net->ipv4.ra_mutex);
return -ENOBUFS;
}
new_ra->sk = sk;
new_ra->destructor = destructor;
RCU_INIT_POINTER(new_ra->next, ra);
rcu_assign_pointer(*rap, new_ra);
sock_hold(sk);
mutex_unlock(&net->ipv4.ra_mutex);
return 0;
}
......@@ -586,7 +580,6 @@ static bool setsockopt_needs_rtnl(int optname)
case MCAST_LEAVE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
case MCAST_UNBLOCK_SOURCE:
case IP_ROUTER_ALERT:
return true;
}
return false;
......@@ -639,6 +632,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
/* If optlen==0, it is equivalent to val == 0 */
if (optname == IP_ROUTER_ALERT)
return ip_ra_control(sk, val ? 1 : 0, NULL);
if (ip_mroute_opt(optname))
return ip_mroute_setsockopt(sk, optname, optval, optlen);
......@@ -1149,9 +1144,6 @@ static int do_ip_setsockopt(struct sock *sk, int level,
goto e_inval;
inet->mc_all = val;
break;
case IP_ROUTER_ALERT:
err = ip_ra_control(sk, val ? 1 : 0, NULL);
break;
case IP_FREEBIND:
if (optlen < 1)
......
......@@ -1399,7 +1399,7 @@ static void mrtsock_destruct(struct sock *sk)
struct net *net = sock_net(sk);
struct mr_table *mrt;
ASSERT_RTNL();
rtnl_lock();
ipmr_for_each_table(mrt, net) {
if (sk == rtnl_dereference(mrt->mroute_sk)) {
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
......@@ -1411,6 +1411,7 @@ static void mrtsock_destruct(struct sock *sk)
mroute_clean_tables(mrt, false);
}
}
rtnl_unlock();
}
/* Socket options and virtual interface manipulation. The whole
......@@ -1475,8 +1476,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
if (sk != rcu_access_pointer(mrt->mroute_sk)) {
ret = -EACCES;
} else {
/* We need to unlock here because mrtsock_destruct takes
* care of rtnl itself and we can't change that due to
* the IP_ROUTER_ALERT setsockopt which runs without it.
*/
rtnl_unlock();
ret = ip_ra_control(sk, 0, NULL);
goto out_unlock;
goto out;
}
break;
case MRT_ADD_VIF:
......@@ -1588,6 +1594,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
}
out_unlock:
rtnl_unlock();
out:
return ret;
}
......
......@@ -711,9 +711,7 @@ static void raw_close(struct sock *sk, long timeout)
/*
* Raw sockets may have direct kernel references. Kill them.
*/
rtnl_lock();
ip_ra_control(sk, 0, NULL);
rtnl_unlock();
sk_common_release(sk);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment