Commit 0972457f authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'dccp-tcp-fix-bhash2-issues-related-to-warn_on-in-inet_csk_get_port'

Kuniyuki Iwashima says:

====================
dccp/tcp: Fix bhash2 issues related to WARN_ON() in inet_csk_get_port().

syzkaller was hitting a WARN_ON() in inet_csk_get_port() in the 4th patch,
which was because we forgot to fix up bhash2 bucket when connect() for a
socket bound to a wildcard address fails in __inet_stream_connect().

There was a similar report [0], but its repro does not fire the WARN_ON() due
to inconsistent error handling.

When connect() for a socket bound to a wildcard address fails, saddr may or
may not be reset depending on where the failure happens.  When we fail in
__inet_stream_connect(), sk->sk_prot->disconnect() resets saddr.  OTOH, in
(dccp|tcp)_v[46]_connect(), if we fail after inet_hash6?_connect(), we
forget to reset saddr.

We fix this inconsistent error handling in the 1st patch, and then we'll
fix the bhash2 WARN_ON() issue.

Note that there is still an issue in that we reset saddr without checking
if there are conflicting sockets in bhash and bhash2, but this should be
another series.

See [1][2] for the previous discussion.

[0]: https://lore.kernel.org/netdev/0000000000003f33bc05dfaf44fe@google.com/
[1]: https://lore.kernel.org/netdev/20221029001249.86337-1-kuniyu@amazon.com/
[2]: https://lore.kernel.org/netdev/20221103172419.20977-1-kuniyu@amazon.com/
[3]: https://lore.kernel.org/netdev/20221118081906.053d5231@kernel.org/T/#m00aafedb29ff0b55d5e67aef0252ef1baaf4b6ee
====================

Link: https://lore.kernel.org/r/20221119014914.31792-1-kuniyu@amazon.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 2dc4ac91 e0833d1f
......@@ -281,7 +281,8 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in
* sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's
* rcv_saddr field should already have been updated when this is called.
*/
int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk);
int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family);
void inet_bhash2_reset_saddr(struct sock *sk);
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
struct inet_bind2_bucket *tb2, unsigned short port);
......
......@@ -45,11 +45,10 @@ static unsigned int dccp_v4_pernet_id __read_mostly;
int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
__be32 daddr, nexthop, prev_sk_rcv_saddr;
struct inet_sock *inet = inet_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
__be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
struct flowi4 *fl4;
struct rtable *rt;
int err;
......@@ -91,26 +90,13 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
daddr = fl4->daddr;
if (inet->inet_saddr == 0) {
if (inet_csk(sk)->icsk_bind2_hash) {
prev_addr_hashbucket =
inet_bhashfn_portaddr(&dccp_hashinfo, sk,
sock_net(sk),
inet->inet_num);
prev_sk_rcv_saddr = sk->sk_rcv_saddr;
}
inet->inet_saddr = fl4->saddr;
}
sk_rcv_saddr_set(sk, inet->inet_saddr);
if (prev_addr_hashbucket) {
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
if (err) {
inet->inet_saddr = 0;
sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
ip_rt_put(rt);
return err;
}
} else {
sk_rcv_saddr_set(sk, inet->inet_saddr);
}
inet->inet_dport = usin->sin_port;
......@@ -157,6 +143,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* This unhashes the socket and releases the local port, if necessary.
*/
dccp_set_state(sk, DCCP_CLOSED);
inet_bhash2_reset_saddr(sk);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->inet_dport = 0;
......
......@@ -934,26 +934,11 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
}
if (saddr == NULL) {
struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
struct in6_addr prev_v6_rcv_saddr;
if (icsk->icsk_bind2_hash) {
prev_addr_hashbucket = inet_bhashfn_portaddr(&dccp_hashinfo,
sk, sock_net(sk),
inet->inet_num);
prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
}
saddr = &fl6.saddr;
sk->sk_v6_rcv_saddr = *saddr;
if (prev_addr_hashbucket) {
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
if (err) {
sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr;
goto failure;
}
}
err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
if (err)
goto failure;
}
/* set the source address */
......@@ -985,6 +970,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
late_failure:
dccp_set_state(sk, DCCP_CLOSED);
inet_bhash2_reset_saddr(sk);
__sk_dst_reset(sk);
failure:
inet->inet_dport = 0;
......
......@@ -279,8 +279,7 @@ int dccp_disconnect(struct sock *sk, int flags)
inet->inet_dport = 0;
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
inet_bhash2_reset_saddr(sk);
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
......
......@@ -1230,7 +1230,6 @@ EXPORT_SYMBOL(inet_unregister_protosw);
static int inet_sk_reselect_saddr(struct sock *sk)
{
struct inet_bind_hashbucket *prev_addr_hashbucket;
struct inet_sock *inet = inet_sk(sk);
__be32 old_saddr = inet->inet_saddr;
__be32 daddr = inet->inet_daddr;
......@@ -1260,16 +1259,8 @@ static int inet_sk_reselect_saddr(struct sock *sk)
return 0;
}
prev_addr_hashbucket =
inet_bhashfn_portaddr(tcp_or_dccp_get_hashinfo(sk), sk,
sock_net(sk), inet->inet_num);
inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET);
if (err) {
inet->inet_saddr = old_saddr;
inet->inet_rcv_saddr = old_saddr;
ip_rt_put(rt);
return err;
}
......
......@@ -858,34 +858,80 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in
return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
}
int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk)
static void inet_update_saddr(struct sock *sk, void *saddr, int family)
{
if (family == AF_INET) {
inet_sk(sk)->inet_saddr = *(__be32 *)saddr;
sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr);
}
#if IS_ENABLED(CONFIG_IPV6)
else {
sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr;
}
#endif
}
static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
{
struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
struct inet_bind_hashbucket *head, *head2;
struct inet_bind2_bucket *tb2, *new_tb2;
int l3mdev = inet_sk_bound_l3mdev(sk);
struct inet_bind_hashbucket *head2;
int port = inet_sk(sk)->inet_num;
struct net *net = sock_net(sk);
int bhash;
if (!inet_csk(sk)->icsk_bind2_hash) {
/* Not bind()ed before. */
if (reset)
inet_reset_saddr(sk);
else
inet_update_saddr(sk, saddr, family);
return 0;
}
/* Allocate a bind2 bucket ahead of time to avoid permanently putting
* the bhash2 table in an inconsistent state if a new tb2 bucket
* allocation fails.
*/
new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC);
if (!new_tb2)
if (!new_tb2) {
if (reset) {
/* The (INADDR_ANY, port) bucket might have already
* been freed, then we cannot fixup icsk_bind2_hash,
* so we give up and unlink sk from bhash/bhash2 not
* to leave inconsistency in bhash2.
*/
inet_put_port(sk);
inet_reset_saddr(sk);
}
return -ENOMEM;
}
bhash = inet_bhashfn(net, port, hinfo->bhash_size);
head = &hinfo->bhash[bhash];
head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
if (prev_saddr) {
spin_lock_bh(&prev_saddr->lock);
__sk_del_bind2_node(sk);
inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep,
inet_csk(sk)->icsk_bind2_hash);
spin_unlock_bh(&prev_saddr->lock);
}
/* If we change saddr locklessly, another thread
* iterating over bhash might see corrupted address.
*/
spin_lock_bh(&head->lock);
spin_lock_bh(&head2->lock);
spin_lock(&head2->lock);
__sk_del_bind2_node(sk);
inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash);
spin_unlock(&head2->lock);
if (reset)
inet_reset_saddr(sk);
else
inet_update_saddr(sk, saddr, family);
head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
spin_lock(&head2->lock);
tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
if (!tb2) {
tb2 = new_tb2;
......@@ -893,15 +939,29 @@ int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct soc
}
sk_add_bind2_node(sk, &tb2->owners);
inet_csk(sk)->icsk_bind2_hash = tb2;
spin_unlock_bh(&head2->lock);
spin_unlock(&head2->lock);
spin_unlock_bh(&head->lock);
if (tb2 != new_tb2)
kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2);
return 0;
}
int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
{
return __inet_bhash2_update_saddr(sk, saddr, family, false);
}
EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr);
void inet_bhash2_reset_saddr(struct sock *sk)
{
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
__inet_bhash2_update_saddr(sk, NULL, 0, true);
}
EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr);
/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm
* Note that we use 32bit integers (vs RFC 'short integers')
* because 2^16 is not a multiple of num_ephemeral and this
......
......@@ -3114,8 +3114,7 @@ int tcp_disconnect(struct sock *sk, int flags)
inet->inet_dport = 0;
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
inet_bhash2_reset_saddr(sk);
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
......
......@@ -199,15 +199,14 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_timewait_death_row *tcp_death_row;
__be32 daddr, nexthop, prev_sk_rcv_saddr;
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct ip_options_rcu *inet_opt;
struct net *net = sock_net(sk);
__be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
struct flowi4 *fl4;
struct rtable *rt;
int err;
......@@ -251,24 +250,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (!inet->inet_saddr) {
if (inet_csk(sk)->icsk_bind2_hash) {
prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
sk, net, inet->inet_num);
prev_sk_rcv_saddr = sk->sk_rcv_saddr;
}
inet->inet_saddr = fl4->saddr;
}
sk_rcv_saddr_set(sk, inet->inet_saddr);
if (prev_addr_hashbucket) {
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
if (err) {
inet->inet_saddr = 0;
sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
ip_rt_put(rt);
return err;
}
} else {
sk_rcv_saddr_set(sk, inet->inet_saddr);
}
if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
......@@ -343,6 +331,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* if necessary.
*/
tcp_set_state(sk, TCP_CLOSE);
inet_bhash2_reset_saddr(sk);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->inet_dport = 0;
......
......@@ -292,24 +292,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (!saddr) {
struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
struct in6_addr prev_v6_rcv_saddr;
if (icsk->icsk_bind2_hash) {
prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
sk, net, inet->inet_num);
prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
}
saddr = &fl6.saddr;
sk->sk_v6_rcv_saddr = *saddr;
if (prev_addr_hashbucket) {
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
if (err) {
sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr;
goto failure;
}
}
err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
if (err)
goto failure;
}
/* set the source address */
......@@ -359,6 +346,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
late_failure:
tcp_set_state(sk, TCP_CLOSE);
inet_bhash2_reset_saddr(sk);
failure:
inet->inet_dport = 0;
sk->sk_route_caps = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment