Commit ad521763 authored by David S. Miller's avatar David S. Miller

Merge branch 'rds-use-RCU-between-work-enqueue-and-connection-teardown'

Sowmini Varadhan says:

====================
rds: use RCU between work-enqueue and connection teardown

This patchset follows up on the root-cause mentioned in
https://www.spinics.net/lists/netdev/msg472849.html

Patch1 implements some code refactoring that was suggeseted
as an enhancement in http://patchwork.ozlabs.org/patch/843157/
It replaces the c_destroy_in_prog bit in rds_connection with
an atomically managed flag in rds_conn_path.

Patch2 builds on Patch1 and uses RCU to make sure that
work is only enqueued if the connection destroy is not already
in progress: the test-flag-and-enqueue is done under rcu_read_lock,
while destroy first sets the flag, uses synchronize_rcu to
wait for existing reader threads to complete, and then starts
all the work-cancellation.

Since I have not been able to reproduce the original stack traces
reported by syszbot, and these are fixes for a race condition that
are based on code-inspection I am not marking these as reported-by
at this time.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents eb9aa1bf 3db6e0d1
...@@ -219,7 +219,11 @@ void rds_cong_queue_updates(struct rds_cong_map *map) ...@@ -219,7 +219,11 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
spin_lock_irqsave(&rds_cong_lock, flags); spin_lock_irqsave(&rds_cong_lock, flags);
list_for_each_entry(conn, &map->m_conn_list, c_map_item) { list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
if (!test_and_set_bit(0, &conn->c_map_queued)) { struct rds_conn_path *cp = &conn->c_path[0];
rcu_read_lock();
if (!test_and_set_bit(0, &conn->c_map_queued) &&
!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
rds_stats_inc(s_cong_update_queued); rds_stats_inc(s_cong_update_queued);
/* We cannot inline the call to rds_send_xmit() here /* We cannot inline the call to rds_send_xmit() here
* for two reasons (both pertaining to a TCP transport): * for two reasons (both pertaining to a TCP transport):
...@@ -235,9 +239,9 @@ void rds_cong_queue_updates(struct rds_cong_map *map) ...@@ -235,9 +239,9 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
* therefore trigger warnings. * therefore trigger warnings.
* Defer the xmit to rds_send_worker() instead. * Defer the xmit to rds_send_worker() instead.
*/ */
queue_delayed_work(rds_wq, queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
&conn->c_path[0].cp_send_w, 0);
} }
rcu_read_unlock();
} }
spin_unlock_irqrestore(&rds_cong_lock, flags); spin_unlock_irqrestore(&rds_cong_lock, flags);
......
...@@ -366,8 +366,6 @@ void rds_conn_shutdown(struct rds_conn_path *cp) ...@@ -366,8 +366,6 @@ void rds_conn_shutdown(struct rds_conn_path *cp)
* to the conn hash, so we never trigger a reconnect on this * to the conn hash, so we never trigger a reconnect on this
* conn - the reconnect is always triggered by the active peer. */ * conn - the reconnect is always triggered by the active peer. */
cancel_delayed_work_sync(&cp->cp_conn_w); cancel_delayed_work_sync(&cp->cp_conn_w);
if (conn->c_destroy_in_prog)
return;
rcu_read_lock(); rcu_read_lock();
if (!hlist_unhashed(&conn->c_hash_node)) { if (!hlist_unhashed(&conn->c_hash_node)) {
rcu_read_unlock(); rcu_read_unlock();
...@@ -384,10 +382,13 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp) ...@@ -384,10 +382,13 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp)
{ {
struct rds_message *rm, *rtmp; struct rds_message *rm, *rtmp;
set_bit(RDS_DESTROY_PENDING, &cp->cp_flags);
if (!cp->cp_transport_data) if (!cp->cp_transport_data)
return; return;
/* make sure lingering queued work won't try to ref the conn */ /* make sure lingering queued work won't try to ref the conn */
synchronize_rcu();
cancel_delayed_work_sync(&cp->cp_send_w); cancel_delayed_work_sync(&cp->cp_send_w);
cancel_delayed_work_sync(&cp->cp_recv_w); cancel_delayed_work_sync(&cp->cp_recv_w);
...@@ -405,6 +406,11 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp) ...@@ -405,6 +406,11 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp)
if (cp->cp_xmit_rm) if (cp->cp_xmit_rm)
rds_message_put(cp->cp_xmit_rm); rds_message_put(cp->cp_xmit_rm);
WARN_ON(delayed_work_pending(&cp->cp_send_w));
WARN_ON(delayed_work_pending(&cp->cp_recv_w));
WARN_ON(delayed_work_pending(&cp->cp_conn_w));
WARN_ON(work_pending(&cp->cp_down_w));
cp->cp_conn->c_trans->conn_free(cp->cp_transport_data); cp->cp_conn->c_trans->conn_free(cp->cp_transport_data);
} }
...@@ -426,7 +432,6 @@ void rds_conn_destroy(struct rds_connection *conn) ...@@ -426,7 +432,6 @@ void rds_conn_destroy(struct rds_connection *conn)
"%pI4\n", conn, &conn->c_laddr, "%pI4\n", conn, &conn->c_laddr,
&conn->c_faddr); &conn->c_faddr);
conn->c_destroy_in_prog = 1;
/* Ensure conn will not be scheduled for reconnect */ /* Ensure conn will not be scheduled for reconnect */
spin_lock_irq(&rds_conn_lock); spin_lock_irq(&rds_conn_lock);
hlist_del_init_rcu(&conn->c_hash_node); hlist_del_init_rcu(&conn->c_hash_node);
...@@ -685,10 +690,13 @@ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) ...@@ -685,10 +690,13 @@ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy)
{ {
atomic_set(&cp->cp_state, RDS_CONN_ERROR); atomic_set(&cp->cp_state, RDS_CONN_ERROR);
if (!destroy && cp->cp_conn->c_destroy_in_prog) rcu_read_lock();
if (!destroy && test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
rcu_read_unlock();
return; return;
}
queue_work(rds_wq, &cp->cp_down_w); queue_work(rds_wq, &cp->cp_down_w);
rcu_read_unlock();
} }
EXPORT_SYMBOL_GPL(rds_conn_path_drop); EXPORT_SYMBOL_GPL(rds_conn_path_drop);
...@@ -705,9 +713,15 @@ EXPORT_SYMBOL_GPL(rds_conn_drop); ...@@ -705,9 +713,15 @@ EXPORT_SYMBOL_GPL(rds_conn_drop);
*/ */
void rds_conn_path_connect_if_down(struct rds_conn_path *cp) void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
{ {
rcu_read_lock();
if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
rcu_read_unlock();
return;
}
if (rds_conn_path_state(cp) == RDS_CONN_DOWN && if (rds_conn_path_state(cp) == RDS_CONN_DOWN &&
!test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
rcu_read_unlock();
} }
EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
......
...@@ -88,6 +88,7 @@ enum { ...@@ -88,6 +88,7 @@ enum {
#define RDS_RECONNECT_PENDING 1 #define RDS_RECONNECT_PENDING 1
#define RDS_IN_XMIT 2 #define RDS_IN_XMIT 2
#define RDS_RECV_REFILL 3 #define RDS_RECV_REFILL 3
#define RDS_DESTROY_PENDING 4
/* Max number of multipaths per RDS connection. Must be a power of 2 */ /* Max number of multipaths per RDS connection. Must be a power of 2 */
#define RDS_MPATH_WORKERS 8 #define RDS_MPATH_WORKERS 8
...@@ -139,8 +140,7 @@ struct rds_connection { ...@@ -139,8 +140,7 @@ struct rds_connection {
__be32 c_faddr; __be32 c_faddr;
unsigned int c_loopback:1, unsigned int c_loopback:1,
c_ping_triggered:1, c_ping_triggered:1,
c_destroy_in_prog:1, c_pad_to_32:30;
c_pad_to_32:29;
int c_npaths; int c_npaths;
struct rds_connection *c_passive; struct rds_connection *c_passive;
struct rds_transport *c_trans; struct rds_transport *c_trans;
......
...@@ -162,6 +162,12 @@ int rds_send_xmit(struct rds_conn_path *cp) ...@@ -162,6 +162,12 @@ int rds_send_xmit(struct rds_conn_path *cp)
goto out; goto out;
} }
if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
release_in_xmit(cp);
ret = -ENETUNREACH; /* dont requeue send work */
goto out;
}
/* /*
* we record the send generation after doing the xmit acquire. * we record the send generation after doing the xmit acquire.
* if someone else manages to jump in and do some work, we'll use * if someone else manages to jump in and do some work, we'll use
...@@ -437,7 +443,12 @@ int rds_send_xmit(struct rds_conn_path *cp) ...@@ -437,7 +443,12 @@ int rds_send_xmit(struct rds_conn_path *cp)
!list_empty(&cp->cp_send_queue)) && !raced) { !list_empty(&cp->cp_send_queue)) && !raced) {
if (batch_count < send_batch_count) if (batch_count < send_batch_count)
goto restart; goto restart;
rcu_read_lock();
if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
ret = -ENETUNREACH;
else
queue_delayed_work(rds_wq, &cp->cp_send_w, 1); queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
rcu_read_unlock();
} else if (raced) { } else if (raced) {
rds_stats_inc(s_send_lock_queue_raced); rds_stats_inc(s_send_lock_queue_raced);
} }
...@@ -1151,6 +1162,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) ...@@ -1151,6 +1162,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
else else
cpath = &conn->c_path[0]; cpath = &conn->c_path[0];
if (test_bit(RDS_DESTROY_PENDING, &cpath->cp_flags)) {
ret = -EAGAIN;
goto out;
}
rds_conn_path_connect_if_down(cpath); rds_conn_path_connect_if_down(cpath);
ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
...@@ -1190,9 +1206,17 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) ...@@ -1190,9 +1206,17 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
rds_stats_inc(s_send_queued); rds_stats_inc(s_send_queued);
ret = rds_send_xmit(cpath); ret = rds_send_xmit(cpath);
if (ret == -ENOMEM || ret == -EAGAIN) if (ret == -ENOMEM || ret == -EAGAIN) {
ret = 0;
rcu_read_lock();
if (test_bit(RDS_DESTROY_PENDING, &cpath->cp_flags))
ret = -ENETUNREACH;
else
queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
rcu_read_unlock();
}
if (ret)
goto out;
rds_message_put(rm); rds_message_put(rm);
return payload_len; return payload_len;
...@@ -1270,7 +1294,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport, ...@@ -1270,7 +1294,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
rds_stats_inc(s_send_pong); rds_stats_inc(s_send_pong);
/* schedule the send work on rds_wq */ /* schedule the send work on rds_wq */
rcu_read_lock();
if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
queue_delayed_work(rds_wq, &cp->cp_send_w, 1); queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
rcu_read_unlock();
rds_message_put(rm); rds_message_put(rm);
return 0; return 0;
......
...@@ -170,7 +170,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) ...@@ -170,7 +170,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
cp->cp_conn, tc, sock); cp->cp_conn, tc, sock);
if (sock) { if (sock) {
if (cp->cp_conn->c_destroy_in_prog) if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
rds_tcp_set_linger(sock); rds_tcp_set_linger(sock);
sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
lock_sock(sock->sk); lock_sock(sock->sk);
......
...@@ -321,8 +321,12 @@ void rds_tcp_data_ready(struct sock *sk) ...@@ -321,8 +321,12 @@ void rds_tcp_data_ready(struct sock *sk)
ready = tc->t_orig_data_ready; ready = tc->t_orig_data_ready;
rds_tcp_stats_inc(s_tcp_data_ready_calls); rds_tcp_stats_inc(s_tcp_data_ready_calls);
if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) {
rcu_read_lock();
if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
rcu_read_unlock();
}
out: out:
read_unlock_bh(&sk->sk_callback_lock); read_unlock_bh(&sk->sk_callback_lock);
ready(sk); ready(sk);
......
...@@ -202,8 +202,11 @@ void rds_tcp_write_space(struct sock *sk) ...@@ -202,8 +202,11 @@ void rds_tcp_write_space(struct sock *sk)
tc->t_last_seen_una = rds_tcp_snd_una(tc); tc->t_last_seen_una = rds_tcp_snd_una(tc);
rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc), rds_tcp_is_acked); rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc), rds_tcp_is_acked);
if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) rcu_read_lock();
if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf &&
!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
queue_delayed_work(rds_wq, &cp->cp_send_w, 0); queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
rcu_read_unlock();
out: out:
read_unlock_bh(&sk->sk_callback_lock); read_unlock_bh(&sk->sk_callback_lock);
......
...@@ -87,8 +87,12 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr) ...@@ -87,8 +87,12 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
cp->cp_reconnect_jiffies = 0; cp->cp_reconnect_jiffies = 0;
set_bit(0, &cp->cp_conn->c_map_queued); set_bit(0, &cp->cp_conn->c_map_queued);
rcu_read_lock();
if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
queue_delayed_work(rds_wq, &cp->cp_send_w, 0); queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
}
rcu_read_unlock();
} }
EXPORT_SYMBOL_GPL(rds_connect_path_complete); EXPORT_SYMBOL_GPL(rds_connect_path_complete);
...@@ -133,7 +137,10 @@ void rds_queue_reconnect(struct rds_conn_path *cp) ...@@ -133,7 +137,10 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
if (cp->cp_reconnect_jiffies == 0) { if (cp->cp_reconnect_jiffies == 0) {
cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
rcu_read_lock();
if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
rcu_read_unlock();
return; return;
} }
...@@ -141,8 +148,11 @@ void rds_queue_reconnect(struct rds_conn_path *cp) ...@@ -141,8 +148,11 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
conn, &conn->c_laddr, &conn->c_faddr); conn, &conn->c_laddr, &conn->c_faddr);
rcu_read_lock();
if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
queue_delayed_work(rds_wq, &cp->cp_conn_w, queue_delayed_work(rds_wq, &cp->cp_conn_w,
rand % cp->cp_reconnect_jiffies); rand % cp->cp_reconnect_jiffies);
rcu_read_unlock();
cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2, cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2,
rds_sysctl_reconnect_max_jiffies); rds_sysctl_reconnect_max_jiffies);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment