Commit 63561a40 authored by Paolo Abeni's avatar Paolo Abeni Committed by David S. Miller

mptcp: rethink 'is writable' conditional

Currently, when checking for the 'msk is writable' condition, we
look at the individual subflows write space.
That works well while we send data via a single subflow, but will
not as soon as we will enable concurrent xmit on multiple subflows.

With this change msk becomes writable when the following conditions
hold:
- the socket has some free write space
- there is at least a subflow with write free space

Additionally we need to set the NOSPACE bit on all subflows
before blocking.
Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
Reviewed-by: default avatarMat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 26cdb8f7
...@@ -472,7 +472,7 @@ void mptcp_data_acked(struct sock *sk) ...@@ -472,7 +472,7 @@ void mptcp_data_acked(struct sock *sk)
{ {
mptcp_reset_timer(sk); mptcp_reset_timer(sk);
if ((!sk_stream_is_writeable(sk) || if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) ||
(inet_sk_state_load(sk) != TCP_ESTABLISHED)) && (inet_sk_state_load(sk) != TCP_ESTABLISHED)) &&
schedule_work(&mptcp_sk(sk)->work)) schedule_work(&mptcp_sk(sk)->work))
sock_hold(sk); sock_hold(sk);
...@@ -567,6 +567,20 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) ...@@ -567,6 +567,20 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
put_page(dfrag->page); put_page(dfrag->page);
} }
static bool mptcp_is_writeable(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
if (!sk_stream_is_writeable((struct sock *)msk))
return false;
mptcp_for_each_subflow(msk, subflow) {
if (sk_stream_is_writeable(subflow->tcp_sock))
return true;
}
return false;
}
static void mptcp_clean_una(struct sock *sk) static void mptcp_clean_una(struct sock *sk)
{ {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
...@@ -609,9 +623,16 @@ static void mptcp_clean_una(struct sock *sk) ...@@ -609,9 +623,16 @@ static void mptcp_clean_una(struct sock *sk)
sk_mem_reclaim_partial(sk); sk_mem_reclaim_partial(sk);
/* Only wake up writers if a subflow is ready */ /* Only wake up writers if a subflow is ready */
if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) if (mptcp_is_writeable(msk)) {
set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags);
smp_mb__after_atomic();
/* set SEND_SPACE before sk_stream_write_space clears
* NOSPACE
*/
sk_stream_write_space(sk); sk_stream_write_space(sk);
} }
}
} }
/* ensure we get enough memory for the frag hdr, beyond some minimal amount of /* ensure we get enough memory for the frag hdr, beyond some minimal amount of
...@@ -801,21 +822,31 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ...@@ -801,21 +822,31 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
return ret; return ret;
} }
static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock) static void mptcp_nospace(struct mptcp_sock *msk)
{ {
struct mptcp_subflow_context *subflow;
clear_bit(MPTCP_SEND_SPACE, &msk->flags); clear_bit(MPTCP_SEND_SPACE, &msk->flags);
smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
/* enables sk->write_space() callbacks */ mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
struct socket *sock = READ_ONCE(ssk->sk_socket);
/* enables ssk->write_space() callbacks */
if (sock)
set_bit(SOCK_NOSPACE, &sock->flags); set_bit(SOCK_NOSPACE, &sock->flags);
}
} }
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{ {
struct mptcp_subflow_context *subflow; struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
struct sock *backup = NULL; struct sock *backup = NULL;
bool free;
sock_owned_by_me((const struct sock *)msk); sock_owned_by_me(sk);
if (!mptcp_ext_cache_refill(msk)) if (!mptcp_ext_cache_refill(msk))
return NULL; return NULL;
...@@ -823,12 +854,9 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) ...@@ -823,12 +854,9 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
mptcp_for_each_subflow(msk, subflow) { mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (!sk_stream_memory_free(ssk)) { free = sk_stream_is_writeable(subflow->tcp_sock);
struct socket *sock = ssk->sk_socket; if (!free) {
mptcp_nospace(msk);
if (sock)
mptcp_nospace(msk, sock);
return NULL; return NULL;
} }
...@@ -845,16 +873,10 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) ...@@ -845,16 +873,10 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
return backup; return backup;
} }
static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) static void ssk_check_wmem(struct mptcp_sock *msk)
{ {
struct socket *sock; if (unlikely(!mptcp_is_writeable(msk)))
mptcp_nospace(msk);
if (likely(sk_stream_is_writeable(ssk)))
return;
sock = READ_ONCE(ssk->sk_socket);
if (sock)
mptcp_nospace(msk, sock);
} }
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
...@@ -907,6 +929,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ...@@ -907,6 +929,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
mptcp_reset_timer(sk); mptcp_reset_timer(sk);
} }
mptcp_nospace(msk);
ret = sk_stream_wait_memory(sk, &timeo); ret = sk_stream_wait_memory(sk, &timeo);
if (ret) if (ret)
goto out; goto out;
...@@ -945,7 +968,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ...@@ -945,7 +968,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!sk_stream_memory_free(ssk) || if (!sk_stream_memory_free(ssk) ||
!mptcp_page_frag_refill(ssk, pfrag) || !mptcp_page_frag_refill(ssk, pfrag) ||
!mptcp_ext_cache_refill(msk)) { !mptcp_ext_cache_refill(msk)) {
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
tcp_push(ssk, msg->msg_flags, mss_now, tcp_push(ssk, msg->msg_flags, mss_now,
tcp_sk(ssk)->nonagle, size_goal); tcp_sk(ssk)->nonagle, size_goal);
mptcp_set_timeout(sk, ssk); mptcp_set_timeout(sk, ssk);
...@@ -993,9 +1015,9 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ...@@ -993,9 +1015,9 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
mptcp_reset_timer(sk); mptcp_reset_timer(sk);
} }
ssk_check_wmem(msk, ssk);
release_sock(ssk); release_sock(ssk);
out: out:
ssk_check_wmem(msk);
release_sock(sk); release_sock(sk);
return copied ? : ret; return copied ? : ret;
} }
...@@ -2291,8 +2313,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, ...@@ -2291,8 +2313,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
mask |= mptcp_check_readable(msk); mask |= mptcp_check_readable(msk);
if (sk_stream_is_writeable(sk) && if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
test_bit(MPTCP_SEND_SPACE, &msk->flags))
mask |= EPOLLOUT | EPOLLWRNORM; mask |= EPOLLOUT | EPOLLWRNORM;
} }
if (sk->sk_shutdown & RCV_SHUTDOWN) if (sk->sk_shutdown & RCV_SHUTDOWN)
......
...@@ -996,8 +996,10 @@ static void subflow_write_space(struct sock *sk) ...@@ -996,8 +996,10 @@ static void subflow_write_space(struct sock *sk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct sock *parent = subflow->conn; struct sock *parent = subflow->conn;
sk_stream_write_space(sk); if (!sk_stream_is_writeable(sk))
if (sk_stream_is_writeable(sk)) { return;
if (sk_stream_is_writeable(parent)) {
set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
smp_mb__after_atomic(); smp_mb__after_atomic();
/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment