Commit 8edf0864 authored by Florian Westphal's avatar Florian Westphal Committed by Jakub Kicinski

mptcp: rework poll+nospace handling

MPTCP maintains a status bit, MPTCP_SEND_SPACE, that is set when at
least one subflow and the mptcp socket itself are writeable.

mptcp_poll returns EPOLLOUT if the bit is set.

mptcp_sendmsg makes sure MPTCP_SEND_SPACE gets cleared when last write
has used up all subflows or the mptcp socket wmem.

This reworks nospace handling as follows:

MPTCP_SEND_SPACE is replaced with MPTCP_NOSPACE, i.e. inverted meaning.
This bit is set when the mptcp socket is not writeable.
The mptcp-level ack path schedule will then schedule the mptcp worker
to allow it to free already-acked data (and reduce wmem usage).

This will then wake userspace processes that wait for a POLLOUT event.

sendmsg will set MPTCP_NOSPACE only when it has to wait for more
wmem (blocking I/O case).

poll path will set MPTCP_NOSPACE in case the mptcp socket is
not writeable.

Normal tcp-level notification (SOCK_NOSPACE) is only enabled
in case the subflow socket has no available wmem.
Signed-off-by: default avatarFlorian Westphal <fw@strlen.de>
Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 813e0a68
...@@ -724,7 +724,7 @@ void mptcp_data_acked(struct sock *sk) ...@@ -724,7 +724,7 @@ void mptcp_data_acked(struct sock *sk)
{ {
mptcp_reset_timer(sk); mptcp_reset_timer(sk);
if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) || if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) ||
mptcp_send_head(sk) || mptcp_send_head(sk) ||
(inet_sk_state_load(sk) != TCP_ESTABLISHED))) (inet_sk_state_load(sk) != TCP_ESTABLISHED)))
mptcp_schedule_work(sk); mptcp_schedule_work(sk);
...@@ -835,20 +835,6 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) ...@@ -835,20 +835,6 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
put_page(dfrag->page); put_page(dfrag->page);
} }
static bool mptcp_is_writeable(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
if (!sk_stream_is_writeable((struct sock *)msk))
return false;
mptcp_for_each_subflow(msk, subflow) {
if (sk_stream_is_writeable(subflow->tcp_sock))
return true;
}
return false;
}
static void mptcp_clean_una(struct sock *sk) static void mptcp_clean_una(struct sock *sk)
{ {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
...@@ -901,13 +887,8 @@ static void mptcp_clean_una_wakeup(struct sock *sk) ...@@ -901,13 +887,8 @@ static void mptcp_clean_una_wakeup(struct sock *sk)
mptcp_clean_una(sk); mptcp_clean_una(sk);
/* Only wake up writers if a subflow is ready */ /* Only wake up writers if a subflow is ready */
if (mptcp_is_writeable(msk)) { if (sk_stream_is_writeable(sk)) {
set_bit(MPTCP_SEND_SPACE, &msk->flags); clear_bit(MPTCP_NOSPACE, &msk->flags);
smp_mb__after_atomic();
/* set SEND_SPACE before sk_stream_write_space clears
* NOSPACE
*/
sk_stream_write_space(sk); sk_stream_write_space(sk);
} }
} }
...@@ -1041,17 +1022,25 @@ static void mptcp_nospace(struct mptcp_sock *msk) ...@@ -1041,17 +1022,25 @@ static void mptcp_nospace(struct mptcp_sock *msk)
{ {
struct mptcp_subflow_context *subflow; struct mptcp_subflow_context *subflow;
clear_bit(MPTCP_SEND_SPACE, &msk->flags); set_bit(MPTCP_NOSPACE, &msk->flags);
smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
mptcp_for_each_subflow(msk, subflow) { mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool ssk_writeable = sk_stream_is_writeable(ssk);
struct socket *sock = READ_ONCE(ssk->sk_socket); struct socket *sock = READ_ONCE(ssk->sk_socket);
if (ssk_writeable || !sock)
continue;
/* enables ssk->write_space() callbacks */ /* enables ssk->write_space() callbacks */
if (sock) set_bit(SOCK_NOSPACE, &sock->flags);
set_bit(SOCK_NOSPACE, &sock->flags);
} }
/* mptcp_data_acked() could run just before we set the NOSPACE bit,
* so explicitly check for snd_una value
*/
mptcp_clean_una((struct sock *)msk);
} }
static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
...@@ -1155,12 +1144,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, ...@@ -1155,12 +1144,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
return NULL; return NULL;
} }
static void ssk_check_wmem(struct mptcp_sock *msk)
{
if (unlikely(!mptcp_is_writeable(msk)))
mptcp_nospace(msk);
}
static void mptcp_push_release(struct sock *sk, struct sock *ssk, static void mptcp_push_release(struct sock *sk, struct sock *ssk,
struct mptcp_sendmsg_info *info) struct mptcp_sendmsg_info *info)
{ {
...@@ -1332,7 +1315,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ...@@ -1332,7 +1315,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
wait_for_memory: wait_for_memory:
mptcp_nospace(msk); mptcp_nospace(msk);
mptcp_clean_una(sk);
if (mptcp_timer_pending(sk)) if (mptcp_timer_pending(sk))
mptcp_reset_timer(sk); mptcp_reset_timer(sk);
ret = sk_stream_wait_memory(sk, &timeo); ret = sk_stream_wait_memory(sk, &timeo);
...@@ -1344,7 +1326,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ...@@ -1344,7 +1326,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
mptcp_push_pending(sk, msg->msg_flags); mptcp_push_pending(sk, msg->msg_flags);
out: out:
ssk_check_wmem(msk);
release_sock(sk); release_sock(sk);
return copied ? : ret; return copied ? : ret;
} }
...@@ -1921,7 +1902,6 @@ static int __mptcp_init_sock(struct sock *sk) ...@@ -1921,7 +1902,6 @@ static int __mptcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->conn_list);
INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->join_list);
INIT_LIST_HEAD(&msk->rtx_queue); INIT_LIST_HEAD(&msk->rtx_queue);
__set_bit(MPTCP_SEND_SPACE, &msk->flags);
INIT_WORK(&msk->work, mptcp_worker); INIT_WORK(&msk->work, mptcp_worker);
msk->out_of_order_queue = RB_ROOT; msk->out_of_order_queue = RB_ROOT;
msk->first_pending = NULL; msk->first_pending = NULL;
...@@ -2619,13 +2599,6 @@ bool mptcp_finish_join(struct sock *ssk) ...@@ -2619,13 +2599,6 @@ bool mptcp_finish_join(struct sock *ssk)
return true; return true;
} }
static bool mptcp_memory_free(const struct sock *sk, int wake)
{
struct mptcp_sock *msk = mptcp_sk(sk);
return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true;
}
static struct proto mptcp_prot = { static struct proto mptcp_prot = {
.name = "MPTCP", .name = "MPTCP",
.owner = THIS_MODULE, .owner = THIS_MODULE,
...@@ -2646,7 +2619,6 @@ static struct proto mptcp_prot = { ...@@ -2646,7 +2619,6 @@ static struct proto mptcp_prot = {
.sockets_allocated = &mptcp_sockets_allocated, .sockets_allocated = &mptcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated, .memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure, .memory_pressure = &tcp_memory_pressure,
.stream_memory_free = mptcp_memory_free,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.sysctl_mem = sysctl_tcp_mem, .sysctl_mem = sysctl_tcp_mem,
...@@ -2820,6 +2792,39 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) ...@@ -2820,6 +2792,39 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
0; 0;
} }
static bool __mptcp_check_writeable(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
bool mptcp_writable;
mptcp_clean_una(sk);
mptcp_writable = sk_stream_is_writeable(sk);
if (!mptcp_writable)
mptcp_nospace(msk);
return mptcp_writable;
}
static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
__poll_t ret = 0;
bool slow;
if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
return 0;
if (sk_stream_is_writeable(sk))
return EPOLLOUT | EPOLLWRNORM;
slow = lock_sock_fast(sk);
if (__mptcp_check_writeable(msk))
ret = EPOLLOUT | EPOLLWRNORM;
unlock_sock_fast(sk, slow);
return ret;
}
static __poll_t mptcp_poll(struct file *file, struct socket *sock, static __poll_t mptcp_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait) struct poll_table_struct *wait)
{ {
...@@ -2838,8 +2843,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, ...@@ -2838,8 +2843,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
mask |= mptcp_check_readable(msk); mask |= mptcp_check_readable(msk);
if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) mask |= mptcp_check_writeable(msk);
mask |= EPOLLOUT | EPOLLWRNORM;
} }
if (sk->sk_shutdown & RCV_SHUTDOWN) if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
......
...@@ -86,7 +86,7 @@ ...@@ -86,7 +86,7 @@
/* MPTCP socket flags */ /* MPTCP socket flags */
#define MPTCP_DATA_READY 0 #define MPTCP_DATA_READY 0
#define MPTCP_SEND_SPACE 1 #define MPTCP_NOSPACE 1
#define MPTCP_WORK_RTX 2 #define MPTCP_WORK_RTX 2
#define MPTCP_WORK_EOF 3 #define MPTCP_WORK_EOF 3
#define MPTCP_FALLBACK_DONE 4 #define MPTCP_FALLBACK_DONE 4
......
...@@ -997,17 +997,16 @@ static void subflow_data_ready(struct sock *sk) ...@@ -997,17 +997,16 @@ static void subflow_data_ready(struct sock *sk)
static void subflow_write_space(struct sock *sk) static void subflow_write_space(struct sock *sk)
{ {
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct socket *sock = READ_ONCE(sk->sk_socket);
struct sock *parent = subflow->conn; struct sock *parent = subflow->conn;
if (!sk_stream_is_writeable(sk)) if (!sk_stream_is_writeable(sk))
return; return;
if (sk_stream_is_writeable(parent)) { if (sock && sk_stream_is_writeable(parent))
set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); clear_bit(SOCK_NOSPACE, &sock->flags);
smp_mb__after_atomic();
/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ sk_stream_write_space(parent);
sk_stream_write_space(parent);
}
} }
static struct inet_connection_sock_af_ops * static struct inet_connection_sock_af_ops *
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment