Commit b8dc6d6c authored by Paolo Abeni's avatar Paolo Abeni Committed by Jakub Kicinski

mptcp: fix rcv buffer auto-tuning

The MPTCP code uses the assumption that the tcp_win_from_space() helper
does not use any TCP-specific field, and thus works correctly operating
on an MPTCP socket.

The commit dfa2f048 ("tcp: get rid of sysctl_tcp_adv_win_scale")
broke such assumption, and as a consequence most MPTCP connections stall
on zero-window event due to auto-tuning changing the rcv buffer size
quite randomly.

Address the issue syncing again the MPTCP auto-tuning code with the TCP
one. To achieve that, factor out the windows size logic in socket
independent helpers, and reuse them in mptcp_rcv_space_adjust(). The
MPTCP level scaling_ratio is selected as the minimum one from the all
the subflows, as a worst-case estimate.

Fixes: dfa2f048 ("tcp: get rid of sysctl_tcp_adv_win_scale")
Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
Co-developed-by: default avatarMatthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: default avatarMatthieu Baerts <matthieu.baerts@tessares.net>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Acked-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Link: https://lore.kernel.org/r/20230720-upstream-net-next-20230720-mptcp-fix-rcv-buffer-auto-tuning-v1-1-175ef12b8380@tessares.netSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 004a04b9
...@@ -1430,22 +1430,32 @@ void tcp_select_initial_window(const struct sock *sk, int __space, ...@@ -1430,22 +1430,32 @@ void tcp_select_initial_window(const struct sock *sk, int __space,
__u32 *window_clamp, int wscale_ok, __u32 *window_clamp, int wscale_ok,
__u8 *rcv_wscale, __u32 init_rcv_wnd); __u8 *rcv_wscale, __u32 init_rcv_wnd);
static inline int tcp_win_from_space(const struct sock *sk, int space) static inline int __tcp_win_from_space(u8 scaling_ratio, int space)
{ {
s64 scaled_space = (s64)space * tcp_sk(sk)->scaling_ratio; s64 scaled_space = (s64)space * scaling_ratio;
return scaled_space >> TCP_RMEM_TO_WIN_SCALE; return scaled_space >> TCP_RMEM_TO_WIN_SCALE;
} }
/* inverse of tcp_win_from_space() */ static inline int tcp_win_from_space(const struct sock *sk, int space)
static inline int tcp_space_from_win(const struct sock *sk, int win) {
return __tcp_win_from_space(tcp_sk(sk)->scaling_ratio, space);
}
/* inverse of __tcp_win_from_space() */
static inline int __tcp_space_from_win(u8 scaling_ratio, int win)
{ {
u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE; u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE;
do_div(val, tcp_sk(sk)->scaling_ratio); do_div(val, scaling_ratio);
return val; return val;
} }
static inline int tcp_space_from_win(const struct sock *sk, int win)
{
return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win);
}
static inline void tcp_scaling_ratio_init(struct sock *sk) static inline void tcp_scaling_ratio_init(struct sock *sk)
{ {
/* Assume a conservative default of 1200 bytes of payload per 4K page. /* Assume a conservative default of 1200 bytes of payload per 4K page.
......
...@@ -90,6 +90,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) ...@@ -90,6 +90,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
if (err) if (err)
return err; return err;
msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio;
WRITE_ONCE(msk->first, ssock->sk); WRITE_ONCE(msk->first, ssock->sk);
WRITE_ONCE(msk->subflow, ssock); WRITE_ONCE(msk->subflow, ssock);
subflow = mptcp_subflow_ctx(ssock->sk); subflow = mptcp_subflow_ctx(ssock->sk);
...@@ -1881,6 +1882,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) ...@@ -1881,6 +1882,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
{ {
struct mptcp_subflow_context *subflow; struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk; struct sock *sk = (struct sock *)msk;
u8 scaling_ratio = U8_MAX;
u32 time, advmss = 1; u32 time, advmss = 1;
u64 rtt_us, mstamp; u64 rtt_us, mstamp;
...@@ -1911,9 +1913,11 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) ...@@ -1911,9 +1913,11 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
rtt_us = max(sf_rtt_us, rtt_us); rtt_us = max(sf_rtt_us, rtt_us);
advmss = max(sf_advmss, advmss); advmss = max(sf_advmss, advmss);
scaling_ratio = min(tp->scaling_ratio, scaling_ratio);
} }
msk->rcvq_space.rtt_us = rtt_us; msk->rcvq_space.rtt_us = rtt_us;
msk->scaling_ratio = scaling_ratio;
if (time < (rtt_us >> 3) || rtt_us == 0) if (time < (rtt_us >> 3) || rtt_us == 0)
return; return;
...@@ -1922,8 +1926,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) ...@@ -1922,8 +1926,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvmem, rcvbuf;
u64 rcvwin, grow; u64 rcvwin, grow;
int rcvbuf;
rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
...@@ -1932,18 +1936,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) ...@@ -1932,18 +1936,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
do_div(grow, msk->rcvq_space.space); do_div(grow, msk->rcvq_space.space);
rcvwin += (grow << 1); rcvwin += (grow << 1);
rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin),
while (tcp_win_from_space(sk, rcvmem) < advmss)
rcvmem += 128;
do_div(rcvwin, advmss);
rcvbuf = min_t(u64, rcvwin * rcvmem,
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
if (rcvbuf > sk->sk_rcvbuf) { if (rcvbuf > sk->sk_rcvbuf) {
u32 window_clamp; u32 window_clamp;
window_clamp = tcp_win_from_space(sk, rcvbuf); window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf);
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make subflows follow along. If we do not do this, we /* Make subflows follow along. If we do not do this, we
......
...@@ -321,6 +321,7 @@ struct mptcp_sock { ...@@ -321,6 +321,7 @@ struct mptcp_sock {
u64 time; /* start time of measurement window */ u64 time; /* start time of measurement window */
u64 rtt_us; /* last maximum rtt of subflows */ u64 rtt_us; /* last maximum rtt of subflows */
} rcvq_space; } rcvq_space;
u8 scaling_ratio;
u32 subflow_id; u32 subflow_id;
u32 setsockopt_seq; u32 setsockopt_seq;
...@@ -351,9 +352,14 @@ static inline int __mptcp_rmem(const struct sock *sk) ...@@ -351,9 +352,14 @@ static inline int __mptcp_rmem(const struct sock *sk)
return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
} }
static inline int mptcp_win_from_space(const struct sock *sk, int space)
{
return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space);
}
static inline int __mptcp_space(const struct sock *sk) static inline int __mptcp_space(const struct sock *sk)
{ {
return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
} }
static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
......
...@@ -1359,7 +1359,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) ...@@ -1359,7 +1359,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space)
const struct sock *sk = subflow->conn; const struct sock *sk = subflow->conn;
*space = __mptcp_space(sk); *space = __mptcp_space(sk);
*full_space = tcp_full_space(sk); *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
} }
void __mptcp_error_report(struct sock *sk) void __mptcp_error_report(struct sock *sk)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment