Commit 16fd7539 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-options-lockless'

Eric Dumazet says:

====================
tcp: set few options locklessly

This series is avoiding the socket lock for six TCP options.

They are not heavily used, but this exercise can give
ideas for other parts of TCP/IP stack :)
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 81083076 6e97ba55
...@@ -564,6 +564,6 @@ void __tcp_sock_set_nodelay(struct sock *sk, bool on); ...@@ -564,6 +564,6 @@ void __tcp_sock_set_nodelay(struct sock *sk, bool on);
void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_nodelay(struct sock *sk);
void tcp_sock_set_quickack(struct sock *sk, int val); void tcp_sock_set_quickack(struct sock *sk, int val);
int tcp_sock_set_syncnt(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val);
void tcp_sock_set_user_timeout(struct sock *sk, u32 val); int tcp_sock_set_user_timeout(struct sock *sk, int val);
#endif /* _LINUX_TCP_H */ #endif /* _LINUX_TCP_H */
...@@ -2865,7 +2865,7 @@ void __tcp_close(struct sock *sk, long timeout) ...@@ -2865,7 +2865,7 @@ void __tcp_close(struct sock *sk, long timeout)
if (sk->sk_state == TCP_FIN_WAIT2) { if (sk->sk_state == TCP_FIN_WAIT2) {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
if (tp->linger2 < 0) { if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE); tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC); tcp_send_active_reset(sk, GFP_ATOMIC);
__NET_INC_STATS(sock_net(sk), __NET_INC_STATS(sock_net(sk),
...@@ -3291,18 +3291,21 @@ int tcp_sock_set_syncnt(struct sock *sk, int val) ...@@ -3291,18 +3291,21 @@ int tcp_sock_set_syncnt(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_SYNCNT) if (val < 1 || val > MAX_TCP_SYNCNT)
return -EINVAL; return -EINVAL;
lock_sock(sk);
WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val); WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val);
release_sock(sk);
return 0; return 0;
} }
EXPORT_SYMBOL(tcp_sock_set_syncnt); EXPORT_SYMBOL(tcp_sock_set_syncnt);
void tcp_sock_set_user_timeout(struct sock *sk, u32 val) int tcp_sock_set_user_timeout(struct sock *sk, int val)
{ {
lock_sock(sk); /* Cap the max time in ms TCP will retry or probe the window
* before giving up and aborting (ETIMEDOUT) a connection.
*/
if (val < 0)
return -EINVAL;
WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val); WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
release_sock(sk); return 0;
} }
EXPORT_SYMBOL(tcp_sock_set_user_timeout); EXPORT_SYMBOL(tcp_sock_set_user_timeout);
...@@ -3345,9 +3348,7 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val) ...@@ -3345,9 +3348,7 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_KEEPINTVL) if (val < 1 || val > MAX_TCP_KEEPINTVL)
return -EINVAL; return -EINVAL;
lock_sock(sk);
WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
release_sock(sk);
return 0; return 0;
} }
EXPORT_SYMBOL(tcp_sock_set_keepintvl); EXPORT_SYMBOL(tcp_sock_set_keepintvl);
...@@ -3357,10 +3358,8 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val) ...@@ -3357,10 +3358,8 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_KEEPCNT) if (val < 1 || val > MAX_TCP_KEEPCNT)
return -EINVAL; return -EINVAL;
lock_sock(sk);
/* Paired with READ_ONCE() in keepalive_probes() */ /* Paired with READ_ONCE() in keepalive_probes() */
WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
release_sock(sk);
return 0; return 0;
} }
EXPORT_SYMBOL(tcp_sock_set_keepcnt); EXPORT_SYMBOL(tcp_sock_set_keepcnt);
...@@ -3462,6 +3461,32 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, ...@@ -3462,6 +3461,32 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
if (copy_from_sockptr(&val, optval, sizeof(val))) if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT; return -EFAULT;
/* Handle options that can be set without locking the socket. */
switch (optname) {
case TCP_SYNCNT:
return tcp_sock_set_syncnt(sk, val);
case TCP_USER_TIMEOUT:
return tcp_sock_set_user_timeout(sk, val);
case TCP_KEEPINTVL:
return tcp_sock_set_keepintvl(sk, val);
case TCP_KEEPCNT:
return tcp_sock_set_keepcnt(sk, val);
case TCP_LINGER2:
if (val < 0)
WRITE_ONCE(tp->linger2, -1);
else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
else
WRITE_ONCE(tp->linger2, val * HZ);
return 0;
case TCP_DEFER_ACCEPT:
/* Translate value in seconds to number of retransmits */
WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
TCP_RTO_MAX / HZ));
return 0;
}
sockopt_lock_sock(sk); sockopt_lock_sock(sk);
switch (optname) { switch (optname) {
...@@ -3557,25 +3582,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, ...@@ -3557,25 +3582,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
case TCP_KEEPIDLE: case TCP_KEEPIDLE:
err = tcp_sock_set_keepidle_locked(sk, val); err = tcp_sock_set_keepidle_locked(sk, val);
break; break;
case TCP_KEEPINTVL:
if (val < 1 || val > MAX_TCP_KEEPINTVL)
err = -EINVAL;
else
WRITE_ONCE(tp->keepalive_intvl, val * HZ);
break;
case TCP_KEEPCNT:
if (val < 1 || val > MAX_TCP_KEEPCNT)
err = -EINVAL;
else
WRITE_ONCE(tp->keepalive_probes, val);
break;
case TCP_SYNCNT:
if (val < 1 || val > MAX_TCP_SYNCNT)
err = -EINVAL;
else
WRITE_ONCE(icsk->icsk_syn_retries, val);
break;
case TCP_SAVE_SYN: case TCP_SAVE_SYN:
/* 0: disable, 1: enable, 2: start from ether_header */ /* 0: disable, 1: enable, 2: start from ether_header */
if (val < 0 || val > 2) if (val < 0 || val > 2)
...@@ -3584,22 +3590,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, ...@@ -3584,22 +3590,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
tp->save_syn = val; tp->save_syn = val;
break; break;
case TCP_LINGER2:
if (val < 0)
WRITE_ONCE(tp->linger2, -1);
else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
else
WRITE_ONCE(tp->linger2, val * HZ);
break;
case TCP_DEFER_ACCEPT:
/* Translate value in seconds to number of retransmits */
WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
TCP_RTO_MAX / HZ));
break;
case TCP_WINDOW_CLAMP: case TCP_WINDOW_CLAMP:
err = tcp_set_window_clamp(sk, val); err = tcp_set_window_clamp(sk, val);
break; break;
...@@ -3614,16 +3604,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, ...@@ -3614,16 +3604,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
err = tp->af_specific->md5_parse(sk, optname, optval, optlen); err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
break; break;
#endif #endif
case TCP_USER_TIMEOUT:
/* Cap the max time in ms TCP will retry or probe the window
* before giving up and aborting (ETIMEDOUT) a connection.
*/
if (val < 0)
err = -EINVAL;
else
WRITE_ONCE(icsk->icsk_user_timeout, val);
break;
case TCP_FASTOPEN: case TCP_FASTOPEN:
if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
TCPF_LISTEN))) { TCPF_LISTEN))) {
......
...@@ -6324,7 +6324,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, ...@@ -6324,7 +6324,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (fastopen_fail) if (fastopen_fail)
return -1; return -1;
if (sk->sk_write_pending || if (sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept || READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) ||
inet_csk_in_pingpong_mode(sk)) { inet_csk_in_pingpong_mode(sk)) {
/* Save one ACK. Data will be ready after /* Save one ACK. Data will be ready after
* several ticks, if write_pending is set. * several ticks, if write_pending is set.
...@@ -6624,7 +6624,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) ...@@ -6624,7 +6624,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
break; break;
} }
if (tp->linger2 < 0) { if (READ_ONCE(tp->linger2) < 0) {
tcp_done(sk); tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1; return 1;
......
...@@ -792,7 +792,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, ...@@ -792,7 +792,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
return sk; return sk;
/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) &&
TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1; inet_rsk(req)->acked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
......
...@@ -26,14 +26,15 @@ ...@@ -26,14 +26,15 @@
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
u32 elapsed, start_ts; u32 elapsed, start_ts, user_timeout;
s32 remaining; s32 remaining;
start_ts = tcp_sk(sk)->retrans_stamp; start_ts = tcp_sk(sk)->retrans_stamp;
if (!icsk->icsk_user_timeout) user_timeout = READ_ONCE(icsk->icsk_user_timeout);
if (!user_timeout)
return icsk->icsk_rto; return icsk->icsk_rto;
elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
remaining = icsk->icsk_user_timeout - elapsed; remaining = user_timeout - elapsed;
if (remaining <= 0) if (remaining <= 0)
return 1; /* user timeout has passed; fire ASAP */ return 1; /* user timeout has passed; fire ASAP */
...@@ -43,16 +44,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) ...@@ -43,16 +44,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
u32 remaining; u32 remaining, user_timeout;
s32 elapsed; s32 elapsed;
if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp) user_timeout = READ_ONCE(icsk->icsk_user_timeout);
if (!user_timeout || !icsk->icsk_probes_tstamp)
return when; return when;
elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp; elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp;
if (unlikely(elapsed < 0)) if (unlikely(elapsed < 0))
elapsed = 0; elapsed = 0;
remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed; remaining = msecs_to_jiffies(user_timeout) - elapsed;
remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN); remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN);
return min_t(u32, remaining, when); return min_t(u32, remaining, when);
...@@ -239,7 +241,8 @@ static int tcp_write_timeout(struct sock *sk) ...@@ -239,7 +241,8 @@ static int tcp_write_timeout(struct sock *sk)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits) if (icsk->icsk_retransmits)
__dst_negative_advice(sk); __dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ? : /* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */
retry_until = READ_ONCE(icsk->icsk_syn_retries) ? :
READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
max_retransmits = retry_until; max_retransmits = retry_until;
...@@ -269,7 +272,7 @@ static int tcp_write_timeout(struct sock *sk) ...@@ -269,7 +272,7 @@ static int tcp_write_timeout(struct sock *sk)
} }
if (!expired) if (!expired)
expired = retransmits_timed_out(sk, retry_until, expired = retransmits_timed_out(sk, retry_until,
icsk->icsk_user_timeout); READ_ONCE(icsk->icsk_user_timeout));
tcp_fastopen_active_detect_blackhole(sk, expired); tcp_fastopen_active_detect_blackhole(sk, expired);
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
...@@ -383,13 +386,16 @@ static void tcp_probe_timer(struct sock *sk) ...@@ -383,13 +386,16 @@ static void tcp_probe_timer(struct sock *sk)
* corresponding system limit. We also implement similar policy when * corresponding system limit. We also implement similar policy when
* we use RTO to probe window in tcp_retransmit_timer(). * we use RTO to probe window in tcp_retransmit_timer().
*/ */
if (!icsk->icsk_probes_tstamp) if (!icsk->icsk_probes_tstamp) {
icsk->icsk_probes_tstamp = tcp_jiffies32; icsk->icsk_probes_tstamp = tcp_jiffies32;
else if (icsk->icsk_user_timeout && } else {
(s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
msecs_to_jiffies(icsk->icsk_user_timeout))
goto abort;
if (user_timeout &&
(s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
msecs_to_jiffies(user_timeout))
goto abort;
}
max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2);
if (sock_flag(sk, SOCK_DEAD)) { if (sock_flag(sk, SOCK_DEAD)) {
const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
...@@ -421,8 +427,10 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) ...@@ -421,8 +427,10 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
req->rsk_ops->syn_ack_timeout(req); req->rsk_ops->syn_ack_timeout(req);
/* add one more retry for fastopen */ /* Add one more retry for fastopen.
max_retries = icsk->icsk_syn_retries ? : * Paired with WRITE_ONCE() in tcp_sock_set_syncnt()
*/
max_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1; READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1;
if (req->num_timeout >= max_retries) { if (req->num_timeout >= max_retries) {
...@@ -706,7 +714,7 @@ static void tcp_keepalive_timer (struct timer_list *t) ...@@ -706,7 +714,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
tcp_mstamp_refresh(tp); tcp_mstamp_refresh(tp);
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
if (tp->linger2 >= 0) { if (READ_ONCE(tp->linger2) >= 0) {
const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
if (tmo > 0) { if (tmo > 0) {
...@@ -731,13 +739,15 @@ static void tcp_keepalive_timer (struct timer_list *t) ...@@ -731,13 +739,15 @@ static void tcp_keepalive_timer (struct timer_list *t)
elapsed = keepalive_time_elapsed(tp); elapsed = keepalive_time_elapsed(tp);
if (elapsed >= keepalive_time_when(tp)) { if (elapsed >= keepalive_time_when(tp)) {
u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
/* If the TCP_USER_TIMEOUT option is enabled, use that /* If the TCP_USER_TIMEOUT option is enabled, use that
* to determine when to timeout instead. * to determine when to timeout instead.
*/ */
if ((icsk->icsk_user_timeout != 0 && if ((user_timeout != 0 &&
elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) && elapsed >= msecs_to_jiffies(user_timeout) &&
icsk->icsk_probes_out > 0) || icsk->icsk_probes_out > 0) ||
(icsk->icsk_user_timeout == 0 && (user_timeout == 0 &&
icsk->icsk_probes_out >= keepalive_probes(tp))) { icsk->icsk_probes_out >= keepalive_probes(tp))) {
tcp_send_active_reset(sk, GFP_ATOMIC); tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_write_err(sk); tcp_write_err(sk);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment