Commit 16fd7539 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-options-lockless'

Eric Dumazet says:

====================
tcp: set few options locklessly

This series is avoiding the socket lock for six TCP options.

They are not heavily used, but this exercise can give
ideas for other parts of TCP/IP stack :)
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 81083076 6e97ba55
......@@ -564,6 +564,6 @@ void __tcp_sock_set_nodelay(struct sock *sk, bool on);
void tcp_sock_set_nodelay(struct sock *sk);
void tcp_sock_set_quickack(struct sock *sk, int val);
int tcp_sock_set_syncnt(struct sock *sk, int val);
void tcp_sock_set_user_timeout(struct sock *sk, u32 val);
int tcp_sock_set_user_timeout(struct sock *sk, int val);
#endif /* _LINUX_TCP_H */
......@@ -2865,7 +2865,7 @@ void __tcp_close(struct sock *sk, long timeout)
if (sk->sk_state == TCP_FIN_WAIT2) {
struct tcp_sock *tp = tcp_sk(sk);
if (tp->linger2 < 0) {
if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
__NET_INC_STATS(sock_net(sk),
......@@ -3291,18 +3291,21 @@ int tcp_sock_set_syncnt(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_SYNCNT)
return -EINVAL;
lock_sock(sk);
WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val);
release_sock(sk);
return 0;
}
EXPORT_SYMBOL(tcp_sock_set_syncnt);
void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
int tcp_sock_set_user_timeout(struct sock *sk, int val)
{
lock_sock(sk);
/* Cap the max time in ms TCP will retry or probe the window
* before giving up and aborting (ETIMEDOUT) a connection.
*/
if (val < 0)
return -EINVAL;
WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
release_sock(sk);
return 0;
}
EXPORT_SYMBOL(tcp_sock_set_user_timeout);
......@@ -3345,9 +3348,7 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_KEEPINTVL)
return -EINVAL;
lock_sock(sk);
WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
release_sock(sk);
return 0;
}
EXPORT_SYMBOL(tcp_sock_set_keepintvl);
......@@ -3357,10 +3358,8 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_KEEPCNT)
return -EINVAL;
lock_sock(sk);
/* Paired with READ_ONCE() in keepalive_probes() */
WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
release_sock(sk);
return 0;
}
EXPORT_SYMBOL(tcp_sock_set_keepcnt);
......@@ -3462,6 +3461,32 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
/* Handle options that can be set without locking the socket. */
switch (optname) {
case TCP_SYNCNT:
return tcp_sock_set_syncnt(sk, val);
case TCP_USER_TIMEOUT:
return tcp_sock_set_user_timeout(sk, val);
case TCP_KEEPINTVL:
return tcp_sock_set_keepintvl(sk, val);
case TCP_KEEPCNT:
return tcp_sock_set_keepcnt(sk, val);
case TCP_LINGER2:
if (val < 0)
WRITE_ONCE(tp->linger2, -1);
else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
else
WRITE_ONCE(tp->linger2, val * HZ);
return 0;
case TCP_DEFER_ACCEPT:
/* Translate value in seconds to number of retransmits */
WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
TCP_RTO_MAX / HZ));
return 0;
}
sockopt_lock_sock(sk);
switch (optname) {
......@@ -3557,25 +3582,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
case TCP_KEEPIDLE:
err = tcp_sock_set_keepidle_locked(sk, val);
break;
case TCP_KEEPINTVL:
if (val < 1 || val > MAX_TCP_KEEPINTVL)
err = -EINVAL;
else
WRITE_ONCE(tp->keepalive_intvl, val * HZ);
break;
case TCP_KEEPCNT:
if (val < 1 || val > MAX_TCP_KEEPCNT)
err = -EINVAL;
else
WRITE_ONCE(tp->keepalive_probes, val);
break;
case TCP_SYNCNT:
if (val < 1 || val > MAX_TCP_SYNCNT)
err = -EINVAL;
else
WRITE_ONCE(icsk->icsk_syn_retries, val);
break;
case TCP_SAVE_SYN:
/* 0: disable, 1: enable, 2: start from ether_header */
if (val < 0 || val > 2)
......@@ -3584,22 +3590,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
tp->save_syn = val;
break;
case TCP_LINGER2:
if (val < 0)
WRITE_ONCE(tp->linger2, -1);
else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
else
WRITE_ONCE(tp->linger2, val * HZ);
break;
case TCP_DEFER_ACCEPT:
/* Translate value in seconds to number of retransmits */
WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
TCP_RTO_MAX / HZ));
break;
case TCP_WINDOW_CLAMP:
err = tcp_set_window_clamp(sk, val);
break;
......@@ -3614,16 +3604,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
break;
#endif
case TCP_USER_TIMEOUT:
/* Cap the max time in ms TCP will retry or probe the window
* before giving up and aborting (ETIMEDOUT) a connection.
*/
if (val < 0)
err = -EINVAL;
else
WRITE_ONCE(icsk->icsk_user_timeout, val);
break;
case TCP_FASTOPEN:
if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
TCPF_LISTEN))) {
......
......@@ -6324,7 +6324,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (fastopen_fail)
return -1;
if (sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept ||
READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) ||
inet_csk_in_pingpong_mode(sk)) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
......@@ -6624,7 +6624,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
break;
}
if (tp->linger2 < 0) {
if (READ_ONCE(tp->linger2) < 0) {
tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
......
......@@ -792,7 +792,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
return sk;
/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) &&
TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
......
......@@ -26,14 +26,15 @@
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
u32 elapsed, start_ts;
u32 elapsed, start_ts, user_timeout;
s32 remaining;
start_ts = tcp_sk(sk)->retrans_stamp;
if (!icsk->icsk_user_timeout)
user_timeout = READ_ONCE(icsk->icsk_user_timeout);
if (!user_timeout)
return icsk->icsk_rto;
elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
remaining = icsk->icsk_user_timeout - elapsed;
remaining = user_timeout - elapsed;
if (remaining <= 0)
return 1; /* user timeout has passed; fire ASAP */
......@@ -43,16 +44,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
{
struct inet_connection_sock *icsk = inet_csk(sk);
u32 remaining;
u32 remaining, user_timeout;
s32 elapsed;
if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp)
user_timeout = READ_ONCE(icsk->icsk_user_timeout);
if (!user_timeout || !icsk->icsk_probes_tstamp)
return when;
elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp;
if (unlikely(elapsed < 0))
elapsed = 0;
remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed;
remaining = msecs_to_jiffies(user_timeout) - elapsed;
remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN);
return min_t(u32, remaining, when);
......@@ -239,7 +241,8 @@ static int tcp_write_timeout(struct sock *sk)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits)
__dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ? :
/* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */
retry_until = READ_ONCE(icsk->icsk_syn_retries) ? :
READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
max_retransmits = retry_until;
......@@ -269,7 +272,7 @@ static int tcp_write_timeout(struct sock *sk)
}
if (!expired)
expired = retransmits_timed_out(sk, retry_until,
icsk->icsk_user_timeout);
READ_ONCE(icsk->icsk_user_timeout));
tcp_fastopen_active_detect_blackhole(sk, expired);
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
......@@ -383,13 +386,16 @@ static void tcp_probe_timer(struct sock *sk)
* corresponding system limit. We also implement similar policy when
* we use RTO to probe window in tcp_retransmit_timer().
*/
if (!icsk->icsk_probes_tstamp)
if (!icsk->icsk_probes_tstamp) {
icsk->icsk_probes_tstamp = tcp_jiffies32;
else if (icsk->icsk_user_timeout &&
} else {
u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
if (user_timeout &&
(s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
msecs_to_jiffies(icsk->icsk_user_timeout))
msecs_to_jiffies(user_timeout))
goto abort;
}
max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2);
if (sock_flag(sk, SOCK_DEAD)) {
const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
......@@ -421,8 +427,10 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
req->rsk_ops->syn_ack_timeout(req);
/* add one more retry for fastopen */
max_retries = icsk->icsk_syn_retries ? :
/* Add one more retry for fastopen.
* Paired with WRITE_ONCE() in tcp_sock_set_syncnt()
*/
max_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1;
if (req->num_timeout >= max_retries) {
......@@ -706,7 +714,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
tcp_mstamp_refresh(tp);
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
if (tp->linger2 >= 0) {
if (READ_ONCE(tp->linger2) >= 0) {
const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
if (tmo > 0) {
......@@ -731,13 +739,15 @@ static void tcp_keepalive_timer (struct timer_list *t)
elapsed = keepalive_time_elapsed(tp);
if (elapsed >= keepalive_time_when(tp)) {
u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
/* If the TCP_USER_TIMEOUT option is enabled, use that
* to determine when to timeout instead.
*/
if ((icsk->icsk_user_timeout != 0 &&
elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) &&
if ((user_timeout != 0 &&
elapsed >= msecs_to_jiffies(user_timeout) &&
icsk->icsk_probes_out > 0) ||
(icsk->icsk_user_timeout == 0 &&
(user_timeout == 0 &&
icsk->icsk_probes_out >= keepalive_probes(tp))) {
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_write_err(sk);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment