Commit ee995283 authored by Pavel Emelyanov's avatar Pavel Emelyanov Committed by David S. Miller

tcp: Initial repair mode

This includes (according the the previous description):

* TCP_REPAIR sockoption

This one just puts the socket in/out of the repair mode.
Allowed for CAP_NET_ADMIN and for closed/establised sockets only.
When repair mode is turned off and the socket happens to be in
the established state the window probe is sent to the peer to
'unlock' the connection.

* TCP_REPAIR_QUEUE sockoption

This one sets the queue which we're about to repair. The
'no-queue' is set by default.

* TCP_QUEUE_SEQ socoption

Sets the write_seq/rcv_nxt of a selected repaired queue.
Allowed for TCP_CLOSE-d sockets only. When the socket changes
its state the other seq-s are changed by the kernel according
to the protocol rules (most of the existing code is actually
reused).

* Ability to forcibly bind a socket to a port

The sk->sk_reuse is set to SK_FORCE_REUSE.

* Immediate connect modification

The connect syscall initializes the connection, then directly jumps
to the code which finalizes it.

* Silent close modification

The close just aborts the connection (similar to SO_LINGER with 0
time) but without sending any FIN/RST-s to peer.
Signed-off-by: default avatarPavel Emelyanov <xemul@parallels.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 370816ae
...@@ -106,6 +106,16 @@ enum { ...@@ -106,6 +106,16 @@ enum {
#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ #define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/
#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ #define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */
#define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ #define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */
#define TCP_REPAIR 19 /* TCP sock is under repair right now */
#define TCP_REPAIR_QUEUE 20
#define TCP_QUEUE_SEQ 21
enum {
TCP_NO_QUEUE,
TCP_RECV_QUEUE,
TCP_SEND_QUEUE,
TCP_QUEUES_NR,
};
/* for TCP_INFO socket option */ /* for TCP_INFO socket option */
#define TCPI_OPT_TIMESTAMPS 1 #define TCPI_OPT_TIMESTAMPS 1
...@@ -353,7 +363,9 @@ struct tcp_sock { ...@@ -353,7 +363,9 @@ struct tcp_sock {
u8 nonagle : 4,/* Disable Nagle algorithm? */ u8 nonagle : 4,/* Disable Nagle algorithm? */
thin_lto : 1,/* Use linear timeouts for thin streams */ thin_lto : 1,/* Use linear timeouts for thin streams */
thin_dupack : 1,/* Fast retransmit on first dupack */ thin_dupack : 1,/* Fast retransmit on first dupack */
unused : 2; repair : 1,
unused : 1;
u8 repair_queue;
/* RTT measurement */ /* RTT measurement */
u32 srtt; /* smoothed round trip time << 3 */ u32 srtt; /* smoothed round trip time << 3 */
......
...@@ -612,6 +612,8 @@ static inline u32 tcp_receive_window(const struct tcp_sock *tp) ...@@ -612,6 +612,8 @@ static inline u32 tcp_receive_window(const struct tcp_sock *tp)
*/ */
extern u32 __tcp_select_window(struct sock *sk); extern u32 __tcp_select_window(struct sock *sk);
void tcp_send_window_probe(struct sock *sk);
/* TCP timestamps are only 32-bits, this causes a slight /* TCP timestamps are only 32-bits, this causes a slight
* complication on 64-bit systems since we store a snapshot * complication on 64-bit systems since we store a snapshot
* of jiffies in the buffer control blocks below. We decided * of jiffies in the buffer control blocks below. We decided
......
...@@ -1935,7 +1935,9 @@ void tcp_close(struct sock *sk, long timeout) ...@@ -1935,7 +1935,9 @@ void tcp_close(struct sock *sk, long timeout)
* advertise a zero window, then kill -9 the FTP client, wheee... * advertise a zero window, then kill -9 the FTP client, wheee...
* Note: timeout is always zero in such a case. * Note: timeout is always zero in such a case.
*/ */
if (data_was_unread) { if (unlikely(tcp_sk(sk)->repair)) {
sk->sk_prot->disconnect(sk, 0);
} else if (data_was_unread) {
/* Unread data was tossed, zap the connection. */ /* Unread data was tossed, zap the connection. */
NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE); tcp_set_state(sk, TCP_CLOSE);
...@@ -2074,6 +2076,8 @@ int tcp_disconnect(struct sock *sk, int flags) ...@@ -2074,6 +2076,8 @@ int tcp_disconnect(struct sock *sk, int flags)
/* ABORT function of RFC793 */ /* ABORT function of RFC793 */
if (old_state == TCP_LISTEN) { if (old_state == TCP_LISTEN) {
inet_csk_listen_stop(sk); inet_csk_listen_stop(sk);
} else if (unlikely(tp->repair)) {
sk->sk_err = ECONNABORTED;
} else if (tcp_need_reset(old_state) || } else if (tcp_need_reset(old_state) ||
(tp->snd_nxt != tp->write_seq && (tp->snd_nxt != tp->write_seq &&
(1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
...@@ -2125,6 +2129,12 @@ int tcp_disconnect(struct sock *sk, int flags) ...@@ -2125,6 +2129,12 @@ int tcp_disconnect(struct sock *sk, int flags)
} }
EXPORT_SYMBOL(tcp_disconnect); EXPORT_SYMBOL(tcp_disconnect);
static inline int tcp_can_repair_sock(struct sock *sk)
{
return capable(CAP_NET_ADMIN) &&
((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
}
/* /*
* Socket option code for TCP. * Socket option code for TCP.
*/ */
...@@ -2297,6 +2307,42 @@ static int do_tcp_setsockopt(struct sock *sk, int level, ...@@ -2297,6 +2307,42 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
tp->thin_dupack = val; tp->thin_dupack = val;
break; break;
case TCP_REPAIR:
if (!tcp_can_repair_sock(sk))
err = -EPERM;
else if (val == 1) {
tp->repair = 1;
sk->sk_reuse = SK_FORCE_REUSE;
tp->repair_queue = TCP_NO_QUEUE;
} else if (val == 0) {
tp->repair = 0;
sk->sk_reuse = SK_NO_REUSE;
tcp_send_window_probe(sk);
} else
err = -EINVAL;
break;
case TCP_REPAIR_QUEUE:
if (!tp->repair)
err = -EPERM;
else if (val < TCP_QUEUES_NR)
tp->repair_queue = val;
else
err = -EINVAL;
break;
case TCP_QUEUE_SEQ:
if (sk->sk_state != TCP_CLOSE)
err = -EPERM;
else if (tp->repair_queue == TCP_SEND_QUEUE)
tp->write_seq = val;
else if (tp->repair_queue == TCP_RECV_QUEUE)
tp->rcv_nxt = val;
else
err = -EINVAL;
break;
case TCP_CORK: case TCP_CORK:
/* When set indicates to always queue non-full frames. /* When set indicates to always queue non-full frames.
* Later the user clears this option and we transmit * Later the user clears this option and we transmit
...@@ -2632,6 +2678,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level, ...@@ -2632,6 +2678,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->thin_dupack; val = tp->thin_dupack;
break; break;
case TCP_REPAIR:
val = tp->repair;
break;
case TCP_REPAIR_QUEUE:
if (tp->repair)
val = tp->repair_queue;
else
return -EINVAL;
break;
case TCP_QUEUE_SEQ:
if (tp->repair_queue == TCP_SEND_QUEUE)
val = tp->write_seq;
else if (tp->repair_queue == TCP_RECV_QUEUE)
val = tp->rcv_nxt;
else
return -EINVAL;
break;
case TCP_USER_TIMEOUT: case TCP_USER_TIMEOUT:
val = jiffies_to_msecs(icsk->icsk_user_timeout); val = jiffies_to_msecs(icsk->icsk_user_timeout);
break; break;
......
...@@ -138,6 +138,14 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) ...@@ -138,6 +138,14 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
} }
EXPORT_SYMBOL_GPL(tcp_twsk_unique); EXPORT_SYMBOL_GPL(tcp_twsk_unique);
static int tcp_repair_connect(struct sock *sk)
{
tcp_connect_init(sk);
tcp_finish_connect(sk, NULL);
return 0;
}
/* This will initiate an outgoing connection. */ /* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{ {
...@@ -196,7 +204,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) ...@@ -196,7 +204,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* Reset inherited state */ /* Reset inherited state */
tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0; tp->rx_opt.ts_recent_stamp = 0;
tp->write_seq = 0; if (likely(!tp->repair))
tp->write_seq = 0;
} }
if (tcp_death_row.sysctl_tw_recycle && if (tcp_death_row.sysctl_tw_recycle &&
...@@ -247,7 +256,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) ...@@ -247,7 +256,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk->sk_gso_type = SKB_GSO_TCPV4; sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst); sk_setup_caps(sk, &rt->dst);
if (!tp->write_seq) if (!tp->write_seq && likely(!tp->repair))
tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
inet->inet_daddr, inet->inet_daddr,
inet->inet_sport, inet->inet_sport,
...@@ -255,7 +264,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) ...@@ -255,7 +264,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_id = tp->write_seq ^ jiffies; inet->inet_id = tp->write_seq ^ jiffies;
err = tcp_connect(sk); if (likely(!tp->repair))
err = tcp_connect(sk);
else
err = tcp_repair_connect(sk);
rt = NULL; rt = NULL;
if (err) if (err)
goto failure; goto failure;
......
...@@ -2617,9 +2617,11 @@ void tcp_connect_init(struct sock *sk) ...@@ -2617,9 +2617,11 @@ void tcp_connect_init(struct sock *sk)
tp->snd_sml = tp->write_seq; tp->snd_sml = tp->write_seq;
tp->snd_up = tp->write_seq; tp->snd_up = tp->write_seq;
tp->snd_nxt = tp->write_seq; tp->snd_nxt = tp->write_seq;
tp->rcv_nxt = 0;
tp->rcv_wup = 0; if (likely(!tp->repair))
tp->copied_seq = 0; tp->rcv_nxt = 0;
tp->rcv_wup = tp->rcv_nxt;
tp->copied_seq = tp->rcv_nxt;
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
inet_csk(sk)->icsk_retransmits = 0; inet_csk(sk)->icsk_retransmits = 0;
...@@ -2790,6 +2792,14 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) ...@@ -2790,6 +2792,14 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
} }
void tcp_send_window_probe(struct sock *sk)
{
if (sk->sk_state == TCP_ESTABLISHED) {
tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
tcp_xmit_probe_skb(sk, 0);
}
}
/* Initiate keepalive or window probe from timer. */ /* Initiate keepalive or window probe from timer. */
int tcp_write_wakeup(struct sock *sk) int tcp_write_wakeup(struct sock *sk)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment