Commit 9890092e authored by Florian Westphal's avatar Florian Westphal Committed by David S. Miller

net: tcp: more detailed ACK events and events for CE marked packets

DataCenter TCP (DCTCP) determines cwnd growth based on ECN information
and ACK properties, e.g. ACK that updates window is treated differently
than DUPACK.

Also DCTCP needs information whether ACK was delayed ACK. Furthermore,
DCTCP also implements a CE state machine that keeps track of CE markings
of incoming packets.

Therefore, extend the congestion control framework to provide these
event types, so that DCTCP can be properly implemented as a normal
congestion algorithm module outside of the core stack.

Joint work with Daniel Borkmann and Glenn Judd.
Signed-off-by: default avatarFlorian Westphal <fw@strlen.de>
Signed-off-by: default avatarDaniel Borkmann <dborkman@redhat.com>
Signed-off-by: default avatarGlenn Judd <glenn.judd@morganstanley.com>
Acked-by: default avatarStephen Hemminger <stephen@networkplumber.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 7354c8c3
...@@ -763,10 +763,17 @@ enum tcp_ca_event { ...@@ -763,10 +763,17 @@ enum tcp_ca_event {
CA_EVENT_CWND_RESTART, /* congestion window restart */ CA_EVENT_CWND_RESTART, /* congestion window restart */
CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
CA_EVENT_LOSS, /* loss timeout */ CA_EVENT_LOSS, /* loss timeout */
CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */
CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */
CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */
CA_EVENT_NON_DELAYED_ACK,
}; };
/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
enum tcp_ca_ack_event_flags { enum tcp_ca_ack_event_flags {
CA_ACK_SLOWPATH = (1 << 0), CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */
CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */
CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */
}; };
/* /*
......
...@@ -233,14 +233,21 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s ...@@ -233,14 +233,21 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
tcp_enter_quickack_mode((struct sock *)tp); tcp_enter_quickack_mode((struct sock *)tp);
break; break;
case INET_ECN_CE: case INET_ECN_CE:
if (tcp_ca_needs_ecn((struct sock *)tp))
tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
/* Better not delay acks, sender can have a very low cwnd */ /* Better not delay acks, sender can have a very low cwnd */
tcp_enter_quickack_mode((struct sock *)tp); tcp_enter_quickack_mode((struct sock *)tp);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR; tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
} }
/* fallinto */ tp->ecn_flags |= TCP_ECN_SEEN;
break;
default: default:
if (tcp_ca_needs_ecn((struct sock *)tp))
tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
tp->ecn_flags |= TCP_ECN_SEEN; tp->ecn_flags |= TCP_ECN_SEEN;
break;
} }
} }
...@@ -3429,10 +3436,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3429,10 +3436,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tp->snd_una = ack; tp->snd_una = ack;
flag |= FLAG_WIN_UPDATE; flag |= FLAG_WIN_UPDATE;
tcp_in_ack_event(sk, 0); tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else { } else {
u32 ack_ev_flags = CA_ACK_SLOWPATH;
if (ack_seq != TCP_SKB_CB(skb)->end_seq) if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA; flag |= FLAG_DATA;
else else
...@@ -3444,10 +3453,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3444,10 +3453,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_rtt_us); &sack_rtt_us);
if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) {
flag |= FLAG_ECE; flag |= FLAG_ECE;
ack_ev_flags |= CA_ACK_ECE;
}
if (flag & FLAG_WIN_UPDATE)
ack_ev_flags |= CA_ACK_WIN_UPDATE;
tcp_in_ack_event(sk, CA_ACK_SLOWPATH); tcp_in_ack_event(sk, ack_ev_flags);
} }
/* We passed data and got it acked, remove any soft error /* We passed data and got it acked, remove any soft error
......
...@@ -3130,6 +3130,8 @@ void tcp_send_delayed_ack(struct sock *sk) ...@@ -3130,6 +3130,8 @@ void tcp_send_delayed_ack(struct sock *sk)
int ato = icsk->icsk_ack.ato; int ato = icsk->icsk_ack.ato;
unsigned long timeout; unsigned long timeout;
tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
if (ato > TCP_DELACK_MIN) { if (ato > TCP_DELACK_MIN) {
const struct tcp_sock *tp = tcp_sk(sk); const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ / 2; int max_ato = HZ / 2;
...@@ -3186,6 +3188,8 @@ void tcp_send_ack(struct sock *sk) ...@@ -3186,6 +3188,8 @@ void tcp_send_ack(struct sock *sk)
if (sk->sk_state == TCP_CLOSE) if (sk->sk_state == TCP_CLOSE)
return; return;
tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
/* We are not putting this on the write queue, so /* We are not putting this on the write queue, so
* tcp_transmit_skb() will set the ownership to this * tcp_transmit_skb() will set the ownership to this
* sock. * sock.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment