Commit 38e3bfa8 authored by David S. Miller's avatar David S. Miller

Merge branch 'mptcp-improve-backup-subflows'

Mat Martineau says:

====================
mptcp: Improve use of backup subflows

Multipath TCP combines multiple TCP subflows in to one stream, and the
MPTCP-level socket must decide which subflow to use when sending (or
resending) chunks of data. The choice of the "best" subflow to transmit
on can vary depending on the priority (normal or backup) for each
subflow and how well the subflow is performing.

In order to improve MPTCP performance when some subflows are failing,
this patch set changes how backup subflows are utilized and introduces
tracking of "stale" subflows that are still connected but not making
progress.

Patch 1 adjusts MPTCP-level retransmit timeouts to use data from all
subflows.

Patch 2 makes MPTCP-level retransmissions less aggressive to avoid
resending data that's still queued at the TCP level.

Patch 3 changes the way pending data is handled when subflows are
closed. Unacked MPTCP-level data still in the subflow tx queue is
immediately moved to another subflow for transmission instead of waiting
for MPTCP-level timeouts to trigger retransmission.

Patch 4 has some sysctl code cleanup.

Patches 5 and 6 add tracking of "stale" subflows, so only underlying TCP
subflow connections that appear to be making progress are considered
when selecting a subflow to (re)transmit data. How fast a subflow goes
stale is configurable with a per-namespace sysctl. Related MIBS are
added too.

Patch 7 makes sure the backup flag is always correctly recorded when the
MP_JOIN SYN/ACK is received for an added subflow.

Patch 8 adds more test cases for backup subflows and stale subflows.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents e5f31552 7d1e6f16
......@@ -45,3 +45,15 @@ allow_join_initial_addr_port - BOOLEAN
This is a per-namespace sysctl.
Default: 1
stale_loss_cnt - INTEGER
The number of MPTCP-level retransmission intervals with no traffic and
pending outstanding data on a given subflow required to declare it stale.
The packet scheduler ignores stale subflows.
A low stale_loss_cnt value allows for fast active-backup switch-over,
an high value maximize links utilization on edge scenarios e.g. lossy
link with high BER or peer pausing the data processing.
This is a per-namespace sysctl.
Default: 4
......@@ -21,43 +21,50 @@ struct mptcp_pernet {
struct ctl_table_header *ctl_table_hdr;
#endif
u8 mptcp_enabled;
unsigned int add_addr_timeout;
unsigned int stale_loss_cnt;
u8 mptcp_enabled;
u8 checksum_enabled;
u8 allow_join_initial_addr_port;
};
static struct mptcp_pernet *mptcp_get_pernet(struct net *net)
static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
{
return net_generic(net, mptcp_pernet_id);
}
int mptcp_is_enabled(struct net *net)
int mptcp_is_enabled(const struct net *net)
{
return mptcp_get_pernet(net)->mptcp_enabled;
}
unsigned int mptcp_get_add_addr_timeout(struct net *net)
unsigned int mptcp_get_add_addr_timeout(const struct net *net)
{
return mptcp_get_pernet(net)->add_addr_timeout;
}
int mptcp_is_checksum_enabled(struct net *net)
int mptcp_is_checksum_enabled(const struct net *net)
{
return mptcp_get_pernet(net)->checksum_enabled;
}
int mptcp_allow_join_id0(struct net *net)
int mptcp_allow_join_id0(const struct net *net)
{
return mptcp_get_pernet(net)->allow_join_initial_addr_port;
}
unsigned int mptcp_stale_loss_cnt(const struct net *net)
{
return mptcp_get_pernet(net)->stale_loss_cnt;
}
static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
pernet->add_addr_timeout = TCP_RTO_MAX;
pernet->checksum_enabled = 0;
pernet->allow_join_initial_addr_port = 1;
pernet->stale_loss_cnt = 4;
}
#ifdef CONFIG_SYSCTL
......@@ -95,6 +102,12 @@ static struct ctl_table mptcp_sysctl_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE
},
{
.procname = "stale_loss_cnt",
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_douintvec_minmax,
},
{}
};
......@@ -114,6 +127,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[1].data = &pernet->add_addr_timeout;
table[2].data = &pernet->checksum_enabled;
table[3].data = &pernet->allow_join_initial_addr_port;
table[4].data = &pernet->stale_loss_cnt;
hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
if (!hdr)
......
......@@ -45,6 +45,8 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX),
SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX),
SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED),
SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE),
SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER),
SNMP_MIB_SENTINEL
};
......
......@@ -38,6 +38,8 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */
MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */
MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */
MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */
MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */
__MPTCP_MIB_MAX
};
......
......@@ -975,9 +975,11 @@ static void ack_update_msk(struct mptcp_sock *msk,
old_snd_una = msk->snd_una;
new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
/* ACK for data not even sent yet? Ignore. */
if (after64(new_snd_una, snd_nxt))
/* ACK for data not even sent yet and even above recovery bound? Ignore.*/
if (unlikely(after64(new_snd_una, snd_nxt))) {
if (!msk->recovery || after64(new_snd_una, msk->recovery_snd_nxt))
new_snd_una = old_snd_una;
}
new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
......
......@@ -10,6 +10,8 @@
#include <net/mptcp.h>
#include "protocol.h"
#include "mib.h"
/* path manager command handlers */
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
......@@ -308,6 +310,25 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
return mptcp_pm_nl_get_local_id(msk, skc);
}
void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
u32 rcv_tstamp = READ_ONCE(tcp_sk(ssk)->rcv_tstamp);
/* keep track of rtx periods with no progress */
if (!subflow->stale_count) {
subflow->stale_rcv_tstamp = rcv_tstamp;
subflow->stale_count++;
} else if (subflow->stale_rcv_tstamp == rcv_tstamp) {
if (subflow->stale_count < U8_MAX)
subflow->stale_count++;
mptcp_pm_nl_subflow_chk_stale(msk, ssk);
} else {
subflow->stale_count = 0;
mptcp_subflow_set_active(subflow);
}
}
void mptcp_pm_data_init(struct mptcp_sock *msk)
{
msk->pm.add_addr_signaled = 0;
......
......@@ -46,6 +46,7 @@ struct pm_nl_pernet {
spinlock_t lock;
struct list_head local_addr_list;
unsigned int addrs;
unsigned int stale_loss_cnt;
unsigned int add_addr_signal_max;
unsigned int add_addr_accept_max;
unsigned int local_addr_max;
......@@ -899,6 +900,43 @@ static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = {
[MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, },
};
void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
{
struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = (struct sock *)msk;
unsigned int active_max_loss_cnt;
struct net *net = sock_net(sk);
unsigned int stale_loss_cnt;
bool slow;
stale_loss_cnt = mptcp_stale_loss_cnt(net);
if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt)
return;
/* look for another available subflow not in loss state */
active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1);
mptcp_for_each_subflow(msk, iter) {
if (iter != subflow && mptcp_subflow_active(iter) &&
iter->stale_count < active_max_loss_cnt) {
/* we have some alternatives, try to mark this subflow as idle ...*/
slow = lock_sock_fast(ssk);
if (!tcp_rtx_and_write_queues_empty(ssk)) {
subflow->stale = 1;
__mptcp_retransmit_pending_data(sk);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_SUBFLOWSTALE);
}
unlock_sock_fast(ssk, slow);
/* always try to push the pending data regarless of re-injections:
* we can possibly use backup subflows now, and subflow selection
* is cheap under the msk socket lock
*/
__mptcp_push_pending(sk, 0);
return;
}
}
}
static int mptcp_pm_family_to_addr(int family)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
......@@ -1922,6 +1960,7 @@ static int __net_init pm_nl_init_net(struct net *net)
INIT_LIST_HEAD_RCU(&pernet->local_addr_list);
pernet->next_id = 1;
pernet->stale_loss_cnt = 4;
spin_lock_init(&pernet->lock);
/* No need to initialize other pernet fields, the struct is zeroed at
......
......@@ -411,14 +411,27 @@ static void mptcp_set_datafin_timeout(const struct sock *sk)
TCP_RTO_MIN << icsk->icsk_retransmits);
}
static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
static void __mptcp_set_timeout(struct sock *sk, long tout)
{
long tout = ssk && inet_csk(ssk)->icsk_pending ?
mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
}
static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow)
{
const struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
return inet_csk(ssk)->icsk_pending && !subflow->stale_count ?
inet_csk(ssk)->icsk_timeout - jiffies : 0;
}
if (tout <= 0)
tout = mptcp_sk(sk)->timer_ival;
mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
static void mptcp_set_timeout(struct sock *sk)
{
struct mptcp_subflow_context *subflow;
long tout = 0;
mptcp_for_each_subflow(mptcp_sk(sk), subflow)
tout = max(tout, mptcp_timeout_from_subflow(subflow));
__mptcp_set_timeout(sk, tout);
}
static bool tcp_can_send_ack(const struct sock *ssk)
......@@ -531,7 +544,6 @@ static bool mptcp_check_data_fin(struct sock *sk)
}
ret = true;
mptcp_set_timeout(sk, NULL);
mptcp_send_ack(msk);
mptcp_close_wake_up(sk);
}
......@@ -791,10 +803,7 @@ static void mptcp_reset_timer(struct sock *sk)
if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE))
return;
/* should never be called with mptcp level timer cleared */
tout = READ_ONCE(mptcp_sk(sk)->timer_ival);
if (WARN_ON_ONCE(!tout))
tout = TCP_RTO_MIN;
tout = mptcp_sk(sk)->timer_ival;
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
}
......@@ -1046,8 +1055,14 @@ static void __mptcp_clean_una(struct sock *sk)
if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
break;
if (WARN_ON_ONCE(dfrag == msk->first_pending))
if (unlikely(dfrag == msk->first_pending)) {
/* in recovery mode can see ack after the current snd head */
if (WARN_ON_ONCE(!msk->recovery))
break;
WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
}
dfrag_clear(sk, dfrag);
cleaned = true;
}
......@@ -1056,8 +1071,14 @@ static void __mptcp_clean_una(struct sock *sk)
if (dfrag && after64(snd_una, dfrag->data_seq)) {
u64 delta = snd_una - dfrag->data_seq;
if (WARN_ON_ONCE(delta > dfrag->already_sent))
/* prevent wrap around in recovery mode */
if (unlikely(delta > dfrag->already_sent)) {
if (WARN_ON_ONCE(!msk->recovery))
goto out;
if (WARN_ON_ONCE(delta > dfrag->data_len))
goto out;
dfrag->already_sent += delta - dfrag->already_sent;
}
dfrag->data_seq += delta;
dfrag->offset += delta;
......@@ -1068,6 +1089,10 @@ static void __mptcp_clean_una(struct sock *sk)
cleaned = true;
}
/* all retransmitted data acked, recovery completed */
if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt))
msk->recovery = false;
out:
if (cleaned) {
if (tcp_under_memory_pressure(sk)) {
......@@ -1076,8 +1101,8 @@ static void __mptcp_clean_una(struct sock *sk)
}
}
if (snd_una == READ_ONCE(msk->snd_nxt)) {
if (msk->timer_ival && !mptcp_data_fin_enabled(msk))
if (snd_una == READ_ONCE(msk->snd_nxt) && !msk->recovery) {
if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
mptcp_stop_timer(sk);
} else {
mptcp_reset_timer(sk);
......@@ -1366,16 +1391,44 @@ struct subflow_send_info {
u64 ratio;
};
void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow)
{
if (!subflow->stale)
return;
subflow->stale = 0;
MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER);
}
bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
{
if (unlikely(subflow->stale)) {
u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp);
if (subflow->stale_rcv_tstamp == rcv_tstamp)
return false;
mptcp_subflow_set_active(subflow);
}
return __mptcp_subflow_active(subflow);
}
/* implement the mptcp packet scheduler;
* returns the subflow that will transmit the next DSS
* additionally updates the rtx timeout
*/
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
struct subflow_send_info send_info[2];
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
int i, nr_active = 0;
struct sock *ssk;
long tout = 0;
u64 ratio;
u32 pace;
sock_owned_by_me((struct sock *)msk);
sock_owned_by_me(sk);
if (__mptcp_check_fallback(msk)) {
if (!msk->first)
......@@ -1386,8 +1439,10 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
/* re-use last subflow, if the burst allow that */
if (msk->last_snd && msk->snd_burst > 0 &&
sk_stream_memory_free(msk->last_snd) &&
mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd)))
mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
mptcp_set_timeout(sk);
return msk->last_snd;
}
/* pick the subflow with the lower wmem/wspace ratio */
for (i = 0; i < 2; ++i) {
......@@ -1400,6 +1455,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
if (!mptcp_subflow_active(subflow))
continue;
tout = max(tout, mptcp_timeout_from_subflow(subflow));
nr_active += !subflow->backup;
if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd)
continue;
......@@ -1415,6 +1471,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
send_info[subflow->backup].ratio = ratio;
}
}
__mptcp_set_timeout(sk, tout);
/* pick the best backup if no other subflow is active */
if (!nr_active)
......@@ -1433,12 +1490,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
static void mptcp_push_release(struct sock *sk, struct sock *ssk,
struct mptcp_sendmsg_info *info)
{
mptcp_set_timeout(sk, ssk);
tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal);
release_sock(ssk);
}
static void __mptcp_push_pending(struct sock *sk, unsigned int flags)
void __mptcp_push_pending(struct sock *sk, unsigned int flags)
{
struct sock *prev_ssk = NULL, *ssk = NULL;
struct mptcp_sock *msk = mptcp_sk(sk);
......@@ -1501,12 +1557,11 @@ static void __mptcp_push_pending(struct sock *sk, unsigned int flags)
mptcp_push_release(sk, ssk, &info);
out:
if (copied) {
/* start the timer, if it's not pending */
/* ensure the rtx timer is running */
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
if (copied)
__mptcp_check_send_data_fin(sk);
}
}
static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
......@@ -1567,7 +1622,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
*/
__mptcp_update_wmem(sk);
if (copied) {
mptcp_set_timeout(sk, ssk);
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal);
if (!mptcp_timer_pending(sk))
......@@ -2083,10 +2137,11 @@ static void mptcp_timeout_timer(struct timer_list *t)
*
* A backup subflow is returned only if that is the only kind available.
*/
static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
{
struct sock *backup = NULL, *pick = NULL;
struct mptcp_subflow_context *subflow;
struct sock *backup = NULL;
int min_stale_count = INT_MAX;
sock_owned_by_me((const struct sock *)msk);
......@@ -2096,14 +2151,14 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (!mptcp_subflow_active(subflow))
if (!__mptcp_subflow_active(subflow))
continue;
/* still data outstanding at TCP level? Don't retransmit. */
if (!tcp_write_queue_empty(ssk)) {
if (inet_csk(ssk)->icsk_ca_state >= TCP_CA_Loss)
/* still data outstanding at TCP level? skip this */
if (!tcp_rtx_and_write_queues_empty(ssk)) {
mptcp_pm_subflow_chk_stale(msk, ssk);
min_stale_count = min_t(int, min_stale_count, subflow->stale_count);
continue;
return NULL;
}
if (subflow->backup) {
......@@ -2112,10 +2167,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
continue;
}
return ssk;
if (!pick)
pick = ssk;
}
return backup;
if (pick)
return pick;
/* use backup only if there are no progresses anywhere */
return min_stale_count > 1 ? backup : NULL;
}
static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk)
......@@ -2126,6 +2186,50 @@ static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk)
}
}
bool __mptcp_retransmit_pending_data(struct sock *sk)
{
struct mptcp_data_frag *cur, *rtx_head;
struct mptcp_sock *msk = mptcp_sk(sk);
if (__mptcp_check_fallback(mptcp_sk(sk)))
return false;
if (tcp_rtx_and_write_queues_empty(sk))
return false;
/* the closing socket has some data untransmitted and/or unacked:
* some data in the mptcp rtx queue has not really xmitted yet.
* keep it simple and re-inject the whole mptcp level rtx queue
*/
mptcp_data_lock(sk);
__mptcp_clean_una_wakeup(sk);
rtx_head = mptcp_rtx_head(sk);
if (!rtx_head) {
mptcp_data_unlock(sk);
return false;
}
/* will accept ack for reijected data before re-sending them */
if (!msk->recovery || after64(msk->snd_nxt, msk->recovery_snd_nxt))
msk->recovery_snd_nxt = msk->snd_nxt;
msk->recovery = true;
mptcp_data_unlock(sk);
msk->first_pending = rtx_head;
msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq;
msk->snd_nxt = rtx_head->data_seq;
msk->snd_burst = 0;
/* be sure to clear the "sent status" on all re-injected fragments */
list_for_each_entry(cur, &msk->rtx_queue, list) {
if (!cur->already_sent)
break;
cur->already_sent = 0;
}
return true;
}
/* subflow sockets can be either outgoing (connect) or incoming
* (accept).
*
......@@ -2138,6 +2242,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
struct mptcp_subflow_context *subflow)
{
struct mptcp_sock *msk = mptcp_sk(sk);
bool need_push;
list_del(&subflow->node);
......@@ -2149,6 +2254,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
if (ssk->sk_socket)
sock_orphan(ssk);
need_push = __mptcp_retransmit_pending_data(sk);
subflow->disposable = 1;
/* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
......@@ -2176,6 +2282,9 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
if (msk->subflow && ssk == msk->subflow->sk)
mptcp_dispose_initial_subflow(msk);
if (need_push)
__mptcp_push_pending(sk, 0);
}
void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
......@@ -2313,7 +2422,6 @@ static void __mptcp_retrans(struct sock *sk)
info.size_goal);
}
mptcp_set_timeout(sk, ssk);
release_sock(ssk);
reset_timer:
......@@ -2384,10 +2492,12 @@ static int __mptcp_init_sock(struct sock *sk)
msk->wmem_reserved = 0;
WRITE_ONCE(msk->rmem_released, 0);
msk->tx_pending_data = 0;
msk->timer_ival = TCP_RTO_MIN;
msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
msk->recovery = false;
mptcp_pm_data_init(msk);
......@@ -2472,7 +2582,6 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
tcp_shutdown(ssk, how);
} else {
pr_debug("Sending DATA_FIN on subflow %p", ssk);
mptcp_set_timeout(sk, ssk);
tcp_send_ack(ssk);
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
......
......@@ -230,12 +230,17 @@ struct mptcp_sock {
struct sock *last_snd;
int snd_burst;
int old_wspace;
u64 recovery_snd_nxt; /* in recovery mode accept up to this seq;
* recovery related fields are under data_lock
* protection
*/
u64 snd_una;
u64 wnd_end;
unsigned long timer_ival;
u32 token;
int rmem_released;
unsigned long flags;
bool recovery; /* closing subflow write queue reinjected */
bool can_ack;
bool fully_established;
bool rcv_data_fin;
......@@ -427,7 +432,8 @@ struct mptcp_subflow_context {
send_mp_prio : 1,
rx_eof : 1,
can_ack : 1, /* only after processing the remote a key */
disposable : 1; /* ctx can be free at ulp release time */
disposable : 1, /* ctx can be free at ulp release time */
stale : 1; /* unable to snd/rcv data, do not use for xmit */
enum mptcp_data_avail data_avail;
u32 remote_nonce;
u64 thmac;
......@@ -439,11 +445,13 @@ struct mptcp_subflow_context {
u8 reset_seen:1;
u8 reset_transient:1;
u8 reset_reason:4;
u8 stale_count;
long delegated_status;
struct list_head delegated_node; /* link into delegated_action, protected by local BH */
u32 setsockopt_seq;
u32 stale_rcv_tstamp;
struct sock *tcp_sock; /* tcp sk backpointer */
struct sock *conn; /* parent mptcp_sock */
......@@ -549,12 +557,15 @@ static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *su
clear_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status);
}
int mptcp_is_enabled(struct net *net);
unsigned int mptcp_get_add_addr_timeout(struct net *net);
int mptcp_is_checksum_enabled(struct net *net);
int mptcp_allow_join_id0(struct net *net);
int mptcp_is_enabled(const struct net *net);
unsigned int mptcp_get_add_addr_timeout(const struct net *net);
int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk);
void __mptcp_push_pending(struct sock *sk, unsigned int flags);
bool mptcp_subflow_data_available(struct sock *sk);
void __init mptcp_subflow_init(void);
void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how);
......@@ -573,7 +584,7 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
struct sockaddr_storage *addr,
unsigned short family);
static inline bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow)
{
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
......@@ -585,6 +596,10 @@ static inline bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
}
void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow);
bool mptcp_subflow_active(struct mptcp_subflow_context *subflow);
static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
struct mptcp_subflow_context *ctx)
{
......@@ -690,6 +705,8 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac);
void __init mptcp_pm_init(void);
void mptcp_pm_data_init(struct mptcp_sock *msk);
void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side);
void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp);
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
......
......@@ -435,10 +435,12 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
goto do_reset;
}
subflow->backup = mp_opt.backup;
subflow->thmac = mp_opt.thmac;
subflow->remote_nonce = mp_opt.nonce;
pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
subflow->thmac, subflow->remote_nonce);
pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d",
subflow, subflow->thmac, subflow->remote_nonce,
subflow->backup);
if (!subflow_thmac_valid(subflow)) {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
......
......@@ -3,8 +3,10 @@
ret=0
sin=""
sinfail=""
sout=""
cin=""
cinfail=""
cinsent=""
cout=""
ksft_skip=4
......@@ -76,6 +78,14 @@ init()
done
}
init_shapers()
{
for i in `seq 1 4`; do
tc -n $ns1 qdisc add dev ns1eth$i root netem rate 20mbit delay 1
tc -n $ns2 qdisc add dev ns2eth$i root netem rate 20mbit delay 1
done
}
cleanup_partial()
{
rm -f "$capout"
......@@ -88,8 +98,8 @@ cleanup_partial()
cleanup()
{
rm -f "$cin" "$cout"
rm -f "$sin" "$sout" "$cinsent"
rm -f "$cin" "$cout" "$sinfail"
rm -f "$sin" "$sout" "$cinsent" "$cinfail"
cleanup_partial
}
......@@ -211,11 +221,15 @@ link_failure()
{
ns="$1"
if [ -z "$FAILING_LINKS" ]; then
l=$((RANDOM%4))
l=$((l+1))
FAILING_LINKS=$((l+1))
fi
for l in $FAILING_LINKS; do
veth="ns1eth$l"
ip -net "$ns" link set "$veth" down
done
}
# $1: IP address
......@@ -280,10 +294,17 @@ do_transfer()
local_addr="0.0.0.0"
fi
if [ "$test_link_fail" -eq 2 ];then
timeout ${timeout_test} \
ip netns exec ${listener_ns} \
$mptcp_connect -t ${timeout_poll} -l -p $port -s ${cl_proto} \
${local_addr} < "$sinfail" > "$sout" &
else
timeout ${timeout_test} \
ip netns exec ${listener_ns} \
$mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \
${local_addr} < "$sin" > "$sout" &
fi
spid=$!
sleep 1
......@@ -294,7 +315,7 @@ do_transfer()
$mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \
$connect_addr < "$cin" > "$cout" &
else
( cat "$cin" ; sleep 2; link_failure $listener_ns ; cat "$cin" ) | \
( cat "$cinfail" ; sleep 2; link_failure $listener_ns ; cat "$cinfail" ) | \
tee "$cinsent" | \
timeout ${timeout_test} \
ip netns exec ${connector_ns} \
......@@ -434,7 +455,11 @@ do_transfer()
return 1
fi
if [ "$test_link_fail" -eq 2 ];then
check_transfer $sinfail $cout "file received by client"
else
check_transfer $sin $cout "file received by client"
fi
retc=$?
if [ "$test_link_fail" -eq 0 ];then
check_transfer $cin $sout "file received by server"
......@@ -477,29 +502,33 @@ run_tests()
lret=0
oldin=""
if [ "$test_linkfail" -eq 1 ];then
size=$((RANDOM%1024))
# create the input file for the failure test when
# the first failure test run
if [ "$test_linkfail" -ne 0 -a -z "$cinfail" ]; then
# the client file must be considerably larger
# of the maximum expected cwin value, or the
# link utilization will be not predicable
size=$((RANDOM%2))
size=$((size+1))
size=$((size*128))
size=$((size*8192))
size=$((size + ( $RANDOM % 8192) ))
oldin=$(mktemp)
cp "$cin" "$oldin"
make_file "$cin" "client" $size
cinfail=$(mktemp)
make_file "$cinfail" "client" $size
fi
do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} \
${test_linkfail} ${addr_nr_ns1} ${addr_nr_ns2} ${speed} ${bkup}
lret=$?
if [ "$test_linkfail" -eq 2 -a -z "$sinfail" ]; then
size=$((RANDOM%16))
size=$((size+1))
size=$((size*2048))
if [ "$test_linkfail" -eq 1 ];then
cp "$oldin" "$cin"
rm -f "$oldin"
sinfail=$(mktemp)
make_file "$sinfail" "server" $size
fi
if [ $lret -ne 0 ]; then
ret=$lret
return
fi
do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} \
${test_linkfail} ${addr_nr_ns1} ${addr_nr_ns2} ${speed} ${bkup}
lret=$?
}
chk_csum_nr()
......@@ -593,6 +622,46 @@ chk_join_nr()
fi
}
# a negative value for 'stale_max' means no upper bound:
# for bidirectional transfer, if one peer sleep for a while
# - as these tests do - we can have a quite high number of
# stale/recover conversions, proportional to
# sleep duration/ MPTCP-level RTX interval.
chk_stale_nr()
{
local ns=$1
local stale_min=$2
local stale_max=$3
local stale_delta=$4
local dump_stats
local stale_nr
local recover_nr
printf "%-39s %-18s" " " "stale"
stale_nr=`ip netns exec $ns nstat -as | grep MPTcpExtSubflowStale | awk '{print $2}'`
[ -z "$stale_nr" ] && stale_nr=0
recover_nr=`ip netns exec $ns nstat -as | grep MPTcpExtSubflowRecover | awk '{print $2}'`
[ -z "$recover_nr" ] && recover_nr=0
if [ $stale_nr -lt $stale_min ] ||
[ $stale_max -gt 0 -a $stale_nr -gt $stale_max ] ||
[ $((stale_nr - $recover_nr)) -ne $stale_delta ]; then
echo "[fail] got $stale_nr stale[s] $recover_nr recover[s], " \
" expected stale in range [$stale_min..$stale_max]," \
" stale-recover delta $stale_delta "
ret=1
dump_stats=1
else
echo "[ ok ]"
fi
if [ "${dump_stats}" = 1 ]; then
echo $ns stats
ip netns exec $ns ip -s link show
ip netns exec $ns nstat -as | grep MPTcp
fi
}
chk_add_nr()
{
local add_nr=$1
......@@ -801,6 +870,27 @@ chk_prio_nr()
fi
}
chk_link_usage()
{
local ns=$1
local link=$2
local out=$3
local expected_rate=$4
local tx_link=`ip netns exec $ns cat /sys/class/net/$link/statistics/tx_bytes`
local tx_total=`ls -l $out | awk '{print $5}'`
local tx_rate=$((tx_link * 100 / $tx_total))
local tolerance=5
printf "%-39s %-18s" " " "link usage"
if [ $tx_rate -lt $((expected_rate - $tolerance)) -o \
$tx_rate -gt $((expected_rate + $tolerance)) ]; then
echo "[fail] got $tx_rate% usage, expected $expected_rate%"
ret=1
else
echo "[ ok ]"
fi
}
subflows_tests()
{
reset
......@@ -924,14 +1014,80 @@ link_failure_tests()
{
# accept and use add_addr with additional subflows and link loss
reset
# without any b/w limit each veth could spool the packets and get
# them acked at xmit time, so that the corresponding subflow will
# have almost always no outstanding pkts, the scheduler will pick
# always the first subflow and we will have hard time testing
# active backup and link switch-over.
# Let's set some arbitrary (low) virtual link limits.
init_shapers
ip netns exec $ns1 ./pm_nl_ctl limits 0 3
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal
ip netns exec $ns2 ./pm_nl_ctl limits 1 3
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 dev ns2eth4 flags subflow
run_tests $ns1 $ns2 10.0.1.1 1
chk_join_nr "multiple flows, signal, link failure" 3 3 3
chk_add_nr 1 1
chk_stale_nr $ns2 1 5 1
# accept and use add_addr with additional subflows and link loss
# for bidirectional transfer
reset
init_shapers
ip netns exec $ns1 ./pm_nl_ctl limits 0 3
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal
ip netns exec $ns2 ./pm_nl_ctl limits 1 3
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 dev ns2eth4 flags subflow
run_tests $ns1 $ns2 10.0.1.1 2
chk_join_nr "multi flows, signal, bidi, link fail" 3 3 3
chk_add_nr 1 1
chk_stale_nr $ns2 1 -1 1
# 2 subflows plus 1 backup subflow with a lossy link, backup
# will never be used
reset
init_shapers
ip netns exec $ns1 ./pm_nl_ctl limits 0 2
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal
ip netns exec $ns2 ./pm_nl_ctl limits 1 2
export FAILING_LINKS="1"
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow,backup
run_tests $ns1 $ns2 10.0.1.1 1
chk_join_nr "backup subflow unused, link failure" 2 2 2
chk_add_nr 1 1
chk_link_usage $ns2 ns2eth3 $cinsent 0
# 2 lossy links after half transfer, backup will get half of
# the traffic
reset
init_shapers
ip netns exec $ns1 ./pm_nl_ctl limits 0 2
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal
ip netns exec $ns2 ./pm_nl_ctl limits 1 2
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow,backup
export FAILING_LINKS="1 2"
run_tests $ns1 $ns2 10.0.1.1 1
chk_join_nr "backup flow used, multi links fail" 2 2 2
chk_add_nr 1 1
chk_stale_nr $ns2 2 4 2
chk_link_usage $ns2 ns2eth3 $cinsent 50
# use a backup subflow with the first subflow on a lossy link
# for bidirectional transfer
reset
init_shapers
ip netns exec $ns1 ./pm_nl_ctl limits 0 2
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal
ip netns exec $ns2 ./pm_nl_ctl limits 1 3
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow,backup
run_tests $ns1 $ns2 10.0.1.1 2
chk_join_nr "backup flow used, bidi, link failure" 2 2 2
chk_add_nr 1 1
chk_stale_nr $ns2 1 -1 2
chk_link_usage $ns2 ns2eth3 $cinsent 50
}
add_addr_timeout_tests()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment