Commit ca1a6705 authored by David S. Miller's avatar David S. Miller

Merge branch 'mptcp-next'

Mat Martineau says:

====================
mptcp: New features and cleanup

These patches have been tested in the MPTCP tree for a longer than usual
time (thanks to holiday schedules), and are ready for the net-next
branch. Changes include feature updates, small fixes, refactoring, and
some selftest changes.

Patch 1 fixes an OUTQ ioctl issue with TCP fallback sockets.

Patches 2, 3, and 6 add support of the MPTCP fastclose option (quick
shutdown of the full MPTCP connection, similar to TCP RST in regular
TCP), and a related self test.

Patch 4 cleans up some accept and poll code that is no longer needed
after the fastclose changes.

Patch 5 add userspace disconnect using AF_UNSPEC, which is used when
testing fastclose and makes the MPTCP socket's handling of AF_UNSPEC in
connect() more TCP-like.

Patches 7-11 refactor subflow creation to make better use of multiple
local endpoints and to better handle individual connection failures when
creating multiple subflows. Includes self test updates.

Patch 12 cleans up the way subflows are added to the MPTCP connection
list, eliminating the need for calls throughout the MPTCP code that had
to check the intermediate "join list" for entries to shift over to the
main "connection list".

Patch 13 refactors the MPTCP release_cb flags to use separate storage
for values only accessed with the socket lock held (no atomic ops
needed), and for values that need atomic operations.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 26abf15c e9d09bac
...@@ -768,6 +768,28 @@ static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_bu ...@@ -768,6 +768,28 @@ static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_bu
return true; return true;
} }
static bool mptcp_established_options_fastclose(struct sock *sk,
unsigned int *size,
unsigned int remaining,
struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
if (likely(!subflow->send_fastclose))
return false;
if (remaining < TCPOLEN_MPTCP_FASTCLOSE)
return false;
*size = TCPOLEN_MPTCP_FASTCLOSE;
opts->suboptions |= OPTION_MPTCP_FASTCLOSE;
opts->rcvr_key = msk->remote_key;
pr_debug("FASTCLOSE key=%llu", opts->rcvr_key);
return true;
}
static bool mptcp_established_options_mp_fail(struct sock *sk, static bool mptcp_established_options_mp_fail(struct sock *sk,
unsigned int *size, unsigned int *size,
unsigned int remaining, unsigned int remaining,
...@@ -806,10 +828,12 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, ...@@ -806,10 +828,12 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
return false; return false;
if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) {
if (mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) ||
mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) {
*size += opt_size; *size += opt_size;
remaining -= opt_size; remaining -= opt_size;
} }
/* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */
if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) { if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) {
*size += opt_size; *size += opt_size;
remaining -= opt_size; remaining -= opt_size;
...@@ -1251,17 +1275,8 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, ...@@ -1251,17 +1275,8 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
ptr += 2; ptr += 2;
} }
/* RST is mutually exclusive with everything else */ /* DSS, MPC, MPJ, ADD_ADDR, FASTCLOSE and RST are mutually exclusive,
if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) { * see mptcp_established_options*()
*ptr++ = mptcp_option(MPTCPOPT_RST,
TCPOLEN_MPTCP_RST,
opts->reset_transient,
opts->reset_reason);
return;
}
/* DSS, MPC, MPJ and ADD_ADDR are mutually exclusive, see
* mptcp_established_options*()
*/ */
if (likely(OPTION_MPTCP_DSS & opts->suboptions)) { if (likely(OPTION_MPTCP_DSS & opts->suboptions)) {
struct mptcp_ext *mpext = &opts->ext_copy; struct mptcp_ext *mpext = &opts->ext_copy;
...@@ -1370,7 +1385,8 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, ...@@ -1370,7 +1385,8 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
/* MPC is additionally mutually exclusive with MP_PRIO */ /* MPC is additionally mutually exclusive with MP_PRIO */
goto mp_capable_done; goto mp_capable_done;
} else if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { } else if (OPTIONS_MPTCP_MPJ & opts->suboptions) {
if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
TCPOLEN_MPTCP_MPJ_SYN, TCPOLEN_MPTCP_MPJ_SYN,
opts->backup, opts->join_id); opts->backup, opts->join_id);
...@@ -1386,11 +1402,12 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, ...@@ -1386,11 +1402,12 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
ptr += 2; ptr += 2;
put_unaligned_be32(opts->nonce, ptr); put_unaligned_be32(opts->nonce, ptr);
ptr += 1; ptr += 1;
} else if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) { } else {
*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
TCPOLEN_MPTCP_MPJ_ACK, 0, 0); TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
ptr += 5; ptr += 5;
}
} else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { } else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE;
u8 echo = MPTCP_ADDR_ECHO; u8 echo = MPTCP_ADDR_ECHO;
...@@ -1447,6 +1464,24 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, ...@@ -1447,6 +1464,24 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
ptr += 1; ptr += 1;
} }
} }
} else if (unlikely(OPTION_MPTCP_FASTCLOSE & opts->suboptions)) {
/* FASTCLOSE is mutually exclusive with others except RST */
*ptr++ = mptcp_option(MPTCPOPT_MP_FASTCLOSE,
TCPOLEN_MPTCP_FASTCLOSE,
0, 0);
put_unaligned_be64(opts->rcvr_key, ptr);
ptr += 2;
if (OPTION_MPTCP_RST & opts->suboptions)
goto mp_rst;
return;
} else if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) {
mp_rst:
*ptr++ = mptcp_option(MPTCPOPT_RST,
TCPOLEN_MPTCP_RST,
opts->reset_transient,
opts->reset_reason);
return;
} }
if (OPTION_MPTCP_PRIO & opts->suboptions) { if (OPTION_MPTCP_PRIO & opts->suboptions) {
......
...@@ -172,9 +172,28 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk) ...@@ -172,9 +172,28 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk)
spin_unlock_bh(&pm->lock); spin_unlock_bh(&pm->lock);
} }
void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id) void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
const struct mptcp_subflow_context *subflow)
{ {
pr_debug("msk=%p", msk); struct mptcp_pm_data *pm = &msk->pm;
bool update_subflows;
update_subflows = (ssk->sk_state == TCP_CLOSE) &&
(subflow->request_join || subflow->mp_join);
if (!READ_ONCE(pm->work_pending) && !update_subflows)
return;
spin_lock_bh(&pm->lock);
if (update_subflows)
pm->subflows--;
/* Even if this subflow is not really established, tell the PM to try
* to pick the next ones, if possible.
*/
if (mptcp_pm_nl_check_work_pending(msk))
mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
spin_unlock_bh(&pm->lock);
} }
void mptcp_pm_add_addr_received(struct mptcp_sock *msk, void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
...@@ -356,7 +375,7 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) ...@@ -356,7 +375,7 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
} }
} }
void mptcp_pm_data_init(struct mptcp_sock *msk) void mptcp_pm_data_reset(struct mptcp_sock *msk)
{ {
msk->pm.add_addr_signaled = 0; msk->pm.add_addr_signaled = 0;
msk->pm.add_addr_accepted = 0; msk->pm.add_addr_accepted = 0;
...@@ -370,11 +389,16 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) ...@@ -370,11 +389,16 @@ void mptcp_pm_data_init(struct mptcp_sock *msk)
WRITE_ONCE(msk->pm.accept_subflow, false); WRITE_ONCE(msk->pm.accept_subflow, false);
WRITE_ONCE(msk->pm.remote_deny_join_id0, false); WRITE_ONCE(msk->pm.remote_deny_join_id0, false);
msk->pm.status = 0; msk->pm.status = 0;
bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
mptcp_pm_nl_data_init(msk);
}
void mptcp_pm_data_init(struct mptcp_sock *msk)
{
spin_lock_init(&msk->pm.lock); spin_lock_init(&msk->pm.lock);
INIT_LIST_HEAD(&msk->pm.anno_list); INIT_LIST_HEAD(&msk->pm.anno_list);
mptcp_pm_data_reset(msk);
mptcp_pm_nl_data_init(msk);
} }
void __init mptcp_pm_init(void) void __init mptcp_pm_init(void)
......
...@@ -38,10 +38,6 @@ struct mptcp_pm_add_entry { ...@@ -38,10 +38,6 @@ struct mptcp_pm_add_entry {
u8 retrans_times; u8 retrans_times;
}; };
/* max value of mptcp_addr_info.id */
#define MAX_ADDR_ID U8_MAX
#define BITMAP_SZ DIV_ROUND_UP(MAX_ADDR_ID + 1, BITS_PER_LONG)
struct pm_nl_pernet { struct pm_nl_pernet {
/* protects pernet updates */ /* protects pernet updates */
spinlock_t lock; spinlock_t lock;
...@@ -53,14 +49,14 @@ struct pm_nl_pernet { ...@@ -53,14 +49,14 @@ struct pm_nl_pernet {
unsigned int local_addr_max; unsigned int local_addr_max;
unsigned int subflows_max; unsigned int subflows_max;
unsigned int next_id; unsigned int next_id;
unsigned long id_bitmap[BITMAP_SZ]; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
}; };
#define MPTCP_PM_ADDR_MAX 8 #define MPTCP_PM_ADDR_MAX 8
#define ADD_ADDR_RETRANS_MAX 3 #define ADD_ADDR_RETRANS_MAX 3
static bool addresses_equal(const struct mptcp_addr_info *a, static bool addresses_equal(const struct mptcp_addr_info *a,
struct mptcp_addr_info *b, bool use_port) const struct mptcp_addr_info *b, bool use_port)
{ {
bool addr_equals = false; bool addr_equals = false;
...@@ -169,11 +165,13 @@ select_local_address(const struct pm_nl_pernet *pernet, ...@@ -169,11 +165,13 @@ select_local_address(const struct pm_nl_pernet *pernet,
msk_owned_by_me(msk); msk_owned_by_me(msk);
rcu_read_lock(); rcu_read_lock();
__mptcp_flush_join_list(msk);
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue; continue;
if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
continue;
if (entry->addr.family != sk->sk_family) { if (entry->addr.family != sk->sk_family) {
#if IS_ENABLED(CONFIG_MPTCP_IPV6) #if IS_ENABLED(CONFIG_MPTCP_IPV6)
if ((entry->addr.family == AF_INET && if ((entry->addr.family == AF_INET &&
...@@ -184,23 +182,17 @@ select_local_address(const struct pm_nl_pernet *pernet, ...@@ -184,23 +182,17 @@ select_local_address(const struct pm_nl_pernet *pernet,
continue; continue;
} }
/* avoid any address already in use by subflows and
* pending join
*/
if (!lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) {
ret = entry; ret = entry;
break; break;
} }
}
rcu_read_unlock(); rcu_read_unlock();
return ret; return ret;
} }
static struct mptcp_pm_addr_entry * static struct mptcp_pm_addr_entry *
select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos) select_signal_address(struct pm_nl_pernet *pernet, struct mptcp_sock *msk)
{ {
struct mptcp_pm_addr_entry *entry, *ret = NULL; struct mptcp_pm_addr_entry *entry, *ret = NULL;
int i = 0;
rcu_read_lock(); rcu_read_lock();
/* do not keep any additional per socket state, just signal /* do not keep any additional per socket state, just signal
...@@ -209,13 +201,15 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos) ...@@ -209,13 +201,15 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
* can lead to additional addresses not being announced. * can lead to additional addresses not being announced.
*/ */
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
continue;
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
continue; continue;
if (i++ == pos) {
ret = entry; ret = entry;
break; break;
} }
}
rcu_read_unlock(); rcu_read_unlock();
return ret; return ret;
} }
...@@ -256,12 +250,17 @@ unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk) ...@@ -256,12 +250,17 @@ unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk)
} }
EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max);
static void check_work_pending(struct mptcp_sock *msk) bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk)
{ {
if (msk->pm.add_addr_signaled == mptcp_pm_get_add_addr_signal_max(msk) && struct pm_nl_pernet *pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
(msk->pm.local_addr_used == mptcp_pm_get_local_addr_max(msk) ||
msk->pm.subflows == mptcp_pm_get_subflows_max(msk))) if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) ||
(find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap,
MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) {
WRITE_ONCE(msk->pm.work_pending, false); WRITE_ONCE(msk->pm.work_pending, false);
return false;
}
return true;
} }
struct mptcp_pm_add_entry * struct mptcp_pm_add_entry *
...@@ -430,6 +429,7 @@ static bool lookup_address_in_vec(struct mptcp_addr_info *addrs, unsigned int nr ...@@ -430,6 +429,7 @@ static bool lookup_address_in_vec(struct mptcp_addr_info *addrs, unsigned int nr
static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullmesh, static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullmesh,
struct mptcp_addr_info *addrs) struct mptcp_addr_info *addrs)
{ {
bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0);
struct sock *sk = (struct sock *)msk, *ssk; struct sock *sk = (struct sock *)msk, *ssk;
struct mptcp_subflow_context *subflow; struct mptcp_subflow_context *subflow;
struct mptcp_addr_info remote = { 0 }; struct mptcp_addr_info remote = { 0 };
...@@ -437,22 +437,28 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm ...@@ -437,22 +437,28 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm
int i = 0; int i = 0;
subflows_max = mptcp_pm_get_subflows_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk);
remote_address((struct sock_common *)sk, &remote);
/* Non-fullmesh endpoint, fill in the single entry /* Non-fullmesh endpoint, fill in the single entry
* corresponding to the primary MPC subflow remote address * corresponding to the primary MPC subflow remote address
*/ */
if (!fullmesh) { if (!fullmesh) {
remote_address((struct sock_common *)sk, &remote); if (deny_id0)
return 0;
msk->pm.subflows++; msk->pm.subflows++;
addrs[i++] = remote; addrs[i++] = remote;
} else { } else {
mptcp_for_each_subflow(msk, subflow) { mptcp_for_each_subflow(msk, subflow) {
ssk = mptcp_subflow_tcp_sock(subflow); ssk = mptcp_subflow_tcp_sock(subflow);
remote_address((struct sock_common *)ssk, &remote); remote_address((struct sock_common *)ssk, &addrs[i]);
if (!lookup_address_in_vec(addrs, i, &remote) && if (deny_id0 && addresses_equal(&addrs[i], &remote, false))
continue;
if (!lookup_address_in_vec(addrs, i, &addrs[i]) &&
msk->pm.subflows < subflows_max) { msk->pm.subflows < subflows_max) {
msk->pm.subflows++; msk->pm.subflows++;
addrs[i++] = remote; i++;
} }
} }
} }
...@@ -460,6 +466,35 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm ...@@ -460,6 +466,35 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm
return i; return i;
} }
static struct mptcp_pm_addr_entry *
__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
{
struct mptcp_pm_addr_entry *entry;
list_for_each_entry(entry, &pernet->local_addr_list, list) {
if (entry->addr.id == id)
return entry;
}
return NULL;
}
static int
lookup_id_by_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *addr)
{
struct mptcp_pm_addr_entry *entry;
int ret = -1;
rcu_read_lock();
list_for_each_entry(entry, &pernet->local_addr_list, list) {
if (addresses_equal(&entry->addr, addr, entry->addr.port)) {
ret = entry->addr.id;
break;
}
}
rcu_read_unlock();
return ret;
}
static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
{ {
struct sock *sk = (struct sock *)msk; struct sock *sk = (struct sock *)msk;
...@@ -475,6 +510,19 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) ...@@ -475,6 +510,19 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
local_addr_max = mptcp_pm_get_local_addr_max(msk); local_addr_max = mptcp_pm_get_local_addr_max(msk);
subflows_max = mptcp_pm_get_subflows_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk);
/* do lazy endpoint usage accounting for the MPC subflows */
if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) {
struct mptcp_addr_info mpc_addr;
int mpc_id;
local_address((struct sock_common *)msk->first, &mpc_addr);
mpc_id = lookup_id_by_addr(pernet, &mpc_addr);
if (mpc_id >= 0)
__clear_bit(mpc_id, msk->pm.id_avail_bitmap);
msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED);
}
pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
msk->pm.local_addr_used, local_addr_max, msk->pm.local_addr_used, local_addr_max,
msk->pm.add_addr_signaled, add_addr_signal_max, msk->pm.add_addr_signaled, add_addr_signal_max,
...@@ -482,47 +530,41 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) ...@@ -482,47 +530,41 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
/* check first for announce */ /* check first for announce */
if (msk->pm.add_addr_signaled < add_addr_signal_max) { if (msk->pm.add_addr_signaled < add_addr_signal_max) {
local = select_signal_address(pernet, local = select_signal_address(pernet, msk);
msk->pm.add_addr_signaled);
if (local) { if (local) {
if (mptcp_pm_alloc_anno_list(msk, local)) { if (mptcp_pm_alloc_anno_list(msk, local)) {
__clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
msk->pm.add_addr_signaled++; msk->pm.add_addr_signaled++;
mptcp_pm_announce_addr(msk, &local->addr, false); mptcp_pm_announce_addr(msk, &local->addr, false);
mptcp_pm_nl_addr_send_ack(msk); mptcp_pm_nl_addr_send_ack(msk);
} }
} else {
/* pick failed, avoid fourther attempts later */
msk->pm.local_addr_used = add_addr_signal_max;
} }
check_work_pending(msk);
} }
/* check if should create a new subflow */ /* check if should create a new subflow */
if (msk->pm.local_addr_used < local_addr_max && while (msk->pm.local_addr_used < local_addr_max &&
msk->pm.subflows < subflows_max && msk->pm.subflows < subflows_max) {
!READ_ONCE(msk->pm.remote_deny_join_id0)) {
local = select_local_address(pernet, msk);
if (local) {
bool fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH);
struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
bool fullmesh;
int i, nr; int i, nr;
local = select_local_address(pernet, msk);
if (!local)
break;
fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH);
msk->pm.local_addr_used++; msk->pm.local_addr_used++;
check_work_pending(msk);
nr = fill_remote_addresses_vec(msk, fullmesh, addrs); nr = fill_remote_addresses_vec(msk, fullmesh, addrs);
if (nr)
__clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
spin_unlock_bh(&msk->pm.lock); spin_unlock_bh(&msk->pm.lock);
for (i = 0; i < nr; i++) for (i = 0; i < nr; i++)
__mptcp_subflow_connect(sk, &local->addr, &addrs[i]); __mptcp_subflow_connect(sk, &local->addr, &addrs[i]);
spin_lock_bh(&msk->pm.lock); spin_lock_bh(&msk->pm.lock);
return;
}
/* lookup failed, avoid fourther attempts later */
msk->pm.local_addr_used = local_addr_max;
check_work_pending(msk);
} }
mptcp_pm_nl_check_work_pending(msk);
} }
static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk)
...@@ -552,7 +594,6 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, ...@@ -552,7 +594,6 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
subflows_max = mptcp_pm_get_subflows_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk);
rcu_read_lock(); rcu_read_lock();
__mptcp_flush_join_list(msk);
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH))
continue; continue;
...@@ -641,7 +682,6 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) ...@@ -641,7 +682,6 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk)
!mptcp_pm_should_rm_signal(msk)) !mptcp_pm_should_rm_signal(msk))
return; return;
__mptcp_flush_join_list(msk);
subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node);
if (subflow) { if (subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
...@@ -711,6 +751,8 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, ...@@ -711,6 +751,8 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
return; return;
for (i = 0; i < rm_list->nr; i++) { for (i = 0; i < rm_list->nr; i++) {
bool removed = false;
list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
int how = RCV_SHUTDOWN | SEND_SHUTDOWN; int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
...@@ -727,18 +769,24 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, ...@@ -727,18 +769,24 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
i, rm_list->ids[i], subflow->local_id, subflow->remote_id); i, rm_list->ids[i], subflow->local_id, subflow->remote_id);
spin_unlock_bh(&msk->pm.lock); spin_unlock_bh(&msk->pm.lock);
mptcp_subflow_shutdown(sk, ssk, how); mptcp_subflow_shutdown(sk, ssk, how);
/* the following takes care of updating the subflows counter */
mptcp_close_ssk(sk, ssk, subflow); mptcp_close_ssk(sk, ssk, subflow);
spin_lock_bh(&msk->pm.lock); spin_lock_bh(&msk->pm.lock);
removed = true;
__MPTCP_INC_STATS(sock_net(sk), rm_type);
}
__set_bit(rm_list->ids[1], msk->pm.id_avail_bitmap);
if (!removed)
continue;
if (rm_type == MPTCP_MIB_RMADDR) { if (rm_type == MPTCP_MIB_RMADDR) {
msk->pm.add_addr_accepted--; msk->pm.add_addr_accepted--;
WRITE_ONCE(msk->pm.accept_addr, true); WRITE_ONCE(msk->pm.accept_addr, true);
} else if (rm_type == MPTCP_MIB_RMSUBFLOW) { } else if (rm_type == MPTCP_MIB_RMSUBFLOW) {
msk->pm.local_addr_used--; msk->pm.local_addr_used--;
} }
msk->pm.subflows--;
__MPTCP_INC_STATS(sock_net(sk), rm_type);
}
} }
} }
...@@ -759,6 +807,9 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk) ...@@ -759,6 +807,9 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk)
msk_owned_by_me(msk); msk_owned_by_me(msk);
if (!(pm->status & MPTCP_PM_WORK_MASK))
return;
spin_lock_bh(&msk->pm.lock); spin_lock_bh(&msk->pm.lock);
pr_debug("msk=%p status=%x", msk, pm->status); pr_debug("msk=%p status=%x", msk, pm->status);
...@@ -804,7 +855,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, ...@@ -804,7 +855,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
/* to keep the code simple, don't do IDR-like allocation for address ID, /* to keep the code simple, don't do IDR-like allocation for address ID,
* just bail when we exceed limits * just bail when we exceed limits
*/ */
if (pernet->next_id == MAX_ADDR_ID) if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID)
pernet->next_id = 1; pernet->next_id = 1;
if (pernet->addrs >= MPTCP_PM_ADDR_MAX) if (pernet->addrs >= MPTCP_PM_ADDR_MAX)
goto out; goto out;
...@@ -824,7 +875,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, ...@@ -824,7 +875,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
if (!entry->addr.id) { if (!entry->addr.id) {
find_next: find_next:
entry->addr.id = find_next_zero_bit(pernet->id_bitmap, entry->addr.id = find_next_zero_bit(pernet->id_bitmap,
MAX_ADDR_ID + 1, MPTCP_PM_MAX_ADDR_ID + 1,
pernet->next_id); pernet->next_id);
if (!entry->addr.id && pernet->next_id != 1) { if (!entry->addr.id && pernet->next_id != 1) {
pernet->next_id = 1; pernet->next_id = 1;
...@@ -1191,18 +1242,6 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) ...@@ -1191,18 +1242,6 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
return 0; return 0;
} }
static struct mptcp_pm_addr_entry *
__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
{
struct mptcp_pm_addr_entry *entry;
list_for_each_entry(entry, &pernet->local_addr_list, list) {
if (entry->addr.id == id)
return entry;
}
return NULL;
}
int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id,
u8 *flags, int *ifindex) u8 *flags, int *ifindex)
{ {
...@@ -1461,7 +1500,7 @@ static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) ...@@ -1461,7 +1500,7 @@ static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
list_splice_init(&pernet->local_addr_list, &free_list); list_splice_init(&pernet->local_addr_list, &free_list);
__reset_counters(pernet); __reset_counters(pernet);
pernet->next_id = 1; pernet->next_id = 1;
bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1); bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
spin_unlock_bh(&pernet->lock); spin_unlock_bh(&pernet->lock);
mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list); mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list);
synchronize_rcu(); synchronize_rcu();
...@@ -1571,7 +1610,7 @@ static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg, ...@@ -1571,7 +1610,7 @@ static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg,
pernet = net_generic(net, pm_nl_pernet_id); pernet = net_generic(net, pm_nl_pernet_id);
spin_lock_bh(&pernet->lock); spin_lock_bh(&pernet->lock);
for (i = id; i < MAX_ADDR_ID + 1; i++) { for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) {
if (test_bit(i, pernet->id_bitmap)) { if (test_bit(i, pernet->id_bitmap)) {
entry = __lookup_addr_by_id(pernet, i); entry = __lookup_addr_by_id(pernet, i);
if (!entry) if (!entry)
......
...@@ -763,7 +763,7 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) ...@@ -763,7 +763,7 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
if (!sock_owned_by_user(sk)) if (!sock_owned_by_user(sk))
__mptcp_error_report(sk); __mptcp_error_report(sk);
else else
set_bit(MPTCP_ERROR_REPORT, &msk->flags); __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags);
} }
/* If the moves have caught up with the DATA_FIN sequence number /* If the moves have caught up with the DATA_FIN sequence number
...@@ -808,47 +808,38 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) ...@@ -808,47 +808,38 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
mptcp_data_unlock(sk); mptcp_data_unlock(sk);
} }
static bool mptcp_do_flush_join_list(struct mptcp_sock *msk) static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
{ {
struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk;
bool ret = false;
if (likely(list_empty(&msk->join_list))) if (sk->sk_state != TCP_ESTABLISHED)
return false; return false;
spin_lock_bh(&msk->join_list_lock); /* attach to msk socket only after we are sure we will deal with it
list_for_each_entry(subflow, &msk->join_list, node) { * at close time
u32 sseq = READ_ONCE(subflow->setsockopt_seq); */
if (sk->sk_socket && !ssk->sk_socket)
mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow)); mptcp_sock_graft(ssk, sk->sk_socket);
if (READ_ONCE(msk->setsockopt_seq) != sseq)
ret = true;
}
list_splice_tail_init(&msk->join_list, &msk->conn_list);
spin_unlock_bh(&msk->join_list_lock);
return ret;
}
void __mptcp_flush_join_list(struct mptcp_sock *msk)
{
if (likely(!mptcp_do_flush_join_list(msk)))
return;
if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags)) mptcp_propagate_sndbuf((struct sock *)msk, ssk);
mptcp_schedule_work((struct sock *)msk); mptcp_sockopt_sync_locked(msk, ssk);
return true;
} }
static void mptcp_flush_join_list(struct mptcp_sock *msk) static void __mptcp_flush_join_list(struct sock *sk)
{ {
bool sync_needed = test_and_clear_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags); struct mptcp_subflow_context *tmp, *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
might_sleep();
if (!mptcp_do_flush_join_list(msk) && !sync_needed) list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) {
return; struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow = lock_sock_fast(ssk);
mptcp_sockopt_sync_all(msk); list_move_tail(&subflow->node, &msk->conn_list);
if (!__mptcp_finish_join(msk, ssk))
mptcp_subflow_reset(ssk);
unlock_sock_fast(ssk, slow);
}
} }
static bool mptcp_timer_pending(struct sock *sk) static bool mptcp_timer_pending(struct sock *sk)
...@@ -1526,9 +1517,8 @@ static void mptcp_update_post_push(struct mptcp_sock *msk, ...@@ -1526,9 +1517,8 @@ static void mptcp_update_post_push(struct mptcp_sock *msk,
void mptcp_check_and_set_pending(struct sock *sk) void mptcp_check_and_set_pending(struct sock *sk)
{ {
if (mptcp_send_head(sk) && if (mptcp_send_head(sk))
!test_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING);
set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
} }
void __mptcp_push_pending(struct sock *sk, unsigned int flags) void __mptcp_push_pending(struct sock *sk, unsigned int flags)
...@@ -1549,7 +1539,6 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) ...@@ -1549,7 +1539,6 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
int ret = 0; int ret = 0;
prev_ssk = ssk; prev_ssk = ssk;
__mptcp_flush_join_list(msk);
ssk = mptcp_subflow_get_send(msk); ssk = mptcp_subflow_get_send(msk);
/* First check. If the ssk has changed since /* First check. If the ssk has changed since
...@@ -1954,7 +1943,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) ...@@ -1954,7 +1943,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
unsigned int moved = 0; unsigned int moved = 0;
bool ret, done; bool ret, done;
mptcp_flush_join_list(msk);
do { do {
struct sock *ssk = mptcp_subflow_recv_lookup(msk); struct sock *ssk = mptcp_subflow_recv_lookup(msk);
bool slowpath; bool slowpath;
...@@ -2145,7 +2133,7 @@ static void mptcp_retransmit_timer(struct timer_list *t) ...@@ -2145,7 +2133,7 @@ static void mptcp_retransmit_timer(struct timer_list *t)
mptcp_schedule_work(sk); mptcp_schedule_work(sk);
} else { } else {
/* delegate our work to tcp_release_cb() */ /* delegate our work to tcp_release_cb() */
set_bit(MPTCP_RETRANSMIT, &msk->flags); __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags);
} }
bh_unlock_sock(sk); bh_unlock_sock(sk);
sock_put(sk); sock_put(sk);
...@@ -2253,6 +2241,10 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) ...@@ -2253,6 +2241,10 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
return true; return true;
} }
/* flags for __mptcp_close_ssk() */
#define MPTCP_CF_PUSH BIT(1)
#define MPTCP_CF_FASTCLOSE BIT(2)
/* subflow sockets can be either outgoing (connect) or incoming /* subflow sockets can be either outgoing (connect) or incoming
* (accept). * (accept).
* *
...@@ -2262,22 +2254,37 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) ...@@ -2262,22 +2254,37 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
* parent socket. * parent socket.
*/ */
static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
struct mptcp_subflow_context *subflow) struct mptcp_subflow_context *subflow,
unsigned int flags)
{ {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
bool need_push; bool need_push, dispose_it;
dispose_it = !msk->subflow || ssk != msk->subflow->sk;
if (dispose_it)
list_del(&subflow->node); list_del(&subflow->node);
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
if (flags & MPTCP_CF_FASTCLOSE)
subflow->send_fastclose = 1;
need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
if (!dispose_it) {
tcp_disconnect(ssk, 0);
msk->subflow->state = SS_UNCONNECTED;
mptcp_subflow_ctx_reset(subflow);
release_sock(ssk);
goto out;
}
/* if we are invoked by the msk cleanup code, the subflow is /* if we are invoked by the msk cleanup code, the subflow is
* already orphaned * already orphaned
*/ */
if (ssk->sk_socket) if (ssk->sk_socket)
sock_orphan(ssk); sock_orphan(ssk);
need_push = __mptcp_retransmit_pending_data(sk);
subflow->disposable = 1; subflow->disposable = 1;
/* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
...@@ -2297,14 +2304,12 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, ...@@ -2297,14 +2304,12 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
sock_put(ssk); sock_put(ssk);
if (ssk == msk->last_snd)
msk->last_snd = NULL;
if (ssk == msk->first) if (ssk == msk->first)
msk->first = NULL; msk->first = NULL;
if (msk->subflow && ssk == msk->subflow->sk) out:
mptcp_dispose_initial_subflow(msk); if (ssk == msk->last_snd)
msk->last_snd = NULL;
if (need_push) if (need_push)
__mptcp_push_pending(sk, 0); __mptcp_push_pending(sk, 0);
...@@ -2315,7 +2320,13 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, ...@@ -2315,7 +2320,13 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
{ {
if (sk->sk_state == TCP_ESTABLISHED) if (sk->sk_state == TCP_ESTABLISHED)
mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL);
__mptcp_close_ssk(sk, ssk, subflow);
/* subflow aborted before reaching the fully_established status
* attempt the creation of the next subflow
*/
mptcp_pm_subflow_check_next(mptcp_sk(sk), ssk, subflow);
__mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH);
} }
static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
...@@ -2467,11 +2478,9 @@ static void mptcp_worker(struct work_struct *work) ...@@ -2467,11 +2478,9 @@ static void mptcp_worker(struct work_struct *work)
goto unlock; goto unlock;
mptcp_check_data_fin_ack(sk); mptcp_check_data_fin_ack(sk);
mptcp_flush_join_list(msk);
mptcp_check_fastclose(msk); mptcp_check_fastclose(msk);
if (msk->pm.status)
mptcp_pm_nl_work(msk); mptcp_pm_nl_work(msk);
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
...@@ -2506,8 +2515,6 @@ static int __mptcp_init_sock(struct sock *sk) ...@@ -2506,8 +2515,6 @@ static int __mptcp_init_sock(struct sock *sk)
{ {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
spin_lock_init(&msk->join_list_lock);
INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->conn_list);
INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->join_list);
INIT_LIST_HEAD(&msk->rtx_queue); INIT_LIST_HEAD(&msk->rtx_queue);
...@@ -2533,9 +2540,20 @@ static int __mptcp_init_sock(struct sock *sk) ...@@ -2533,9 +2540,20 @@ static int __mptcp_init_sock(struct sock *sk)
return 0; return 0;
} }
static int mptcp_init_sock(struct sock *sk) static void mptcp_ca_reset(struct sock *sk)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
tcp_assign_congestion_control(sk);
strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name);
/* no need to keep a reference to the ops, the name will suffice */
tcp_cleanup_congestion_control(sk);
icsk->icsk_ca_ops = NULL;
}
static int mptcp_init_sock(struct sock *sk)
{
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
int ret; int ret;
...@@ -2556,12 +2574,7 @@ static int mptcp_init_sock(struct sock *sk) ...@@ -2556,12 +2574,7 @@ static int mptcp_init_sock(struct sock *sk)
/* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will
* propagate the correct value * propagate the correct value
*/ */
tcp_assign_congestion_control(sk); mptcp_ca_reset(sk);
strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name);
/* no need to keep a reference to the ops, the name will suffice */
tcp_cleanup_congestion_control(sk);
icsk->icsk_ca_ops = NULL;
sk_sockets_allocated_inc(sk); sk_sockets_allocated_inc(sk);
sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
...@@ -2666,6 +2679,7 @@ static void __mptcp_check_send_data_fin(struct sock *sk) ...@@ -2666,6 +2679,7 @@ static void __mptcp_check_send_data_fin(struct sock *sk)
* state now * state now
*/ */
if (__mptcp_check_fallback(msk)) { if (__mptcp_check_fallback(msk)) {
WRITE_ONCE(msk->snd_una, msk->write_seq);
if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
inet_sk_state_store(sk, TCP_CLOSE); inet_sk_state_store(sk, TCP_CLOSE);
mptcp_close_wake_up(sk); mptcp_close_wake_up(sk);
...@@ -2674,7 +2688,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk) ...@@ -2674,7 +2688,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk)
} }
} }
mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) { mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
...@@ -2707,21 +2720,20 @@ static void __mptcp_destroy_sock(struct sock *sk) ...@@ -2707,21 +2720,20 @@ static void __mptcp_destroy_sock(struct sock *sk)
might_sleep(); might_sleep();
/* be sure to always acquire the join list lock, to sync vs /* join list will be eventually flushed (with rst) at sock lock release time*/
* mptcp_finish_join().
*/
spin_lock_bh(&msk->join_list_lock);
list_splice_tail_init(&msk->join_list, &msk->conn_list);
spin_unlock_bh(&msk->join_list_lock);
list_splice_init(&msk->conn_list, &conn_list); list_splice_init(&msk->conn_list, &conn_list);
sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
sk_stop_timer(sk, &sk->sk_timer); sk_stop_timer(sk, &sk->sk_timer);
msk->pm.status = 0; msk->pm.status = 0;
/* clears msk->subflow, allowing the following loop to close
* even the initial subflow
*/
mptcp_dispose_initial_subflow(msk);
list_for_each_entry_safe(subflow, tmp, &conn_list, node) { list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
__mptcp_close_ssk(sk, ssk, subflow); __mptcp_close_ssk(sk, ssk, subflow, 0);
} }
sk->sk_prot->destroy(sk); sk->sk_prot->destroy(sk);
...@@ -2732,7 +2744,6 @@ static void __mptcp_destroy_sock(struct sock *sk) ...@@ -2732,7 +2744,6 @@ static void __mptcp_destroy_sock(struct sock *sk)
xfrm_sk_free_policy(sk); xfrm_sk_free_policy(sk);
sk_refcnt_debug_release(sk); sk_refcnt_debug_release(sk);
mptcp_dispose_initial_subflow(msk);
sock_put(sk); sock_put(sk);
} }
...@@ -2768,6 +2779,9 @@ static void mptcp_close(struct sock *sk, long timeout) ...@@ -2768,6 +2779,9 @@ static void mptcp_close(struct sock *sk, long timeout)
sock_hold(sk); sock_hold(sk);
pr_debug("msk=%p state=%d", sk, sk->sk_state); pr_debug("msk=%p state=%d", sk, sk->sk_state);
if (mptcp_sk(sk)->token)
mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
if (sk->sk_state == TCP_CLOSE) { if (sk->sk_state == TCP_CLOSE) {
__mptcp_destroy_sock(sk); __mptcp_destroy_sock(sk);
do_cancel_work = true; do_cancel_work = true;
...@@ -2778,9 +2792,6 @@ static void mptcp_close(struct sock *sk, long timeout) ...@@ -2778,9 +2792,6 @@ static void mptcp_close(struct sock *sk, long timeout)
if (do_cancel_work) if (do_cancel_work)
mptcp_cancel_work(sk); mptcp_cancel_work(sk);
if (mptcp_sk(sk)->token)
mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
sock_put(sk); sock_put(sk);
} }
...@@ -2812,15 +2823,38 @@ static int mptcp_disconnect(struct sock *sk, int flags) ...@@ -2812,15 +2823,38 @@ static int mptcp_disconnect(struct sock *sk, int flags)
struct mptcp_subflow_context *subflow; struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
mptcp_do_flush_join_list(msk); inet_sk_state_store(sk, TCP_CLOSE);
mptcp_for_each_subflow(msk, subflow) { mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
lock_sock(ssk); __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_FASTCLOSE);
tcp_disconnect(ssk, flags);
release_sock(ssk);
} }
sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
sk_stop_timer(sk, &sk->sk_timer);
if (mptcp_sk(sk)->token)
mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
mptcp_destroy_common(msk);
msk->last_snd = NULL;
WRITE_ONCE(msk->flags, 0);
msk->cb_flags = 0;
msk->push_pending = 0;
msk->recovery = false;
msk->can_ack = false;
msk->fully_established = false;
msk->rcv_data_fin = false;
msk->snd_data_fin_enable = false;
msk->rcv_fastclose = false;
msk->use_64bit_ack = false;
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
mptcp_pm_data_reset(msk);
mptcp_ca_reset(sk);
sk->sk_shutdown = 0;
sk_error_report(sk);
return 0; return 0;
} }
...@@ -2960,9 +2994,11 @@ void mptcp_destroy_common(struct mptcp_sock *msk) ...@@ -2960,9 +2994,11 @@ void mptcp_destroy_common(struct mptcp_sock *msk)
__mptcp_clear_xmit(sk); __mptcp_clear_xmit(sk);
/* move to sk_receive_queue, sk_stream_kill_queues will purge it */ /* move to sk_receive_queue, sk_stream_kill_queues will purge it */
mptcp_data_lock(sk);
skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
__skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue);
skb_rbtree_purge(&msk->out_of_order_queue); skb_rbtree_purge(&msk->out_of_order_queue);
mptcp_data_unlock(sk);
/* move all the rx fwd alloc into the sk_mem_reclaim_final in /* move all the rx fwd alloc into the sk_mem_reclaim_final in
* inet_sock_destruct() will dispose it * inet_sock_destruct() will dispose it
...@@ -2986,7 +3022,7 @@ void __mptcp_data_acked(struct sock *sk) ...@@ -2986,7 +3022,7 @@ void __mptcp_data_acked(struct sock *sk)
if (!sock_owned_by_user(sk)) if (!sock_owned_by_user(sk))
__mptcp_clean_una(sk); __mptcp_clean_una(sk);
else else
set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags); __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags);
if (mptcp_pending_data_fin_ack(sk)) if (mptcp_pending_data_fin_ack(sk))
mptcp_schedule_work(sk); mptcp_schedule_work(sk);
...@@ -3005,20 +3041,23 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk) ...@@ -3005,20 +3041,23 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
else if (xmit_ssk) else if (xmit_ssk)
mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), MPTCP_DELEGATE_SEND); mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), MPTCP_DELEGATE_SEND);
} else { } else {
set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
} }
} }
#define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
BIT(MPTCP_RETRANSMIT) | \
BIT(MPTCP_FLUSH_JOIN_LIST))
/* processes deferred events and flush wmem */ /* processes deferred events and flush wmem */
static void mptcp_release_cb(struct sock *sk) static void mptcp_release_cb(struct sock *sk)
__must_hold(&sk->sk_lock.slock)
{ {
for (;;) { struct mptcp_sock *msk = mptcp_sk(sk);
unsigned long flags = 0;
if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) for (;;) {
flags |= BIT(MPTCP_PUSH_PENDING); unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) |
if (test_and_clear_bit(MPTCP_RETRANSMIT, &mptcp_sk(sk)->flags)) msk->push_pending;
flags |= BIT(MPTCP_RETRANSMIT);
if (!flags) if (!flags)
break; break;
...@@ -3029,8 +3068,11 @@ static void mptcp_release_cb(struct sock *sk) ...@@ -3029,8 +3068,11 @@ static void mptcp_release_cb(struct sock *sk)
* datapath acquires the msk socket spinlock while helding * datapath acquires the msk socket spinlock while helding
* the subflow socket lock * the subflow socket lock
*/ */
msk->push_pending = 0;
msk->cb_flags &= ~flags;
spin_unlock_bh(&sk->sk_lock.slock); spin_unlock_bh(&sk->sk_lock.slock);
if (flags & BIT(MPTCP_FLUSH_JOIN_LIST))
__mptcp_flush_join_list(sk);
if (flags & BIT(MPTCP_PUSH_PENDING)) if (flags & BIT(MPTCP_PUSH_PENDING))
__mptcp_push_pending(sk, 0); __mptcp_push_pending(sk, 0);
if (flags & BIT(MPTCP_RETRANSMIT)) if (flags & BIT(MPTCP_RETRANSMIT))
...@@ -3043,11 +3085,11 @@ static void mptcp_release_cb(struct sock *sk) ...@@ -3043,11 +3085,11 @@ static void mptcp_release_cb(struct sock *sk)
/* be sure to set the current sk state before tacking actions /* be sure to set the current sk state before tacking actions
* depending on sk_state * depending on sk_state
*/ */
if (test_and_clear_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags)) if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags))
__mptcp_set_connected(sk); __mptcp_set_connected(sk);
if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
__mptcp_clean_una_wakeup(sk); __mptcp_clean_una_wakeup(sk);
if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
__mptcp_error_report(sk); __mptcp_error_report(sk);
__mptcp_update_rmem(sk); __mptcp_update_rmem(sk);
...@@ -3089,7 +3131,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk) ...@@ -3089,7 +3131,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk)
if (!sock_owned_by_user(sk)) if (!sock_owned_by_user(sk))
__mptcp_subflow_push_pending(sk, ssk); __mptcp_subflow_push_pending(sk, ssk);
else else
set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk); mptcp_data_unlock(sk);
mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_SEND); mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_SEND);
} }
...@@ -3175,8 +3217,7 @@ bool mptcp_finish_join(struct sock *ssk) ...@@ -3175,8 +3217,7 @@ bool mptcp_finish_join(struct sock *ssk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct sock *parent = (void *)msk; struct sock *parent = (void *)msk;
struct socket *parent_sock; bool ret = true;
bool ret;
pr_debug("msk=%p, subflow=%p", msk, subflow); pr_debug("msk=%p, subflow=%p", msk, subflow);
...@@ -3189,35 +3230,38 @@ bool mptcp_finish_join(struct sock *ssk) ...@@ -3189,35 +3230,38 @@ bool mptcp_finish_join(struct sock *ssk)
if (!msk->pm.server_side) if (!msk->pm.server_side)
goto out; goto out;
if (!mptcp_pm_allow_new_subflow(msk)) { if (!mptcp_pm_allow_new_subflow(msk))
subflow->reset_reason = MPTCP_RST_EPROHIBIT; goto err_prohibited;
return false;
} if (WARN_ON_ONCE(!list_empty(&subflow->node)))
goto err_prohibited;
/* active connections are already on conn_list, and we can't acquire /* active connections are already on conn_list.
* msk lock here. * If we can't acquire msk socket lock here, let the release callback
* use the join list lock as synchronization point and double-check * handle it
* msk status to avoid racing with __mptcp_destroy_sock()
*/ */
spin_lock_bh(&msk->join_list_lock); mptcp_data_lock(parent);
ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; if (!sock_owned_by_user(parent)) {
if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) { ret = __mptcp_finish_join(msk, ssk);
list_add_tail(&subflow->node, &msk->join_list); if (ret) {
sock_hold(ssk);
list_add_tail(&subflow->node, &msk->conn_list);
}
} else {
sock_hold(ssk); sock_hold(ssk);
list_add_tail(&subflow->node, &msk->join_list);
__set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags);
} }
spin_unlock_bh(&msk->join_list_lock); mptcp_data_unlock(parent);
if (!ret) { if (!ret) {
err_prohibited:
subflow->reset_reason = MPTCP_RST_EPROHIBIT; subflow->reset_reason = MPTCP_RST_EPROHIBIT;
return false; return false;
} }
/* attach to msk socket only after we are sure he will deal with us
* at close time
*/
parent_sock = READ_ONCE(parent->sk_socket);
if (parent_sock && !ssk->sk_socket)
mptcp_sock_graft(ssk, parent_sock);
subflow->map_seq = READ_ONCE(msk->ack_seq); subflow->map_seq = READ_ONCE(msk->ack_seq);
out: out:
mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
return true; return true;
...@@ -3352,9 +3396,20 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, ...@@ -3352,9 +3396,20 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
struct mptcp_sock *msk = mptcp_sk(sock->sk); struct mptcp_sock *msk = mptcp_sk(sock->sk);
struct mptcp_subflow_context *subflow; struct mptcp_subflow_context *subflow;
struct socket *ssock; struct socket *ssock;
int err; int err = -EINVAL;
lock_sock(sock->sk); lock_sock(sock->sk);
if (uaddr) {
if (addr_len < sizeof(uaddr->sa_family))
goto unlock;
if (uaddr->sa_family == AF_UNSPEC) {
err = mptcp_disconnect(sock->sk, flags);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
goto unlock;
}
}
if (sock->state != SS_UNCONNECTED && msk->subflow) { if (sock->state != SS_UNCONNECTED && msk->subflow) {
/* pending connection or invalid state, let existing subflow /* pending connection or invalid state, let existing subflow
* cope with that * cope with that
...@@ -3364,10 +3419,8 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, ...@@ -3364,10 +3419,8 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
} }
ssock = __mptcp_nmpc_socket(msk); ssock = __mptcp_nmpc_socket(msk);
if (!ssock) { if (!ssock)
err = -EINVAL;
goto unlock; goto unlock;
}
mptcp_token_destroy(msk); mptcp_token_destroy(msk);
inet_sk_state_store(sock->sk, TCP_SYN_SENT); inet_sk_state_store(sock->sk, TCP_SYN_SENT);
...@@ -3441,17 +3494,9 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, ...@@ -3441,17 +3494,9 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
pr_debug("msk=%p", msk); pr_debug("msk=%p", msk);
lock_sock(sock->sk);
if (sock->sk->sk_state != TCP_LISTEN)
goto unlock_fail;
ssock = __mptcp_nmpc_socket(msk); ssock = __mptcp_nmpc_socket(msk);
if (!ssock) if (!ssock)
goto unlock_fail; return -EINVAL;
clear_bit(MPTCP_DATA_READY, &msk->flags);
sock_hold(ssock->sk);
release_sock(sock->sk);
err = ssock->ops->accept(sock, newsock, flags, kern); err = ssock->ops->accept(sock, newsock, flags, kern);
if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) {
...@@ -3481,7 +3526,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, ...@@ -3481,7 +3526,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
/* set ssk->sk_socket of accept()ed flows to mptcp socket. /* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack. * This is needed so NOSPACE flag can be set from tcp stack.
*/ */
mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) { mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
...@@ -3491,14 +3535,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, ...@@ -3491,14 +3535,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
release_sock(newsk); release_sock(newsk);
} }
if (inet_csk_listen_poll(ssock->sk))
set_bit(MPTCP_DATA_READY, &msk->flags);
sock_put(ssock->sk);
return err; return err;
unlock_fail:
release_sock(sock->sk);
return -EINVAL;
} }
static __poll_t mptcp_check_readable(struct mptcp_sock *msk) static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
...@@ -3544,8 +3581,12 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, ...@@ -3544,8 +3581,12 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
state = inet_sk_state_load(sk); state = inet_sk_state_load(sk);
pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
if (state == TCP_LISTEN) if (state == TCP_LISTEN) {
return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 0; if (WARN_ON_ONCE(!msk->subflow || !msk->subflow->sk))
return 0;
return inet_csk_listen_poll(msk->subflow->sk);
}
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
mask |= mptcp_check_readable(msk); mask |= mptcp_check_readable(msk);
......
...@@ -110,19 +110,20 @@ ...@@ -110,19 +110,20 @@
/* MPTCP TCPRST flags */ /* MPTCP TCPRST flags */
#define MPTCP_RST_TRANSIENT BIT(0) #define MPTCP_RST_TRANSIENT BIT(0)
/* MPTCP socket flags */ /* MPTCP socket atomic flags */
#define MPTCP_DATA_READY 0
#define MPTCP_NOSPACE 1 #define MPTCP_NOSPACE 1
#define MPTCP_WORK_RTX 2 #define MPTCP_WORK_RTX 2
#define MPTCP_WORK_EOF 3 #define MPTCP_WORK_EOF 3
#define MPTCP_FALLBACK_DONE 4 #define MPTCP_FALLBACK_DONE 4
#define MPTCP_WORK_CLOSE_SUBFLOW 5 #define MPTCP_WORK_CLOSE_SUBFLOW 5
#define MPTCP_PUSH_PENDING 6
#define MPTCP_CLEAN_UNA 7 /* MPTCP socket release cb flags */
#define MPTCP_ERROR_REPORT 8 #define MPTCP_PUSH_PENDING 1
#define MPTCP_RETRANSMIT 9 #define MPTCP_CLEAN_UNA 2
#define MPTCP_WORK_SYNC_SETSOCKOPT 10 #define MPTCP_ERROR_REPORT 3
#define MPTCP_CONNECTED 11 #define MPTCP_RETRANSMIT 4
#define MPTCP_FLUSH_JOIN_LIST 5
#define MPTCP_CONNECTED 6
static inline bool before64(__u64 seq1, __u64 seq2) static inline bool before64(__u64 seq1, __u64 seq2)
{ {
...@@ -174,16 +175,25 @@ enum mptcp_pm_status { ...@@ -174,16 +175,25 @@ enum mptcp_pm_status {
MPTCP_PM_ADD_ADDR_SEND_ACK, MPTCP_PM_ADD_ADDR_SEND_ACK,
MPTCP_PM_RM_ADDR_RECEIVED, MPTCP_PM_RM_ADDR_RECEIVED,
MPTCP_PM_ESTABLISHED, MPTCP_PM_ESTABLISHED,
MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */
MPTCP_PM_SUBFLOW_ESTABLISHED, MPTCP_PM_SUBFLOW_ESTABLISHED,
MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */
MPTCP_PM_MPC_ENDPOINT_ACCOUNTED /* persistent status, set after MPC local address is
* accounted int id_avail_bitmap
*/
}; };
/* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */
#define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1)
enum mptcp_addr_signal_status { enum mptcp_addr_signal_status {
MPTCP_ADD_ADDR_SIGNAL, MPTCP_ADD_ADDR_SIGNAL,
MPTCP_ADD_ADDR_ECHO, MPTCP_ADD_ADDR_ECHO,
MPTCP_RM_ADDR_SIGNAL, MPTCP_RM_ADDR_SIGNAL,
}; };
/* max value of mptcp_addr_info.id */
#define MPTCP_PM_MAX_ADDR_ID U8_MAX
struct mptcp_pm_data { struct mptcp_pm_data {
struct mptcp_addr_info local; struct mptcp_addr_info local;
struct mptcp_addr_info remote; struct mptcp_addr_info remote;
...@@ -202,6 +212,7 @@ struct mptcp_pm_data { ...@@ -202,6 +212,7 @@ struct mptcp_pm_data {
u8 local_addr_used; u8 local_addr_used;
u8 subflows; u8 subflows;
u8 status; u8 status;
DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
struct mptcp_rm_list rm_list_tx; struct mptcp_rm_list rm_list_tx;
struct mptcp_rm_list rm_list_rx; struct mptcp_rm_list rm_list_rx;
}; };
...@@ -241,6 +252,8 @@ struct mptcp_sock { ...@@ -241,6 +252,8 @@ struct mptcp_sock {
u32 token; u32 token;
int rmem_released; int rmem_released;
unsigned long flags; unsigned long flags;
unsigned long cb_flags;
unsigned long push_pending;
bool recovery; /* closing subflow write queue reinjected */ bool recovery; /* closing subflow write queue reinjected */
bool can_ack; bool can_ack;
bool fully_established; bool fully_established;
...@@ -252,7 +265,6 @@ struct mptcp_sock { ...@@ -252,7 +265,6 @@ struct mptcp_sock {
u8 recvmsg_inq:1, u8 recvmsg_inq:1,
cork:1, cork:1,
nodelay:1; nodelay:1;
spinlock_t join_list_lock;
struct work_struct work; struct work_struct work;
struct sk_buff *ooo_last_skb; struct sk_buff *ooo_last_skb;
struct rb_root out_of_order_queue; struct rb_root out_of_order_queue;
...@@ -395,6 +407,9 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); ...@@ -395,6 +407,9 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
/* MPTCP subflow context */ /* MPTCP subflow context */
struct mptcp_subflow_context { struct mptcp_subflow_context {
struct list_head node;/* conn_list of subflows */ struct list_head node;/* conn_list of subflows */
char reset_start[0];
unsigned long avg_pacing_rate; /* protected by msk socket lock */ unsigned long avg_pacing_rate; /* protected by msk socket lock */
u64 local_key; u64 local_key;
u64 remote_key; u64 remote_key;
...@@ -423,6 +438,7 @@ struct mptcp_subflow_context { ...@@ -423,6 +438,7 @@ struct mptcp_subflow_context {
backup : 1, backup : 1,
send_mp_prio : 1, send_mp_prio : 1,
send_mp_fail : 1, send_mp_fail : 1,
send_fastclose : 1,
rx_eof : 1, rx_eof : 1,
can_ack : 1, /* only after processing the remote a key */ can_ack : 1, /* only after processing the remote a key */
disposable : 1, /* ctx can be free at ulp release time */ disposable : 1, /* ctx can be free at ulp release time */
...@@ -441,6 +457,9 @@ struct mptcp_subflow_context { ...@@ -441,6 +457,9 @@ struct mptcp_subflow_context {
u8 stale_count; u8 stale_count;
long delegated_status; long delegated_status;
char reset_end[0];
struct list_head delegated_node; /* link into delegated_action, protected by local BH */ struct list_head delegated_node; /* link into delegated_action, protected by local BH */
u32 setsockopt_seq; u32 setsockopt_seq;
...@@ -472,6 +491,13 @@ mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) ...@@ -472,6 +491,13 @@ mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow)
return subflow->tcp_sock; return subflow->tcp_sock;
} }
static inline void
mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow)
{
memset(subflow->reset_start, 0, subflow->reset_end - subflow->reset_start);
subflow->request_mptcp = 1;
}
static inline u64 static inline u64
mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
{ {
...@@ -486,15 +512,6 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) ...@@ -486,15 +512,6 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); return subflow->map_seq + mptcp_subflow_get_map_offset(subflow);
} }
static inline void mptcp_add_pending_subflow(struct mptcp_sock *msk,
struct mptcp_subflow_context *subflow)
{
sock_hold(mptcp_subflow_tcp_sock(subflow));
spin_lock_bh(&msk->join_list_lock);
list_add_tail(&subflow->node, &msk->join_list);
spin_unlock_bh(&msk->join_list_lock);
}
void mptcp_subflow_process_delegated(struct sock *ssk); void mptcp_subflow_process_delegated(struct sock *ssk);
static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, int action) static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, int action)
...@@ -659,7 +676,6 @@ void __mptcp_data_acked(struct sock *sk); ...@@ -659,7 +676,6 @@ void __mptcp_data_acked(struct sock *sk);
void __mptcp_error_report(struct sock *sk); void __mptcp_error_report(struct sock *sk);
void mptcp_subflow_eof(struct sock *sk); void mptcp_subflow_eof(struct sock *sk);
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
void __mptcp_flush_join_list(struct mptcp_sock *msk);
static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
{ {
return READ_ONCE(msk->snd_data_fin_enable) && return READ_ONCE(msk->snd_data_fin_enable) &&
...@@ -712,6 +728,7 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); ...@@ -712,6 +728,7 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac);
void __init mptcp_pm_init(void); void __init mptcp_pm_init(void);
void mptcp_pm_data_init(struct mptcp_sock *msk); void mptcp_pm_data_init(struct mptcp_sock *msk);
void mptcp_pm_data_reset(struct mptcp_sock *msk);
void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side);
...@@ -719,7 +736,9 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, ...@@ -719,7 +736,9 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk,
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
void mptcp_pm_connection_closed(struct mptcp_sock *msk); void mptcp_pm_connection_closed(struct mptcp_sock *msk);
void mptcp_pm_subflow_established(struct mptcp_sock *msk); void mptcp_pm_subflow_established(struct mptcp_sock *msk);
void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id); bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk);
void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
const struct mptcp_subflow_context *subflow);
void mptcp_pm_add_addr_received(struct mptcp_sock *msk, void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr); const struct mptcp_addr_info *addr);
void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
...@@ -816,7 +835,7 @@ unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); ...@@ -816,7 +835,7 @@ unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk);
unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk);
void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk);
void mptcp_sockopt_sync_all(struct mptcp_sock *msk); void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk);
static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb)
{ {
......
...@@ -1285,27 +1285,15 @@ void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) ...@@ -1285,27 +1285,15 @@ void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
} }
} }
void mptcp_sockopt_sync_all(struct mptcp_sock *msk) void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk)
{ {
struct mptcp_subflow_context *subflow; struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = (struct sock *)msk;
u32 seq;
seq = sockopt_seq_reset(sk);
mptcp_for_each_subflow(msk, subflow) { msk_owned_by_me(msk);
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
u32 sseq = READ_ONCE(subflow->setsockopt_seq);
if (sseq != msk->setsockopt_seq) { if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
__mptcp_sockopt_sync(msk, ssk); sync_socket_options(msk, ssk);
WRITE_ONCE(subflow->setsockopt_seq, seq);
} else if (sseq != seq) {
WRITE_ONCE(subflow->setsockopt_seq, seq);
}
cond_resched(); subflow->setsockopt_seq = msk->setsockopt_seq;
} }
msk->setsockopt_seq = seq;
} }
...@@ -388,7 +388,7 @@ static void mptcp_set_connected(struct sock *sk) ...@@ -388,7 +388,7 @@ static void mptcp_set_connected(struct sock *sk)
if (!sock_owned_by_user(sk)) if (!sock_owned_by_user(sk))
__mptcp_set_connected(sk); __mptcp_set_connected(sk);
else else
set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags); __set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk); mptcp_data_unlock(sk);
} }
...@@ -1274,7 +1274,7 @@ static void subflow_error_report(struct sock *ssk) ...@@ -1274,7 +1274,7 @@ static void subflow_error_report(struct sock *ssk)
if (!sock_owned_by_user(sk)) if (!sock_owned_by_user(sk))
__mptcp_error_report(sk); __mptcp_error_report(sk);
else else
set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags); __set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk); mptcp_data_unlock(sk);
} }
...@@ -1293,7 +1293,6 @@ static void subflow_data_ready(struct sock *sk) ...@@ -1293,7 +1293,6 @@ static void subflow_data_ready(struct sock *sk)
if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue))
return; return;
set_bit(MPTCP_DATA_READY, &msk->flags);
parent->sk_data_ready(parent); parent->sk_data_ready(parent);
return; return;
} }
...@@ -1442,7 +1441,8 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, ...@@ -1442,7 +1441,8 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP);
mptcp_info2sockaddr(remote, &addr, ssk->sk_family); mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
mptcp_add_pending_subflow(msk, subflow); sock_hold(ssk);
list_add_tail(&subflow->node, &msk->conn_list);
err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
if (err && err != -EINPROGRESS) if (err && err != -EINPROGRESS)
goto failed_unlink; goto failed_unlink;
...@@ -1453,9 +1453,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, ...@@ -1453,9 +1453,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
return err; return err;
failed_unlink: failed_unlink:
spin_lock_bh(&msk->join_list_lock);
list_del(&subflow->node); list_del(&subflow->node);
spin_unlock_bh(&msk->join_list_lock);
sock_put(mptcp_subflow_tcp_sock(subflow)); sock_put(mptcp_subflow_tcp_sock(subflow));
failed: failed:
......
...@@ -384,6 +384,7 @@ void mptcp_token_destroy(struct mptcp_sock *msk) ...@@ -384,6 +384,7 @@ void mptcp_token_destroy(struct mptcp_sock *msk)
bucket->chain_len--; bucket->chain_len--;
} }
spin_unlock_bh(&bucket->lock); spin_unlock_bh(&bucket->lock);
WRITE_ONCE(msk->token, 0);
} }
void __init mptcp_token_init(void) void __init mptcp_token_init(void)
......
...@@ -17,4 +17,5 @@ CONFIG_NFT_TPROXY=m ...@@ -17,4 +17,5 @@ CONFIG_NFT_TPROXY=m
CONFIG_NFT_SOCKET=m CONFIG_NFT_SOCKET=m
CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_MULTIPLE_TABLES=y CONFIG_IP_MULTIPLE_TABLES=y
CONFIG_IP_NF_TARGET_REJECT=m
CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_IPV6_MULTIPLE_TABLES=y
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <unistd.h> #include <unistd.h>
#include <time.h> #include <time.h>
#include <sys/ioctl.h>
#include <sys/poll.h> #include <sys/poll.h>
#include <sys/sendfile.h> #include <sys/sendfile.h>
#include <sys/stat.h> #include <sys/stat.h>
...@@ -28,6 +29,7 @@ ...@@ -28,6 +29,7 @@
#include <linux/tcp.h> #include <linux/tcp.h>
#include <linux/time_types.h> #include <linux/time_types.h>
#include <linux/sockios.h>
extern int optind; extern int optind;
...@@ -68,6 +70,8 @@ static unsigned int cfg_time; ...@@ -68,6 +70,8 @@ static unsigned int cfg_time;
static unsigned int cfg_do_w; static unsigned int cfg_do_w;
static int cfg_wait; static int cfg_wait;
static uint32_t cfg_mark; static uint32_t cfg_mark;
static char *cfg_input;
static int cfg_repeat = 1;
struct cfg_cmsg_types { struct cfg_cmsg_types {
unsigned int cmsg_enabled:1; unsigned int cmsg_enabled:1;
...@@ -91,22 +95,31 @@ static struct cfg_sockopt_types cfg_sockopt_types; ...@@ -91,22 +95,31 @@ static struct cfg_sockopt_types cfg_sockopt_types;
static void die_usage(void) static void die_usage(void)
{ {
fprintf(stderr, "Usage: mptcp_connect [-6] [-u] [-s MPTCP|TCP] [-p port] [-m mode]" fprintf(stderr, "Usage: mptcp_connect [-6] [-c cmsg] [-i file] [-I num] [-j] [-l] "
"[-l] [-w sec] [-t num] [-T num] connect_address\n"); "[-m mode] [-M mark] [-o option] [-p port] [-P mode] [-j] [-l] [-r num] "
"[-s MPTCP|TCP] [-S num] [-r num] [-t num] [-T num] [-u] [-w sec] connect_address\n");
fprintf(stderr, "\t-6 use ipv6\n"); fprintf(stderr, "\t-6 use ipv6\n");
fprintf(stderr, "\t-t num -- set poll timeout to num\n"); fprintf(stderr, "\t-c cmsg -- test cmsg type <cmsg>\n");
fprintf(stderr, "\t-T num -- set expected runtime to num ms\n"); fprintf(stderr, "\t-i file -- read the data to send from the given file instead of stdin");
fprintf(stderr, "\t-S num -- set SO_SNDBUF to num\n"); fprintf(stderr, "\t-I num -- repeat the transfer 'num' times. In listen mode accepts num "
fprintf(stderr, "\t-R num -- set SO_RCVBUF to num\n"); "incoming connections, in client mode, disconnect and reconnect to the server\n");
fprintf(stderr, "\t-p num -- use port num\n"); fprintf(stderr, "\t-j -- add additional sleep at connection start and tear down "
fprintf(stderr, "\t-s [MPTCP|TCP] -- use mptcp(default) or tcp sockets\n"); "-- for MPJ tests\n");
fprintf(stderr, "\t-l -- listens mode, accepts incoming connection\n");
fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n"); fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n");
fprintf(stderr, "\t-M mark -- set socket packet mark\n"); fprintf(stderr, "\t-M mark -- set socket packet mark\n");
fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n");
fprintf(stderr, "\t-c cmsg -- test cmsg type <cmsg>\n");
fprintf(stderr, "\t-o option -- test sockopt <option>\n"); fprintf(stderr, "\t-o option -- test sockopt <option>\n");
fprintf(stderr, "\t-p num -- use port num\n");
fprintf(stderr, fprintf(stderr,
"\t-P [saveWithPeek|saveAfterPeek] -- save data with/after MSG_PEEK form tcp socket\n"); "\t-P [saveWithPeek|saveAfterPeek] -- save data with/after MSG_PEEK form tcp socket\n");
fprintf(stderr, "\t-t num -- set poll timeout to num\n");
fprintf(stderr, "\t-T num -- set expected runtime to num ms\n");
fprintf(stderr, "\t-r num -- enable slow mode, limiting each write to num bytes "
"-- for remove addr tests\n");
fprintf(stderr, "\t-R num -- set SO_RCVBUF to num\n");
fprintf(stderr, "\t-s [MPTCP|TCP] -- use mptcp(default) or tcp sockets\n");
fprintf(stderr, "\t-S num -- set SO_SNDBUF to num\n");
fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n");
exit(1); exit(1);
} }
...@@ -310,7 +323,8 @@ static int sock_listen_mptcp(const char * const listenaddr, ...@@ -310,7 +323,8 @@ static int sock_listen_mptcp(const char * const listenaddr,
} }
static int sock_connect_mptcp(const char * const remoteaddr, static int sock_connect_mptcp(const char * const remoteaddr,
const char * const port, int proto) const char * const port, int proto,
struct addrinfo **peer)
{ {
struct addrinfo hints = { struct addrinfo hints = {
.ai_protocol = IPPROTO_TCP, .ai_protocol = IPPROTO_TCP,
...@@ -334,8 +348,10 @@ static int sock_connect_mptcp(const char * const remoteaddr, ...@@ -334,8 +348,10 @@ static int sock_connect_mptcp(const char * const remoteaddr,
if (cfg_mark) if (cfg_mark)
set_mark(sock, cfg_mark); set_mark(sock, cfg_mark);
if (connect(sock, a->ai_addr, a->ai_addrlen) == 0) if (connect(sock, a->ai_addr, a->ai_addrlen) == 0) {
*peer = a;
break; /* success */ break; /* success */
}
perror("connect()"); perror("connect()");
close(sock); close(sock);
...@@ -524,14 +540,17 @@ static ssize_t do_rnd_read(const int fd, char *buf, const size_t len) ...@@ -524,14 +540,17 @@ static ssize_t do_rnd_read(const int fd, char *buf, const size_t len)
return ret; return ret;
} }
static void set_nonblock(int fd) static void set_nonblock(int fd, bool nonblock)
{ {
int flags = fcntl(fd, F_GETFL); int flags = fcntl(fd, F_GETFL);
if (flags == -1) if (flags == -1)
return; return;
if (nonblock)
fcntl(fd, F_SETFL, flags | O_NONBLOCK); fcntl(fd, F_SETFL, flags | O_NONBLOCK);
else
fcntl(fd, F_SETFL, flags & ~O_NONBLOCK);
} }
static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after_out) static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after_out)
...@@ -543,7 +562,7 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after ...@@ -543,7 +562,7 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
unsigned int woff = 0, wlen = 0; unsigned int woff = 0, wlen = 0;
char wbuf[8192]; char wbuf[8192];
set_nonblock(peerfd); set_nonblock(peerfd, true);
for (;;) { for (;;) {
char rbuf[8192]; char rbuf[8192];
...@@ -638,7 +657,6 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after ...@@ -638,7 +657,6 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
if (cfg_remove) if (cfg_remove)
usleep(cfg_wait); usleep(cfg_wait);
close(peerfd);
return 0; return 0;
} }
...@@ -780,7 +798,7 @@ static int copyfd_io_sendfile(int infd, int peerfd, int outfd, ...@@ -780,7 +798,7 @@ static int copyfd_io_sendfile(int infd, int peerfd, int outfd,
return err; return err;
} }
static int copyfd_io(int infd, int peerfd, int outfd) static int copyfd_io(int infd, int peerfd, int outfd, bool close_peerfd)
{ {
bool in_closed_after_out = false; bool in_closed_after_out = false;
struct timespec start, end; struct timespec start, end;
...@@ -819,6 +837,9 @@ static int copyfd_io(int infd, int peerfd, int outfd) ...@@ -819,6 +837,9 @@ static int copyfd_io(int infd, int peerfd, int outfd)
if (ret) if (ret)
return ret; return ret;
if (close_peerfd)
close(peerfd);
if (cfg_time) { if (cfg_time) {
unsigned int delta_ms; unsigned int delta_ms;
...@@ -930,7 +951,7 @@ static void maybe_close(int fd) ...@@ -930,7 +951,7 @@ static void maybe_close(int fd)
{ {
unsigned int r = rand(); unsigned int r = rand();
if (!(cfg_join || cfg_remove) && (r & 1)) if (!(cfg_join || cfg_remove || cfg_repeat > 1) && (r & 1))
close(fd); close(fd);
} }
...@@ -940,7 +961,9 @@ int main_loop_s(int listensock) ...@@ -940,7 +961,9 @@ int main_loop_s(int listensock)
struct pollfd polls; struct pollfd polls;
socklen_t salen; socklen_t salen;
int remotesock; int remotesock;
int fd = 0;
again:
polls.fd = listensock; polls.fd = listensock;
polls.events = POLLIN; polls.events = POLLIN;
...@@ -961,14 +984,27 @@ int main_loop_s(int listensock) ...@@ -961,14 +984,27 @@ int main_loop_s(int listensock)
check_sockaddr(pf, &ss, salen); check_sockaddr(pf, &ss, salen);
check_getpeername(remotesock, &ss, salen); check_getpeername(remotesock, &ss, salen);
if (cfg_input) {
fd = open(cfg_input, O_RDONLY);
if (fd < 0)
xerror("can't open %s: %d", cfg_input, errno);
}
SOCK_TEST_TCPULP(remotesock, 0); SOCK_TEST_TCPULP(remotesock, 0);
return copyfd_io(0, remotesock, 1); copyfd_io(fd, remotesock, 1, true);
} else {
perror("accept");
return 1;
} }
perror("accept"); if (--cfg_repeat > 0) {
if (cfg_input)
close(fd);
goto again;
}
return 1; return 0;
} }
static void init_rng(void) static void init_rng(void)
...@@ -1057,15 +1093,47 @@ static void parse_setsock_options(const char *name) ...@@ -1057,15 +1093,47 @@ static void parse_setsock_options(const char *name)
exit(1); exit(1);
} }
void xdisconnect(int fd, int addrlen)
{
struct sockaddr_storage empty;
int msec_sleep = 10;
int queued = 1;
int i;
shutdown(fd, SHUT_WR);
/* while until the pending data is completely flushed, the later
* disconnect will bypass/ignore/drop any pending data.
*/
for (i = 0; ; i += msec_sleep) {
if (ioctl(fd, SIOCOUTQ, &queued) < 0)
xerror("can't query out socket queue: %d", errno);
if (!queued)
break;
if (i > poll_timeout)
xerror("timeout while waiting for spool to complete");
usleep(msec_sleep * 1000);
}
memset(&empty, 0, sizeof(empty));
empty.ss_family = AF_UNSPEC;
if (connect(fd, (struct sockaddr *)&empty, addrlen) < 0)
xerror("can't disconnect: %d", errno);
}
int main_loop(void) int main_loop(void)
{ {
int fd; int fd, ret, fd_in = 0;
struct addrinfo *peer;
/* listener is ready. */ /* listener is ready. */
fd = sock_connect_mptcp(cfg_host, cfg_port, cfg_sock_proto); fd = sock_connect_mptcp(cfg_host, cfg_port, cfg_sock_proto, &peer);
if (fd < 0) if (fd < 0)
return 2; return 2;
again:
check_getpeername_connect(fd); check_getpeername_connect(fd);
SOCK_TEST_TCPULP(fd, cfg_sock_proto); SOCK_TEST_TCPULP(fd, cfg_sock_proto);
...@@ -1077,7 +1145,31 @@ int main_loop(void) ...@@ -1077,7 +1145,31 @@ int main_loop(void)
if (cfg_cmsg_types.cmsg_enabled) if (cfg_cmsg_types.cmsg_enabled)
apply_cmsg_types(fd, &cfg_cmsg_types); apply_cmsg_types(fd, &cfg_cmsg_types);
return copyfd_io(0, fd, 1); if (cfg_input) {
fd_in = open(cfg_input, O_RDONLY);
if (fd < 0)
xerror("can't open %s:%d", cfg_input, errno);
}
/* close the client socket open only if we are not going to reconnect */
ret = copyfd_io(fd_in, fd, 1, cfg_repeat == 1);
if (ret)
return ret;
if (--cfg_repeat > 0) {
xdisconnect(fd, peer->ai_addrlen);
/* the socket could be unblocking at this point, we need the
* connect to be blocking
*/
set_nonblock(fd, false);
if (connect(fd, peer->ai_addr, peer->ai_addrlen))
xerror("can't reconnect: %d", errno);
if (cfg_input)
close(fd_in);
goto again;
}
return 0;
} }
int parse_proto(const char *proto) int parse_proto(const char *proto)
...@@ -1162,7 +1254,7 @@ static void parse_opts(int argc, char **argv) ...@@ -1162,7 +1254,7 @@ static void parse_opts(int argc, char **argv)
{ {
int c; int c;
while ((c = getopt(argc, argv, "6jr:lp:s:ht:T:m:S:R:w:M:P:c:o:")) != -1) { while ((c = getopt(argc, argv, "6c:hi:I:jlm:M:o:p:P:r:R:s:S:t:T:w:")) != -1) {
switch (c) { switch (c) {
case 'j': case 'j':
cfg_join = true; cfg_join = true;
...@@ -1176,6 +1268,12 @@ static void parse_opts(int argc, char **argv) ...@@ -1176,6 +1268,12 @@ static void parse_opts(int argc, char **argv)
if (cfg_do_w <= 0) if (cfg_do_w <= 0)
cfg_do_w = 50; cfg_do_w = 50;
break; break;
case 'i':
cfg_input = optarg;
break;
case 'I':
cfg_repeat = atoi(optarg);
break;
case 'l': case 'l':
listen_mode = true; listen_mode = true;
break; break;
......
...@@ -7,6 +7,7 @@ optstring="S:R:d:e:l:r:h4cm:f:tC" ...@@ -7,6 +7,7 @@ optstring="S:R:d:e:l:r:h4cm:f:tC"
ret=0 ret=0
sin="" sin=""
sout="" sout=""
cin_disconnect=""
cin="" cin=""
cout="" cout=""
ksft_skip=4 ksft_skip=4
...@@ -24,6 +25,7 @@ options_log=true ...@@ -24,6 +25,7 @@ options_log=true
do_tcp=0 do_tcp=0
checksum=false checksum=false
filesize=0 filesize=0
connect_per_transfer=1
if [ $tc_loss -eq 100 ];then if [ $tc_loss -eq 100 ];then
tc_loss=1% tc_loss=1%
...@@ -127,6 +129,7 @@ TEST_COUNT=0 ...@@ -127,6 +129,7 @@ TEST_COUNT=0
cleanup() cleanup()
{ {
rm -f "$cin_disconnect" "$cout_disconnect"
rm -f "$cin" "$cout" rm -f "$cin" "$cout"
rm -f "$sin" "$sout" rm -f "$sin" "$sout"
rm -f "$capout" rm -f "$capout"
...@@ -149,6 +152,8 @@ sout=$(mktemp) ...@@ -149,6 +152,8 @@ sout=$(mktemp)
cin=$(mktemp) cin=$(mktemp)
cout=$(mktemp) cout=$(mktemp)
capout=$(mktemp) capout=$(mktemp)
cin_disconnect="$cin".disconnect
cout_disconnect="$cout".disconnect
trap cleanup EXIT trap cleanup EXIT
for i in "$ns1" "$ns2" "$ns3" "$ns4";do for i in "$ns1" "$ns2" "$ns3" "$ns4";do
...@@ -500,8 +505,8 @@ do_transfer() ...@@ -500,8 +505,8 @@ do_transfer()
cookies=${cookies##*=} cookies=${cookies##*=}
if [ ${cl_proto} = "MPTCP" ] && [ ${srv_proto} = "MPTCP" ]; then if [ ${cl_proto} = "MPTCP" ] && [ ${srv_proto} = "MPTCP" ]; then
expect_synrx=$((stat_synrx_last_l+1)) expect_synrx=$((stat_synrx_last_l+$connect_per_transfer))
expect_ackrx=$((stat_ackrx_last_l+1)) expect_ackrx=$((stat_ackrx_last_l+$connect_per_transfer))
fi fi
if [ ${stat_synrx_now_l} -lt ${expect_synrx} ]; then if [ ${stat_synrx_now_l} -lt ${expect_synrx} ]; then
...@@ -738,6 +743,33 @@ run_tests_peekmode() ...@@ -738,6 +743,33 @@ run_tests_peekmode()
run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 "-P ${peekmode}" run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 "-P ${peekmode}"
} }
run_tests_disconnect()
{
local peekmode="$1"
local old_cin=$cin
local old_sin=$sin
cat $cin $cin $cin > "$cin".disconnect
# force do_transfer to cope with the multiple tranmissions
sin="$cin.disconnect"
sin_disconnect=$old_sin
cin="$cin.disconnect"
cin_disconnect="$old_cin"
connect_per_transfer=3
echo "INFO: disconnect"
run_tests_lo "$ns1" "$ns1" 10.0.1.1 1 "-I 3 -i $old_cin"
run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 "-I 3 -i $old_cin"
# restore previous status
cout=$old_cout
cout_disconnect="$cout".disconnect
cin=$old_cin
cin_disconnect="$cin".disconnect
connect_per_transfer=1
}
display_time() display_time()
{ {
time_end=$(date +%s) time_end=$(date +%s)
...@@ -853,6 +885,9 @@ stop_if_error "Tests with peek mode have failed" ...@@ -853,6 +885,9 @@ stop_if_error "Tests with peek mode have failed"
# connect to ns4 ip address, ns2 should intercept/proxy # connect to ns4 ip address, ns2 should intercept/proxy
run_test_transparent 10.0.3.1 "tproxy ipv4" run_test_transparent 10.0.3.1 "tproxy ipv4"
run_test_transparent dead:beef:3::1 "tproxy ipv6" run_test_transparent dead:beef:3::1 "tproxy ipv6"
stop_if_error "Tests with tproxy have failed"
run_tests_disconnect
display_time display_time
exit $ret exit $ret
...@@ -937,6 +937,22 @@ chk_link_usage() ...@@ -937,6 +937,22 @@ chk_link_usage()
fi fi
} }
wait_for_tw()
{
local timeout_ms=$((timeout_poll * 1000))
local time=0
local ns=$1
while [ $time -lt $timeout_ms ]; do
local cnt=$(ip netns exec $ns ss -t state time-wait |wc -l)
[ "$cnt" = 1 ] && return 1
time=$((time + 100))
sleep 0.1
done
return 1
}
subflows_tests() subflows_tests()
{ {
reset reset
...@@ -994,6 +1010,61 @@ subflows_tests() ...@@ -994,6 +1010,61 @@ subflows_tests()
chk_join_nr "single subflow, dev" 1 1 1 chk_join_nr "single subflow, dev" 1 1 1
} }
subflows_error_tests()
{
# If a single subflow is configured, and matches the MPC src
# address, no additional subflow should be created
reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl add 10.0.1.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow
chk_join_nr "no MPC reuse with single endpoint" 0 0 0
# multiple subflows, with subflow creation error
reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 2
ip netns exec $ns2 ./pm_nl_ctl limits 0 2
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
ip netns exec $ns1 iptables -A INPUT -s 10.0.3.2 -p tcp -j REJECT
run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow
chk_join_nr "multi subflows, with failing subflow" 1 1 1
# multiple subflows, with subflow timeout on MPJ
reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 2
ip netns exec $ns2 ./pm_nl_ctl limits 0 2
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
ip netns exec $ns1 iptables -A INPUT -s 10.0.3.2 -p tcp -j DROP
run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow
chk_join_nr "multi subflows, with subflow timeout" 1 1 1
# multiple subflows, check that the endpoint corresponding to
# closed subflow (due to reset) is not reused if additional
# subflows are added later
reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
ip netns exec $ns1 iptables -A INPUT -s 10.0.3.2 -p tcp -j REJECT
run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow &
# updates in the child shell do not have any effect here, we
# need to bump the test counter for the above case
TEST_COUNT=$((TEST_COUNT+1))
# mpj subflow will be in TW after the reset
wait_for_tw $ns2
ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
wait
# additional subflow could be created only if the PM select
# the later endpoint, skipping the already used one
chk_join_nr "multi subflows, fair usage on close" 1 1 1
}
signal_address_tests() signal_address_tests()
{ {
# add_address, unused # add_address, unused
...@@ -1071,7 +1142,10 @@ signal_address_tests() ...@@ -1071,7 +1142,10 @@ signal_address_tests()
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags signal ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags signal
ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags signal ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags signal
run_tests $ns1 $ns2 10.0.1.1 run_tests $ns1 $ns2 10.0.1.1
chk_add_nr 4 4
# the server will not signal the address terminating
# the MPC subflow
chk_add_nr 3 3
} }
link_failure_tests() link_failure_tests()
...@@ -1802,6 +1876,7 @@ fullmesh_tests() ...@@ -1802,6 +1876,7 @@ fullmesh_tests()
all_tests() all_tests()
{ {
subflows_tests subflows_tests
subflows_error_tests
signal_address_tests signal_address_tests
link_failure_tests link_failure_tests
add_addr_timeout_tests add_addr_timeout_tests
...@@ -1821,6 +1896,7 @@ usage() ...@@ -1821,6 +1896,7 @@ usage()
{ {
echo "mptcp_join usage:" echo "mptcp_join usage:"
echo " -f subflows_tests" echo " -f subflows_tests"
echo " -e subflows_error_tests"
echo " -s signal_address_tests" echo " -s signal_address_tests"
echo " -l link_failure_tests" echo " -l link_failure_tests"
echo " -t add_addr_timeout_tests" echo " -t add_addr_timeout_tests"
...@@ -1869,11 +1945,14 @@ if [ $do_all_tests -eq 1 ]; then ...@@ -1869,11 +1945,14 @@ if [ $do_all_tests -eq 1 ]; then
exit $ret exit $ret
fi fi
while getopts 'fsltra64bpkdmchCS' opt; do while getopts 'fesltra64bpkdmchCS' opt; do
case $opt in case $opt in
f) f)
subflows_tests subflows_tests
;; ;;
e)
subflows_error_tests
;;
s) s)
signal_address_tests signal_address_tests
;; ;;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment