Commit 4994d4fa authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'mptcp-path-manager-mode-selection'

Mat Martineau says:

====================
mptcp: Path manager mode selection

MPTCP already has an in-kernel path manager (PM) to add and remove TCP
subflows associated with a given MPTCP connection. This in-kernel PM has
been designed to handle typical server-side use cases, but is not very
flexible or configurable for client devices that may have more
complicated policies to implement.

This patch series from the MPTCP tree is the first step toward adding a
generic-netlink-based API for MPTCP path management, which a privileged
userspace daemon will be able to use to control subflow
establishment. These patches add a per-namespace sysctl to select the
default PM type (in-kernel or userspace) for new MPTCP sockets. New
self-tests confirm expected behavior when userspace PM is selected but
there is no daemon available to handle existing MPTCP PM events.

Subsequent patch series (already staged in the MPTCP tree) will add the
generic netlink path management API.
====================

Link: https://lore.kernel.org/r/20220427225002.231996-1-mathew.j.martineau@linux.intel.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents a41c653d 5ac1d2d6
......@@ -46,6 +46,24 @@ allow_join_initial_addr_port - BOOLEAN
Default: 1
pm_type - INTEGER
Set the default path manager type to use for each new MPTCP
socket. In-kernel path management will control subflow
connections and address advertisements according to
per-namespace values configured over the MPTCP netlink
API. Userspace path management puts per-MPTCP-connection subflow
connection decisions and address advertisements under control of
a privileged userspace program, at the cost of more netlink
traffic to propagate all of the related events and commands.
This is a per-namespace sysctl.
* 0 - In-kernel path manager
* 1 - Userspace path manager
Default: 0
stale_loss_cnt - INTEGER
The number of MPTCP-level retransmission intervals with no traffic and
pending outstanding data on a given subflow required to declare it stale.
......
......@@ -16,6 +16,11 @@
#define MPTCP_SYSCTL_PATH "net/mptcp"
static int mptcp_pernet_id;
#ifdef CONFIG_SYSCTL
static int mptcp_pm_type_max = __MPTCP_PM_TYPE_MAX;
#endif
struct mptcp_pernet {
#ifdef CONFIG_SYSCTL
struct ctl_table_header *ctl_table_hdr;
......@@ -26,6 +31,7 @@ struct mptcp_pernet {
u8 mptcp_enabled;
u8 checksum_enabled;
u8 allow_join_initial_addr_port;
u8 pm_type;
};
static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
......@@ -58,6 +64,11 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net)
return mptcp_get_pernet(net)->stale_loss_cnt;
}
int mptcp_get_pm_type(const struct net *net)
{
return mptcp_get_pernet(net)->pm_type;
}
static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
......@@ -65,6 +76,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
pernet->checksum_enabled = 0;
pernet->allow_join_initial_addr_port = 1;
pernet->stale_loss_cnt = 4;
pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
}
#ifdef CONFIG_SYSCTL
......@@ -108,6 +120,14 @@ static struct ctl_table mptcp_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_douintvec_minmax,
},
{
.procname = "pm_type",
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &mptcp_pm_type_max
},
{}
};
......@@ -128,6 +148,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[2].data = &pernet->checksum_enabled;
table[3].data = &pernet->allow_join_initial_addr_port;
table[4].data = &pernet->stale_loss_cnt;
table[5].data = &pernet->pm_type;
hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
if (!hdr)
......
......@@ -208,7 +208,7 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
spin_lock_bh(&pm->lock);
if (!READ_ONCE(pm->accept_addr)) {
if (!READ_ONCE(pm->accept_addr) || mptcp_pm_is_userspace(msk)) {
mptcp_pm_announce_addr(msk, addr, true);
mptcp_pm_add_addr_send_ack(msk);
} else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) {
......@@ -415,21 +415,41 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
void mptcp_pm_data_reset(struct mptcp_sock *msk)
{
msk->pm.add_addr_signaled = 0;
msk->pm.add_addr_accepted = 0;
msk->pm.local_addr_used = 0;
msk->pm.subflows = 0;
msk->pm.rm_list_tx.nr = 0;
msk->pm.rm_list_rx.nr = 0;
WRITE_ONCE(msk->pm.work_pending, false);
WRITE_ONCE(msk->pm.addr_signal, 0);
WRITE_ONCE(msk->pm.accept_addr, false);
WRITE_ONCE(msk->pm.accept_subflow, false);
WRITE_ONCE(msk->pm.remote_deny_join_id0, false);
msk->pm.status = 0;
bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk));
struct mptcp_pm_data *pm = &msk->pm;
pm->add_addr_signaled = 0;
pm->add_addr_accepted = 0;
pm->local_addr_used = 0;
pm->subflows = 0;
pm->rm_list_tx.nr = 0;
pm->rm_list_rx.nr = 0;
WRITE_ONCE(pm->pm_type, pm_type);
mptcp_pm_nl_data_init(msk);
if (pm_type == MPTCP_PM_TYPE_KERNEL) {
bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk);
/* pm->work_pending must be only be set to 'true' when
* pm->pm_type is set to MPTCP_PM_TYPE_KERNEL
*/
WRITE_ONCE(pm->work_pending,
(!!mptcp_pm_get_local_addr_max(msk) &&
subflows_allowed) ||
!!mptcp_pm_get_add_addr_signal_max(msk));
WRITE_ONCE(pm->accept_addr,
!!mptcp_pm_get_add_addr_accept_max(msk) &&
subflows_allowed);
WRITE_ONCE(pm->accept_subflow, subflows_allowed);
} else {
WRITE_ONCE(pm->work_pending, 0);
WRITE_ONCE(pm->accept_addr, 0);
WRITE_ONCE(pm->accept_subflow, 0);
}
WRITE_ONCE(pm->addr_signal, 0);
WRITE_ONCE(pm->remote_deny_join_id0, false);
pm->status = 0;
bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
}
void mptcp_pm_data_init(struct mptcp_sock *msk)
......
......@@ -1061,18 +1061,6 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
return ret;
}
void mptcp_pm_nl_data_init(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
bool subflows;
subflows = !!mptcp_pm_get_subflows_max(msk);
WRITE_ONCE(pm->work_pending, (!!mptcp_pm_get_local_addr_max(msk) && subflows) ||
!!mptcp_pm_get_add_addr_signal_max(msk));
WRITE_ONCE(pm->accept_addr, !!mptcp_pm_get_add_addr_accept_max(msk) && subflows);
WRITE_ONCE(pm->accept_subflow, subflows);
}
#define MPTCP_PM_CMD_GRP_OFFSET 0
#define MPTCP_PM_EV_GRP_OFFSET 1
......@@ -1232,7 +1220,8 @@ static int mptcp_nl_add_subflow_or_signal_addr(struct net *net)
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
if (!READ_ONCE(msk->fully_established))
if (!READ_ONCE(msk->fully_established) ||
mptcp_pm_is_userspace(msk))
goto next;
lock_sock(sk);
......@@ -1375,6 +1364,9 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
struct sock *sk = (struct sock *)msk;
bool remove_subflow;
if (mptcp_pm_is_userspace(msk))
goto next;
if (list_empty(&msk->conn_list)) {
mptcp_pm_remove_anno_addr(msk, addr, false);
goto next;
......@@ -1409,7 +1401,7 @@ static int mptcp_nl_remove_id_zero_address(struct net *net,
struct sock *sk = (struct sock *)msk;
struct mptcp_addr_info msk_local;
if (list_empty(&msk->conn_list))
if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
goto next;
local_address((struct sock_common *)msk, &msk_local);
......@@ -1516,9 +1508,11 @@ static void mptcp_nl_remove_addrs_list(struct net *net,
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
if (!mptcp_pm_is_userspace(msk)) {
lock_sock(sk);
mptcp_pm_remove_addrs_and_subflows(msk, rm_list);
release_sock(sk);
}
sock_put(sk);
cond_resched();
......@@ -1791,7 +1785,7 @@ static int mptcp_nl_set_flags(struct net *net,
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
if (list_empty(&msk->conn_list))
if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
goto next;
lock_sock(sk);
......
......@@ -184,6 +184,14 @@ enum mptcp_pm_status {
*/
};
enum mptcp_pm_type {
MPTCP_PM_TYPE_KERNEL = 0,
MPTCP_PM_TYPE_USERSPACE,
__MPTCP_PM_TYPE_NR,
__MPTCP_PM_TYPE_MAX = __MPTCP_PM_TYPE_NR - 1,
};
/* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */
#define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1)
......@@ -212,6 +220,7 @@ struct mptcp_pm_data {
u8 add_addr_signaled;
u8 add_addr_accepted;
u8 local_addr_used;
u8 pm_type;
u8 subflows;
u8 status;
DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
......@@ -576,6 +585,7 @@ unsigned int mptcp_get_add_addr_timeout(const struct net *net);
int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net);
int mptcp_get_pm_type(const struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk);
......@@ -796,6 +806,11 @@ static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk)
return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL);
}
static inline bool mptcp_pm_is_userspace(const struct mptcp_sock *msk)
{
return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_USERSPACE;
}
static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port)
{
u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE;
......@@ -828,7 +843,6 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
void __init mptcp_pm_nl_init(void);
void mptcp_pm_nl_data_init(struct mptcp_sock *msk);
void mptcp_pm_nl_work(struct mptcp_sock *msk);
void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
const struct mptcp_rm_list *rm_list);
......
......@@ -70,6 +70,7 @@ init_partial()
ip netns add $netns || exit $ksft_skip
ip -net $netns link set lo up
ip netns exec $netns sysctl -q net.mptcp.enabled=1
ip netns exec $netns sysctl -q net.mptcp.pm_type=0
ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0
ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0
if [ $checksum -eq 1 ]; then
......@@ -1611,6 +1612,13 @@ wait_attempt_fail()
return 1
}
set_userspace_pm()
{
local ns=$1
ip netns exec $ns sysctl -q net.mptcp.pm_type=1
}
subflows_tests()
{
if reset "no JOIN"; then
......@@ -2698,6 +2706,63 @@ fail_tests()
fi
}
userspace_tests()
{
# userspace pm type prevents add_addr
if reset "userspace pm type prevents add_addr"; then
set_userspace_pm $ns1
pm_nl_set_limits $ns1 0 2
pm_nl_set_limits $ns2 0 2
pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 0 0 0
chk_add_nr 0 0
fi
# userspace pm type rejects join
if reset "userspace pm type rejects join"; then
set_userspace_pm $ns1
pm_nl_set_limits $ns1 1 1
pm_nl_set_limits $ns2 1 1
pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 1 1 0
fi
# userspace pm type does not send join
if reset "userspace pm type does not send join"; then
set_userspace_pm $ns2
pm_nl_set_limits $ns1 1 1
pm_nl_set_limits $ns2 1 1
pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 0 0 0
fi
# userspace pm type prevents mp_prio
if reset "userspace pm type prevents mp_prio"; then
set_userspace_pm $ns1
pm_nl_set_limits $ns1 1 1
pm_nl_set_limits $ns2 1 1
pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow backup
chk_join_nr 1 1 0
chk_prio_nr 0 0
fi
# userspace pm type prevents rm_addr
if reset "userspace pm type prevents rm_addr"; then
set_userspace_pm $ns1
set_userspace_pm $ns2
pm_nl_set_limits $ns1 0 1
pm_nl_set_limits $ns2 0 1
pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1 0 0 -1 slow
chk_join_nr 0 0 0
chk_rm_nr 0 0
fi
}
implicit_tests()
{
# userspace pm type prevents add_addr
......@@ -2767,6 +2832,7 @@ all_tests_sorted=(
m@fullmesh_tests
z@fastclose_tests
F@fail_tests
u@userspace_tests
I@implicit_tests
)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment