Commit 197060c1 authored by David S. Miller's avatar David S. Miller

Merge branch 'mptcp-fastclose'

Mat Martineau says:

====================
mptcp: Fastclose edge cases and error handling

MPTCP has existing code to use the MP_FASTCLOSE option header, which
works like a RST for the MPTCP-level connection (regular RSTs only
affect specific subflows in MPTCP). This series has some improvements
for fastclose.

Patch 1 aligns fastclose socket error handling with TCP RST behavior on
TCP sockets.

Patch 2 adds use of MP_FASTCLOSE in some more edge cases, like file
descriptor close, FIN_WAIT timeout, and when the socket has unread data.

Patch 3 updates the fastclose self tests.

Patch 4 does not change any code, just fixes some outdated comments.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 7171e8a1 d89e3ed7
......@@ -662,9 +662,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
skb = skb_peek(&ssk->sk_receive_queue);
if (!skb) {
/* if no data is found, a racing workqueue/recvmsg
* already processed the new data, stop here or we
* can enter an infinite loop
/* With racing move_skbs_to_msk() and __mptcp_move_skbs(),
* a different CPU can have already processed the pending
* data, stop here or we can enter an infinite loop
*/
if (!moved)
done = true;
......@@ -672,9 +672,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
}
if (__mptcp_check_fallback(msk)) {
/* if we are running under the workqueue, TCP could have
* collapsed skbs between dummy map creation and now
* be sure to adjust the size
/* Under fallback skbs have no MPTCP extension and TCP could
* collapse them between the dummy map creation and the
* current dequeue. Be sure to adjust the map size.
*/
map_remaining = skb->len;
subflow->map_data_len = skb->len;
......@@ -1707,7 +1707,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto out;
} else if (ret) {
release_sock(ssk);
goto out;
goto do_error;
}
release_sock(ssk);
}
......@@ -1717,9 +1717,13 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
ret = sk_stream_wait_connect(sk, &timeo);
if (ret)
goto out;
goto do_error;
}
ret = -EPIPE;
if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)))
goto do_error;
pfrag = sk_page_frag(sk);
while (msg_data_left(msg)) {
......@@ -1728,11 +1732,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
bool dfrag_collapsed;
size_t psize, offset;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
ret = -EPIPE;
goto out;
}
/* reuse tail pfrag, if possible, or carve a new one from the
* page allocator
*/
......@@ -1764,7 +1763,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (copy_page_from_iter(dfrag->page, offset, psize,
&msg->msg_iter) != psize) {
ret = -EFAULT;
goto out;
goto do_error;
}
/* data successfully copied into the write queue */
......@@ -1796,7 +1795,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
__mptcp_push_pending(sk, msg->msg_flags);
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
goto out;
goto do_error;
}
if (copied)
......@@ -1804,7 +1803,14 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
out:
release_sock(sk);
return copied ? : ret;
return copied;
do_error:
if (copied)
goto out;
copied = sk_stream_error(sk, msg->msg_flags, ret);
goto out;
}
static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
......@@ -2307,8 +2313,14 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
if (flags & MPTCP_CF_FASTCLOSE)
if (flags & MPTCP_CF_FASTCLOSE) {
/* be sure to force the tcp_disconnect() path,
* to generate the egress reset
*/
ssk->sk_lingertime = 0;
sock_set_flag(ssk, SOCK_LINGER);
subflow->send_fastclose = 1;
}
need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
if (!dispose_it) {
......@@ -2441,12 +2453,31 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
unlock_sock_fast(tcp_sk, slow);
}
/* Mirror the tcp_reset() error propagation */
switch (sk->sk_state) {
case TCP_SYN_SENT:
sk->sk_err = ECONNREFUSED;
break;
case TCP_CLOSE_WAIT:
sk->sk_err = EPIPE;
break;
case TCP_CLOSE:
return;
default:
sk->sk_err = ECONNRESET;
}
inet_sk_state_store(sk, TCP_CLOSE);
sk->sk_shutdown = SHUTDOWN_MASK;
smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags);
mptcp_close_wake_up(sk);
/* the calling mptcp_worker will properly destroy the socket */
if (sock_flag(sk, SOCK_DEAD))
return;
sk->sk_state_change(sk);
sk_error_report(sk);
}
static void __mptcp_retrans(struct sock *sk)
......@@ -2552,6 +2583,16 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
mptcp_reset_timeout(msk, 0);
}
static void mptcp_do_fastclose(struct sock *sk)
{
struct mptcp_subflow_context *subflow, *tmp;
struct mptcp_sock *msk = mptcp_sk(sk);
mptcp_for_each_subflow_safe(msk, subflow, tmp)
__mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow),
subflow, MPTCP_CF_FASTCLOSE);
}
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
......@@ -2580,12 +2621,16 @@ static void mptcp_worker(struct work_struct *work)
* closed, but we need the msk around to reply to incoming DATA_FIN,
* even if it is orphaned and in FIN_WAIT2 state
*/
if (sock_flag(sk, SOCK_DEAD) &&
(mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) {
if (sock_flag(sk, SOCK_DEAD)) {
if (mptcp_check_close_timeout(sk)) {
inet_sk_state_store(sk, TCP_CLOSE);
mptcp_do_fastclose(sk);
}
if (sk->sk_state == TCP_CLOSE) {
__mptcp_destroy_sock(sk);
goto unlock;
}
}
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
__mptcp_close_subflow(msk);
......@@ -2825,6 +2870,18 @@ static void __mptcp_destroy_sock(struct sock *sk)
sock_put(sk);
}
static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
{
/* Concurrent splices from sk_receive_queue into receive_queue will
* always show at least one non-empty queue when checked in this order.
*/
if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
skb_queue_empty_lockless(&msk->receive_queue))
return 0;
return EPOLLIN | EPOLLRDNORM;
}
bool __mptcp_close(struct sock *sk, long timeout)
{
struct mptcp_subflow_context *subflow;
......@@ -2838,8 +2895,13 @@ bool __mptcp_close(struct sock *sk, long timeout)
goto cleanup;
}
if (mptcp_close_state(sk))
if (mptcp_check_readable(msk)) {
/* the msk has read data, do the MPTCP equivalent of TCP reset */
inet_sk_state_store(sk, TCP_CLOSE);
mptcp_do_fastclose(sk);
} else if (mptcp_close_state(sk)) {
__mptcp_wr_shutdown(sk);
}
sk_stream_wait_close(sk, timeout);
......@@ -3656,18 +3718,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
return err;
}
static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
{
/* Concurrent splices from sk_receive_queue into receive_queue will
* always show at least one non-empty queue when checked in this order.
*/
if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
skb_queue_empty_lockless(&msk->receive_queue))
return 0;
return EPOLLIN | EPOLLRDNORM;
}
static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
......@@ -3718,7 +3768,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
/* This barrier is coupled with smp_wmb() in tcp_reset() */
/* This barrier is coupled with smp_wmb() in __mptcp_error_report() */
smp_rmb();
if (sk->sk_err)
mask |= EPOLLERR;
......
......@@ -72,6 +72,8 @@ static int cfg_wait;
static uint32_t cfg_mark;
static char *cfg_input;
static int cfg_repeat = 1;
static int cfg_truncate;
static int cfg_rcv_trunc;
struct cfg_cmsg_types {
unsigned int cmsg_enabled:1;
......@@ -95,11 +97,15 @@ static struct cfg_sockopt_types cfg_sockopt_types;
static void die_usage(void)
{
fprintf(stderr, "Usage: mptcp_connect [-6] [-c cmsg] [-i file] [-I num] [-j] [-l] "
fprintf(stderr, "Usage: mptcp_connect [-6] [-c cmsg] [-f offset] [-i file] [-I num] [-j] [-l] "
"[-m mode] [-M mark] [-o option] [-p port] [-P mode] [-j] [-l] [-r num] "
"[-s MPTCP|TCP] [-S num] [-r num] [-t num] [-T num] [-u] [-w sec] connect_address\n");
fprintf(stderr, "\t-6 use ipv6\n");
fprintf(stderr, "\t-c cmsg -- test cmsg type <cmsg>\n");
fprintf(stderr, "\t-f offset -- stop the I/O after receiving and sending the specified amount "
"of bytes. If there are unread bytes in the receive queue, that will cause a MPTCP "
"fastclose at close/shutdown. If offset is negative, expect the peer to close before "
"all the local data as been sent, thus toleration errors on write and EPIPE signals\n");
fprintf(stderr, "\t-i file -- read the data to send from the given file instead of stdin");
fprintf(stderr, "\t-I num -- repeat the transfer 'num' times. In listen mode accepts num "
"incoming connections, in client mode, disconnect and reconnect to the server\n");
......@@ -382,7 +388,7 @@ static size_t do_rnd_write(const int fd, char *buf, const size_t len)
bw = write(fd, buf, do_w);
if (bw < 0)
perror("write");
return bw;
/* let the join handshake complete, before going on */
if (cfg_join && first) {
......@@ -571,7 +577,7 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
.fd = peerfd,
.events = POLLIN | POLLOUT,
};
unsigned int woff = 0, wlen = 0;
unsigned int woff = 0, wlen = 0, total_wlen = 0, total_rlen = 0;
char wbuf[8192];
set_nonblock(peerfd, true);
......@@ -597,7 +603,16 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
}
if (fds.revents & POLLIN) {
ssize_t rb = sizeof(rbuf);
/* limit the total amount of read data to the trunc value*/
if (cfg_truncate > 0) {
if (rb + total_rlen > cfg_truncate)
rb = cfg_truncate - total_rlen;
len = read(peerfd, rbuf, rb);
} else {
len = do_rnd_read(peerfd, rbuf, sizeof(rbuf));
}
if (len == 0) {
/* no more data to receive:
* peer has closed its write side
......@@ -612,10 +627,13 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
/* Else, still have data to transmit */
} else if (len < 0) {
if (cfg_rcv_trunc)
return 0;
perror("read");
return 3;
}
total_rlen += len;
do_write(outfd, rbuf, len);
}
......@@ -628,12 +646,21 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
if (wlen > 0) {
ssize_t bw;
/* limit the total amount of written data to the trunc value */
if (cfg_truncate > 0 && wlen + total_wlen > cfg_truncate)
wlen = cfg_truncate - total_wlen;
bw = do_rnd_write(peerfd, wbuf + woff, wlen);
if (bw < 0)
if (bw < 0) {
if (cfg_rcv_trunc)
return 0;
perror("write");
return 111;
}
woff += bw;
wlen -= bw;
total_wlen += bw;
} else if (wlen == 0) {
/* We have no more data to send. */
fds.events &= ~POLLOUT;
......@@ -652,10 +679,16 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
}
if (fds.revents & (POLLERR | POLLNVAL)) {
if (cfg_rcv_trunc)
return 0;
fprintf(stderr, "Unexpected revents: "
"POLLERR/POLLNVAL(%x)\n", fds.revents);
return 5;
}
if (cfg_truncate > 0 && total_wlen >= cfg_truncate &&
total_rlen >= cfg_truncate)
break;
}
/* leave some time for late join/announce */
......@@ -1160,11 +1193,13 @@ int main_loop(void)
}
/* close the client socket open only if we are not going to reconnect */
ret = copyfd_io(fd_in, fd, 1, cfg_repeat == 1);
ret = copyfd_io(fd_in, fd, 1, 0);
if (ret)
return ret;
if (--cfg_repeat > 0) {
if (cfg_truncate > 0) {
xdisconnect(fd, peer->ai_addrlen);
} else if (--cfg_repeat > 0) {
xdisconnect(fd, peer->ai_addrlen);
/* the socket could be unblocking at this point, we need the
......@@ -1176,7 +1211,10 @@ int main_loop(void)
if (cfg_input)
close(fd_in);
goto again;
} else {
close(fd);
}
return 0;
}
......@@ -1262,8 +1300,19 @@ static void parse_opts(int argc, char **argv)
{
int c;
while ((c = getopt(argc, argv, "6c:hi:I:jlm:M:o:p:P:r:R:s:S:t:T:w:")) != -1) {
while ((c = getopt(argc, argv, "6c:f:hi:I:jlm:M:o:p:P:r:R:s:S:t:T:w:")) != -1) {
switch (c) {
case 'f':
cfg_truncate = atoi(optarg);
/* when receiving a fastclose, ignore PIPE signals and
* all the I/O errors later in the code
*/
if (cfg_truncate < 0) {
cfg_rcv_trunc = true;
signal(SIGPIPE, handle_signal);
}
break;
case 'j':
cfg_join = true;
cfg_mode = CFG_MODE_POLL;
......
......@@ -346,10 +346,21 @@ check_transfer()
local in=$1
local out=$2
local what=$3
local bytes=$4
local i a b
local line
cmp -l "$in" "$out" | while read -r i a b; do
if [ -n "$bytes" ]; then
# when truncating we must check the size explicitly
local out_size=$(wc -c $out | awk '{print $1}')
if [ $out_size -ne $bytes ]; then
echo "[ FAIL ] $what output file has wrong size ($out_size, $bytes)"
fail_test
return 1
fi
bytes="--bytes=${bytes}"
fi
cmp -l "$in" "$out" ${bytes} | while read -r i a b; do
local sum=$((0${a} + 0${b}))
if [ $check_invert -eq 0 ] || [ $sum -ne $((0xff)) ]; then
echo "[ FAIL ] $what does not match (in, out):"
......@@ -707,9 +718,31 @@ do_transfer()
fi
local flags="subflow"
local extra_cl_args=""
local extra_srv_args=""
local trunc_size=""
if [[ "${addr_nr_ns2}" = "fastclose_"* ]]; then
if [ ${test_link_fail} -le 1 ]; then
echo "fastclose tests need test_link_fail argument"
fail_test
return 1
fi
# disconnect
extra_args="$extra_args -I ${addr_nr_ns2:10}"
trunc_size=${test_link_fail}
local side=${addr_nr_ns2:10}
if [ ${side} = "client" ]; then
extra_cl_args="-f ${test_link_fail}"
extra_srv_args="-f -1"
elif [ ${side} = "server" ]; then
extra_srv_args="-f ${test_link_fail}"
extra_cl_args="-f -1"
else
echo "wrong/unknown fastclose spec ${side}"
fail_test
return 1
fi
addr_nr_ns2=0
elif [[ "${addr_nr_ns2}" = "userspace_"* ]]; then
userspace_pm=1
......@@ -737,39 +770,41 @@ do_transfer()
local_addr="0.0.0.0"
fi
extra_srv_args="$extra_args $extra_srv_args"
if [ "$test_link_fail" -gt 1 ];then
timeout ${timeout_test} \
ip netns exec ${listener_ns} \
./mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \
$extra_args ${local_addr} < "$sinfail" > "$sout" &
$extra_srv_args ${local_addr} < "$sinfail" > "$sout" &
else
timeout ${timeout_test} \
ip netns exec ${listener_ns} \
./mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \
$extra_args ${local_addr} < "$sin" > "$sout" &
$extra_srv_args ${local_addr} < "$sin" > "$sout" &
fi
local spid=$!
wait_local_port_listen "${listener_ns}" "${port}"
extra_cl_args="$extra_args $extra_cl_args"
if [ "$test_link_fail" -eq 0 ];then
timeout ${timeout_test} \
ip netns exec ${connector_ns} \
./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \
$extra_args $connect_addr < "$cin" > "$cout" &
$extra_cl_args $connect_addr < "$cin" > "$cout" &
elif [ "$test_link_fail" -eq 1 ] || [ "$test_link_fail" -eq 2 ];then
( cat "$cinfail" ; sleep 2; link_failure $listener_ns ; cat "$cinfail" ) | \
tee "$cinsent" | \
timeout ${timeout_test} \
ip netns exec ${connector_ns} \
./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \
$extra_args $connect_addr > "$cout" &
$extra_cl_args $connect_addr > "$cout" &
else
tee "$cinsent" < "$cinfail" | \
timeout ${timeout_test} \
ip netns exec ${connector_ns} \
./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \
$extra_args $connect_addr > "$cout" &
$extra_cl_args $connect_addr > "$cout" &
fi
local cpid=$!
......@@ -971,15 +1006,15 @@ do_transfer()
fi
if [ "$test_link_fail" -gt 1 ];then
check_transfer $sinfail $cout "file received by client"
check_transfer $sinfail $cout "file received by client" $trunc_size
else
check_transfer $sin $cout "file received by client"
check_transfer $sin $cout "file received by client" $trunc_size
fi
retc=$?
if [ "$test_link_fail" -eq 0 ];then
check_transfer $cin $sout "file received by server"
check_transfer $cin $sout "file received by server" $trunc_size
else
check_transfer $cinsent $sout "file received by server"
check_transfer $cinsent $sout "file received by server" $trunc_size
fi
rets=$?
......@@ -1188,12 +1223,23 @@ chk_fclose_nr()
{
local fclose_tx=$1
local fclose_rx=$2
local ns_invert=$3
local count
local dump_stats
local ns_tx=$ns2
local ns_rx=$ns1
local extra_msg=" "
if [[ $ns_invert = "invert" ]]; then
ns_tx=$ns1
ns_rx=$ns2
extra_msg=${extra_msg}"invert"
fi
printf "%-${nr_blank}s %s" " " "ctx"
count=$(ip netns exec $ns2 nstat -as | grep MPTcpExtMPFastcloseTx | awk '{print $2}')
count=$(ip netns exec $ns_tx nstat -as | grep MPTcpExtMPFastcloseTx | awk '{print $2}')
[ -z "$count" ] && count=0
[ "$count" != "$fclose_tx" ] && extra_msg="$extra_msg,tx=$count"
if [ "$count" != "$fclose_tx" ]; then
echo "[fail] got $count MP_FASTCLOSE[s] TX expected $fclose_tx"
fail_test
......@@ -1203,17 +1249,20 @@ chk_fclose_nr()
fi
echo -n " - fclzrx"
count=$(ip netns exec $ns1 nstat -as | grep MPTcpExtMPFastcloseRx | awk '{print $2}')
count=$(ip netns exec $ns_rx nstat -as | grep MPTcpExtMPFastcloseRx | awk '{print $2}')
[ -z "$count" ] && count=0
[ "$count" != "$fclose_rx" ] && extra_msg="$extra_msg,rx=$count"
if [ "$count" != "$fclose_rx" ]; then
echo "[fail] got $count MP_FASTCLOSE[s] RX expected $fclose_rx"
fail_test
dump_stats=1
else
echo "[ ok ]"
echo -n "[ ok ]"
fi
[ "${dump_stats}" = 1 ] && dump_stats
echo "$extra_msg"
}
chk_rst_nr()
......@@ -1236,7 +1285,7 @@ chk_rst_nr()
printf "%-${nr_blank}s %s" " " "rtx"
count=$(ip netns exec $ns_tx nstat -as | grep MPTcpExtMPRstTx | awk '{print $2}')
[ -z "$count" ] && count=0
if [ "$count" != "$rst_tx" ]; then
if [ $count -lt $rst_tx ]; then
echo "[fail] got $count MP_RST[s] TX expected $rst_tx"
fail_test
dump_stats=1
......@@ -1247,7 +1296,7 @@ chk_rst_nr()
echo -n " - rstrx "
count=$(ip netns exec $ns_rx nstat -as | grep MPTcpExtMPRstRx | awk '{print $2}')
[ -z "$count" ] && count=0
if [ "$count" != "$rst_rx" ]; then
if [ "$count" -lt "$rst_rx" ]; then
echo "[fail] got $count MP_RST[s] RX expected $rst_rx"
fail_test
dump_stats=1
......@@ -2801,11 +2850,18 @@ fullmesh_tests()
fastclose_tests()
{
if reset "fastclose test"; then
run_tests $ns1 $ns2 10.0.1.1 1024 0 fastclose_2
run_tests $ns1 $ns2 10.0.1.1 1024 0 fastclose_client
chk_join_nr 0 0 0
chk_fclose_nr 1 1
chk_rst_nr 1 1 invert
fi
if reset "fastclose server test"; then
run_tests $ns1 $ns2 10.0.1.1 1024 0 fastclose_server
chk_join_nr 0 0 0
chk_fclose_nr 1 1 invert
chk_rst_nr 1 1
fi
}
pedit_action_pkts()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment