Commit 93e61613 authored by David S. Miller's avatar David S. Miller

Merge branch 'bind_addr_zero'

Kuniyuki Iwashima says:

====================
Improve bind(addr, 0) behaviour.

Currently we fail to bind sockets to ephemeral ports when all of the ports
are exhausted even if all sockets have SO_REUSEADDR enabled. In this case,
we still have a chance to connect to the different remote hosts.

These patches add net.ipv4.ip_autobind_reuse option and fix the behaviour
to fully utilize all space of the local (addr, port) tuples.

Changes in v5:
  - Add more description to documents.
  - Fix sysctl option to use proc_dointvec_minmax.
  - Remove the Fixes: tag and squash two commits.

Changes in v4:
  - Add net.ipv4.ip_autobind_reuse option to not change the current behaviour.
  - Modify .gitignore for test.
  https://lore.kernel.org/netdev/20200308181615.90135-1-kuniyu@amazon.co.jp/

Changes in v3:
  - Change the title and write more specific description of the 3rd patch.
  - Add a test in tools/testing/selftests/net/ as the 4th patch.
  https://lore.kernel.org/netdev/20200229113554.78338-1-kuniyu@amazon.co.jp/

Changes in v2:
  - Change the description of the 2nd patch ('localhost' -> 'address').
  - Correct the description and the if statement of the 3rd patch.
  https://lore.kernel.org/netdev/20200226074631.67688-1-kuniyu@amazon.co.jp/

v1 with tests:
  https://lore.kernel.org/netdev/20200220152020.13056-1-kuniyu@amazon.co.jp/
====================
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents af91fd7e 7f204a7d
...@@ -958,6 +958,15 @@ ip_nonlocal_bind - BOOLEAN ...@@ -958,6 +958,15 @@ ip_nonlocal_bind - BOOLEAN
which can be quite useful - but may break some applications. which can be quite useful - but may break some applications.
Default: 0 Default: 0
ip_autobind_reuse - BOOLEAN
By default, bind() does not select the ports automatically even if
the new socket and all sockets bound to the port have SO_REUSEADDR.
ip_autobind_reuse allows bind() to reuse the port and this is useful
when you use bind()+connect(), but may break some applications.
The preferred solution is to use IP_BIND_ADDRESS_NO_PORT and this
option should only be set by experts.
Default: 0
ip_dynaddr - BOOLEAN ip_dynaddr - BOOLEAN
If set non-zero, enables support for dynamic addresses. If set non-zero, enables support for dynamic addresses.
If set to a non-zero value larger than 1, a kernel log If set to a non-zero value larger than 1, a kernel log
......
...@@ -101,6 +101,7 @@ struct netns_ipv4 { ...@@ -101,6 +101,7 @@ struct netns_ipv4 {
int sysctl_ip_fwd_use_pmtu; int sysctl_ip_fwd_use_pmtu;
int sysctl_ip_fwd_update_priority; int sysctl_ip_fwd_update_priority;
int sysctl_ip_nonlocal_bind; int sysctl_ip_nonlocal_bind;
int sysctl_ip_autobind_reuse;
/* Shall we try to damage output packets if routing dev changes? */ /* Shall we try to damage output packets if routing dev changes? */
int sysctl_ip_dynaddr; int sysctl_ip_dynaddr;
int sysctl_ip_early_demux; int sysctl_ip_early_demux;
......
...@@ -131,7 +131,7 @@ static int inet_csk_bind_conflict(const struct sock *sk, ...@@ -131,7 +131,7 @@ static int inet_csk_bind_conflict(const struct sock *sk,
{ {
struct sock *sk2; struct sock *sk2;
bool reuse = sk->sk_reuse; bool reuse = sk->sk_reuse;
bool reuseport = !!sk->sk_reuseport && reuseport_ok; bool reuseport = !!sk->sk_reuseport;
kuid_t uid = sock_i_uid((struct sock *)sk); kuid_t uid = sock_i_uid((struct sock *)sk);
/* /*
...@@ -146,17 +146,21 @@ static int inet_csk_bind_conflict(const struct sock *sk, ...@@ -146,17 +146,21 @@ static int inet_csk_bind_conflict(const struct sock *sk,
(!sk->sk_bound_dev_if || (!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if || !sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
if ((!reuse || !sk2->sk_reuse || if (reuse && sk2->sk_reuse &&
sk2->sk_state == TCP_LISTEN) && sk2->sk_state != TCP_LISTEN) {
(!reuseport || !sk2->sk_reuseport || if ((!relax ||
(!reuseport_ok &&
reuseport && sk2->sk_reuseport &&
!rcu_access_pointer(sk->sk_reuseport_cb) &&
(sk2->sk_state == TCP_TIME_WAIT ||
uid_eq(uid, sock_i_uid(sk2))))) &&
inet_rcv_saddr_equal(sk, sk2, true))
break;
} else if (!reuseport_ok ||
!reuseport || !sk2->sk_reuseport ||
rcu_access_pointer(sk->sk_reuseport_cb) || rcu_access_pointer(sk->sk_reuseport_cb) ||
(sk2->sk_state != TCP_TIME_WAIT && (sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid, sock_i_uid(sk2))))) { !uid_eq(uid, sock_i_uid(sk2)))) {
if (inet_rcv_saddr_equal(sk, sk2, true))
break;
}
if (!relax && reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) {
if (inet_rcv_saddr_equal(sk, sk2, true)) if (inet_rcv_saddr_equal(sk, sk2, true))
break; break;
} }
...@@ -176,12 +180,14 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int * ...@@ -176,12 +180,14 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
int port = 0; int port = 0;
struct inet_bind_hashbucket *head; struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
bool relax = false;
int i, low, high, attempt_half; int i, low, high, attempt_half;
struct inet_bind_bucket *tb; struct inet_bind_bucket *tb;
u32 remaining, offset; u32 remaining, offset;
int l3mdev; int l3mdev;
l3mdev = inet_sk_bound_l3mdev(sk); l3mdev = inet_sk_bound_l3mdev(sk);
ports_exhausted:
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan: other_half_scan:
inet_get_local_port_range(net, &low, &high); inet_get_local_port_range(net, &low, &high);
...@@ -219,7 +225,7 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int * ...@@ -219,7 +225,7 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
inet_bind_bucket_for_each(tb, &head->chain) inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
tb->port == port) { tb->port == port) {
if (!inet_csk_bind_conflict(sk, tb, false, false)) if (!inet_csk_bind_conflict(sk, tb, relax, false))
goto success; goto success;
goto next_port; goto next_port;
} }
...@@ -239,6 +245,12 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int * ...@@ -239,6 +245,12 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
attempt_half = 2; attempt_half = 2;
goto other_half_scan; goto other_half_scan;
} }
if (net->ipv4.sysctl_ip_autobind_reuse && !relax) {
/* We still have a chance to connect to different destinations */
relax = true;
goto ports_exhausted;
}
return NULL; return NULL;
success: success:
*port_ret = port; *port_ret = port;
......
...@@ -763,6 +763,15 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -763,6 +763,15 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec .proc_handler = proc_dointvec
}, },
{
.procname = "ip_autobind_reuse",
.data = &init_net.ipv4.sysctl_ip_autobind_reuse,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{ {
.procname = "fwmark_reflect", .procname = "fwmark_reflect",
.data = &init_net.ipv4.sysctl_fwmark_reflect, .data = &init_net.ipv4.sysctl_fwmark_reflect,
......
...@@ -23,3 +23,4 @@ so_txtime ...@@ -23,3 +23,4 @@ so_txtime
tcp_fastopen_backup_key tcp_fastopen_backup_key
nettest nettest
fin_ack_lat fin_ack_lat
reuseaddr_ports_exhausted
\ No newline at end of file
...@@ -12,6 +12,7 @@ TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_a ...@@ -12,6 +12,7 @@ TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_a
TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh ipv6_flowlabel.sh TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh ipv6_flowlabel.sh
TEST_PROGS += tcp_fastopen_backup_key.sh fcnal-test.sh l2tp.sh traceroute.sh TEST_PROGS += tcp_fastopen_backup_key.sh fcnal-test.sh l2tp.sh traceroute.sh
TEST_PROGS += fin_ack_lat.sh TEST_PROGS += fin_ack_lat.sh
TEST_PROGS += reuseaddr_ports_exhausted.sh
TEST_PROGS_EXTENDED := in_netns.sh TEST_PROGS_EXTENDED := in_netns.sh
TEST_GEN_FILES = socket nettest TEST_GEN_FILES = socket nettest
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any
...@@ -22,6 +23,7 @@ TEST_GEN_FILES += tcp_fastopen_backup_key ...@@ -22,6 +23,7 @@ TEST_GEN_FILES += tcp_fastopen_backup_key
TEST_GEN_FILES += fin_ack_lat TEST_GEN_FILES += fin_ack_lat
TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls
TEST_GEN_FILES += reuseaddr_ports_exhausted
KSFT_KHDR_INSTALL := 1 KSFT_KHDR_INSTALL := 1
include ../lib.mk include ../lib.mk
......
// SPDX-License-Identifier: GPL-2.0-only
/*
* Check if we can fully utilize 4-tuples for connect().
*
* Rules to bind sockets to the same port when all ephemeral ports are
* exhausted.
*
* 1. if there are TCP_LISTEN sockets on the port, fail to bind.
* 2. if there are sockets without SO_REUSEADDR, fail to bind.
* 3. if SO_REUSEADDR is disabled, fail to bind.
* 4. if SO_REUSEADDR is enabled and SO_REUSEPORT is disabled,
* succeed to bind.
* 5. if SO_REUSEADDR and SO_REUSEPORT are enabled and
* there is no socket having the both options and the same EUID,
* succeed to bind.
* 6. fail to bind.
*
* Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
*/
#include <arpa/inet.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>
#include "../kselftest_harness.h"
struct reuse_opts {
int reuseaddr[2];
int reuseport[2];
};
struct reuse_opts unreusable_opts[12] = {
{0, 0, 0, 0},
{0, 0, 0, 1},
{0, 0, 1, 0},
{0, 0, 1, 1},
{0, 1, 0, 0},
{0, 1, 0, 1},
{0, 1, 1, 0},
{0, 1, 1, 1},
{1, 0, 0, 0},
{1, 0, 0, 1},
{1, 0, 1, 0},
{1, 0, 1, 1},
};
struct reuse_opts reusable_opts[4] = {
{1, 1, 0, 0},
{1, 1, 0, 1},
{1, 1, 1, 0},
{1, 1, 1, 1},
};
int bind_port(struct __test_metadata *_metadata, int reuseaddr, int reuseport)
{
struct sockaddr_in local_addr;
int len = sizeof(local_addr);
int fd, ret;
fd = socket(AF_INET, SOCK_STREAM, 0);
ASSERT_NE(-1, fd) TH_LOG("failed to open socket.");
ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, sizeof(int));
ASSERT_EQ(0, ret) TH_LOG("failed to setsockopt: SO_REUSEADDR.");
ret = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &reuseport, sizeof(int));
ASSERT_EQ(0, ret) TH_LOG("failed to setsockopt: SO_REUSEPORT.");
local_addr.sin_family = AF_INET;
local_addr.sin_addr.s_addr = inet_addr("127.0.0.1");
local_addr.sin_port = 0;
if (bind(fd, (struct sockaddr *)&local_addr, len) == -1) {
close(fd);
return -1;
}
return fd;
}
TEST(reuseaddr_ports_exhausted_unreusable)
{
struct reuse_opts *opts;
int i, j, fd[2];
for (i = 0; i < 12; i++) {
opts = &unreusable_opts[i];
for (j = 0; j < 2; j++)
fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind.");
for (j = 0; j < 2; j++)
if (fd[j] != -1)
close(fd[j]);
}
}
TEST(reuseaddr_ports_exhausted_reusable_same_euid)
{
struct reuse_opts *opts;
int i, j, fd[2];
for (i = 0; i < 4; i++) {
opts = &reusable_opts[i];
for (j = 0; j < 2; j++)
fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
if (opts->reuseport[0] && opts->reuseport[1]) {
EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind because both sockets succeed to be listened.");
} else {
EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind to connect to different destinations.");
}
for (j = 0; j < 2; j++)
if (fd[j] != -1)
close(fd[j]);
}
}
TEST(reuseaddr_ports_exhausted_reusable_different_euid)
{
struct reuse_opts *opts;
int i, j, ret, fd[2];
uid_t euid[2] = {10, 20};
for (i = 0; i < 4; i++) {
opts = &reusable_opts[i];
for (j = 0; j < 2; j++) {
ret = seteuid(euid[j]);
ASSERT_EQ(0, ret) TH_LOG("failed to seteuid: %d.", euid[j]);
fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
ret = seteuid(0);
ASSERT_EQ(0, ret) TH_LOG("failed to seteuid: 0.");
}
ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind because one socket can be bound in each euid.");
if (fd[1] != -1) {
ret = listen(fd[0], 5);
ASSERT_EQ(0, ret) TH_LOG("failed to listen.");
ret = listen(fd[1], 5);
EXPECT_EQ(-1, ret) TH_LOG("should fail to listen because only one uid reserves the port in TCP_LISTEN.");
}
for (j = 0; j < 2; j++)
if (fd[j] != -1)
close(fd[j]);
}
}
TEST_HARNESS_MAIN
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Run tests when all ephemeral ports are exhausted.
#
# Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
set +x
set -e
readonly NETNS="ns-$(mktemp -u XXXXXX)"
setup() {
ip netns add "${NETNS}"
ip -netns "${NETNS}" link set lo up
ip netns exec "${NETNS}" \
sysctl -w net.ipv4.ip_local_port_range="32768 32768" \
> /dev/null 2>&1
ip netns exec "${NETNS}" \
sysctl -w net.ipv4.ip_autobind_reuse=1 > /dev/null 2>&1
}
cleanup() {
ip netns del "${NETNS}"
}
trap cleanup EXIT
setup
do_test() {
ip netns exec "${NETNS}" ./reuseaddr_ports_exhausted
}
do_test
echo "tests done"
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment