Commit 358105ab authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'tcp-dccp-refine-source-port-selection'

Eric Dumazet says:

====================
tcp/dccp: refine source port selection

This patch series leverages IP_LOCAL_PORT_RANGE option
to no longer favor even source port selection at connect() time.

This should lower time taken by connect() for hosts having
many active connections to the same destination.
====================

Link: https://lore.kernel.org/r/20231214192939.1962891-1-edumazet@google.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 758a8d5b 20718485
......@@ -356,7 +356,7 @@ static inline void inet_get_local_port_range(const struct net *net, int *low, in
*low = range & 0xffff;
*high = range >> 16;
}
void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
#ifdef CONFIG_SYSCTL
static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
......
......@@ -117,16 +117,25 @@ bool inet_rcv_saddr_any(const struct sock *sk)
return !sk->sk_rcv_saddr;
}
void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
/**
* inet_sk_get_local_port_range - fetch ephemeral ports range
* @sk: socket
* @low: pointer to low port
* @high: pointer to high port
*
* Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range)
* Range can be overridden if socket got IP_LOCAL_PORT_RANGE option.
* Returns true if IP_LOCAL_PORT_RANGE was set on this socket.
*/
bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
{
const struct inet_sock *inet = inet_sk(sk);
const struct net *net = sock_net(sk);
int lo, hi, sk_lo, sk_hi;
bool local_range = false;
u32 sk_range;
inet_get_local_port_range(net, &lo, &hi);
inet_get_local_port_range(sock_net(sk), &lo, &hi);
sk_range = READ_ONCE(inet->local_port_range);
sk_range = READ_ONCE(inet_sk(sk)->local_port_range);
if (unlikely(sk_range)) {
sk_lo = sk_range & 0xffff;
sk_hi = sk_range >> 16;
......@@ -135,10 +144,12 @@ void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
lo = sk_lo;
if (lo <= sk_hi && sk_hi <= hi)
hi = sk_hi;
local_range = true;
}
*low = lo;
*high = hi;
return local_range;
}
EXPORT_SYMBOL(inet_sk_get_local_port_range);
......
......@@ -1012,7 +1012,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
bool tb_created = false;
u32 remaining, offset;
int ret, i, low, high;
int l3mdev;
bool local_ports;
int step, l3mdev;
u32 index;
if (port) {
......@@ -1024,10 +1025,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
l3mdev = inet_sk_bound_l3mdev(sk);
inet_sk_get_local_port_range(sk, &low, &high);
local_ports = inet_sk_get_local_port_range(sk, &low, &high);
step = local_ports ? 1 : 2;
high++; /* [32768, 60999] -> [32768, 61000[ */
remaining = high - low;
if (likely(remaining > 1))
if (!local_ports && remaining > 1)
remaining &= ~1U;
get_random_sleepable_once(table_perturb,
......@@ -1040,10 +1043,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
/* In first pass we try ports of @low parity.
* inet_csk_get_port() does the opposite choice.
*/
offset &= ~1U;
if (!local_ports)
offset &= ~1U;
other_parity_scan:
port = low + offset;
for (i = 0; i < remaining; i += 2, port += 2) {
for (i = 0; i < remaining; i += step, port += step) {
if (unlikely(port >= high))
port -= remaining;
if (inet_is_local_reserved_port(net, port))
......@@ -1083,10 +1087,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
cond_resched();
}
offset++;
if ((offset & 1) && remaining > 1)
goto other_parity_scan;
if (!local_ports) {
offset++;
if ((offset & 1) && remaining > 1)
goto other_parity_scan;
}
return -EADDRNOTAVAIL;
ok:
......@@ -1109,8 +1114,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
* on low contention the randomness is maximal and on high contention
* it may be inexistent.
*/
i = max_t(int, i, get_random_u32_below(8) * 2);
WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
i = max_t(int, i, get_random_u32_below(8) * step);
WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, tb2, port);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment