Commit e51271d4 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp_dccp_ports'

Eric Dumazet says:

====================
tcp/dccp: better use of ephemeral ports

Big servers have bloated bind table, making very hard to succeed
ephemeral port allocations, without special containers/namespace tricks.

This patch series extends the strategy added in commit 07f4c900
("tcp/dccp: try to not exhaust ip_local_port_range in connect()").

Since ports used by connect() are much likely to be shared among them,
we give a hint to both bind() and connect() to keep the crowds separated
if possible.

Of course, if on a specific host an application needs to allocate ~30000
ports using bind(), it will still be able to do so. Same for ~30000 connect()
to a unique 2-tuple (dst addr, dst port)

New implemetation is also more friendly to softirqs and reschedules.

v2: rebase after TCP SO_REUSEPORT changes
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 3134b9f0 ea8add2b
...@@ -91,165 +91,153 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); ...@@ -91,165 +91,153 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
/* Obtain a reference to a local port for the given sock, /* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port. * if snum is zero it means select any available local port.
* We try to allocate an odd port (and leave even ports for connect())
*/ */
int inet_csk_get_port(struct sock *sk, unsigned short snum) int inet_csk_get_port(struct sock *sk, unsigned short snum)
{ {
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
int ret = 1, attempts = 5, port = snum;
int smallest_size = -1, smallest_port;
struct inet_bind_hashbucket *head; struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb;
int ret, attempts = 5;
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
int smallest_size = -1, smallest_rover; int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
kuid_t uid = sock_i_uid(sk); kuid_t uid = sock_i_uid(sk);
int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; u32 remaining, offset;
local_bh_disable(); if (port) {
if (!snum) { have_port:
int remaining, rover, low, high; head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == port)
goto tb_found;
goto tb_not_found;
}
again: again:
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
inet_get_local_port_range(net, &low, &high); inet_get_local_port_range(net, &low, &high);
high++; /* [32768, 60999] -> [32768, 61000[ */
if (high - low < 4)
attempt_half = 0;
if (attempt_half) { if (attempt_half) {
int half = low + ((high - low) >> 1); int half = low + (((high - low) >> 2) << 1);
if (attempt_half == 1) if (attempt_half == 1)
high = half; high = half;
else else
low = half; low = half;
} }
remaining = (high - low) + 1; remaining = high - low;
smallest_rover = rover = prandom_u32() % remaining + low; if (likely(remaining > 1))
remaining &= ~1U;
offset = prandom_u32() % remaining;
/* __inet_hash_connect() favors ports having @low parity
* We do the opposite to not pollute connect() users.
*/
offset |= 1U;
smallest_size = -1; smallest_size = -1;
do { smallest_port = low; /* avoid compiler warning */
if (inet_is_local_reserved_port(net, rover))
goto next_nolock; other_parity_scan:
head = &hashinfo->bhash[inet_bhashfn(net, rover, port = low + offset;
hashinfo->bhash_size)]; for (i = 0; i < remaining; i += 2, port += 2) {
spin_lock(&head->lock); if (unlikely(port >= high))
port -= remaining;
if (inet_is_local_reserved_port(net, port))
continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain) inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == rover) { if (net_eq(ib_net(tb), net) && tb->port == port) {
if (((tb->fastreuse > 0 && if (((tb->fastreuse > 0 && reuse) ||
sk->sk_reuse &&
sk->sk_state != TCP_LISTEN) ||
(tb->fastreuseport > 0 && (tb->fastreuseport > 0 &&
sk->sk_reuseport && sk->sk_reuseport &&
!rcu_access_pointer(sk->sk_reuseport_cb) && !rcu_access_pointer(sk->sk_reuseport_cb) &&
uid_eq(tb->fastuid, uid))) && uid_eq(tb->fastuid, uid))) &&
(tb->num_owners < smallest_size || smallest_size == -1)) { (tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners; smallest_size = tb->num_owners;
smallest_rover = rover; smallest_port = port;
} }
if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
snum = rover;
goto tb_found; goto tb_found;
goto next_port;
} }
goto next; goto tb_not_found;
next_port:
spin_unlock_bh(&head->lock);
cond_resched();
} }
break;
next:
spin_unlock(&head->lock);
next_nolock:
if (++rover > high)
rover = low;
} while (--remaining > 0);
/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
* locks if this test triggers, because if 'remaining'
* drops to zero, we broke out of the do/while loop at
* the top level, not from the 'break;' statement.
*/
ret = 1;
if (remaining <= 0) {
if (smallest_size != -1) { if (smallest_size != -1) {
snum = smallest_rover; port = smallest_port;
goto have_snum; goto have_port;
} }
offset--;
if (!(offset & 1))
goto other_parity_scan;
if (attempt_half == 1) { if (attempt_half == 1) {
/* OK we now try the upper half of the range */ /* OK we now try the upper half of the range */
attempt_half = 2; attempt_half = 2;
goto again; goto other_half_scan;
}
goto fail;
}
/* OK, here is the one we will use. HEAD is
* non-NULL and we hold it's mutex.
*/
snum = rover;
} else {
have_snum:
head = &hashinfo->bhash[inet_bhashfn(net, snum,
hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == snum)
goto tb_found;
} }
tb = NULL; return ret;
goto tb_not_found;
tb_not_found:
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port);
if (!tb)
goto fail_unlock;
tb_found: tb_found:
if (!hlist_empty(&tb->owners)) { if (!hlist_empty(&tb->owners)) {
if (sk->sk_reuse == SK_FORCE_REUSE) if (sk->sk_reuse == SK_FORCE_REUSE)
goto success; goto success;
if (((tb->fastreuse > 0 && if (((tb->fastreuse > 0 && reuse) ||
sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
(tb->fastreuseport > 0 && (tb->fastreuseport > 0 &&
sk->sk_reuseport && sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
!rcu_access_pointer(sk->sk_reuseport_cb) && smallest_size == -1)
uid_eq(tb->fastuid, uid))) && smallest_size == -1) {
goto success; goto success;
} else {
ret = 1;
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || if ((reuse ||
(tb->fastreuseport > 0 && (tb->fastreuseport > 0 &&
sk->sk_reuseport && sk->sk_reuseport &&
!rcu_access_pointer(sk->sk_reuseport_cb) && !rcu_access_pointer(sk->sk_reuseport_cb) &&
uid_eq(tb->fastuid, uid))) && uid_eq(tb->fastuid, uid))) &&
smallest_size != -1 && --attempts >= 0) { smallest_size != -1 && --attempts >= 0) {
spin_unlock(&head->lock); spin_unlock_bh(&head->lock);
goto again; goto again;
} }
goto fail_unlock; goto fail_unlock;
} }
} if (!reuse)
}
tb_not_found:
ret = 1;
if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
net, head, snum)) == NULL)
goto fail_unlock;
if (hlist_empty(&tb->owners)) {
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
tb->fastreuse = 1;
else
tb->fastreuse = 0; tb->fastreuse = 0;
if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
tb->fastreuseport = 0;
} else {
tb->fastreuse = reuse;
if (sk->sk_reuseport) { if (sk->sk_reuseport) {
tb->fastreuseport = 1; tb->fastreuseport = 1;
tb->fastuid = uid; tb->fastuid = uid;
} else
tb->fastreuseport = 0;
} else { } else {
if (tb->fastreuse &&
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
tb->fastreuse = 0;
if (tb->fastreuseport &&
(!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
tb->fastreuseport = 0; tb->fastreuseport = 0;
} }
}
success: success:
if (!inet_csk(sk)->icsk_bind_hash) if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, snum); inet_bind_hash(sk, tb, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
ret = 0; ret = 0;
fail_unlock: fail_unlock:
spin_unlock(&head->lock); spin_unlock_bh(&head->lock);
fail:
local_bh_enable();
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(inet_csk_get_port); EXPORT_SYMBOL_GPL(inet_csk_get_port);
......
...@@ -565,43 +565,59 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -565,43 +565,59 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *, __u16, struct inet_timewait_sock **)) struct sock *, __u16, struct inet_timewait_sock **))
{ {
struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_hashinfo *hinfo = death_row->hashinfo;
const unsigned short snum = inet_sk(sk)->inet_num; struct inet_timewait_sock *tw = NULL;
struct inet_bind_hashbucket *head; struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb; int port = inet_sk(sk)->inet_num;
int ret;
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
struct inet_bind_bucket *tb;
if (!snum) { u32 remaining, offset;
int i, remaining, low, high, port; int ret, i, low, high;
static u32 hint; static u32 hint;
u32 offset = hint + port_offset;
struct inet_timewait_sock *tw = NULL;
inet_get_local_port_range(net, &low, &high); if (port) {
remaining = (high - low) + 1; head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
inet_ehash_nolisten(sk, NULL);
spin_unlock_bh(&head->lock);
return 0;
}
spin_unlock(&head->lock);
/* No definite answer... Walk to established hash table */
ret = check_established(death_row, sk, port, NULL);
local_bh_enable();
return ret;
}
/* By starting with offset being an even number, inet_get_local_port_range(net, &low, &high);
* we tend to leave about 50% of ports for other uses, high++; /* [32768, 60999] -> [32768, 61000[ */
* like bind(0). remaining = high - low;
if (likely(remaining > 1))
remaining &= ~1U;
offset = (hint + port_offset) % remaining;
/* In first pass we try ports of @low parity.
* inet_csk_get_port() does the opposite choice.
*/ */
offset &= ~1; offset &= ~1U;
other_parity_scan:
local_bh_disable(); port = low + offset;
for (i = 0; i < remaining; i++) { for (i = 0; i < remaining; i += 2, port += 2) {
port = low + (i + offset) % remaining; if (unlikely(port >= high))
port -= remaining;
if (inet_is_local_reserved_port(net, port)) if (inet_is_local_reserved_port(net, port))
continue; continue;
head = &hinfo->bhash[inet_bhashfn(net, port, head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)]; hinfo->bhash_size)];
spin_lock(&head->lock); spin_lock_bh(&head->lock);
/* Does not bother with rcv_saddr checks, /* Does not bother with rcv_saddr checks, because
* because the established check is already * the established check is already unique enough.
* unique enough.
*/ */
inet_bind_bucket_for_each(tb, &head->chain) { inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), net) && if (net_eq(ib_net(tb), net) && tb->port == port) {
tb->port == port) {
if (tb->fastreuse >= 0 || if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0) tb->fastreuseport >= 0)
goto next_port; goto next_port;
...@@ -616,22 +632,25 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -616,22 +632,25 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port); net, head, port);
if (!tb) { if (!tb) {
spin_unlock(&head->lock); spin_unlock_bh(&head->lock);
break; return -ENOMEM;
} }
tb->fastreuse = -1; tb->fastreuse = -1;
tb->fastreuseport = -1; tb->fastreuseport = -1;
goto ok; goto ok;
next_port:
next_port: spin_unlock_bh(&head->lock);
spin_unlock(&head->lock); cond_resched();
} }
local_bh_enable();
offset++;
if ((offset & 1) && remaining > 1)
goto other_parity_scan;
return -EADDRNOTAVAIL; return -EADDRNOTAVAIL;
ok: ok:
hint += (i + 2) & ~1; hint += i + 2;
/* Head lock still held and bh's disabled */ /* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, port); inet_bind_hash(sk, tb, port);
...@@ -642,29 +661,10 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -642,29 +661,10 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
if (tw) if (tw)
inet_twsk_bind_unhash(tw, hinfo); inet_twsk_bind_unhash(tw, hinfo);
spin_unlock(&head->lock); spin_unlock(&head->lock);
if (tw) if (tw)
inet_twsk_deschedule_put(tw); inet_twsk_deschedule_put(tw);
ret = 0;
goto out;
}
head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
inet_ehash_nolisten(sk, NULL);
spin_unlock_bh(&head->lock);
return 0;
} else {
spin_unlock(&head->lock);
/* No definite answer... Walk to established hash table */
ret = check_established(death_row, sk, snum, NULL);
out:
local_bh_enable(); local_bh_enable();
return ret; return 0;
}
} }
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment