Commit e51271d4 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp_dccp_ports'

Eric Dumazet says:

====================
tcp/dccp: better use of ephemeral ports

Big servers have bloated bind table, making very hard to succeed
ephemeral port allocations, without special containers/namespace tricks.

This patch series extends the strategy added in commit 07f4c900
("tcp/dccp: try to not exhaust ip_local_port_range in connect()").

Since ports used by connect() are much likely to be shared among them,
we give a hint to both bind() and connect() to keep the crowds separated
if possible.

Of course, if on a specific host an application needs to allocate ~30000
ports using bind(), it will still be able to do so. Same for ~30000 connect()
to a unique 2-tuple (dst addr, dst port)

New implemetation is also more friendly to softirqs and reschedules.

v2: rebase after TCP SO_REUSEPORT changes
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 3134b9f0 ea8add2b
...@@ -91,165 +91,153 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); ...@@ -91,165 +91,153 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
/* Obtain a reference to a local port for the given sock, /* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port. * if snum is zero it means select any available local port.
* We try to allocate an odd port (and leave even ports for connect())
*/ */
int inet_csk_get_port(struct sock *sk, unsigned short snum) int inet_csk_get_port(struct sock *sk, unsigned short snum)
{ {
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
int ret = 1, attempts = 5, port = snum;
int smallest_size = -1, smallest_port;
struct inet_bind_hashbucket *head; struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb;
int ret, attempts = 5;
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
int smallest_size = -1, smallest_rover; int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
kuid_t uid = sock_i_uid(sk); kuid_t uid = sock_i_uid(sk);
int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; u32 remaining, offset;
local_bh_disable(); if (port) {
if (!snum) { have_port:
int remaining, rover, low, high; head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == port)
goto tb_found;
goto tb_not_found;
}
again: again:
inet_get_local_port_range(net, &low, &high); attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
if (attempt_half) { other_half_scan:
int half = low + ((high - low) >> 1); inet_get_local_port_range(net, &low, &high);
high++; /* [32768, 60999] -> [32768, 61000[ */
if (attempt_half == 1) if (high - low < 4)
high = half; attempt_half = 0;
else if (attempt_half) {
low = half; int half = low + (((high - low) >> 2) << 1);
}
remaining = (high - low) + 1; if (attempt_half == 1)
smallest_rover = rover = prandom_u32() % remaining + low; high = half;
else
smallest_size = -1; low = half;
do { }
if (inet_is_local_reserved_port(net, rover)) remaining = high - low;
goto next_nolock; if (likely(remaining > 1))
head = &hashinfo->bhash[inet_bhashfn(net, rover, remaining &= ~1U;
hashinfo->bhash_size)];
spin_lock(&head->lock); offset = prandom_u32() % remaining;
inet_bind_bucket_for_each(tb, &head->chain) /* __inet_hash_connect() favors ports having @low parity
if (net_eq(ib_net(tb), net) && tb->port == rover) { * We do the opposite to not pollute connect() users.
if (((tb->fastreuse > 0 && */
sk->sk_reuse && offset |= 1U;
sk->sk_state != TCP_LISTEN) || smallest_size = -1;
(tb->fastreuseport > 0 && smallest_port = low; /* avoid compiler warning */
sk->sk_reuseport &&
!rcu_access_pointer(sk->sk_reuseport_cb) && other_parity_scan:
uid_eq(tb->fastuid, uid))) && port = low + offset;
(tb->num_owners < smallest_size || smallest_size == -1)) { for (i = 0; i < remaining; i += 2, port += 2) {
smallest_size = tb->num_owners; if (unlikely(port >= high))
smallest_rover = rover; port -= remaining;
} if (inet_is_local_reserved_port(net, port))
if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { continue;
snum = rover; head = &hinfo->bhash[inet_bhashfn(net, port,
goto tb_found; hinfo->bhash_size)];
} spin_lock_bh(&head->lock);
goto next; inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == port) {
if (((tb->fastreuse > 0 && reuse) ||
(tb->fastreuseport > 0 &&
sk->sk_reuseport &&
!rcu_access_pointer(sk->sk_reuseport_cb) &&
uid_eq(tb->fastuid, uid))) &&
(tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners;
smallest_port = port;
} }
break; if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
next: goto tb_found;
spin_unlock(&head->lock); goto next_port;
next_nolock:
if (++rover > high)
rover = low;
} while (--remaining > 0);
/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
* locks if this test triggers, because if 'remaining'
* drops to zero, we broke out of the do/while loop at
* the top level, not from the 'break;' statement.
*/
ret = 1;
if (remaining <= 0) {
if (smallest_size != -1) {
snum = smallest_rover;
goto have_snum;
} }
if (attempt_half == 1) { goto tb_not_found;
/* OK we now try the upper half of the range */ next_port:
attempt_half = 2; spin_unlock_bh(&head->lock);
goto again; cond_resched();
}
goto fail;
}
/* OK, here is the one we will use. HEAD is
* non-NULL and we hold it's mutex.
*/
snum = rover;
} else {
have_snum:
head = &hashinfo->bhash[inet_bhashfn(net, snum,
hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == snum)
goto tb_found;
} }
tb = NULL;
goto tb_not_found; if (smallest_size != -1) {
port = smallest_port;
goto have_port;
}
offset--;
if (!(offset & 1))
goto other_parity_scan;
if (attempt_half == 1) {
/* OK we now try the upper half of the range */
attempt_half = 2;
goto other_half_scan;
}
return ret;
tb_not_found:
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port);
if (!tb)
goto fail_unlock;
tb_found: tb_found:
if (!hlist_empty(&tb->owners)) { if (!hlist_empty(&tb->owners)) {
if (sk->sk_reuse == SK_FORCE_REUSE) if (sk->sk_reuse == SK_FORCE_REUSE)
goto success; goto success;
if (((tb->fastreuse > 0 && if (((tb->fastreuse > 0 && reuse) ||
sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
(tb->fastreuseport > 0 && (tb->fastreuseport > 0 &&
sk->sk_reuseport && sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
!rcu_access_pointer(sk->sk_reuseport_cb) && smallest_size == -1)
uid_eq(tb->fastuid, uid))) && smallest_size == -1) {
goto success; goto success;
} else { if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
ret = 1; if ((reuse ||
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { (tb->fastreuseport > 0 &&
if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || sk->sk_reuseport &&
(tb->fastreuseport > 0 && !rcu_access_pointer(sk->sk_reuseport_cb) &&
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
!rcu_access_pointer(sk->sk_reuseport_cb) && smallest_size != -1 && --attempts >= 0) {
uid_eq(tb->fastuid, uid))) && spin_unlock_bh(&head->lock);
smallest_size != -1 && --attempts >= 0) { goto again;
spin_unlock(&head->lock);
goto again;
}
goto fail_unlock;
} }
goto fail_unlock;
} }
} if (!reuse)
tb_not_found:
ret = 1;
if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
net, head, snum)) == NULL)
goto fail_unlock;
if (hlist_empty(&tb->owners)) {
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
tb->fastreuse = 1;
else
tb->fastreuse = 0; tb->fastreuse = 0;
if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
tb->fastreuseport = 0;
} else {
tb->fastreuse = reuse;
if (sk->sk_reuseport) { if (sk->sk_reuseport) {
tb->fastreuseport = 1; tb->fastreuseport = 1;
tb->fastuid = uid; tb->fastuid = uid;
} else } else {
tb->fastreuseport = 0;
} else {
if (tb->fastreuse &&
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
tb->fastreuse = 0;
if (tb->fastreuseport &&
(!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
tb->fastreuseport = 0; tb->fastreuseport = 0;
}
} }
success: success:
if (!inet_csk(sk)->icsk_bind_hash) if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, snum); inet_bind_hash(sk, tb, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
ret = 0; ret = 0;
fail_unlock: fail_unlock:
spin_unlock(&head->lock); spin_unlock_bh(&head->lock);
fail:
local_bh_enable();
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(inet_csk_get_port); EXPORT_SYMBOL_GPL(inet_csk_get_port);
......
...@@ -565,106 +565,106 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -565,106 +565,106 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *, __u16, struct inet_timewait_sock **)) struct sock *, __u16, struct inet_timewait_sock **))
{ {
struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_hashinfo *hinfo = death_row->hashinfo;
const unsigned short snum = inet_sk(sk)->inet_num; struct inet_timewait_sock *tw = NULL;
struct inet_bind_hashbucket *head; struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb; int port = inet_sk(sk)->inet_num;
int ret;
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
struct inet_bind_bucket *tb;
u32 remaining, offset;
int ret, i, low, high;
static u32 hint;
if (port) {
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
inet_ehash_nolisten(sk, NULL);
spin_unlock_bh(&head->lock);
return 0;
}
spin_unlock(&head->lock);
/* No definite answer... Walk to established hash table */
ret = check_established(death_row, sk, port, NULL);
local_bh_enable();
return ret;
}
if (!snum) { inet_get_local_port_range(net, &low, &high);
int i, remaining, low, high, port; high++; /* [32768, 60999] -> [32768, 61000[ */
static u32 hint; remaining = high - low;
u32 offset = hint + port_offset; if (likely(remaining > 1))
struct inet_timewait_sock *tw = NULL; remaining &= ~1U;
inet_get_local_port_range(net, &low, &high); offset = (hint + port_offset) % remaining;
remaining = (high - low) + 1; /* In first pass we try ports of @low parity.
* inet_csk_get_port() does the opposite choice.
*/
offset &= ~1U;
other_parity_scan:
port = low + offset;
for (i = 0; i < remaining; i += 2, port += 2) {
if (unlikely(port >= high))
port -= remaining;
if (inet_is_local_reserved_port(net, port))
continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
/* By starting with offset being an even number, /* Does not bother with rcv_saddr checks, because
* we tend to leave about 50% of ports for other uses, * the established check is already unique enough.
* like bind(0).
*/ */
offset &= ~1; inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), net) && tb->port == port) {
local_bh_disable(); if (tb->fastreuse >= 0 ||
for (i = 0; i < remaining; i++) { tb->fastreuseport >= 0)
port = low + (i + offset) % remaining;
if (inet_is_local_reserved_port(net, port))
continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock(&head->lock);
/* Does not bother with rcv_saddr checks,
* because the established check is already
* unique enough.
*/
inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), net) &&
tb->port == port) {
if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0)
goto next_port;
WARN_ON(hlist_empty(&tb->owners));
if (!check_established(death_row, sk,
port, &tw))
goto ok;
goto next_port; goto next_port;
} WARN_ON(hlist_empty(&tb->owners));
if (!check_established(death_row, sk,
port, &tw))
goto ok;
goto next_port;
} }
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port);
if (!tb) {
spin_unlock(&head->lock);
break;
}
tb->fastreuse = -1;
tb->fastreuseport = -1;
goto ok;
next_port:
spin_unlock(&head->lock);
} }
local_bh_enable();
return -EADDRNOTAVAIL;
ok: tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
hint += (i + 2) & ~1; net, head, port);
if (!tb) {
/* Head lock still held and bh's disabled */ spin_unlock_bh(&head->lock);
inet_bind_hash(sk, tb, port); return -ENOMEM;
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
inet_ehash_nolisten(sk, (struct sock *)tw);
} }
if (tw) tb->fastreuse = -1;
inet_twsk_bind_unhash(tw, hinfo); tb->fastreuseport = -1;
spin_unlock(&head->lock); goto ok;
next_port:
spin_unlock_bh(&head->lock);
cond_resched();
}
if (tw) offset++;
inet_twsk_deschedule_put(tw); if ((offset & 1) && remaining > 1)
goto other_parity_scan;
ret = 0; return -EADDRNOTAVAIL;
goto out;
}
head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; ok:
tb = inet_csk(sk)->icsk_bind_hash; hint += i + 2;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { /* Head lock still held and bh's disabled */
inet_ehash_nolisten(sk, NULL); inet_bind_hash(sk, tb, port);
spin_unlock_bh(&head->lock); if (sk_unhashed(sk)) {
return 0; inet_sk(sk)->inet_sport = htons(port);
} else { inet_ehash_nolisten(sk, (struct sock *)tw);
spin_unlock(&head->lock);
/* No definite answer... Walk to established hash table */
ret = check_established(death_row, sk, snum, NULL);
out:
local_bh_enable();
return ret;
} }
if (tw)
inet_twsk_bind_unhash(tw, hinfo);
spin_unlock(&head->lock);
if (tw)
inet_twsk_deschedule_put(tw);
local_bh_enable();
return 0;
} }
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment