Commit b8439924 authored by Alexey Kuznetsov's avatar Alexey Kuznetsov Committed by David S. Miller

Allow to bind to an already in use local port

during connect when the connection will still have a unique
identity.  Fixes port space exhaustion, especially in web
caches.

Initial work done by Andi Kleen.
parent 9a218f37
...@@ -126,7 +126,13 @@ tcp_max_tw_buckets - INTEGER ...@@ -126,7 +126,13 @@ tcp_max_tw_buckets - INTEGER
if network conditions require more than default value. if network conditions require more than default value.
tcp_tw_recycle - BOOLEAN tcp_tw_recycle - BOOLEAN
Enable fast recycling TIME-WAIT sockets. Default value is 1. Enable fast recycling TIME-WAIT sockets. Default value is 0.
It should not be changed without advice/request of technical
experts.
tcp_tw_reuse - BOOLEAN
Allow to reuse TIME-WAIT sockets for new connections when it is
safe from protocol viewpoint. Default value is 0.
It should not be changed without advice/request of technical It should not be changed without advice/request of technical
experts. experts.
......
...@@ -288,7 +288,8 @@ enum ...@@ -288,7 +288,8 @@ enum
NET_TCP_ADV_WIN_SCALE=87, NET_TCP_ADV_WIN_SCALE=87,
NET_IPV4_NONLOCAL_BIND=88, NET_IPV4_NONLOCAL_BIND=88,
NET_IPV4_ICMP_RATELIMIT=89, NET_IPV4_ICMP_RATELIMIT=89,
NET_IPV4_ICMP_RATEMASK=90 NET_IPV4_ICMP_RATEMASK=90,
NET_TCP_TW_REUSE=91
}; };
enum { enum {
......
...@@ -78,7 +78,7 @@ struct tcp_ehash_bucket { ...@@ -78,7 +78,7 @@ struct tcp_ehash_bucket {
*/ */
struct tcp_bind_bucket { struct tcp_bind_bucket {
unsigned short port; unsigned short port;
unsigned short fastreuse; signed short fastreuse;
struct tcp_bind_bucket *next; struct tcp_bind_bucket *next;
struct sock *owners; struct sock *owners;
struct tcp_bind_bucket **pprev; struct tcp_bind_bucket **pprev;
...@@ -469,6 +469,7 @@ extern int sysctl_tcp_wmem[3]; ...@@ -469,6 +469,7 @@ extern int sysctl_tcp_wmem[3];
extern int sysctl_tcp_rmem[3]; extern int sysctl_tcp_rmem[3];
extern int sysctl_tcp_app_win; extern int sysctl_tcp_app_win;
extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_adv_win_scale;
extern int sysctl_tcp_tw_reuse;
extern atomic_t tcp_memory_allocated; extern atomic_t tcp_memory_allocated;
extern atomic_t tcp_sockets_allocated; extern atomic_t tcp_sockets_allocated;
...@@ -577,9 +578,7 @@ struct tcp_func { ...@@ -577,9 +578,7 @@ struct tcp_func {
struct sk_buff *skb, struct sk_buff *skb,
struct open_request *req, struct open_request *req,
struct dst_entry *dst); struct dst_entry *dst);
int (*hash_connecting) (struct sock *sk);
int (*remember_stamp) (struct sock *sk); int (*remember_stamp) (struct sock *sk);
__u16 net_header_len; __u16 net_header_len;
...@@ -781,8 +780,7 @@ extern int tcp_v4_connect(struct sock *sk, ...@@ -781,8 +780,7 @@ extern int tcp_v4_connect(struct sock *sk,
struct sockaddr *uaddr, struct sockaddr *uaddr,
int addr_len); int addr_len);
extern int tcp_connect(struct sock *sk, extern int tcp_connect(struct sock *sk);
struct sk_buff *skb);
extern struct sk_buff * tcp_make_synack(struct sock *sk, extern struct sk_buff * tcp_make_synack(struct sock *sk,
struct dst_entry *dst, struct dst_entry *dst,
......
...@@ -655,13 +655,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr, ...@@ -655,13 +655,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
if (sk->state != TCP_CLOSE) if (sk->state != TCP_CLOSE)
goto out; goto out;
err = -EAGAIN;
if (!inet->num) {
if (sk->prot->get_port(sk, 0) != 0)
goto out;
inet->sport = htons(inet->num);
}
err = sk->prot->connect(sk, uaddr, addr_len); err = sk->prot->connect(sk, uaddr, addr_len);
if (err < 0) if (err < 0)
goto out; goto out;
......
...@@ -219,6 +219,8 @@ ctl_table ipv4_table[] = { ...@@ -219,6 +219,8 @@ ctl_table ipv4_table[] = {
&sysctl_icmp_ratelimit, sizeof(int), 0644, NULL, &proc_dointvec}, &sysctl_icmp_ratelimit, sizeof(int), 0644, NULL, &proc_dointvec},
{NET_IPV4_ICMP_RATEMASK, "icmp_ratemask", {NET_IPV4_ICMP_RATEMASK, "icmp_ratemask",
&sysctl_icmp_ratemask, sizeof(int), 0644, NULL, &proc_dointvec}, &sysctl_icmp_ratemask, sizeof(int), 0644, NULL, &proc_dointvec},
{NET_TCP_TW_REUSE, "tcp_tw_reuse",
&sysctl_tcp_tw_reuse, sizeof(int), 0644, NULL, &proc_dointvec},
{0} {0}
}; };
......
This diff is collapsed.
...@@ -38,6 +38,7 @@ ...@@ -38,6 +38,7 @@
#include <net/tcp.h> #include <net/tcp.h>
#include <linux/compiler.h>
#include <linux/smp_lock.h> #include <linux/smp_lock.h>
/* People can turn this off for buggy TCP's found in printers etc. */ /* People can turn this off for buggy TCP's found in printers etc. */
...@@ -1156,14 +1157,14 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, ...@@ -1156,14 +1157,14 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
return skb; return skb;
} }
int tcp_connect(struct sock *sk, struct sk_buff *buff) /*
* Do all connect socket setups that can be done AF independent.
*/
static inline void tcp_connect_init(struct sock *sk)
{ {
struct dst_entry *dst = __sk_dst_get(sk); struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_opt *tp = tcp_sk(sk); struct tcp_opt *tp = tcp_sk(sk);
/* Reserve space for headers. */
skb_reserve(buff, MAX_TCP_HEADER);
/* We'll fix this up when we get a response from the other end. /* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
*/ */
...@@ -1190,14 +1191,6 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff) ...@@ -1190,14 +1191,6 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
tp->rcv_ssthresh = tp->rcv_wnd; tp->rcv_ssthresh = tp->rcv_wnd;
/* Socket identity change complete, no longer
* in TCP_CLOSE, so enter ourselves into the
* hash tables.
*/
tcp_set_state(sk,TCP_SYN_SENT);
if (tp->af_specific->hash_connecting(sk))
goto err_out;
sk->err = 0; sk->err = 0;
sk->done = 0; sk->done = 0;
tp->snd_wnd = 0; tp->snd_wnd = 0;
...@@ -1211,6 +1204,24 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff) ...@@ -1211,6 +1204,24 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
tp->rto = TCP_TIMEOUT_INIT; tp->rto = TCP_TIMEOUT_INIT;
tp->retransmits = 0; tp->retransmits = 0;
tcp_clear_retrans(tp); tcp_clear_retrans(tp);
}
/*
* Build a SYN and send it off.
*/
int tcp_connect(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *buff;
tcp_connect_init(sk);
buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation);
if (unlikely(buff == NULL))
return -ENOBUFS;
/* Reserve space for headers. */
skb_reserve(buff, MAX_TCP_HEADER);
TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
TCP_ECN_send_syn(tp, buff); TCP_ECN_send_syn(tp, buff);
...@@ -1233,11 +1244,6 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff) ...@@ -1233,11 +1244,6 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
/* Timer for repeating the SYN until an answer. */ /* Timer for repeating the SYN until an answer. */
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
return 0; return 0;
err_out:
tcp_set_state(sk,TCP_CLOSE);
kfree_skb(buff);
return -EADDRNOTAVAIL;
} }
/* Send out a delayed ack, the caller does the policy checking /* Send out a delayed ack, the caller does the policy checking
......
...@@ -136,7 +136,7 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) ...@@ -136,7 +136,7 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
break; break;
} }
if (tb != NULL && tb->owners != NULL) { if (tb != NULL && tb->owners != NULL) {
if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) { if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
goto success; goto success;
} else { } else {
struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk);
...@@ -499,11 +499,21 @@ static int tcp_v6_check_established(struct sock *sk) ...@@ -499,11 +499,21 @@ static int tcp_v6_check_established(struct sock *sk)
return -EADDRNOTAVAIL; return -EADDRNOTAVAIL;
} }
static int tcp_v6_hash_connecting(struct sock *sk) static int tcp_v6_hash_connect(struct sock *sk)
{ {
unsigned short snum = inet_sk(sk)->num; struct tcp_bind_hashbucket *head;
struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)]; struct tcp_bind_bucket *tb;
struct tcp_bind_bucket *tb = head->chain;
/* XXX */
if (inet_sk(sk)->num == 0) {
int err = tcp_v6_get_port(sk, inet_sk(sk)->num);
if (err)
return err;
inet_sk(sk)->sport = htons(inet_sk(sk)->num);
}
head = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
tb = head->chain;
spin_lock_bh(&head->lock); spin_lock_bh(&head->lock);
...@@ -534,7 +544,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ...@@ -534,7 +544,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
struct in6_addr saddr_buf; struct in6_addr saddr_buf;
struct flowi fl; struct flowi fl;
struct dst_entry *dst; struct dst_entry *dst;
struct sk_buff *buff;
int addr_type; int addr_type;
int err; int err;
...@@ -675,17 +684,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ...@@ -675,17 +684,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen; tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen;
tp->mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); tp->mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
err = -ENOBUFS;
buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation);
if (buff == NULL)
goto failure;
inet->dport = usin->sin6_port; inet->dport = usin->sin6_port;
/* tcp_set_state(sk, TCP_SYN_SENT);
* Init variables err = tcp_v6_hash_connect(sk);
*/ if (err)
goto late_failure;
if (!tp->write_seq) if (!tp->write_seq)
tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
...@@ -693,10 +697,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ...@@ -693,10 +697,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
inet->sport, inet->sport,
inet->dport); inet->dport);
err = tcp_connect(sk, buff); err = tcp_connect(sk);
if (err == 0) if (err)
return 0; goto late_failure;
return 0;
late_failure:
tcp_set_state(sk, TCP_CLOSE);
failure: failure:
__sk_dst_reset(sk); __sk_dst_reset(sk);
inet->dport = 0; inet->dport = 0;
...@@ -1785,7 +1793,6 @@ static struct tcp_func ipv6_specific = { ...@@ -1785,7 +1793,6 @@ static struct tcp_func ipv6_specific = {
tcp_v6_rebuild_header, tcp_v6_rebuild_header,
tcp_v6_conn_request, tcp_v6_conn_request,
tcp_v6_syn_recv_sock, tcp_v6_syn_recv_sock,
tcp_v6_hash_connecting,
tcp_v6_remember_stamp, tcp_v6_remember_stamp,
sizeof(struct ipv6hdr), sizeof(struct ipv6hdr),
...@@ -1805,7 +1812,6 @@ static struct tcp_func ipv6_mapped = { ...@@ -1805,7 +1812,6 @@ static struct tcp_func ipv6_mapped = {
tcp_v4_rebuild_header, tcp_v4_rebuild_header,
tcp_v6_conn_request, tcp_v6_conn_request,
tcp_v6_syn_recv_sock, tcp_v6_syn_recv_sock,
tcp_v4_hash_connecting,
tcp_v4_remember_stamp, tcp_v4_remember_stamp,
sizeof(struct iphdr), sizeof(struct iphdr),
......
...@@ -364,7 +364,6 @@ EXPORT_SYMBOL(tcp_inherit_port); ...@@ -364,7 +364,6 @@ EXPORT_SYMBOL(tcp_inherit_port);
EXPORT_SYMBOL(tcp_v4_syn_recv_sock); EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
EXPORT_SYMBOL(tcp_v4_do_rcv); EXPORT_SYMBOL(tcp_v4_do_rcv);
EXPORT_SYMBOL(tcp_v4_connect); EXPORT_SYMBOL(tcp_v4_connect);
EXPORT_SYMBOL(tcp_v4_hash_connecting);
EXPORT_SYMBOL(tcp_unhash); EXPORT_SYMBOL(tcp_unhash);
EXPORT_SYMBOL(udp_prot); EXPORT_SYMBOL(udp_prot);
EXPORT_SYMBOL(tcp_prot); EXPORT_SYMBOL(tcp_prot);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment