tcp_ipv4.c 57.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
8
 * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
Linus Torvalds's avatar
Linus Torvalds committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
 *
 *		IPv4 specific functions
 *
 *
 *		code split from:
 *		linux/ipv4/tcp.c
 *		linux/ipv4/tcp_input.c
 *		linux/ipv4/tcp_output.c
 *
 *		See tcp.c for author information
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

/*
 * Changes:
 *		David S. Miller	:	New socket lookup architecture.
 *					This code is dedicated to John Dyson.
 *		David S. Miller :	Change semantics of established hash,
 *					half is devoted to TIME_WAIT sockets
 *					and the rest go in the other half.
 *		Andi Kleen :		Add support for syncookies and fixed
 *					some bugs: ip options weren't passed to
35 36
 *					the TCP layer, missed a check for an
 *					ACK bit.
Linus Torvalds's avatar
Linus Torvalds committed
37 38 39 40 41 42 43 44 45
 *		Andi Kleen :		Implemented fast path mtu discovery.
 *	     				Fixed many serious bugs in the
 *					open_request handling and moved
 *					most of it into the af independent code.
 *					Added tail drop and some other bugfixes.
 *					Added new listen sematics.
 *		Mike McLagan	:	Routing by source
 *	Juan Jose Ciarlante:		ip_dynaddr bits
 *		Andi Kleen:		various fixes.
46 47
 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
 *					coma.
Linus Torvalds's avatar
Linus Torvalds committed
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
 *	Andi Kleen		:	Fix new listen.
 *	Andi Kleen		:	Fix accept error reporting.
 */

#include <linux/config.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/random.h>
#include <linux/cache.h>
#include <linux/init.h>

#include <net/icmp.h>
#include <net/tcp.h>
#include <net/ipv6.h>
#include <net/inet_common.h>

#include <linux/inet.h>
#include <linux/stddef.h>
#include <linux/ipsec.h>

extern int sysctl_ip_dynaddr;
69
extern int sysctl_ip_default_ttl;
70
int sysctl_tcp_tw_reuse;
Linus Torvalds's avatar
Linus Torvalds committed
71 72 73 74

/* Check TCP sequence numbers in ICMP packets. */
#define ICMP_MIN_LENGTH 8

75
/* Socket used for sending RSTs */
Linus Torvalds's avatar
Linus Torvalds committed
76
static struct socket *tcp_socket;
Linus Torvalds's avatar
Linus Torvalds committed
77

78
void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
Linus Torvalds's avatar
Linus Torvalds committed
79 80 81
		       struct sk_buff *skb);

struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
82 83
	.__tcp_lhash_lock =   RW_LOCK_UNLOCKED,
	.__tcp_lhash_users =  ATOMIC_INIT(0),
Linus Torvalds's avatar
Linus Torvalds committed
84 85
	__tcp_lhash_wait:
	  __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
86
	.__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
Linus Torvalds's avatar
Linus Torvalds committed
87 88 89 90 91 92 93 94
};

/*
 * This array holds the first and last local port number.
 * For high-usage systems, use sysctl to change this to
 * 32768-61000
 */
int sysctl_local_port_range[2] = { 1024, 4999 };
95
int tcp_port_rover = 1024 - 1;
Linus Torvalds's avatar
Linus Torvalds committed
96 97 98 99

static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
				 __u32 faddr, __u16 fport)
{
100 101 102
	int h = (laddr ^ lport) ^ (faddr ^ fport);
	h ^= h >> 16;
	h ^= h >> 8;
Linus Torvalds's avatar
Linus Torvalds committed
103 104 105 106 107
	return h & (tcp_ehash_size - 1);
}

static __inline__ int tcp_sk_hashfn(struct sock *sk)
{
108 109 110 111 112
	struct inet_opt *inet = inet_sk(sk);
	__u32 laddr = inet->rcv_saddr;
	__u16 lport = inet->num;
	__u32 faddr = inet->daddr;
	__u16 fport = inet->dport;
Linus Torvalds's avatar
Linus Torvalds committed
113 114 115 116 117 118 119 120 121 122

	return tcp_hashfn(laddr, lport, faddr, fport);
}

/* Allocate and initialize a new TCP local port bind bucket.
 * The bindhash mutex for snum's hash chain must be held here.
 */
struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
					  unsigned short snum)
{
123 124 125
	struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
						      SLAB_ATOMIC);
	if (tb) {
Linus Torvalds's avatar
Linus Torvalds committed
126 127 128
		tb->port = snum;
		tb->fastreuse = 0;
		tb->owners = NULL;
129
		if ((tb->next = head->chain) != NULL)
Linus Torvalds's avatar
Linus Torvalds committed
130 131 132 133 134 135 136 137 138 139
			tb->next->pprev = &tb->next;
		head->chain = tb;
		tb->pprev = &head->chain;
	}
	return tb;
}

/* Caller must disable local BH processing. */
static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
{
140 141
	struct tcp_bind_hashbucket *head =
				&tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
Linus Torvalds's avatar
Linus Torvalds committed
142 143 144 145 146 147
	struct tcp_bind_bucket *tb;

	spin_lock(&head->lock);
	tb = (struct tcp_bind_bucket *)sk->prev;
	if ((child->bind_next = tb->owners) != NULL)
		tb->owners->bind_pprev = &child->bind_next;
148
	tb->owners	  = child;
Linus Torvalds's avatar
Linus Torvalds committed
149
	child->bind_pprev = &tb->owners;
150
	child->prev	  = (struct sock *)tb;
Linus Torvalds's avatar
Linus Torvalds committed
151 152 153 154 155 156 157 158 159 160
	spin_unlock(&head->lock);
}

__inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
{
	local_bh_disable();
	__tcp_inherit_port(sk, child);
	local_bh_enable();
}

161 162
void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
		   unsigned short snum)
163 164
{
	inet_sk(sk)->num = snum;
Linus Torvalds's avatar
Linus Torvalds committed
165 166
	if ((sk->bind_next = tb->owners) != NULL)
		tb->owners->bind_pprev = &sk->bind_next;
167
	tb->owners     = sk;
Linus Torvalds's avatar
Linus Torvalds committed
168
	sk->bind_pprev = &tb->owners;
169
	sk->prev       = (struct sock *)tb;
170
}
Linus Torvalds's avatar
Linus Torvalds committed
171 172

static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
173
{
174
	struct inet_opt *inet = inet_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
175 176
	struct sock *sk2 = tb->owners;
	int sk_reuse = sk->reuse;
177 178 179 180

	for ( ; sk2; sk2 = sk2->bind_next) {
		if (sk != sk2 && sk->bound_dev_if == sk2->bound_dev_if) {
			if (!sk_reuse || !sk2->reuse ||
Linus Torvalds's avatar
Linus Torvalds committed
181
			    sk2->state == TCP_LISTEN) {
182
				struct inet_opt *inet2 = inet_sk(sk2);
183 184
				if (!inet2->rcv_saddr || !inet->rcv_saddr ||
				    inet2->rcv_saddr == inet->rcv_saddr)
Linus Torvalds's avatar
Linus Torvalds committed
185 186 187 188
					break;
			}
		}
	}
189 190
	return sk2 != NULL;
}
Linus Torvalds's avatar
Linus Torvalds committed
191

Linus Torvalds's avatar
Linus Torvalds committed
192 193 194 195 196 197 198 199 200 201
/* Obtain a reference to a local port for the given sock,
 * if snum is zero it means select any available local port.
 */
static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
{
	struct tcp_bind_hashbucket *head;
	struct tcp_bind_bucket *tb;
	int ret;

	local_bh_disable();
202
	if (!snum) {
Linus Torvalds's avatar
Linus Torvalds committed
203 204 205 206 207 208 209
		int low = sysctl_local_port_range[0];
		int high = sysctl_local_port_range[1];
		int remaining = (high - low) + 1;
		int rover;

		spin_lock(&tcp_portalloc_lock);
		rover = tcp_port_rover;
210 211 212
		do {
			rover++;
			if (rover < low || rover > high)
Linus Torvalds's avatar
Linus Torvalds committed
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
				rover = low;
			head = &tcp_bhash[tcp_bhashfn(rover)];
			spin_lock(&head->lock);
			for (tb = head->chain; tb; tb = tb->next)
				if (tb->port == rover)
					goto next;
			break;
		next:
			spin_unlock(&head->lock);
		} while (--remaining > 0);
		tcp_port_rover = rover;
		spin_unlock(&tcp_portalloc_lock);

		/* Exhausted local port range during search? */
		ret = 1;
		if (remaining <= 0)
			goto fail;

		/* OK, here is the one we will use.  HEAD is
		 * non-NULL and we hold it's mutex.
		 */
		snum = rover;
		tb = NULL;
	} else {
		head = &tcp_bhash[tcp_bhashfn(snum)];
		spin_lock(&head->lock);
239
		for (tb = head->chain; tb; tb = tb->next)
Linus Torvalds's avatar
Linus Torvalds committed
240 241 242
			if (tb->port == snum)
				break;
	}
243
	if (tb && tb->owners) {
244 245
		if (sk->reuse > 1)
			goto success;
246
		if (tb->fastreuse > 0 && sk->reuse && sk->state != TCP_LISTEN) {
Linus Torvalds's avatar
Linus Torvalds committed
247 248
			goto success;
		} else {
249
			ret = 1;
Linus Torvalds's avatar
Linus Torvalds committed
250
			if (tcp_bind_conflict(sk, tb))
251
				goto fail_unlock;
Linus Torvalds's avatar
Linus Torvalds committed
252 253 254
		}
	}
	ret = 1;
255 256 257
	if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
		goto fail_unlock;
	if (!tb->owners) {
Linus Torvalds's avatar
Linus Torvalds committed
258 259 260 261
		if (sk->reuse && sk->state != TCP_LISTEN)
			tb->fastreuse = 1;
		else
			tb->fastreuse = 0;
262
	} else if (tb->fastreuse && (!sk->reuse || sk->state == TCP_LISTEN))
Linus Torvalds's avatar
Linus Torvalds committed
263 264
		tb->fastreuse = 0;
success:
265
	if (!sk->prev)
266
		tcp_bind_hash(sk, tb, snum);
267
	BUG_TRAP(sk->prev == (struct sock *)tb);
Linus Torvalds's avatar
Linus Torvalds committed
268
 	ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
269 270 271 272 273 274 275 276 277 278 279 280 281

fail_unlock:
	spin_unlock(&head->lock);
fail:
	local_bh_enable();
	return ret;
}

/* Get rid of any references to a local port held by the
 * given sock.
 */
__inline__ void __tcp_put_port(struct sock *sk)
{
282 283
	struct inet_opt *inet = inet_sk(sk);
	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
Linus Torvalds's avatar
Linus Torvalds committed
284 285 286 287 288 289 290
	struct tcp_bind_bucket *tb;

	spin_lock(&head->lock);
	tb = (struct tcp_bind_bucket *) sk->prev;
	if (sk->bind_next)
		sk->bind_next->bind_pprev = sk->bind_pprev;
	*(sk->bind_pprev) = sk->bind_next;
291
	sk->prev  = NULL;
292
	inet->num = 0;
293
	if (!tb->owners) {
Linus Torvalds's avatar
Linus Torvalds committed
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
		if (tb->next)
			tb->next->pprev = tb->pprev;
		*(tb->pprev) = tb->next;
		kmem_cache_free(tcp_bucket_cachep, tb);
	}
	spin_unlock(&head->lock);
}

void tcp_put_port(struct sock *sk)
{
	local_bh_disable();
	__tcp_put_port(sk);
	local_bh_enable();
}

/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 * Look, when several writers sleep and reader wakes them up, all but one
 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 * this, _but_ remember, it adds useless work on UP machines (wake up each
 * exclusive lock release). It should be ifdefed really.
 */

void tcp_listen_wlock(void)
{
	write_lock(&tcp_lhash_lock);

	if (atomic_read(&tcp_lhash_users)) {
		DECLARE_WAITQUEUE(wait, current);

		add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
		for (;;) {
			set_current_state(TASK_UNINTERRUPTIBLE);
326
			if (!atomic_read(&tcp_lhash_users))
Linus Torvalds's avatar
Linus Torvalds committed
327 328 329 330 331 332 333 334 335 336 337
				break;
			write_unlock_bh(&tcp_lhash_lock);
			schedule();
			write_lock_bh(&tcp_lhash_lock);
		}

		__set_current_state(TASK_RUNNING);
		remove_wait_queue(&tcp_lhash_wait, &wait);
	}
}

338
static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
Linus Torvalds's avatar
Linus Torvalds committed
339 340 341 342
{
	struct sock **skp;
	rwlock_t *lock;

343 344
	BUG_TRAP(!sk->pprev);
	if (listen_possible && sk->state == TCP_LISTEN) {
Linus Torvalds's avatar
Linus Torvalds committed
345 346 347 348 349 350 351 352
		skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
		lock = &tcp_lhash_lock;
		tcp_listen_wlock();
	} else {
		skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
		lock = &tcp_ehash[sk->hashent].lock;
		write_lock(lock);
	}
353
	if ((sk->next = *skp) != NULL)
Linus Torvalds's avatar
Linus Torvalds committed
354 355 356 357 358
		(*skp)->pprev = &sk->next;
	*skp = sk;
	sk->pprev = skp;
	sock_prot_inc_use(sk->prot);
	write_unlock(lock);
359
	if (listen_possible && sk->state == TCP_LISTEN)
Linus Torvalds's avatar
Linus Torvalds committed
360 361 362 363 364 365 366
		wake_up(&tcp_lhash_wait);
}

static void tcp_v4_hash(struct sock *sk)
{
	if (sk->state != TCP_CLOSE) {
		local_bh_disable();
367
		__tcp_v4_hash(sk, 1);
Linus Torvalds's avatar
Linus Torvalds committed
368 369 370 371 372 373 374 375
		local_bh_enable();
	}
}

void tcp_unhash(struct sock *sk)
{
	rwlock_t *lock;

376 377 378
	if (!sk->pprev)
		goto ende;

Linus Torvalds's avatar
Linus Torvalds committed
379 380 381 382 383 384 385 386 387 388
	if (sk->state == TCP_LISTEN) {
		local_bh_disable();
		tcp_listen_wlock();
		lock = &tcp_lhash_lock;
	} else {
		struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
		lock = &head->lock;
		write_lock_bh(&head->lock);
	}

389 390
	if (sk->pprev) {
		if (sk->next)
Linus Torvalds's avatar
Linus Torvalds committed
391 392 393 394 395 396
			sk->next->pprev = sk->pprev;
		*sk->pprev = sk->next;
		sk->pprev = NULL;
		sock_prot_dec_use(sk->prot);
	}
	write_unlock_bh(lock);
397 398

 ende:
Linus Torvalds's avatar
Linus Torvalds committed
399 400 401 402 403 404 405 406 407 408
	if (sk->state == TCP_LISTEN)
		wake_up(&tcp_lhash_wait);
}

/* Don't inline this cruft.  Here are some nice properties to
 * exploit here.  The BSD API does not allow a listening TCP
 * to specify the remote port nor the remote address for the
 * connection.  So always assume those are both wildcarded
 * during the search since they can never be otherwise.
 */
409 410
static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr,
					     unsigned short hnum, int dif)
Linus Torvalds's avatar
Linus Torvalds committed
411 412 413 414 415
{
	struct sock *result = NULL;
	int score, hiscore;

	hiscore=0;
416
	for (; sk; sk = sk->next) {
417 418
		struct inet_opt *inet = inet_sk(sk);

419
		if (inet->num == hnum) {
420
			__u32 rcv_saddr = inet->rcv_saddr;
Linus Torvalds's avatar
Linus Torvalds committed
421 422

			score = 1;
423
			if (rcv_saddr) {
Linus Torvalds's avatar
Linus Torvalds committed
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444
				if (rcv_saddr != daddr)
					continue;
				score++;
			}
			if (sk->bound_dev_if) {
				if (sk->bound_dev_if != dif)
					continue;
				score++;
			}
			if (score == 3)
				return sk;
			if (score > hiscore) {
				hiscore = score;
				result = sk;
			}
		}
	}
	return result;
}

/* Optimize the common listener case. */
445 446
__inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum,
					       int dif)
Linus Torvalds's avatar
Linus Torvalds committed
447 448 449 450 451 452
{
	struct sock *sk;

	read_lock(&tcp_lhash_lock);
	sk = tcp_listening_hash[tcp_lhashfn(hnum)];
	if (sk) {
453 454
		struct inet_opt *inet = inet_sk(sk);

455
		if (inet->num == hnum && !sk->next &&
456
		    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
Linus Torvalds's avatar
Linus Torvalds committed
457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
		    !sk->bound_dev_if)
			goto sherry_cache;
		sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
	}
	if (sk) {
sherry_cache:
		sock_hold(sk);
	}
	read_unlock(&tcp_lhash_lock);
	return sk;
}

/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 *
 * Local BH must be disabled here.
 */

static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
476 477
						       u32 daddr, u16 hnum,
						       int dif)
Linus Torvalds's avatar
Linus Torvalds committed
478 479 480 481 482 483 484 485
{
	struct tcp_ehash_bucket *head;
	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
	__u32 ports = TCP_COMBINED_PORTS(sport, hnum);
	struct sock *sk;
	/* Optimize here for direct hit, only listening connections can
	 * have wildcards anyways.
	 */
486
	int hash = tcp_hashfn(daddr, hnum, saddr, sport);
Linus Torvalds's avatar
Linus Torvalds committed
487 488
	head = &tcp_ehash[hash];
	read_lock(&head->lock);
489 490
	for (sk = head->chain; sk; sk = sk->next) {
		if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
Linus Torvalds's avatar
Linus Torvalds committed
491 492 493 494
			goto hit; /* You sunk my battleship! */
	}

	/* Must check for a TIME_WAIT'er before going to listener hash. */
495 496
	for (sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
		if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
Linus Torvalds's avatar
Linus Torvalds committed
497
			goto hit;
498
out:
Linus Torvalds's avatar
Linus Torvalds committed
499
	read_unlock(&head->lock);
500
	return sk;
Linus Torvalds's avatar
Linus Torvalds committed
501 502
hit:
	sock_hold(sk);
503
	goto out;
Linus Torvalds's avatar
Linus Torvalds committed
504 505 506 507 508
}

static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
					   u32 daddr, u16 hnum, int dif)
{
509 510
	struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
						      daddr, hnum, dif);
Linus Torvalds's avatar
Linus Torvalds committed
511

512
	return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
Linus Torvalds's avatar
Linus Torvalds committed
513 514
}

515 516
__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
				      u16 dport, int dif)
Linus Torvalds's avatar
Linus Torvalds committed
517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
{
	struct sock *sk;

	local_bh_disable();
	sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
	local_bh_enable();

	return sk;
}

static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
{
	return secure_tcp_sequence_number(skb->nh.iph->daddr,
					  skb->nh.iph->saddr,
					  skb->h.th->dest,
					  skb->h.th->source);
}

535 536 537
/* called with local bh disabled */
static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
				      struct tcp_tw_bucket **twp)
Linus Torvalds's avatar
Linus Torvalds committed
538
{
539 540 541
	struct inet_opt *inet = inet_sk(sk);
	u32 daddr = inet->rcv_saddr;
	u32 saddr = inet->daddr;
Linus Torvalds's avatar
Linus Torvalds committed
542 543
	int dif = sk->bound_dev_if;
	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
544 545
	__u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
	int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
Linus Torvalds's avatar
Linus Torvalds committed
546 547 548 549
	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
	struct sock *sk2, **skp;
	struct tcp_tw_bucket *tw;

550
	write_lock(&head->lock);
Linus Torvalds's avatar
Linus Torvalds committed
551 552

	/* Check TIME-WAIT sockets first. */
553 554 555
	for (skp = &(head + tcp_ehash_size)->chain; (sk2 = *skp) != NULL;
	     skp = &sk2->next) {
		tw = (struct tcp_tw_bucket *)sk2;
Linus Torvalds's avatar
Linus Torvalds committed
556

557
		if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
558
			struct tcp_opt *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
559 560 561 562 563 564 565 566 567 568 569 570 571 572 573

			/* With PAWS, it is safe from the viewpoint
			   of data integrity. Even without PAWS it
			   is safe provided sequence spaces do not
			   overlap i.e. at data rates <= 80Mbit/sec.

			   Actually, the idea is close to VJ's one,
			   only timestamp cache is held not per host,
			   but per port pair and TW bucket is used
			   as state holder.

			   If TW bucket has been already destroyed we
			   fall back to VJ's scheme and use initial
			   timestamp retrieved from peer table.
			 */
574 575
			if (tw->ts_recent_stamp &&
			    (!twp || (sysctl_tcp_tw_reuse &&
576 577
				      xtime.tv_sec -
				      tw->ts_recent_stamp > 1))) {
578 579
				if ((tp->write_seq =
						tw->snd_nxt + 65535 + 2) == 0)
Linus Torvalds's avatar
Linus Torvalds committed
580
					tp->write_seq = 1;
581
				tp->ts_recent	    = tw->ts_recent;
Linus Torvalds's avatar
Linus Torvalds committed
582 583 584 585 586 587 588 589 590 591 592
				tp->ts_recent_stamp = tw->ts_recent_stamp;
				sock_hold(sk2);
				skp = &head->chain;
				goto unique;
			} else
				goto not_unique;
		}
	}
	tw = NULL;

	/* And established part... */
593 594
	for (skp = &head->chain; (sk2 = *skp) != NULL; skp = &sk2->next) {
		if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
Linus Torvalds's avatar
Linus Torvalds committed
595 596 597 598
			goto not_unique;
	}

unique:
599 600
	/* Must record num and sport now. Otherwise we will see
	 * in hash table socket with a funny identity. */
601 602
	inet->num = lport;
	inet->sport = htons(lport);
603
	BUG_TRAP(!sk->pprev);
Linus Torvalds's avatar
Linus Torvalds committed
604 605 606 607 608 609 610
	if ((sk->next = *skp) != NULL)
		(*skp)->pprev = &sk->next;

	*skp = sk;
	sk->pprev = skp;
	sk->hashent = hash;
	sock_prot_inc_use(sk->prot);
611
	write_unlock(&head->lock);
Linus Torvalds's avatar
Linus Torvalds committed
612

613 614 615 616
	if (twp) {
		*twp = tw;
		NET_INC_STATS_BH(TimeWaitRecycled);
	} else if (tw) {
Linus Torvalds's avatar
Linus Torvalds committed
617 618 619 620 621 622 623 624 625 626 627
		/* Silly. Should hash-dance instead... */
		tcp_tw_deschedule(tw);
		tcp_timewait_kill(tw);
		NET_INC_STATS_BH(TimeWaitRecycled);

		tcp_tw_put(tw);
	}

	return 0;

not_unique:
628
	write_unlock(&head->lock);
Linus Torvalds's avatar
Linus Torvalds committed
629 630 631
	return -EADDRNOTAVAIL;
}

632 633
/*
 * Bind a port for a connect operation and hash it.
Linus Torvalds's avatar
Linus Torvalds committed
634
 */
635
static int tcp_v4_hash_connect(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
636
{
637
	unsigned short snum = inet_sk(sk)->num;
638 639
 	struct tcp_bind_hashbucket *head;
 	struct tcp_bind_bucket *tb;
640 641 642
	int ret;

 	if (!snum) {
643 644 645 646 647
 		int rover;
 		int low = sysctl_local_port_range[0];
 		int high = sysctl_local_port_range[1];
 		int remaining = (high - low) + 1;
 		struct tcp_tw_bucket *tw = NULL;
648

649
 		local_bh_disable();
650

651 652 653 654 655 656 657 658 659 660 661 662 663 664 665
 		/* TODO. Actually it is not so bad idea to remove
 		 * tcp_portalloc_lock before next submission to Linus.
 		 * As soon as we touch this place at all it is time to think.
 		 *
 		 * Now it protects single _advisory_ variable tcp_port_rover,
 		 * hence it is mostly useless.
 		 * Code will work nicely if we just delete it, but
 		 * I am afraid in contented case it will work not better or
 		 * even worse: another cpu just will hit the same bucket
 		 * and spin there.
 		 * So some cpu salt could remove both contention and
 		 * memory pingpong. Any ideas how to do this in a nice way?
 		 */
 		spin_lock(&tcp_portalloc_lock);
 		rover = tcp_port_rover;
666

667 668 669 670 671
 		do {
 			rover++;
 			if ((rover < low) || (rover > high))
 				rover = low;
 			head = &tcp_bhash[tcp_bhashfn(rover)];
672 673
 			spin_lock(&head->lock);

674 675 676 677 678 679
 			/* Does not bother with rcv_saddr checks,
 			 * because the established check is already
 			 * unique enough.
 			 */
 			for (tb = head->chain; tb; tb = tb->next) {
 				if (tb->port == rover) {
680
 					BUG_TRAP(tb->owners);
681 682
 					if (tb->fastreuse >= 0)
 						goto next_port;
683 684 685
 					if (!__tcp_v4_check_established(sk,
									rover,
									&tw))
686 687 688 689
 						goto ok;
 					goto next_port;
 				}
 			}
690

691 692 693 694 695 696 697
 			tb = tcp_bucket_create(head, rover);
 			if (!tb) {
 				spin_unlock(&head->lock);
 				break;
 			}
 			tb->fastreuse = -1;
 			goto ok;
698

699 700 701 702 703
 		next_port:
 			spin_unlock(&head->lock);
 		} while (--remaining > 0);
 		tcp_port_rover = rover;
 		spin_unlock(&tcp_portalloc_lock);
704

705
 		local_bh_enable();
706

707
 		return -EADDRNOTAVAIL;
708 709

ok:
710 711 712
 		/* All locks still held and bhs disabled */
 		tcp_port_rover = rover;
 		spin_unlock(&tcp_portalloc_lock);
713

714 715
 		tcp_bind_hash(sk, tb, rover);
 		if (!sk->pprev) {
716
 			inet_sk(sk)->sport = htons(rover);
717 718 719 720 721 722 723 724 725
 			__tcp_v4_hash(sk, 0);
 		}
 		spin_unlock(&head->lock);

 		if (tw) {
 			tcp_tw_deschedule(tw);
 			tcp_timewait_kill(tw);
 			tcp_tw_put(tw);
 		}
726 727 728

		ret = 0;
		goto out;
729
 	}
730

731 732
 	head  = &tcp_bhash[tcp_bhashfn(snum)];
 	tb  = (struct tcp_bind_bucket *)sk->prev;
Linus Torvalds's avatar
Linus Torvalds committed
733
	spin_lock_bh(&head->lock);
734
	if (tb->owners == sk && !sk->bind_next) {
735
		__tcp_v4_hash(sk, 0);
Linus Torvalds's avatar
Linus Torvalds committed
736 737 738
		spin_unlock_bh(&head->lock);
		return 0;
	} else {
739
		spin_unlock(&head->lock);
Linus Torvalds's avatar
Linus Torvalds committed
740
		/* No definite answer... Walk to established hash table */
741
		ret = __tcp_v4_check_established(sk, snum, NULL);
742
out:
743 744
		local_bh_enable();
		return ret;
Linus Torvalds's avatar
Linus Torvalds committed
745 746 747 748 749 750
	}
}

/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
751 752
	struct inet_opt *inet = inet_sk(sk);
	struct tcp_opt *tp = tcp_sk(sk);
753
	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
Linus Torvalds's avatar
Linus Torvalds committed
754 755 756 757 758 759
	struct rtable *rt;
	u32 daddr, nexthop;
	int tmp;
	int err;

	if (addr_len < sizeof(struct sockaddr_in))
760
		return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
761 762

	if (usin->sin_family != AF_INET)
763
		return -EAFNOSUPPORT;
Linus Torvalds's avatar
Linus Torvalds committed
764 765

	nexthop = daddr = usin->sin_addr.s_addr;
766
	if (inet->opt && inet->opt->srr) {
767
		if (!daddr)
Linus Torvalds's avatar
Linus Torvalds committed
768
			return -EINVAL;
769
		nexthop = inet->opt->faddr;
Linus Torvalds's avatar
Linus Torvalds committed
770 771
	}

772
	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
Linus Torvalds's avatar
Linus Torvalds committed
773
			       RT_CONN_FLAGS(sk), sk->bound_dev_if);
Linus Torvalds's avatar
Linus Torvalds committed
774 775 776
	if (tmp < 0)
		return tmp;

777
	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
Linus Torvalds's avatar
Linus Torvalds committed
778 779 780 781 782
		ip_rt_put(rt);
		return -ENETUNREACH;
	}

	__sk_dst_set(sk, &rt->u.dst);
Linus Torvalds's avatar
Linus Torvalds committed
783
	sk->route_caps = rt->u.dst.dev->features;
Linus Torvalds's avatar
Linus Torvalds committed
784

785
	if (!inet->opt || !inet->opt->srr)
Linus Torvalds's avatar
Linus Torvalds committed
786 787
		daddr = rt->rt_dst;

788 789 790
	if (!inet->saddr)
		inet->saddr = rt->rt_src;
	inet->rcv_saddr = inet->saddr;
Linus Torvalds's avatar
Linus Torvalds committed
791

792
	if (tp->ts_recent_stamp && inet->daddr != daddr) {
Linus Torvalds's avatar
Linus Torvalds committed
793
		/* Reset inherited state */
794
		tp->ts_recent	    = 0;
Linus Torvalds's avatar
Linus Torvalds committed
795
		tp->ts_recent_stamp = 0;
796
		tp->write_seq	    = 0;
Linus Torvalds's avatar
Linus Torvalds committed
797 798 799
	}

	if (sysctl_tcp_tw_recycle &&
800
	    !tp->ts_recent_stamp && rt->rt_dst == daddr) {
Linus Torvalds's avatar
Linus Torvalds committed
801 802 803 804 805 806 807 808 809 810 811 812 813
		struct inet_peer *peer = rt_get_peer(rt);

		/* VJ's idea. We save last timestamp seen from
		 * the destination in peer table, when entering state TIME-WAIT
		 * and initialize ts_recent from it, when trying new connection.
		 */

		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
			tp->ts_recent_stamp = peer->tcp_ts_stamp;
			tp->ts_recent = peer->tcp_ts;
		}
	}

814 815
	inet->dport = usin->sin_port;
	inet->daddr = daddr;
Linus Torvalds's avatar
Linus Torvalds committed
816

817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832
	tp->ext_header_len = 0;
	if (inet->opt)
		tp->ext_header_len = inet->opt->optlen;

	tp->mss_clamp = 536;

	/* Socket identity is still unknown (sport may be zero).
	 * However we set state to SYN-SENT and not releasing socket
	 * lock select source port, enter ourselves into the hash tables and
	 * complete initalization after this.
	 */
	tcp_set_state(sk, TCP_SYN_SENT);
	err = tcp_v4_hash_connect(sk);
	if (err)
		goto failure;

Linus Torvalds's avatar
Linus Torvalds committed
833
	if (!tp->write_seq)
834 835 836
		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
							   inet->daddr,
							   inet->sport,
837
							   usin->sin_port);
Linus Torvalds's avatar
Linus Torvalds committed
838

839
	inet->id = tp->write_seq ^ jiffies;
Linus Torvalds's avatar
Linus Torvalds committed
840

841 842 843
	err = tcp_connect(sk);
	if (err)
		goto failure;
Linus Torvalds's avatar
Linus Torvalds committed
844

845
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
846

Linus Torvalds's avatar
Linus Torvalds committed
847
failure:
848
	tcp_set_state(sk, TCP_CLOSE);
Linus Torvalds's avatar
Linus Torvalds committed
849
	__sk_dst_reset(sk);
Linus Torvalds's avatar
Linus Torvalds committed
850
	sk->route_caps = 0;
851
	inet->dport = 0;
Linus Torvalds's avatar
Linus Torvalds committed
852 853 854 855 856
	return err;
}

static __inline__ int tcp_v4_iif(struct sk_buff *skb)
{
857
	return ((struct rtable *)skb->dst)->rt_iif;
Linus Torvalds's avatar
Linus Torvalds committed
858 859 860 861 862
}

static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
{
	unsigned h = raddr ^ rport;
863 864 865
	h ^= h >> 16;
	h ^= h >> 8;
	return h & (TCP_SYNQ_HSIZE - 1);
Linus Torvalds's avatar
Linus Torvalds committed
866 867
}

868
static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
Linus Torvalds's avatar
Linus Torvalds committed
869 870 871
					      struct open_request ***prevp,
					      __u16 rport,
					      __u32 raddr, __u32 laddr)
Linus Torvalds's avatar
Linus Torvalds committed
872 873
{
	struct tcp_listen_opt *lopt = tp->listen_opt;
874
	struct open_request *req, **prev;
Linus Torvalds's avatar
Linus Torvalds committed
875 876 877 878 879 880

	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
	     (req = *prev) != NULL;
	     prev = &req->dl_next) {
		if (req->rmt_port == rport &&
		    req->af.v4_req.rmt_addr == raddr &&
Linus Torvalds's avatar
Linus Torvalds committed
881
		    req->af.v4_req.loc_addr == laddr &&
Linus Torvalds's avatar
Linus Torvalds committed
882
		    TCP_INET_FAMILY(req->class->family)) {
883
			BUG_TRAP(!req->sk);
Linus Torvalds's avatar
Linus Torvalds committed
884
			*prevp = prev;
885
			break;
Linus Torvalds's avatar
Linus Torvalds committed
886 887 888
		}
	}

889
	return req;
Linus Torvalds's avatar
Linus Torvalds committed
890 891 892 893
}

static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
{
894
	struct tcp_opt *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910
	struct tcp_listen_opt *lopt = tp->listen_opt;
	unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);

	req->expires = jiffies + TCP_TIMEOUT_INIT;
	req->retrans = 0;
	req->sk = NULL;
	req->dl_next = lopt->syn_table[h];

	write_lock(&tp->syn_wait_lock);
	lopt->syn_table[h] = req;
	write_unlock(&tp->syn_wait_lock);

	tcp_synq_added(sk);
}


911
/*
Linus Torvalds's avatar
Linus Torvalds committed
912 913
 * This routine does path mtu discovery as defined in RFC1191.
 */
914 915
static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
				     unsigned mtu)
Linus Torvalds's avatar
Linus Torvalds committed
916 917
{
	struct dst_entry *dst;
918 919
	struct inet_opt *inet = inet_sk(sk);
	struct tcp_opt *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
920 921 922 923 924 925

	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
	 * send out by Linux are always <576bytes so they should go through
	 * unfragmented).
	 */
	if (sk->state == TCP_LISTEN)
926
		return;
Linus Torvalds's avatar
Linus Torvalds committed
927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944

	/* We don't check in the destentry if pmtu discovery is forbidden
	 * on this route. We just assume that no packet_to_big packets
	 * are send back when pmtu discovery is not active.
     	 * There is a small race when the user changes this flag in the
	 * route, but I think that's acceptable.
	 */
	if ((dst = __sk_dst_check(sk, 0)) == NULL)
		return;

	ip_rt_update_pmtu(dst, mtu);

	/* Something is about to be wrong... Remember soft error
	 * for the case, if this connection will not able to recover.
	 */
	if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
		sk->err_soft = EMSGSIZE;

945
	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
Linus Torvalds's avatar
Linus Torvalds committed
946 947 948
	    tp->pmtu_cookie > dst->pmtu) {
		tcp_sync_mss(sk, dst->pmtu);

949
		/* Resend the TCP packet because it's
Linus Torvalds's avatar
Linus Torvalds committed
950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973
		 * clear that the old packet has been
		 * dropped. This is the new "fast" path mtu
		 * discovery.
		 */
		tcp_simple_retransmit(sk);
	} /* else let the usual retransmit timer handle it */
}

/*
 * This routine is called by the ICMP module when it gets some
 * sort of error condition.  If err < 0 then the socket should
 * be closed and the error returned to the user.  If err > 0
 * it's just the icmp type << 8 | icmp code.  After adjustment
 * header points to the first 8 bytes of the tcp header.  We need
 * to find the appropriate port.
 *
 * The locking strategy used here is very "optimistic". When
 * someone else accesses the socket the ICMP is just dropped
 * and for some paths there is no check at all.
 * A more general error queue to queue errors for later handling
 * is probably better.
 *
 */

Linus Torvalds's avatar
Linus Torvalds committed
974
void tcp_v4_err(struct sk_buff *skb, u32 info)
Linus Torvalds's avatar
Linus Torvalds committed
975
{
976 977
	struct iphdr *iph = (struct iphdr *)skb->data;
	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
Linus Torvalds's avatar
Linus Torvalds committed
978
	struct tcp_opt *tp;
979
	struct inet_opt *inet;
Linus Torvalds's avatar
Linus Torvalds committed
980 981 982 983 984 985
	int type = skb->h.icmph->type;
	int code = skb->h.icmph->code;
	struct sock *sk;
	__u32 seq;
	int err;

Linus Torvalds's avatar
Linus Torvalds committed
986
	if (skb->len < (iph->ihl << 2) + 8) {
987
		ICMP_INC_STATS_BH(IcmpInErrors);
Linus Torvalds's avatar
Linus Torvalds committed
988 989 990
		return;
	}

991 992 993
	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
			   th->source, tcp_v4_iif(skb));
	if (!sk) {
Linus Torvalds's avatar
Linus Torvalds committed
994 995 996 997
		ICMP_INC_STATS_BH(IcmpInErrors);
		return;
	}
	if (sk->state == TCP_TIME_WAIT) {
998
		tcp_tw_put((struct tcp_tw_bucket *)sk);
Linus Torvalds's avatar
Linus Torvalds committed
999 1000 1001 1002 1003 1004 1005
		return;
	}

	bh_lock_sock(sk);
	/* If too many ICMPs get dropped on busy
	 * servers this needs to be solved differently.
	 */
1006
	if (sk->lock.users)
Linus Torvalds's avatar
Linus Torvalds committed
1007 1008 1009 1010 1011
		NET_INC_STATS_BH(LockDroppedIcmps);

	if (sk->state == TCP_CLOSE)
		goto out;

1012
	tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
1013
	seq = ntohl(th->seq);
1014 1015
	if (sk->state != TCP_LISTEN &&
	    !between(seq, tp->snd_una, tp->snd_nxt)) {
Linus Torvalds's avatar
Linus Torvalds committed
1016 1017 1018 1019 1020 1021 1022 1023 1024
		NET_INC_STATS(OutOfWindowIcmps);
		goto out;
	}

	switch (type) {
	case ICMP_SOURCE_QUENCH:
		/* This is deprecated, but if someone generated it,
		 * we have no reasons to ignore it.
		 */
1025
		if (!sk->lock.users)
Linus Torvalds's avatar
Linus Torvalds committed
1026 1027 1028 1029
			tcp_enter_cwr(tp);
		goto out;
	case ICMP_PARAMETERPROB:
		err = EPROTO;
1030
		break;
Linus Torvalds's avatar
Linus Torvalds committed
1031 1032 1033 1034 1035
	case ICMP_DEST_UNREACH:
		if (code > NR_ICMP_UNREACH)
			goto out;

		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1036
			if (!sk->lock.users)
Linus Torvalds's avatar
Linus Torvalds committed
1037
				do_pmtu_discovery(sk, iph, info);
Linus Torvalds's avatar
Linus Torvalds committed
1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052
			goto out;
		}

		err = icmp_err_convert[code].errno;
		break;
	case ICMP_TIME_EXCEEDED:
		err = EHOSTUNREACH;
		break;
	default:
		goto out;
	}

	switch (sk->state) {
		struct open_request *req, **prev;
	case TCP_LISTEN:
1053
		if (sk->lock.users)
Linus Torvalds's avatar
Linus Torvalds committed
1054 1055
			goto out;

1056 1057
		req = tcp_v4_search_req(tp, &prev, th->dest,
					iph->daddr, iph->saddr);
Linus Torvalds's avatar
Linus Torvalds committed
1058 1059 1060 1061 1062 1063
		if (!req)
			goto out;

		/* ICMPs are not backlogged, hence we cannot get
		   an established socket here.
		 */
1064
		BUG_TRAP(!req->sk);
Linus Torvalds's avatar
Linus Torvalds committed
1065 1066 1067 1068 1069 1070

		if (seq != req->snt_isn) {
			NET_INC_STATS_BH(OutOfWindowIcmps);
			goto out;
		}

1071
		/*
Linus Torvalds's avatar
Linus Torvalds committed
1072 1073 1074
		 * Still in SYN_RECV, just remove it silently.
		 * There is no good way to pass the error to the newly
		 * created socket, and POSIX does not want network
1075 1076
		 * errors returned from accept().
		 */
Linus Torvalds's avatar
Linus Torvalds committed
1077 1078 1079 1080 1081 1082
		tcp_synq_drop(sk, req, prev);
		goto out;

	case TCP_SYN_SENT:
	case TCP_SYN_RECV:  /* Cannot happen.
			       It can f.e. if SYNs crossed.
1083 1084
			     */
		if (!sk->lock.users) {
Linus Torvalds's avatar
Linus Torvalds committed
1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
			TCP_INC_STATS_BH(TcpAttemptFails);
			sk->err = err;

			sk->error_report(sk);

			tcp_done(sk);
		} else {
			sk->err_soft = err;
		}
		goto out;
	}

	/* If we've already connected we will keep trying
	 * until we time out, or the user gives up.
	 *
	 * rfc1122 4.2.3.9 allows to consider as hard errors
	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
	 * but it is obsoleted by pmtu discovery).
	 *
	 * Note, that in modern internet, where routing is unreliable
	 * and in each dark corner broken firewalls sit, sending random
	 * errors ordered by their masters even this two messages finally lose
	 * their original sense (even Linux sends invalid PORT_UNREACHs)
	 *
	 * Now we are in compliance with RFCs.
	 *							--ANK (980905)
	 */

1113
	inet = inet_sk(sk);
1114
	if (!sk->lock.users && inet->recverr) {
Linus Torvalds's avatar
Linus Torvalds committed
1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126
		sk->err = err;
		sk->error_report(sk);
	} else	{ /* Only an error on timeout */
		sk->err_soft = err;
	}

out:
	bh_unlock_sock(sk);
	sock_put(sk);
}

/* This routine computes an IPv4 TCP checksum. */
1127
void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
Linus Torvalds's avatar
Linus Torvalds committed
1128 1129
		       struct sk_buff *skb)
{
1130 1131
	struct inet_opt *inet = inet_sk(sk);

Linus Torvalds's avatar
Linus Torvalds committed
1132
	if (skb->ip_summed == CHECKSUM_HW) {
1133
		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1134 1135
		skb->csum = offsetof(struct tcphdr, check);
	} else {
1136
		th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1137 1138 1139
					 csum_partial((char *)th,
						      th->doff << 2,
						      skb->csum));
Linus Torvalds's avatar
Linus Torvalds committed
1140
	}
Linus Torvalds's avatar
Linus Torvalds committed
1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165
}

/*
 *	This routine will send an RST to the other tcp.
 *
 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 *		      for reset.
 *	Answer: if a packet caused RST, it is not for a socket
 *		existing in our system, if it is matched to a socket,
 *		it is just duplicate segment or bug in other side's TCP.
 *		So that we build reply only basing on parameters
 *		arrived with segment.
 *	Exception: precedence violation. We do not implement it in any case.
 */

static void tcp_v4_send_reset(struct sk_buff *skb)
{
	struct tcphdr *th = skb->h.th;
	struct tcphdr rth;
	struct ip_reply_arg arg;

	/* Never send a reset in response to a reset. */
	if (th->rst)
		return;

1166
	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
Linus Torvalds's avatar
Linus Torvalds committed
1167 1168 1169
		return;

	/* Swap the send and the receive. */
1170 1171 1172 1173 1174
	memset(&rth, 0, sizeof(struct tcphdr));
	rth.dest   = th->source;
	rth.source = th->dest;
	rth.doff   = sizeof(struct tcphdr) / 4;
	rth.rst    = 1;
Linus Torvalds's avatar
Linus Torvalds committed
1175 1176 1177 1178 1179

	if (th->ack) {
		rth.seq = th->ack_seq;
	} else {
		rth.ack = 1;
1180 1181
		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
				    skb->len - (th->doff << 2));
Linus Torvalds's avatar
Linus Torvalds committed
1182 1183
	}

1184 1185
	memset(&arg, 0, sizeof arg);
	arg.iov[0].iov_base = (unsigned char *)&rth;
Linus Torvalds's avatar
Linus Torvalds committed
1186
	arg.iov[0].iov_len  = sizeof rth;
1187
	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
Linus Torvalds's avatar
Linus Torvalds committed
1188
				      skb->nh.iph->saddr, /*XXX*/
1189
				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1190
	arg.n_iov = 1;
1191
	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
Linus Torvalds's avatar
Linus Torvalds committed
1192

1193
	inet_sk(tcp_socket->sk)->ttl = sysctl_ip_default_ttl;
Linus Torvalds's avatar
Linus Torvalds committed
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203
	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);

	TCP_INC_STATS_BH(TcpOutSegs);
	TCP_INC_STATS_BH(TcpOutRsts);
}

/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
   outside socket context is ugly, certainly. What can I do?
 */

1204 1205
static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
			    u32 win, u32 ts)
Linus Torvalds's avatar
Linus Torvalds committed
1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216
{
	struct tcphdr *th = skb->h.th;
	struct {
		struct tcphdr th;
		u32 tsopt[3];
	} rep;
	struct ip_reply_arg arg;

	memset(&rep.th, 0, sizeof(struct tcphdr));
	memset(&arg, 0, sizeof arg);

1217
	arg.iov[0].iov_base = (unsigned char *)&rep;
Linus Torvalds's avatar
Linus Torvalds committed
1218 1219 1220
	arg.iov[0].iov_len  = sizeof(rep.th);
	arg.n_iov = 1;
	if (ts) {
1221 1222 1223
		rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
				     (TCPOPT_TIMESTAMP << 8) |
				     TCPOLEN_TIMESTAMP);
Linus Torvalds's avatar
Linus Torvalds committed
1224 1225 1226 1227 1228 1229
		rep.tsopt[1] = htonl(tcp_time_stamp);
		rep.tsopt[2] = htonl(ts);
		arg.iov[0].iov_len = sizeof(rep);
	}

	/* Swap the send and the receive. */
1230 1231 1232 1233
	rep.th.dest    = th->source;
	rep.th.source  = th->dest;
	rep.th.doff    = arg.iov[0].iov_len / 4;
	rep.th.seq     = htonl(seq);
Linus Torvalds's avatar
Linus Torvalds committed
1234
	rep.th.ack_seq = htonl(ack);
1235 1236
	rep.th.ack     = 1;
	rep.th.window  = htons(win);
Linus Torvalds's avatar
Linus Torvalds committed
1237

1238
	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
Linus Torvalds's avatar
Linus Torvalds committed
1239
				      skb->nh.iph->saddr, /*XXX*/
1240 1241
				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
Linus Torvalds's avatar
Linus Torvalds committed
1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252

	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);

	TCP_INC_STATS_BH(TcpOutSegs);
}

static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;

	tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1253
			tw->rcv_wnd >> tw->rcv_wscale, tw->ts_recent);
Linus Torvalds's avatar
Linus Torvalds committed
1254 1255 1256 1257 1258 1259

	tcp_tw_put(tw);
}

static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
{
1260
	tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds's avatar
Linus Torvalds committed
1261 1262 1263
			req->ts_recent);
}

1264 1265
static struct dst_entry* tcp_v4_route_req(struct sock *sk,
					  struct open_request *req)
Linus Torvalds's avatar
Linus Torvalds committed
1266 1267
{
	struct rtable *rt;
1268
	struct ip_options *opt = req->af.v4_req.opt;
Linus Torvalds's avatar
Linus Torvalds committed
1269

1270 1271
	if (ip_route_output(&rt, ((opt && opt->srr) ? opt->faddr :
						      req->af.v4_req.rmt_addr),
Linus Torvalds's avatar
Linus Torvalds committed
1272
			   req->af.v4_req.loc_addr,
Linus Torvalds's avatar
Linus Torvalds committed
1273
			   RT_CONN_FLAGS(sk), sk->bound_dev_if)) {
Linus Torvalds's avatar
Linus Torvalds committed
1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285
		IP_INC_STATS_BH(IpOutNoRoutes);
		return NULL;
	}
	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
		ip_rt_put(rt);
		IP_INC_STATS_BH(IpOutNoRoutes);
		return NULL;
	}
	return &rt->u.dst;
}

/*
1286
 *	Send a SYN-ACK after having received an ACK.
Linus Torvalds's avatar
Linus Torvalds committed
1287 1288
 *	This still operates on a open_request only, not on a big
 *	socket.
1289
 */
Linus Torvalds's avatar
Linus Torvalds committed
1290 1291 1292 1293 1294 1295 1296
static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
			      struct dst_entry *dst)
{
	int err = -1;
	struct sk_buff * skb;

	/* First, grab a route. */
1297
	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
Linus Torvalds's avatar
Linus Torvalds committed
1298 1299 1300 1301 1302 1303 1304 1305
		goto out;

	skb = tcp_make_synack(sk, dst, req);

	if (skb) {
		struct tcphdr *th = skb->h.th;

		th->check = tcp_v4_check(th, skb->len,
1306 1307 1308 1309
					 req->af.v4_req.loc_addr,
					 req->af.v4_req.rmt_addr,
					 csum_partial((char *)th, skb->len,
						      skb->csum));
Linus Torvalds's avatar
Linus Torvalds committed
1310 1311

		err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1312 1313
					    req->af.v4_req.rmt_addr,
					    req->af.v4_req.opt);
Linus Torvalds's avatar
Linus Torvalds committed
1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324
		if (err == NET_XMIT_CN)
			err = 0;
	}

out:
	dst_release(dst);
	return err;
}

/*
 *	IPv4 open_request destructor.
1325
 */
Linus Torvalds's avatar
Linus Torvalds committed
1326 1327 1328 1329 1330 1331 1332 1333 1334
static void tcp_v4_or_free(struct open_request *req)
{
	if (req->af.v4_req.opt)
		kfree(req->af.v4_req.opt);
}

static inline void syn_flood_warning(struct sk_buff *skb)
{
	static unsigned long warntime;
1335 1336

	if (jiffies - warntime > HZ * 60) {
Linus Torvalds's avatar
Linus Torvalds committed
1337
		warntime = jiffies;
1338 1339
		printk(KERN_INFO
		       "possible SYN flooding on port %d. Sending cookies.\n",
Linus Torvalds's avatar
Linus Torvalds committed
1340 1341 1342 1343
		       ntohs(skb->h.th->dest));
	}
}

1344 1345
/*
 * Save and compile IPv4 options into the open_request if needed.
Linus Torvalds's avatar
Linus Torvalds committed
1346
 */
1347 1348
static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
						     struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
1349 1350
{
	struct ip_options *opt = &(IPCB(skb)->opt);
1351
	struct ip_options *dopt = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1352 1353

	if (opt && opt->optlen) {
1354
		int opt_size = optlength(opt);
Linus Torvalds's avatar
Linus Torvalds committed
1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365
		dopt = kmalloc(opt_size, GFP_ATOMIC);
		if (dopt) {
			if (ip_options_echo(dopt, skb)) {
				kfree(dopt);
				dopt = NULL;
			}
		}
	}
	return dopt;
}

1366
/*
Linus Torvalds's avatar
Linus Torvalds committed
1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378
 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
 * It would be better to replace it with a global counter for all sockets
 * but then some measure against one socket starving all other sockets
 * would be needed.
 *
 * It was 128 by default. Experiments with real servers show, that
 * it is absolutely not enough even at 100conn/sec. 256 cures most
 * of problems. This value is adjusted to 128 for very small machines
 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
 * Further increasing requires to change hash table size.
 */
1379
int sysctl_max_syn_backlog = 256;
Linus Torvalds's avatar
Linus Torvalds committed
1380 1381

struct or_calltable or_ipv4 = {
1382 1383 1384 1385 1386
	.family =	PF_INET,
	.rtx_syn_ack =	tcp_v4_send_synack,
	.send_ack =	tcp_v4_or_send_ack,
	.destructor =	tcp_v4_or_free,
	.send_reset =	tcp_v4_send_reset,
Linus Torvalds's avatar
Linus Torvalds committed
1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403
};

int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_opt tp;
	struct open_request *req;
	__u32 saddr = skb->nh.iph->saddr;
	__u32 daddr = skb->nh.iph->daddr;
	__u32 isn = TCP_SKB_CB(skb)->when;
	struct dst_entry *dst = NULL;
#ifdef CONFIG_SYN_COOKIES
	int want_cookie = 0;
#else
#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
#endif

	/* Never answer to SYNs send to broadcast or multicast */
1404 1405 1406
	if (((struct rtable *)skb->dst)->rt_flags &
	    (RTCF_BROADCAST | RTCF_MULTICAST))
		goto drop;
Linus Torvalds's avatar
Linus Torvalds committed
1407 1408 1409 1410 1411 1412 1413 1414

	/* TW buckets are converted to open requests without
	 * limitations, they conserve resources and peer is
	 * evidently real one.
	 */
	if (tcp_synq_is_full(sk) && !isn) {
#ifdef CONFIG_SYN_COOKIES
		if (sysctl_tcp_syncookies) {
1415
			want_cookie = 1;
Linus Torvalds's avatar
Linus Torvalds committed
1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429
		} else
#endif
		goto drop;
	}

	/* Accept backlog is full. If we have already queued enough
	 * of warm entries in syn queue, drop request. It is better than
	 * clogging syn queue with openreqs with exponentially increasing
	 * timeout.
	 */
	if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
		goto drop;

	req = tcp_openreq_alloc();
1430
	if (!req)
Linus Torvalds's avatar
Linus Torvalds committed
1431 1432 1433 1434
		goto drop;

	tcp_clear_options(&tp);
	tp.mss_clamp = 536;
1435
	tp.user_mss  = tcp_sk(sk)->user_mss;
Linus Torvalds's avatar
Linus Torvalds committed
1436 1437 1438 1439 1440 1441 1442 1443

	tcp_parse_options(skb, &tp, 0);

	if (want_cookie) {
		tcp_clear_options(&tp);
		tp.saw_tstamp = 0;
	}

1444
	if (tp.saw_tstamp && !tp.rcv_tsval) {
Linus Torvalds's avatar
Linus Torvalds committed
1445 1446 1447 1448 1449 1450
		/* Some OSes (unknown ones, but I see them on web server, which
		 * contains information interesting only for windows'
		 * users) do not send their stamp in SYN. It is easy case.
		 * We simply do not advertise TS support.
		 */
		tp.saw_tstamp = 0;
1451
		tp.tstamp_ok  = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468
	}
	tp.tstamp_ok = tp.saw_tstamp;

	tcp_openreq_init(req, &tp, skb);

	req->af.v4_req.loc_addr = daddr;
	req->af.v4_req.rmt_addr = saddr;
	req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
	req->class = &or_ipv4;
	if (!want_cookie)
		TCP_ECN_create_request(req, skb->h.th);

	if (want_cookie) {
#ifdef CONFIG_SYN_COOKIES
		syn_flood_warning(skb);
#endif
		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1469
	} else if (!isn) {
Linus Torvalds's avatar
Linus Torvalds committed
1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483
		struct inet_peer *peer = NULL;

		/* VJ's idea. We save last timestamp seen
		 * from the destination in peer table, when entering
		 * state TIME-WAIT, and check against it before
		 * accepting new connection request.
		 *
		 * If "isn" is not zero, this request hit alive
		 * timewait bucket, so that all the necessary checks
		 * are made in the function processing timewait state.
		 */
		if (tp.saw_tstamp &&
		    sysctl_tcp_tw_recycle &&
		    (dst = tcp_v4_route_req(sk, req)) != NULL &&
1484
		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
Linus Torvalds's avatar
Linus Torvalds committed
1485 1486
		    peer->v4daddr == saddr) {
			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1487 1488
			    (s32)(peer->tcp_ts - req->ts_recent) >
							TCP_PAWS_WINDOW) {
Linus Torvalds's avatar
Linus Torvalds committed
1489 1490 1491 1492 1493 1494 1495
				NET_INC_STATS_BH(PAWSPassiveRejected);
				dst_release(dst);
				goto drop_and_free;
			}
		}
		/* Kill the following clause, if you dislike this way. */
		else if (!sysctl_tcp_syncookies &&
1496 1497
			 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
			  (sysctl_max_syn_backlog >> 2)) &&
Linus Torvalds's avatar
Linus Torvalds committed
1498 1499 1500
			 (!peer || !peer->tcp_ts_stamp) &&
			 (!dst || !dst->rtt)) {
			/* Without syncookies last quarter of
1501 1502
			 * backlog is filled with destinations,
			 * proven to be alive.
Linus Torvalds's avatar
Linus Torvalds committed
1503 1504 1505 1506 1507
			 * It means that we continue to communicate
			 * to destinations, already remembered
			 * to the moment of synflood.
			 */
			NETDEBUG(if (net_ratelimit()) \
1508 1509 1510 1511 1512
					printk(KERN_DEBUG "TCP: drop open "
							  "request from %u.%u."
							  "%u.%u/%u\n", \
					       NIPQUAD(saddr),
					       ntohs(skb->h.th->source)));
Linus Torvalds's avatar
Linus Torvalds committed
1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524
			dst_release(dst);
			goto drop_and_free;
		}

		isn = tcp_v4_init_sequence(sk, skb);
	}
	req->snt_isn = isn;

	if (tcp_v4_send_synack(sk, req, dst))
		goto drop_and_free;

	if (want_cookie) {
1525
	   	tcp_openreq_free(req);
Linus Torvalds's avatar
Linus Torvalds committed
1526 1527 1528 1529 1530 1531
	} else {
		tcp_v4_synq_add(sk, req);
	}
	return 0;

drop_and_free:
1532
	tcp_openreq_free(req);
Linus Torvalds's avatar
Linus Torvalds committed
1533 1534 1535 1536 1537 1538
drop:
	TCP_INC_STATS_BH(TcpAttemptFails);
	return 0;
}


1539 1540 1541
/*
 * The three way handshake has completed - we got a valid synack -
 * now create the new socket.
Linus Torvalds's avatar
Linus Torvalds committed
1542
 */
1543 1544 1545
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
				  struct open_request *req,
				  struct dst_entry *dst)
Linus Torvalds's avatar
Linus Torvalds committed
1546
{
1547
	struct inet_opt *newinet;
Linus Torvalds's avatar
Linus Torvalds committed
1548 1549 1550 1551 1552 1553
	struct tcp_opt *newtp;
	struct sock *newsk;

	if (tcp_acceptq_is_full(sk))
		goto exit_overflow;

1554
	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
Linus Torvalds's avatar
Linus Torvalds committed
1555 1556 1557 1558 1559 1560 1561
		goto exit;

	newsk = tcp_create_openreq_child(sk, req, skb);
	if (!newsk)
		goto exit;

	newsk->dst_cache = dst;
Linus Torvalds's avatar
Linus Torvalds committed
1562
	newsk->route_caps = dst->dev->features;
Linus Torvalds's avatar
Linus Torvalds committed
1563

1564 1565 1566 1567 1568 1569 1570 1571 1572
	newtp		      = tcp_sk(newsk);
	newinet		      = inet_sk(newsk);
	newinet->daddr	      = req->af.v4_req.rmt_addr;
	newinet->rcv_saddr    = req->af.v4_req.loc_addr;
	newinet->saddr	      = req->af.v4_req.loc_addr;
	newinet->opt	      = req->af.v4_req.opt;
	req->af.v4_req.opt    = NULL;
	newinet->mc_index     = tcp_v4_iif(skb);
	newinet->mc_ttl	      = skb->nh.iph->ttl;
Linus Torvalds's avatar
Linus Torvalds committed
1573
	newtp->ext_header_len = 0;
1574 1575 1576
	if (newinet->opt)
		newtp->ext_header_len = newinet->opt->optlen;
	newinet->id = newtp->write_seq ^ jiffies;
Linus Torvalds's avatar
Linus Torvalds committed
1577 1578 1579 1580 1581

	tcp_sync_mss(newsk, dst->pmtu);
	newtp->advmss = dst->advmss;
	tcp_initialize_rcv_mss(newsk);

1582
	__tcp_v4_hash(newsk, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594
	__tcp_inherit_port(sk, newsk);

	return newsk;

exit_overflow:
	NET_INC_STATS_BH(ListenOverflows);
exit:
	NET_INC_STATS_BH(ListenDrops);
	dst_release(dst);
	return NULL;
}

1595
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
1596 1597
{
	struct tcphdr *th = skb->h.th;
Linus Torvalds's avatar
Linus Torvalds committed
1598
	struct iphdr *iph = skb->nh.iph;
1599
	struct tcp_opt *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
1600
	struct sock *nsk;
1601
	struct open_request **prev;
Linus Torvalds's avatar
Linus Torvalds committed
1602
	/* Find possible connection requests. */
1603 1604
	struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
						     iph->saddr, iph->daddr);
Linus Torvalds's avatar
Linus Torvalds committed
1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618
	if (req)
		return tcp_check_req(sk, skb, req, prev);

	nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
					  th->source,
					  skb->nh.iph->daddr,
					  ntohs(th->dest),
					  tcp_v4_iif(skb));

	if (nsk) {
		if (nsk->state != TCP_TIME_WAIT) {
			bh_lock_sock(nsk);
			return nsk;
		}
1619
		tcp_tw_put((struct tcp_tw_bucket *)nsk);
Linus Torvalds's avatar
Linus Torvalds committed
1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632
		return NULL;
	}

#ifdef CONFIG_SYN_COOKIES
	if (!th->rst && !th->syn && th->ack)
		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
#endif
	return sk;
}

static int tcp_v4_checksum_init(struct sk_buff *skb)
{
	if (skb->ip_summed == CHECKSUM_HW) {
Linus Torvalds's avatar
Linus Torvalds committed
1633
		skb->ip_summed = CHECKSUM_UNNECESSARY;
1634 1635
		if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
				  skb->nh.iph->daddr, skb->csum))
Linus Torvalds's avatar
Linus Torvalds committed
1636 1637
			return 0;

1638 1639
		NETDEBUG(if (net_ratelimit())
				printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
Linus Torvalds's avatar
Linus Torvalds committed
1640 1641 1642
		skb->ip_summed = CHECKSUM_NONE;
	}
	if (skb->len <= 76) {
1643
		if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
Linus Torvalds's avatar
Linus Torvalds committed
1644 1645
				 skb->nh.iph->daddr,
				 skb_checksum(skb, 0, skb->len, 0)))
Linus Torvalds's avatar
Linus Torvalds committed
1646 1647 1648
			return -1;
		skb->ip_summed = CHECKSUM_UNNECESSARY;
	} else {
1649 1650 1651
		skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
					  skb->nh.iph->saddr,
					  skb->nh.iph->daddr, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679
	}
	return 0;
}


/* The socket must have it's spinlock held when we get
 * here.
 *
 * We have a potential double-lock case here, so even when
 * doing backlog processing we use the BH locking scheme.
 * This is because we cannot sleep with the original spinlock
 * held.
 */
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
#ifdef CONFIG_FILTER
	struct sk_filter *filter = sk->filter;
	if (filter && sk_filter(skb, filter))
		goto discard;
#endif /* CONFIG_FILTER */

  	IP_INC_STATS_BH(IpInDelivers);

	if (sk->state == TCP_ESTABLISHED) { /* Fast path */
		TCP_CHECK_TIMER(sk);
		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
			goto reset;
		TCP_CHECK_TIMER(sk);
1680
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1681 1682
	}

1683
	if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
Linus Torvalds's avatar
Linus Torvalds committed
1684 1685
		goto csum_err;

1686
	if (sk->state == TCP_LISTEN) {
Linus Torvalds's avatar
Linus Torvalds committed
1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708
		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
		if (!nsk)
			goto discard;

		if (nsk != sk) {
			if (tcp_child_process(sk, nsk, skb))
				goto reset;
			return 0;
		}
	}

	TCP_CHECK_TIMER(sk);
	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
		goto reset;
	TCP_CHECK_TIMER(sk);
	return 0;

reset:
	tcp_v4_send_reset(skb);
discard:
	kfree_skb(skb);
	/* Be careful here. If this function gets more complicated and
1709
	 * gcc suffers from register pressure on the x86, sk (in %ebx)
Linus Torvalds's avatar
Linus Torvalds committed
1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723
	 * might be destroyed here. This current version compiles correctly,
	 * but you have been warned.
	 */
	return 0;

csum_err:
	TCP_INC_STATS_BH(TcpInErrs);
	goto discard;
}

/*
 *	From tcp_input.c
 */

Linus Torvalds's avatar
Linus Torvalds committed
1724
int tcp_v4_rcv(struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
1725 1726 1727 1728 1729
{
	struct tcphdr *th;
	struct sock *sk;
	int ret;

1730
	if (skb->pkt_type != PACKET_HOST)
Linus Torvalds's avatar
Linus Torvalds committed
1731 1732 1733 1734 1735
		goto discard_it;

	/* Count it even if it's bad */
	TCP_INC_STATS_BH(TcpInSegs);

Linus Torvalds's avatar
Linus Torvalds committed
1736 1737 1738 1739 1740
	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
		goto discard_it;

	th = skb->h.th;

1741
	if (th->doff < sizeof(struct tcphdr) / 4)
Linus Torvalds's avatar
Linus Torvalds committed
1742
		goto bad_packet;
1743
	if (!pskb_may_pull(skb, th->doff * 4))
Linus Torvalds's avatar
Linus Torvalds committed
1744 1745
		goto discard_it;

Linus Torvalds's avatar
Linus Torvalds committed
1746 1747 1748 1749
	/* An explanation is required here, I think.
	 * Packet length and doff are validated by header prediction,
	 * provided case of th->doff==0 is elimineted.
	 * So, we defer the checks. */
Linus Torvalds's avatar
Linus Torvalds committed
1750
	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
Linus Torvalds's avatar
Linus Torvalds committed
1751 1752 1753
	     tcp_v4_checksum_init(skb) < 0))
		goto bad_packet;

Linus Torvalds's avatar
Linus Torvalds committed
1754
	th = skb->h.th;
Linus Torvalds's avatar
Linus Torvalds committed
1755 1756
	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1757
				    skb->len - th->doff * 4);
Linus Torvalds's avatar
Linus Torvalds committed
1758
	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1759 1760 1761
	TCP_SKB_CB(skb)->when	 = 0;
	TCP_SKB_CB(skb)->flags	 = skb->nh.iph->tos;
	TCP_SKB_CB(skb)->sacked	 = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1762 1763

	sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1764 1765
			     skb->nh.iph->daddr, ntohs(th->dest),
			     tcp_v4_iif(skb));
Linus Torvalds's avatar
Linus Torvalds committed
1766 1767 1768 1769 1770

	if (!sk)
		goto no_tcp_socket;

process:
1771
	if (!ipsec_sk_policy(sk, skb))
Linus Torvalds's avatar
Linus Torvalds committed
1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792
		goto discard_and_relse;

	if (sk->state == TCP_TIME_WAIT)
		goto do_time_wait;

	skb->dev = NULL;

	bh_lock_sock(sk);
	ret = 0;
	if (!sk->lock.users) {
		if (!tcp_prequeue(sk, skb))
			ret = tcp_v4_do_rcv(sk, skb);
	} else
		sk_add_backlog(sk, skb);
	bh_unlock_sock(sk);

	sock_put(sk);

	return ret;

no_tcp_socket:
1793
	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
Linus Torvalds's avatar
Linus Torvalds committed
1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809
bad_packet:
		TCP_INC_STATS_BH(TcpInErrs);
	} else {
		tcp_v4_send_reset(skb);
	}

discard_it:
	/* Discard frame. */
	kfree_skb(skb);
  	return 0;

discard_and_relse:
	sock_put(sk);
	goto discard_it;

do_time_wait:
1810
	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
Linus Torvalds's avatar
Linus Torvalds committed
1811 1812 1813
		TCP_INC_STATS_BH(TcpInErrs);
		goto discard_and_relse;
	}
1814 1815 1816 1817 1818 1819 1820
	switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
					   skb, th, skb->len)) {
	case TCP_TW_SYN: {
		struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
							  ntohs(th->dest),
							  tcp_v4_iif(skb));
		if (sk2) {
Linus Torvalds's avatar
Linus Torvalds committed
1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849
			tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
			tcp_timewait_kill((struct tcp_tw_bucket *)sk);
			tcp_tw_put((struct tcp_tw_bucket *)sk);
			sk = sk2;
			goto process;
		}
		/* Fall through to ACK */
	}
	case TCP_TW_ACK:
		tcp_v4_timewait_ack(sk, skb);
		break;
	case TCP_TW_RST:
		goto no_tcp_socket;
	case TCP_TW_SUCCESS:;
	}
	goto discard_it;
}

/* With per-bucket locks this operation is not-atomic, so that
 * this version is not worse.
 */
static void __tcp_v4_rehash(struct sock *sk)
{
	sk->prot->unhash(sk);
	sk->prot->hash(sk);
}

static int tcp_v4_reselect_saddr(struct sock *sk)
{
1850
	struct inet_opt *inet = inet_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
1851 1852
	int err;
	struct rtable *rt;
1853
	__u32 old_saddr = inet->saddr;
Linus Torvalds's avatar
Linus Torvalds committed
1854
	__u32 new_saddr;
1855
	__u32 daddr = inet->daddr;
Linus Torvalds's avatar
Linus Torvalds committed
1856

1857 1858
	if (inet->opt && inet->opt->srr)
		daddr = inet->opt->faddr;
Linus Torvalds's avatar
Linus Torvalds committed
1859 1860 1861

	/* Query new route. */
	err = ip_route_connect(&rt, daddr, 0,
1862
			       RT_TOS(inet->tos) | sk->localroute,
Linus Torvalds's avatar
Linus Torvalds committed
1863 1864 1865 1866 1867
			       sk->bound_dev_if);
	if (err)
		return err;

	__sk_dst_set(sk, &rt->u.dst);
Linus Torvalds's avatar
Linus Torvalds committed
1868
	sk->route_caps = rt->u.dst.dev->features;
Linus Torvalds's avatar
Linus Torvalds committed
1869 1870 1871 1872 1873 1874 1875

	new_saddr = rt->rt_src;

	if (new_saddr == old_saddr)
		return 0;

	if (sysctl_ip_dynaddr > 1) {
1876 1877
		printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
				 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1878
		       NIPQUAD(old_saddr),
Linus Torvalds's avatar
Linus Torvalds committed
1879 1880 1881
		       NIPQUAD(new_saddr));
	}

1882 1883
	inet->saddr = new_saddr;
	inet->rcv_saddr = new_saddr;
Linus Torvalds's avatar
Linus Torvalds committed
1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897

	/* XXX The only one ugly spot where we need to
	 * XXX really change the sockets identity after
	 * XXX it has entered the hashes. -DaveM
	 *
	 * Besides that, it does not check for connection
	 * uniqueness. Wait for troubles.
	 */
	__tcp_v4_rehash(sk);
	return 0;
}

int tcp_v4_rebuild_header(struct sock *sk)
{
1898
	struct inet_opt *inet = inet_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
1899 1900 1901 1902 1903
	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
	u32 daddr;
	int err;

	/* Route is OK, nothing to do. */
1904
	if (rt)
Linus Torvalds's avatar
Linus Torvalds committed
1905 1906 1907
		return 0;

	/* Reroute. */
1908
	daddr = inet->daddr;
1909 1910
	if (inet->opt && inet->opt->srr)
		daddr = inet->opt->faddr;
Linus Torvalds's avatar
Linus Torvalds committed
1911

1912
	err = ip_route_output(&rt, daddr, inet->saddr,
Linus Torvalds's avatar
Linus Torvalds committed
1913
			      RT_CONN_FLAGS(sk), sk->bound_dev_if);
Linus Torvalds's avatar
Linus Torvalds committed
1914 1915
	if (!err) {
		__sk_dst_set(sk, &rt->u.dst);
Linus Torvalds's avatar
Linus Torvalds committed
1916
		sk->route_caps = rt->u.dst.dev->features;
Linus Torvalds's avatar
Linus Torvalds committed
1917 1918 1919 1920
		return 0;
	}

	/* Routing failed... */
Linus Torvalds's avatar
Linus Torvalds committed
1921
	sk->route_caps = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1922 1923 1924 1925

	if (!sysctl_ip_dynaddr ||
	    sk->state != TCP_SYN_SENT ||
	    (sk->userlocks & SOCK_BINDADDR_LOCK) ||
Linus Torvalds's avatar
Linus Torvalds committed
1926
	    (err = tcp_v4_reselect_saddr(sk)) != 0)
Linus Torvalds's avatar
Linus Torvalds committed
1927
		sk->err_soft=-err;
Linus Torvalds's avatar
Linus Torvalds committed
1928

Linus Torvalds's avatar
Linus Torvalds committed
1929 1930 1931 1932 1933 1934
	return err;
}

static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
{
	struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1935
	struct inet_opt *inet = inet_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
1936 1937

	sin->sin_family		= AF_INET;
1938 1939
	sin->sin_addr.s_addr	= inet->daddr;
	sin->sin_port		= inet->dport;
Linus Torvalds's avatar
Linus Torvalds committed
1940 1941 1942 1943 1944 1945 1946 1947 1948 1949
}

/* VJ's idea. Save last timestamp seen from this destination
 * and hold it at least for normal timewait interval to use for duplicate
 * segment detection in subsequent connections, before they enter synchronized
 * state.
 */

int tcp_v4_remember_stamp(struct sock *sk)
{
1950
	struct inet_opt *inet = inet_sk(sk);
1951
	struct tcp_opt *tp = tcp_sk(sk);
1952
	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
Linus Torvalds's avatar
Linus Torvalds committed
1953 1954 1955
	struct inet_peer *peer = NULL;
	int release_it = 0;

1956
	if (!rt || rt->rt_dst != inet->daddr) {
1957
		peer = inet_getpeer(inet->daddr, 1);
Linus Torvalds's avatar
Linus Torvalds committed
1958 1959
		release_it = 1;
	} else {
1960
		if (!rt->peer)
Linus Torvalds's avatar
Linus Torvalds committed
1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000
			rt_bind_peer(rt, 1);
		peer = rt->peer;
	}

	if (peer) {
		if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
		     peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
			peer->tcp_ts_stamp = tp->ts_recent_stamp;
			peer->tcp_ts = tp->ts_recent;
		}
		if (release_it)
			inet_putpeer(peer);
		return 1;
	}

	return 0;
}

int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
{
	struct inet_peer *peer = NULL;

	peer = inet_getpeer(tw->daddr, 1);

	if (peer) {
		if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
		     peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
			peer->tcp_ts_stamp = tw->ts_recent_stamp;
			peer->tcp_ts = tw->ts_recent;
		}
		inet_putpeer(peer);
		return 1;
	}

	return 0;
}

struct tcp_func ipv4_specific = {
2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
	.queue_xmit =	ip_queue_xmit,
	.send_check =	tcp_v4_send_check,
	.rebuild_header =tcp_v4_rebuild_header,
	.conn_request =	tcp_v4_conn_request,
	.syn_recv_sock =tcp_v4_syn_recv_sock,
	.remember_stamp =tcp_v4_remember_stamp,
	.net_header_len =sizeof(struct iphdr),
	.setsockopt =	ip_setsockopt,
	.getsockopt =	ip_getsockopt,
	.addr2sockaddr =v4_addr2sockaddr,
	.sockaddr_len =	sizeof(struct sockaddr_in),
Linus Torvalds's avatar
Linus Torvalds committed
2012 2013 2014 2015 2016 2017 2018
};

/* NOTE: A lot of things set to zero explicitly by call to
 *       sk_alloc() so need not be done here.
 */
static int tcp_v4_init_sock(struct sock *sk)
{
2019
	struct tcp_opt *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
2020 2021 2022 2023 2024 2025 2026

	skb_queue_head_init(&tp->out_of_order_queue);
	tcp_init_xmit_timers(sk);
	tcp_prequeue_init(tp);

	tp->rto  = TCP_TIMEOUT_INIT;
	tp->mdev = TCP_TIMEOUT_INIT;
2027

Linus Torvalds's avatar
Linus Torvalds committed
2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045
	/* So many TCP implementations out there (incorrectly) count the
	 * initial SYN frame in their delayed-ACK and congestion control
	 * algorithms that we must have the following bandaid to talk
	 * efficiently to them.  -DaveM
	 */
	tp->snd_cwnd = 2;

	/* See draft-stevens-tcpca-spec-01 for discussion of the
	 * initialization of these values.
	 */
	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
	tp->snd_cwnd_clamp = ~0;
	tp->mss_cache = 536;

	tp->reordering = sysctl_tcp_reordering;

	sk->state = TCP_CLOSE;

Linus Torvalds's avatar
Linus Torvalds committed
2046 2047
	sk->write_space = tcp_write_space;
	sk->use_write_queue = 1;
Linus Torvalds's avatar
Linus Torvalds committed
2048

2049
	tp->af_specific = &ipv4_specific;
Linus Torvalds's avatar
Linus Torvalds committed
2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060

	sk->sndbuf = sysctl_tcp_wmem[1];
	sk->rcvbuf = sysctl_tcp_rmem[1];

	atomic_inc(&tcp_sockets_allocated);

	return 0;
}

static int tcp_v4_destroy_sock(struct sock *sk)
{
2061
	struct tcp_opt *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
2062 2063 2064 2065 2066 2067

	tcp_clear_xmit_timers(sk);

	/* Cleanup up the write buffer. */
  	tcp_writequeue_purge(sk);

Linus Torvalds's avatar
Linus Torvalds committed
2068
	/* Cleans up our, hopefully empty, out_of_order_queue. */
Linus Torvalds's avatar
Linus Torvalds committed
2069 2070 2071 2072 2073 2074
  	__skb_queue_purge(&tp->out_of_order_queue);

	/* Clean prequeue, it must be empty really */
	__skb_queue_purge(&tp->ucopy.prequeue);

	/* Clean up a referenced TCP bind bucket. */
2075
	if (sk->prev)
Linus Torvalds's avatar
Linus Torvalds committed
2076 2077
		tcp_put_port(sk);

Linus Torvalds's avatar
Linus Torvalds committed
2078
	/* If sendmsg cached page exists, toss it. */
2079
	if (tp->sndmsg_page)
Linus Torvalds's avatar
Linus Torvalds committed
2080 2081
		__free_page(tp->sndmsg_page);

Linus Torvalds's avatar
Linus Torvalds committed
2082 2083 2084 2085 2086 2087
	atomic_dec(&tcp_sockets_allocated);

	return 0;
}

/* Proc filesystem TCP sock list dumping. */
2088 2089
static void get_openreq(struct sock *sk, struct open_request *req,
			char *tmpbuf, int i, int uid)
Linus Torvalds's avatar
Linus Torvalds committed
2090 2091 2092 2093 2094 2095 2096
{
	int ttd = req->expires - jiffies;

	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
		" %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
		i,
		req->af.v4_req.loc_addr,
2097
		ntohs(inet_sk(sk)->sport),
Linus Torvalds's avatar
Linus Torvalds committed
2098 2099 2100
		req->af.v4_req.rmt_addr,
		ntohs(req->rmt_port),
		TCP_SYN_RECV,
2101 2102 2103
		0, 0, /* could print option size, but that is af dependent. */
		1,    /* timers active (only the expire timer) */
		ttd,
Linus Torvalds's avatar
Linus Torvalds committed
2104 2105
		req->retrans,
		uid,
2106
		0,  /* non standard timer */
Linus Torvalds's avatar
Linus Torvalds committed
2107 2108
		0, /* open_requests have no inode */
		atomic_read(&sk->refcnt),
2109
		req);
Linus Torvalds's avatar
Linus Torvalds committed
2110 2111 2112 2113 2114 2115
}

static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
{
	int timer_active;
	unsigned long timer_expires;
2116
	struct tcp_opt *tp = tcp_sk(sp);
2117
	struct inet_opt *inet = inet_sk(sp);
2118 2119 2120 2121
	unsigned int dest = inet->daddr;
	unsigned int src = inet->rcv_saddr;
	__u16 destp = ntohs(inet->dport);
	__u16 srcp = ntohs(inet->sport);
Linus Torvalds's avatar
Linus Torvalds committed
2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136

	if (tp->pending == TCP_TIME_RETRANS) {
		timer_active	= 1;
		timer_expires	= tp->timeout;
	} else if (tp->pending == TCP_TIME_PROBE0) {
		timer_active	= 4;
		timer_expires	= tp->timeout;
	} else if (timer_pending(&sp->timer)) {
		timer_active	= 2;
		timer_expires	= sp->timer.expires;
	} else {
		timer_active	= 0;
		timer_expires = jiffies;
	}

2137 2138 2139 2140 2141
	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
		i, src, srcp, dest, destp, sp->state,
		tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
		timer_active, timer_expires - jiffies,
Linus Torvalds's avatar
Linus Torvalds committed
2142 2143 2144 2145 2146
		tp->retransmits,
		sock_i_uid(sp),
		tp->probes_out,
		sock_i_ino(sp),
		atomic_read(&sp->refcnt), sp,
2147 2148 2149
		tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
		tp->snd_cwnd,
		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
Linus Torvalds's avatar
Linus Torvalds committed
2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178
}

static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
{
	unsigned int dest, src;
	__u16 destp, srcp;
	int ttd = tw->ttd - jiffies;

	if (ttd < 0)
		ttd = 0;

	dest  = tw->daddr;
	src   = tw->rcv_saddr;
	destp = ntohs(tw->dport);
	srcp  = ntohs(tw->sport);

	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
		" %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
		i, src, srcp, dest, destp, tw->substate, 0, 0,
		3, ttd, 0, 0, 0, 0,
		atomic_read(&tw->refcnt), tw);
}

#define TMPSZ 150

int tcp_get_info(char *buffer, char **start, off_t offset, int length)
{
	int len = 0, num = 0, i;
	off_t begin, pos = 0;
2179
	char tmpbuf[TMPSZ + 1];
Linus Torvalds's avatar
Linus Torvalds committed
2180 2181

	if (offset < TMPSZ)
2182
		len += sprintf(buffer, "%-*s\n", TMPSZ - 1,
Linus Torvalds's avatar
Linus Torvalds committed
2183
			       "  sl  local_address rem_address   st tx_queue "
2184 2185
			       "rx_queue tr tm->when retrnsmt   uid  timeout "
			       "inode");
Linus Torvalds's avatar
Linus Torvalds committed
2186 2187 2188 2189 2190

	pos = TMPSZ;

	/* First, walk listening socket table. */
	tcp_listen_lock();
2191
	for (i = 0; i < TCP_LHTABLE_SIZE; i++) {
Linus Torvalds's avatar
Linus Torvalds committed
2192
		struct sock *sk;
Linus Torvalds's avatar
Linus Torvalds committed
2193 2194 2195 2196 2197 2198
		struct tcp_listen_opt *lopt;
		int k;

		for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
			struct open_request *req;
			int uid;
2199
			struct tcp_opt *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
2200 2201 2202 2203 2204 2205 2206

			if (!TCP_INET_FAMILY(sk->family))
				goto skip_listen;

			pos += TMPSZ;
			if (pos >= offset) {
				get_tcp_sock(sk, tmpbuf, num);
2207 2208
				len += sprintf(buffer + len, "%-*s\n",
					       TMPSZ - 1, tmpbuf);
Linus Torvalds's avatar
Linus Torvalds committed
2209
				if (pos >= offset + length) {
Linus Torvalds's avatar
Linus Torvalds committed
2210 2211 2212 2213 2214 2215 2216 2217 2218
					tcp_listen_unlock();
					goto out_no_bh;
				}
			}

skip_listen:
			uid = sock_i_uid(sk);
			read_lock_bh(&tp->syn_wait_lock);
			lopt = tp->listen_opt;
2219 2220 2221 2222
			if (lopt && lopt->qlen) {
				for (k = 0; k < TCP_SYNQ_HSIZE; k++) {
					for (req = lopt->syn_table[k];
					     req; req = req->dl_next, num++) {
Linus Torvalds's avatar
Linus Torvalds committed
2223 2224 2225 2226 2227 2228
						if (!TCP_INET_FAMILY(req->class->family))
							continue;

						pos += TMPSZ;
						if (pos <= offset)
							continue;
2229 2230 2231 2232 2233 2234
						get_openreq(sk, req, tmpbuf,
							    num, uid);
						len += sprintf(buffer + len,
							       "%-*s\n",
							       TMPSZ - 1,
							       tmpbuf);
Linus Torvalds's avatar
Linus Torvalds committed
2235
						if (pos >= offset + length) {
Linus Torvalds's avatar
Linus Torvalds committed
2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258
							read_unlock_bh(&tp->syn_wait_lock);
							tcp_listen_unlock();
							goto out_no_bh;
						}
					}
				}
			}
			read_unlock_bh(&tp->syn_wait_lock);

			/* Completed requests are in normal socket hash table */
		}
	}
	tcp_listen_unlock();

	local_bh_disable();

	/* Next, walk established hash chain. */
	for (i = 0; i < tcp_ehash_size; i++) {
		struct tcp_ehash_bucket *head = &tcp_ehash[i];
		struct sock *sk;
		struct tcp_tw_bucket *tw;

		read_lock(&head->lock);
2259
		for (sk = head->chain; sk; sk = sk->next, num++) {
Linus Torvalds's avatar
Linus Torvalds committed
2260 2261 2262 2263 2264 2265
			if (!TCP_INET_FAMILY(sk->family))
				continue;
			pos += TMPSZ;
			if (pos <= offset)
				continue;
			get_tcp_sock(sk, tmpbuf, num);
2266 2267
			len += sprintf(buffer + len, "%-*s\n",
				       TMPSZ - 1, tmpbuf);
Linus Torvalds's avatar
Linus Torvalds committed
2268
			if (pos >= offset + length) {
Linus Torvalds's avatar
Linus Torvalds committed
2269 2270 2271 2272
				read_unlock(&head->lock);
				goto out;
			}
		}
2273 2274 2275
		for (tw = (struct tcp_tw_bucket *)tcp_ehash[i +
							  tcp_ehash_size].chain;
		     tw;
Linus Torvalds's avatar
Linus Torvalds committed
2276 2277 2278 2279 2280 2281 2282
		     tw = (struct tcp_tw_bucket *)tw->next, num++) {
			if (!TCP_INET_FAMILY(tw->family))
				continue;
			pos += TMPSZ;
			if (pos <= offset)
				continue;
			get_timewait_sock(tw, tmpbuf, num);
2283 2284
			len += sprintf(buffer + len, "%-*s\n",
				       TMPSZ - 1, tmpbuf);
Linus Torvalds's avatar
Linus Torvalds committed
2285
			if (pos >= offset + length) {
Linus Torvalds's avatar
Linus Torvalds committed
2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299
				read_unlock(&head->lock);
				goto out;
			}
		}
		read_unlock(&head->lock);
	}

out:
	local_bh_enable();
out_no_bh:

	begin = len - (pos - offset);
	*start = buffer + begin;
	len -= begin;
Linus Torvalds's avatar
Linus Torvalds committed
2300
	if (len > length)
Linus Torvalds's avatar
Linus Torvalds committed
2301 2302
		len = length;
	if (len < 0)
2303
		len = 0;
Linus Torvalds's avatar
Linus Torvalds committed
2304 2305 2306 2307
	return len;
}

struct proto tcp_prot = {
2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324
	.name =		"TCP",
	.close =	tcp_close,
	.connect =	tcp_v4_connect,
	.disconnect =	tcp_disconnect,
	.accept =	tcp_accept,
	.ioctl =	tcp_ioctl,
	.init =		tcp_v4_init_sock,
	.destroy =	tcp_v4_destroy_sock,
	.shutdown =	tcp_shutdown,
	.setsockopt =	tcp_setsockopt,
	.getsockopt =	tcp_getsockopt,
	.sendmsg =	tcp_sendmsg,
	.recvmsg =	tcp_recvmsg,
	.backlog_rcv =	tcp_v4_do_rcv,
	.hash =		tcp_v4_hash,
	.unhash =	tcp_unhash,
	.get_port =	tcp_v4_get_port,
Linus Torvalds's avatar
Linus Torvalds committed
2325 2326 2327 2328 2329 2330
};



void __init tcp_v4_init(struct net_proto_family *ops)
{
Linus Torvalds's avatar
Linus Torvalds committed
2331 2332
	int err = sock_create(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
	if (err < 0)
Linus Torvalds's avatar
Linus Torvalds committed
2333
		panic("Failed to create the TCP control socket.\n");
2334
	tcp_socket->sk->allocation   = GFP_ATOMIC;
2335
	inet_sk(tcp_socket->sk)->ttl = MAXTTL;
Linus Torvalds's avatar
Linus Torvalds committed
2336 2337 2338 2339 2340 2341 2342

	/* Unhash it so that IP input processing does not even
	 * see it, we do not wish this socket to see incoming
	 * packets.
	 */
	tcp_socket->sk->prot->unhash(tcp_socket->sk);
}