Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
nexedi
linux
Commits
8e33ba49
Commit
8e33ba49
authored
Nov 07, 2005
by
Linus Torvalds
Browse files
Options
Browse Files
Download
Plain Diff
Merge master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6
parents
8cde0776
2d43f112
Changes
24
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
24 changed files
with
1066 additions
and
886 deletions
+1066
-886
include/linux/pkt_sched.h
include/linux/pkt_sched.h
+24
-26
include/linux/skbuff.h
include/linux/skbuff.h
+30
-8
include/net/inet_ecn.h
include/net/inet_ecn.h
+24
-4
include/net/inet_hashtables.h
include/net/inet_hashtables.h
+0
-2
include/net/red.h
include/net/red.h
+325
-0
net/core/stream.c
net/core/stream.c
+6
-6
net/dccp/ipv4.c
net/dccp/ipv4.c
+3
-29
net/ipv4/inet_connection_sock.c
net/ipv4/inet_connection_sock.c
+3
-11
net/ipv4/netfilter/ip_conntrack_helper_pptp.c
net/ipv4/netfilter/ip_conntrack_helper_pptp.c
+0
-4
net/ipv4/netfilter/ip_conntrack_netlink.c
net/ipv4/netfilter/ip_conntrack_netlink.c
+7
-12
net/ipv4/netfilter/ip_nat_core.c
net/ipv4/netfilter/ip_nat_core.c
+2
-4
net/ipv4/netfilter/ip_nat_helper_pptp.c
net/ipv4/netfilter/ip_nat_helper_pptp.c
+2
-0
net/ipv4/netfilter/ip_nat_proto_gre.c
net/ipv4/netfilter/ip_nat_proto_gre.c
+2
-2
net/ipv4/netfilter/ip_nat_proto_unknown.c
net/ipv4/netfilter/ip_nat_proto_unknown.c
+1
-1
net/ipv4/netfilter/ipt_CONNMARK.c
net/ipv4/netfilter/ipt_CONNMARK.c
+1
-0
net/ipv4/tcp.c
net/ipv4/tcp.c
+0
-1
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_ipv4.c
+0
-2
net/ipv6/tcp_ipv6.c
net/ipv6/tcp_ipv6.c
+4
-11
net/netfilter/nf_queue.c
net/netfilter/nf_queue.c
+1
-1
net/netfilter/nfnetlink_log.c
net/netfilter/nfnetlink_log.c
+2
-4
net/netfilter/nfnetlink_queue.c
net/netfilter/nfnetlink_queue.c
+2
-4
net/sched/sch_gred.c
net/sched/sch_gred.c
+411
-430
net/sched/sch_netem.c
net/sched/sch_netem.c
+109
-13
net/sched/sch_red.c
net/sched/sch_red.c
+107
-311
No files found.
include/linux/pkt_sched.h
View file @
8e33ba49
...
...
@@ -93,6 +93,7 @@ struct tc_fifo_qopt
/* PRIO section */
#define TCQ_PRIO_BANDS 16
#define TCQ_MIN_PRIO_BANDS 2
struct
tc_prio_qopt
{
...
...
@@ -169,6 +170,7 @@ struct tc_red_qopt
unsigned
char
Scell_log
;
/* cell size for idle damping */
unsigned
char
flags
;
#define TC_RED_ECN 1
#define TC_RED_HARDDROP 2
};
struct
tc_red_xstats
...
...
@@ -194,15 +196,11 @@ enum
#define TCA_GRED_MAX (__TCA_GRED_MAX - 1)
#define TCA_SET_OFF TCA_GRED_PARMS
struct
tc_gred_qopt
{
__u32
limit
;
/* HARD maximal queue length (bytes)
*/
__u32
qth_min
;
/* Min average length threshold (bytes)
*/
__u32
qth_max
;
/* Max average length threshold (bytes)
*/
__u32
limit
;
/* HARD maximal queue length (bytes) */
__u32
qth_min
;
/* Min average length threshold (bytes) */
__u32
qth_max
;
/* Max average length threshold (bytes) */
__u32
DP
;
/* upto 2^32 DPs */
__u32
backlog
;
__u32
qave
;
...
...
@@ -210,22 +208,22 @@ struct tc_gred_qopt
__u32
early
;
__u32
other
;
__u32
pdrop
;
unsigned
char
Wlog
;
/* log(W) */
unsigned
char
Plog
;
/* log(P_max/(qth_max-qth_min)) */
unsigned
char
Scell_log
;
/* cell size for idle damping */
__u8
Wlog
;
/* log(W) */
__u8
Plog
;
/* log(P_max/(qth_max-qth_min)) */
__u8
Scell_log
;
/* cell size for idle damping */
__u8
prio
;
/* prio of this VQ */
__u32
packets
;
__u32
bytesin
;
};
/* gred setup */
struct
tc_gred_sopt
{
__u32
DPs
;
__u32
def_DP
;
__u8
grio
;
__u8
pad1
;
__u16
pad2
;
__u8
flags
;
__u16
pad1
;
};
/* HTB section */
...
...
include/linux/skbuff.h
View file @
8e33ba49
...
...
@@ -603,29 +603,46 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
*/
/**
* __skb_queue_
head
- queue a buffer at the list head
* __skb_queue_
after
- queue a buffer at the list head
* @list: list to use
* @prev: place after this buffer
* @newsk: buffer to queue
*
* Queue a buffer
at the start
of a list. This function takes no locks
* Queue a buffer
int the middle
of a list. This function takes no locks
* and you must therefore hold required locks before calling it.
*
* A buffer cannot be placed on two lists at the same time.
*/
extern
void
skb_queue_head
(
struct
sk_buff_head
*
list
,
struct
sk_buff
*
newsk
);
static
inline
void
__skb_queue_head
(
struct
sk_buff_head
*
list
,
static
inline
void
__skb_queue_after
(
struct
sk_buff_head
*
list
,
struct
sk_buff
*
prev
,
struct
sk_buff
*
newsk
)
{
struct
sk_buff
*
prev
,
*
next
;
struct
sk_buff
*
next
;
list
->
qlen
++
;
prev
=
(
struct
sk_buff
*
)
list
;
next
=
prev
->
next
;
newsk
->
next
=
next
;
newsk
->
prev
=
prev
;
next
->
prev
=
prev
->
next
=
newsk
;
}
/**
* __skb_queue_head - queue a buffer at the list head
* @list: list to use
* @newsk: buffer to queue
*
* Queue a buffer at the start of a list. This function takes no locks
* and you must therefore hold required locks before calling it.
*
* A buffer cannot be placed on two lists at the same time.
*/
extern
void
skb_queue_head
(
struct
sk_buff_head
*
list
,
struct
sk_buff
*
newsk
);
static
inline
void
__skb_queue_head
(
struct
sk_buff_head
*
list
,
struct
sk_buff
*
newsk
)
{
__skb_queue_after
(
list
,
(
struct
sk_buff
*
)
list
,
newsk
);
}
/**
* __skb_queue_tail - queue a buffer at the list tail
* @list: list to use
...
...
@@ -1203,6 +1220,11 @@ static inline void kunmap_skb_frag(void *vaddr)
prefetch(skb->next), (skb != (struct sk_buff *)(queue)); \
skb = skb->next)
#define skb_queue_reverse_walk(queue, skb) \
for (skb = (queue)->prev; \
prefetch(skb->prev), (skb != (struct sk_buff *)(queue)); \
skb = skb->prev)
extern
struct
sk_buff
*
skb_recv_datagram
(
struct
sock
*
sk
,
unsigned
flags
,
int
noblock
,
int
*
err
);
...
...
include/net/inet_ecn.h
View file @
8e33ba49
...
...
@@ -2,6 +2,7 @@
#define _INET_ECN_H_
#include <linux/ip.h>
#include <linux/skbuff.h>
#include <net/dsfield.h>
enum
{
...
...
@@ -48,7 +49,7 @@ static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner)
(label) |= __constant_htons(INET_ECN_ECT_0 << 4); \
} while (0)
static
inline
void
IP_ECN_set_ce
(
struct
iphdr
*
iph
)
static
inline
int
IP_ECN_set_ce
(
struct
iphdr
*
iph
)
{
u32
check
=
iph
->
check
;
u32
ecn
=
(
iph
->
tos
+
1
)
&
INET_ECN_MASK
;
...
...
@@ -61,7 +62,7 @@ static inline void IP_ECN_set_ce(struct iphdr *iph)
* INET_ECN_CE => 00
*/
if
(
!
(
ecn
&
2
))
return
;
return
!
ecn
;
/*
* The following gives us:
...
...
@@ -72,6 +73,7 @@ static inline void IP_ECN_set_ce(struct iphdr *iph)
iph
->
check
=
check
+
(
check
>=
0xFFFF
);
iph
->
tos
|=
INET_ECN_CE
;
return
1
;
}
static
inline
void
IP_ECN_clear
(
struct
iphdr
*
iph
)
...
...
@@ -87,11 +89,12 @@ static inline void ipv4_copy_dscp(struct iphdr *outer, struct iphdr *inner)
struct
ipv6hdr
;
static
inline
void
IP6_ECN_set_ce
(
struct
ipv6hdr
*
iph
)
static
inline
int
IP6_ECN_set_ce
(
struct
ipv6hdr
*
iph
)
{
if
(
INET_ECN_is_not_ect
(
ipv6_get_dsfield
(
iph
)))
return
;
return
0
;
*
(
u32
*
)
iph
|=
htonl
(
INET_ECN_CE
<<
20
);
return
1
;
}
static
inline
void
IP6_ECN_clear
(
struct
ipv6hdr
*
iph
)
...
...
@@ -105,4 +108,21 @@ static inline void ipv6_copy_dscp(struct ipv6hdr *outer, struct ipv6hdr *inner)
ipv6_change_dsfield
(
inner
,
INET_ECN_MASK
,
dscp
);
}
static
inline
int
INET_ECN_set_ce
(
struct
sk_buff
*
skb
)
{
switch
(
skb
->
protocol
)
{
case
__constant_htons
(
ETH_P_IP
):
if
(
skb
->
nh
.
raw
+
sizeof
(
struct
iphdr
)
<=
skb
->
tail
)
return
IP_ECN_set_ce
(
skb
->
nh
.
iph
);
break
;
case
__constant_htons
(
ETH_P_IPV6
):
if
(
skb
->
nh
.
raw
+
sizeof
(
struct
ipv6hdr
)
<=
skb
->
tail
)
return
IP6_ECN_set_ce
(
skb
->
nh
.
ipv6h
);
break
;
}
return
0
;
}
#endif
include/net/inet_hashtables.h
View file @
8e33ba49
...
...
@@ -125,9 +125,7 @@ struct inet_hashinfo {
rwlock_t
lhash_lock
____cacheline_aligned
;
atomic_t
lhash_users
;
wait_queue_head_t
lhash_wait
;
spinlock_t
portalloc_lock
;
kmem_cache_t
*
bind_bucket_cachep
;
int
port_rover
;
};
static
inline
unsigned
int
inet_ehashfn
(
const
__u32
laddr
,
const
__u16
lport
,
...
...
include/net/red.h
0 → 100644
View file @
8e33ba49
#ifndef __NET_SCHED_RED_H
#define __NET_SCHED_RED_H
#include <linux/config.h>
#include <linux/types.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#include <net/dsfield.h>
/* Random Early Detection (RED) algorithm.
=======================================
Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
This file codes a "divisionless" version of RED algorithm
as written down in Fig.17 of the paper.
Short description.
------------------
When a new packet arrives we calculate the average queue length:
avg = (1-W)*avg + W*current_queue_len,
W is the filter time constant (chosen as 2^(-Wlog)), it controls
the inertia of the algorithm. To allow larger bursts, W should be
decreased.
if (avg > th_max) -> packet marked (dropped).
if (avg < th_min) -> packet passes.
if (th_min < avg < th_max) we calculate probability:
Pb = max_P * (avg - th_min)/(th_max-th_min)
and mark (drop) packet with this probability.
Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
max_P should be small (not 1), usually 0.01..0.02 is good value.
max_P is chosen as a number, so that max_P/(th_max-th_min)
is a negative power of two in order arithmetics to contain
only shifts.
Parameters, settable by user:
-----------------------------
qth_min - bytes (should be < qth_max/2)
qth_max - bytes (should be at least 2*qth_min and less limit)
Wlog - bits (<32) log(1/W).
Plog - bits (<32)
Plog is related to max_P by formula:
max_P = (qth_max-qth_min)/2^Plog;
F.e. if qth_max=128K and qth_min=32K, then Plog=22
corresponds to max_P=0.02
Scell_log
Stab
Lookup table for log((1-W)^(t/t_ave).
NOTES:
Upper bound on W.
-----------------
If you want to allow bursts of L packets of size S,
you should choose W:
L + 1 - th_min/S < (1-(1-W)^L)/W
th_min/S = 32 th_min/S = 4
log(W) L
-1 33
-2 35
-3 39
-4 46
-5 57
-6 75
-7 101
-8 135
-9 190
etc.
*/
#define RED_STAB_SIZE 256
#define RED_STAB_MASK (RED_STAB_SIZE - 1)
struct
red_stats
{
u32
prob_drop
;
/* Early probability drops */
u32
prob_mark
;
/* Early probability marks */
u32
forced_drop
;
/* Forced drops, qavg > max_thresh */
u32
forced_mark
;
/* Forced marks, qavg > max_thresh */
u32
pdrop
;
/* Drops due to queue limits */
u32
other
;
/* Drops due to drop() calls */
u32
backlog
;
};
struct
red_parms
{
/* Parameters */
u32
qth_min
;
/* Min avg length threshold: A scaled */
u32
qth_max
;
/* Max avg length threshold: A scaled */
u32
Scell_max
;
u32
Rmask
;
/* Cached random mask, see red_rmask */
u8
Scell_log
;
u8
Wlog
;
/* log(W) */
u8
Plog
;
/* random number bits */
u8
Stab
[
RED_STAB_SIZE
];
/* Variables */
int
qcount
;
/* Number of packets since last random
number generation */
u32
qR
;
/* Cached random number */
unsigned
long
qavg
;
/* Average queue length: A scaled */
psched_time_t
qidlestart
;
/* Start of current idle period */
};
static
inline
u32
red_rmask
(
u8
Plog
)
{
return
Plog
<
32
?
((
1
<<
Plog
)
-
1
)
:
~
0UL
;
}
static
inline
void
red_set_parms
(
struct
red_parms
*
p
,
u32
qth_min
,
u32
qth_max
,
u8
Wlog
,
u8
Plog
,
u8
Scell_log
,
u8
*
stab
)
{
/* Reset average queue length, the value is strictly bound
* to the parameters below, reseting hurts a bit but leaving
* it might result in an unreasonable qavg for a while. --TGR
*/
p
->
qavg
=
0
;
p
->
qcount
=
-
1
;
p
->
qth_min
=
qth_min
<<
Wlog
;
p
->
qth_max
=
qth_max
<<
Wlog
;
p
->
Wlog
=
Wlog
;
p
->
Plog
=
Plog
;
p
->
Rmask
=
red_rmask
(
Plog
);
p
->
Scell_log
=
Scell_log
;
p
->
Scell_max
=
(
255
<<
Scell_log
);
memcpy
(
p
->
Stab
,
stab
,
sizeof
(
p
->
Stab
));
}
static
inline
int
red_is_idling
(
struct
red_parms
*
p
)
{
return
!
PSCHED_IS_PASTPERFECT
(
p
->
qidlestart
);
}
static
inline
void
red_start_of_idle_period
(
struct
red_parms
*
p
)
{
PSCHED_GET_TIME
(
p
->
qidlestart
);
}
static
inline
void
red_end_of_idle_period
(
struct
red_parms
*
p
)
{
PSCHED_SET_PASTPERFECT
(
p
->
qidlestart
);
}
static
inline
void
red_restart
(
struct
red_parms
*
p
)
{
red_end_of_idle_period
(
p
);
p
->
qavg
=
0
;
p
->
qcount
=
-
1
;
}
static
inline
unsigned
long
red_calc_qavg_from_idle_time
(
struct
red_parms
*
p
)
{
psched_time_t
now
;
long
us_idle
;
int
shift
;
PSCHED_GET_TIME
(
now
);
us_idle
=
PSCHED_TDIFF_SAFE
(
now
,
p
->
qidlestart
,
p
->
Scell_max
);
/*
* The problem: ideally, average length queue recalcultion should
* be done over constant clock intervals. This is too expensive, so
* that the calculation is driven by outgoing packets.
* When the queue is idle we have to model this clock by hand.
*
* SF+VJ proposed to "generate":
*
* m = idletime / (average_pkt_size / bandwidth)
*
* dummy packets as a burst after idle time, i.e.
*
* p->qavg *= (1-W)^m
*
* This is an apparently overcomplicated solution (f.e. we have to
* precompute a table to make this calculation in reasonable time)
* I believe that a simpler model may be used here,
* but it is field for experiments.
*/
shift
=
p
->
Stab
[(
us_idle
>>
p
->
Scell_log
)
&
RED_STAB_MASK
];
if
(
shift
)
return
p
->
qavg
>>
shift
;
else
{
/* Approximate initial part of exponent with linear function:
*
* (1-W)^m ~= 1-mW + ...
*
* Seems, it is the best solution to
* problem of too coarse exponent tabulation.
*/
us_idle
=
(
p
->
qavg
*
us_idle
)
>>
p
->
Scell_log
;
if
(
us_idle
<
(
p
->
qavg
>>
1
))
return
p
->
qavg
-
us_idle
;
else
return
p
->
qavg
>>
1
;
}
}
static
inline
unsigned
long
red_calc_qavg_no_idle_time
(
struct
red_parms
*
p
,
unsigned
int
backlog
)
{
/*
* NOTE: p->qavg is fixed point number with point at Wlog.
* The formula below is equvalent to floating point
* version:
*
* qavg = qavg*(1-W) + backlog*W;
*
* --ANK (980924)
*/
return
p
->
qavg
+
(
backlog
-
(
p
->
qavg
>>
p
->
Wlog
));
}
static
inline
unsigned
long
red_calc_qavg
(
struct
red_parms
*
p
,
unsigned
int
backlog
)
{
if
(
!
red_is_idling
(
p
))
return
red_calc_qavg_no_idle_time
(
p
,
backlog
);
else
return
red_calc_qavg_from_idle_time
(
p
);
}
static
inline
u32
red_random
(
struct
red_parms
*
p
)
{
return
net_random
()
&
p
->
Rmask
;
}
static
inline
int
red_mark_probability
(
struct
red_parms
*
p
,
unsigned
long
qavg
)
{
/* The formula used below causes questions.
OK. qR is random number in the interval 0..Rmask
i.e. 0..(2^Plog). If we used floating point
arithmetics, it would be: (2^Plog)*rnd_num,
where rnd_num is less 1.
Taking into account, that qavg have fixed
point at Wlog, and Plog is related to max_P by
max_P = (qth_max-qth_min)/2^Plog; two lines
below have the following floating point equivalent:
max_P*(qavg - qth_min)/(qth_max-qth_min) < rnd/qcount
Any questions? --ANK (980924)
*/
return
!
(((
qavg
-
p
->
qth_min
)
>>
p
->
Wlog
)
*
p
->
qcount
<
p
->
qR
);
}
enum
{
RED_BELOW_MIN_THRESH
,
RED_BETWEEN_TRESH
,
RED_ABOVE_MAX_TRESH
,
};
static
inline
int
red_cmp_thresh
(
struct
red_parms
*
p
,
unsigned
long
qavg
)
{
if
(
qavg
<
p
->
qth_min
)
return
RED_BELOW_MIN_THRESH
;
else
if
(
qavg
>=
p
->
qth_max
)
return
RED_ABOVE_MAX_TRESH
;
else
return
RED_BETWEEN_TRESH
;
}
enum
{
RED_DONT_MARK
,
RED_PROB_MARK
,
RED_HARD_MARK
,
};
static
inline
int
red_action
(
struct
red_parms
*
p
,
unsigned
long
qavg
)
{
switch
(
red_cmp_thresh
(
p
,
qavg
))
{
case
RED_BELOW_MIN_THRESH
:
p
->
qcount
=
-
1
;
return
RED_DONT_MARK
;
case
RED_BETWEEN_TRESH
:
if
(
++
p
->
qcount
)
{
if
(
red_mark_probability
(
p
,
qavg
))
{
p
->
qcount
=
0
;
p
->
qR
=
red_random
(
p
);
return
RED_PROB_MARK
;
}
}
else
p
->
qR
=
red_random
(
p
);
return
RED_DONT_MARK
;
case
RED_ABOVE_MAX_TRESH
:
p
->
qcount
=
-
1
;
return
RED_HARD_MARK
;
}
BUG
();
return
RED_DONT_MARK
;
}
#endif
net/core/stream.c
View file @
8e33ba49
...
...
@@ -52,8 +52,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
{
struct
task_struct
*
tsk
=
current
;
DEFINE_WAIT
(
wait
);
int
done
;
while
(
1
)
{
do
{
if
(
sk
->
sk_err
)
return
sock_error
(
sk
);
if
((
1
<<
sk
->
sk_state
)
&
~
(
TCPF_SYN_SENT
|
TCPF_SYN_RECV
))
...
...
@@ -65,13 +66,12 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
prepare_to_wait
(
sk
->
sk_sleep
,
&
wait
,
TASK_INTERRUPTIBLE
);
sk
->
sk_write_pending
++
;
if
(
sk_wait_event
(
sk
,
timeo_p
,
done
=
sk_wait_event
(
sk
,
timeo_p
,
!
((
1
<<
sk
->
sk_state
)
&
~
(
TCPF_ESTABLISHED
|
TCPF_CLOSE_WAIT
))))
break
;
~
(
TCPF_ESTABLISHED
|
TCPF_CLOSE_WAIT
)));
finish_wait
(
sk
->
sk_sleep
,
&
wait
);
sk
->
sk_write_pending
--
;
}
}
while
(
!
done
);
return
0
;
}
...
...
net/dccp/ipv4.c
View file @
8e33ba49
...
...
@@ -31,8 +31,6 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
.
lhash_lock
=
RW_LOCK_UNLOCKED
,
.
lhash_users
=
ATOMIC_INIT
(
0
),
.
lhash_wait
=
__WAIT_QUEUE_HEAD_INITIALIZER
(
dccp_hashinfo
.
lhash_wait
),
.
portalloc_lock
=
SPIN_LOCK_UNLOCKED
,
.
port_rover
=
1024
-
1
,
};
EXPORT_SYMBOL_GPL
(
dccp_hashinfo
);
...
...
@@ -125,36 +123,15 @@ static int dccp_v4_hash_connect(struct sock *sk)
int
ret
;
if
(
snum
==
0
)
{
int
rover
;
int
low
=
sysctl_local_port_range
[
0
];
int
high
=
sysctl_local_port_range
[
1
];
int
remaining
=
(
high
-
low
)
+
1
;
int
rover
=
net_random
()
%
(
high
-
low
)
+
low
;
struct
hlist_node
*
node
;
struct
inet_timewait_sock
*
tw
=
NULL
;
local_bh_disable
();
/* TODO. Actually it is not so bad idea to remove
* dccp_hashinfo.portalloc_lock before next submission to
* Linus.
* As soon as we touch this place at all it is time to think.
*
* Now it protects single _advisory_ variable
* dccp_hashinfo.port_rover, hence it is mostly useless.
* Code will work nicely if we just delete it, but
* I am afraid in contented case it will work not better or
* even worse: another cpu just will hit the same bucket
* and spin there.
* So some cpu salt could remove both contention and
* memory pingpong. Any ideas how to do this in a nice way?
*/
spin_lock
(
&
dccp_hashinfo
.
portalloc_lock
);
rover
=
dccp_hashinfo
.
port_rover
;
do
{
rover
++
;
if
((
rover
<
low
)
||
(
rover
>
high
))
rover
=
low
;
head
=
&
dccp_hashinfo
.
bhash
[
inet_bhashfn
(
rover
,
dccp_hashinfo
.
bhash_size
)];
spin_lock
(
&
head
->
lock
);
...
...
@@ -187,9 +164,9 @@ static int dccp_v4_hash_connect(struct sock *sk)
next_port:
spin_unlock
(
&
head
->
lock
);
if
(
++
rover
>
high
)
rover
=
low
;
}
while
(
--
remaining
>
0
);
dccp_hashinfo
.
port_rover
=
rover
;
spin_unlock
(
&
dccp_hashinfo
.
portalloc_lock
);
local_bh_enable
();
...
...
@@ -197,9 +174,6 @@ static int dccp_v4_hash_connect(struct sock *sk)
ok:
/* All locks still held and bhs disabled */
dccp_hashinfo
.
port_rover
=
rover
;
spin_unlock
(
&
dccp_hashinfo
.
portalloc_lock
);
inet_bind_hash
(
sk
,
tb
,
rover
);
if
(
sk_unhashed
(
sk
))
{
inet_sk
(
sk
)
->
sport
=
htons
(
rover
);
...
...
net/ipv4/inet_connection_sock.c
View file @
8e33ba49
...
...
@@ -78,17 +78,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
int
low
=
sysctl_local_port_range
[
0
];
int
high
=
sysctl_local_port_range
[
1
];
int
remaining
=
(
high
-
low
)
+
1
;
int
rover
;
int
rover
=
net_random
()
%
(
high
-
low
)
+
low
;
spin_lock
(
&
hashinfo
->
portalloc_lock
);
if
(
hashinfo
->
port_rover
<
low
)
rover
=
low
;
else
rover
=
hashinfo
->
port_rover
;
do
{
rover
++
;
if
(
rover
>
high
)
rover
=
low
;
head
=
&
hashinfo
->
bhash
[
inet_bhashfn
(
rover
,
hashinfo
->
bhash_size
)];
spin_lock
(
&
head
->
lock
);
inet_bind_bucket_for_each
(
tb
,
node
,
&
head
->
chain
)
...
...
@@ -97,9 +89,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
break
;
next:
spin_unlock
(
&
head
->
lock
);
if
(
++
rover
>
high
)
rover
=
low
;
}
while
(
--
remaining
>
0
);
hashinfo
->
port_rover
=
rover
;
spin_unlock
(
&
hashinfo
->
portalloc_lock
);
/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
...
...
net/ipv4/netfilter/ip_conntrack_helper_pptp.c
View file @
8e33ba49
...
...
@@ -270,14 +270,10 @@ exp_gre(struct ip_conntrack *master,
exp_orig
->
expectfn
=
pptp_expectfn
;
exp_orig
->
flags
=
0
;
exp_orig
->
dir
=
IP_CT_DIR_ORIGINAL
;
/* both expectations are identical apart from tuple */
memcpy
(
exp_reply
,
exp_orig
,
sizeof
(
*
exp_reply
));
memcpy
(
&
exp_reply
->
tuple
,
&
exp_tuples
[
1
],
sizeof
(
exp_reply
->
tuple
));
exp_reply
->
dir
=
!
exp_orig
->
dir
;
if
(
ip_nat_pptp_hook_exp_gre
)
ret
=
ip_nat_pptp_hook_exp_gre
(
exp_orig
,
exp_reply
);
else
{
...
...
net/ipv4/netfilter/ip_conntrack_netlink.c
View file @
8e33ba49
...
...
@@ -815,7 +815,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
IPCTNL_MSG_CT_NEW
,
1
,
ct
);
ip_conntrack_put
(
ct
);
if
(
err
<=
0
)
goto
out
;
goto
free
;
err
=
netlink_unicast
(
ctnl
,
skb2
,
NETLINK_CB
(
skb
).
pid
,
MSG_DONTWAIT
);
if
(
err
<
0
)
...
...
@@ -824,9 +824,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
DEBUGP
(
"leaving
\n
"
);
return
0
;
out:
if
(
skb2
)
free:
kfree_skb
(
skb2
);
out:
return
-
1
;
}
...
...
@@ -1322,21 +1322,16 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
nlh
->
nlmsg_seq
,
IPCTNL_MSG_EXP_NEW
,
1
,
exp
);
if
(
err
<=
0
)
goto
out
;
goto
free
;
ip_conntrack_expect_put
(
exp
);
err
=
netlink_unicast
(
ctnl
,
skb2
,
NETLINK_CB
(
skb
).
pid
,
MSG_DONTWAIT
);
if
(
err
<
0
)
goto
free
;
return
err
;
return
netlink_unicast
(
ctnl
,
skb2
,
NETLINK_CB
(
skb
).
pid
,
MSG_DONTWAIT
);
out:
ip_conntrack_expect_put
(
exp
);
free:
if
(
skb2
)
kfree_skb
(
skb2
);
out:
ip_conntrack_expect_put
(
exp
);
return
err
;
}
...
...
net/ipv4/netfilter/ip_nat_core.c
View file @
8e33ba49
...
...
@@ -66,10 +66,8 @@ ip_nat_proto_find_get(u_int8_t protonum)
* removed until we've grabbed the reference */
preempt_disable
();
p
=
__ip_nat_proto_find
(
protonum
);
if
(
p
)
{
if
(
!
try_module_get
(
p
->
me
))
p
=
&
ip_nat_unknown_protocol
;
}
preempt_enable
();
return
p
;
...
...
net/ipv4/netfilter/ip_nat_helper_pptp.c
View file @
8e33ba49
...
...
@@ -216,6 +216,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
expect_orig
->
saved_proto
.
gre
.
key
=
htons
(
nat_pptp_info
->
pac_call_id
);
expect_orig
->
tuple
.
src
.
u
.
gre
.
key
=
htons
(
nat_pptp_info
->
pns_call_id
);
expect_orig
->
tuple
.
dst
.
u
.
gre
.
key
=
htons
(
ct_pptp_info
->
pac_call_id
);
expect_orig
->
dir
=
IP_CT_DIR_ORIGINAL
;
inv_t
.
src
.
ip
=
reply_t
->
src
.
ip
;
inv_t
.
dst
.
ip
=
reply_t
->
dst
.
ip
;
inv_t
.
src
.
u
.
gre
.
key
=
htons
(
nat_pptp_info
->
pac_call_id
);
...
...
@@ -233,6 +234,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
expect_reply
->
saved_proto
.
gre
.
key
=
htons
(
nat_pptp_info
->
pns_call_id
);
expect_reply
->
tuple
.
src
.
u
.
gre
.
key
=
htons
(
nat_pptp_info
->
pac_call_id
);
expect_reply
->
tuple
.
dst
.
u
.
gre
.
key
=
htons
(
ct_pptp_info
->
pns_call_id
);
expect_reply
->
dir
=
IP_CT_DIR_REPLY
;
inv_t
.
src
.
ip
=
orig_t
->
src
.
ip
;
inv_t
.
dst
.
ip
=
orig_t
->
dst
.
ip
;
inv_t
.
src
.
u
.
gre
.
key
=
htons
(
nat_pptp_info
->
pns_call_id
);
...
...
net/ipv4/netfilter/ip_nat_proto_gre.c
View file @
8e33ba49
...
...
@@ -139,8 +139,8 @@ gre_manip_pkt(struct sk_buff **pskb,
break
;
case
GRE_VERSION_PPTP
:
DEBUGP
(
"call_id -> 0x%04x
\n
"
,
ntoh
l
(
tuple
->
dst
.
u
.
gre
.
key
));
pgreh
->
call_id
=
htons
(
ntohl
(
tuple
->
dst
.
u
.
gre
.
key
))
;
ntoh
s
(
tuple
->
dst
.
u
.
gre
.
key
));
pgreh
->
call_id
=
tuple
->
dst
.
u
.
gre
.
key
;
break
;
default:
DEBUGP
(
"can't nat unknown GRE version
\n
"
);
...
...
net/ipv4/netfilter/ip_nat_proto_unknown.c
View file @
8e33ba49
...
...
@@ -62,7 +62,7 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range)
struct
ip_nat_protocol
ip_nat_unknown_protocol
=
{
.
name
=
"unknown"
,
.
me
=
THIS_MODULE
,
/* .me isn't set: getting a ref to this cannot fail. */
.
manip_pkt
=
unknown_manip_pkt
,
.
in_range
=
unknown_in_range
,
.
unique_tuple
=
unknown_unique_tuple
,
...
...
net/ipv4/netfilter/ipt_CONNMARK.c
View file @
8e33ba49
...
...
@@ -109,6 +109,7 @@ static struct ipt_target ipt_connmark_reg = {
static
int
__init
init
(
void
)
{
need_ip_conntrack
();
return
ipt_register_target
(
&
ipt_connmark_reg
);
}
...
...
net/ipv4/tcp.c
View file @
8e33ba49
...
...
@@ -2112,7 +2112,6 @@ void __init tcp_init(void)
sysctl_tcp_max_orphans
>>=
(
3
-
order
);
sysctl_max_syn_backlog
=
128
;
}
tcp_hashinfo
.
port_rover
=
sysctl_local_port_range
[
0
]
-
1
;
sysctl_tcp_mem
[
0
]
=
768
<<
order
;
sysctl_tcp_mem
[
1
]
=
1024
<<
order
;
...
...
net/ipv4/tcp_ipv4.c
View file @
8e33ba49
...
...
@@ -93,8 +93,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
.
lhash_lock
=
RW_LOCK_UNLOCKED
,
.
lhash_users
=
ATOMIC_INIT
(
0
),
.
lhash_wait
=
__WAIT_QUEUE_HEAD_INITIALIZER
(
tcp_hashinfo
.
lhash_wait
),
.
portalloc_lock
=
SPIN_LOCK_UNLOCKED
,
.
port_rover
=
1024
-
1
,
};
static
int
tcp_v4_get_port
(
struct
sock
*
sk
,
unsigned
short
snum
)
...
...
net/ipv6/tcp_ipv6.c
View file @
8e33ba49
...
...
@@ -114,16 +114,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
int
low
=
sysctl_local_port_range
[
0
];
int
high
=
sysctl_local_port_range
[
1
];
int
remaining
=
(
high
-
low
)
+
1
;
int
rover
;
int
rover
=
net_random
()
%
(
high
-
low
)
+
low
;
spin_lock
(
&
tcp_hashinfo
.
portalloc_lock
);
if
(
tcp_hashinfo
.
port_rover
<
low
)
rover
=
low
;
else
rover
=
tcp_hashinfo
.
port_rover
;
do
{
rover
++
;
if
(
rover
>
high
)
rover
=
low
;
do
{
head
=
&
tcp_hashinfo
.
bhash
[
inet_bhashfn
(
rover
,
tcp_hashinfo
.
bhash_size
)];
spin_lock
(
&
head
->
lock
);
inet_bind_bucket_for_each
(
tb
,
node
,
&
head
->
chain
)
...
...
@@ -132,9 +125,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
break
;
next:
spin_unlock
(
&
head
->
lock
);
if
(
++
rover
>
high
)
rover
=
low
;
}
while
(
--
remaining
>
0
);
tcp_hashinfo
.
port_rover
=
rover
;
spin_unlock
(
&
tcp_hashinfo
.
portalloc_lock
);
/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
...
...
net/netfilter/nf_queue.c
View file @
8e33ba49
...
...
@@ -117,7 +117,7 @@ int nf_queue(struct sk_buff **skb,
/* QUEUE == DROP if noone is waiting, to be safe. */
read_lock
(
&
queue_handler_lock
);
if
(
!
queue_handler
[
pf
]
->
outfn
)
{
if
(
!
queue_handler
[
pf
]
||
!
queue_handler
[
pf
]
->
outfn
)
{
read_unlock
(
&
queue_handler_lock
);
kfree_skb
(
*
skb
);
return
1
;
...
...
net/netfilter/nfnetlink_log.c
View file @
8e33ba49
...
...
@@ -146,11 +146,10 @@ instance_create(u_int16_t group_num, int pid)
goto
out_unlock
;
}
inst
=
k
m
alloc
(
sizeof
(
*
inst
),
GFP_ATOMIC
);
inst
=
k
z
alloc
(
sizeof
(
*
inst
),
GFP_ATOMIC
);
if
(
!
inst
)
goto
out_unlock
;
memset
(
inst
,
0
,
sizeof
(
*
inst
));
INIT_HLIST_NODE
(
&
inst
->
hlist
);
inst
->
lock
=
SPIN_LOCK_UNLOCKED
;
/* needs to be two, since we _put() after creation */
...
...
@@ -962,10 +961,9 @@ static int nful_open(struct inode *inode, struct file *file)
struct
iter_state
*
is
;
int
ret
;
is
=
k
m
alloc
(
sizeof
(
*
is
),
GFP_KERNEL
);
is
=
k
z
alloc
(
sizeof
(
*
is
),
GFP_KERNEL
);
if
(
!
is
)
return
-
ENOMEM
;
memset
(
is
,
0
,
sizeof
(
*
is
));
ret
=
seq_open
(
file
,
&
nful_seq_ops
);
if
(
ret
<
0
)
goto
out_free
;
...
...
net/netfilter/nfnetlink_queue.c
View file @
8e33ba49
...
...
@@ -136,11 +136,10 @@ instance_create(u_int16_t queue_num, int pid)
goto
out_unlock
;
}
inst
=
k
m
alloc
(
sizeof
(
*
inst
),
GFP_ATOMIC
);
inst
=
k
z
alloc
(
sizeof
(
*
inst
),
GFP_ATOMIC
);
if
(
!
inst
)
goto
out_unlock
;
memset
(
inst
,
0
,
sizeof
(
*
inst
));
inst
->
queue_num
=
queue_num
;
inst
->
peer_pid
=
pid
;
inst
->
queue_maxlen
=
NFQNL_QMAX_DEFAULT
;
...
...
@@ -1036,10 +1035,9 @@ static int nfqnl_open(struct inode *inode, struct file *file)
struct
iter_state
*
is
;
int
ret
;
is
=
k
m
alloc
(
sizeof
(
*
is
),
GFP_KERNEL
);
is
=
k
z
alloc
(
sizeof
(
*
is
),
GFP_KERNEL
);
if
(
!
is
)
return
-
ENOMEM
;
memset
(
is
,
0
,
sizeof
(
*
is
));
ret
=
seq_open
(
file
,
&
nfqnl_seq_ops
);
if
(
ret
<
0
)
goto
out_free
;
...
...
net/sched/sch_gred.c
View file @
8e33ba49
This diff is collapsed.
Click to expand it.
net/sched/sch_netem.c
View file @
8e33ba49
...
...
@@ -25,6 +25,8 @@
#include <net/pkt_sched.h>
#define VERSION "1.1"
/* Network Emulation Queuing algorithm.
====================================
...
...
@@ -185,10 +187,13 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
||
q
->
counter
<
q
->
gap
/* inside last reordering gap */
||
q
->
reorder
<
get_crandom
(
&
q
->
reorder_cor
))
{
psched_time_t
now
;
psched_tdiff_t
delay
;
delay
=
tabledist
(
q
->
latency
,
q
->
jitter
,
&
q
->
delay_cor
,
q
->
delay_dist
);
PSCHED_GET_TIME
(
now
);
PSCHED_TADD2
(
now
,
tabledist
(
q
->
latency
,
q
->
jitter
,
&
q
->
delay_cor
,
q
->
delay_dist
),
cb
->
time_to_send
);
PSCHED_TADD2
(
now
,
delay
,
cb
->
time_to_send
);
++
q
->
counter
;
ret
=
q
->
qdisc
->
enqueue
(
skb
,
q
->
qdisc
);
}
else
{
...
...
@@ -248,24 +253,31 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
const
struct
netem_skb_cb
*
cb
=
(
const
struct
netem_skb_cb
*
)
skb
->
cb
;
psched_time_t
now
;
long
delay
;
/* if more time remaining? */
PSCHED_GET_TIME
(
now
);
delay
=
PSCHED_US2JIFFIE
(
PSCHED_TDIFF
(
cb
->
time_to_send
,
now
));
pr_debug
(
"netem_run: skb=%p delay=%ld
\n
"
,
skb
,
delay
);
if
(
delay
<=
0
)
{
if
(
PSCHED_TLESS
(
cb
->
time_to_send
,
now
))
{
pr_debug
(
"netem_dequeue: return skb=%p
\n
"
,
skb
);
sch
->
q
.
qlen
--
;
sch
->
flags
&=
~
TCQ_F_THROTTLED
;
return
skb
;
}
else
{
psched_tdiff_t
delay
=
PSCHED_TDIFF
(
cb
->
time_to_send
,
now
);
if
(
q
->
qdisc
->
ops
->
requeue
(
skb
,
q
->
qdisc
)
!=
NET_XMIT_SUCCESS
)
{
sch
->
qstats
.
drops
++
;
/* After this qlen is confused */
printk
(
KERN_ERR
"netem: queue discpline %s could not requeue
\n
"
,
q
->
qdisc
->
ops
->
id
);
sch
->
q
.
qlen
--
;
}
mod_timer
(
&
q
->
timer
,
jiffies
+
delay
);
mod_timer
(
&
q
->
timer
,
jiffies
+
PSCHED_US2JIFFIE
(
delay
)
);
sch
->
flags
|=
TCQ_F_THROTTLED
;
if
(
q
->
qdisc
->
ops
->
requeue
(
skb
,
q
->
qdisc
)
!=
0
)
sch
->
qstats
.
drops
++
;
}
}
return
NULL
;
...
...
@@ -290,11 +302,16 @@ static void netem_reset(struct Qdisc *sch)
del_timer_sync
(
&
q
->
timer
);
}
/* Pass size change message down to embedded FIFO */
static
int
set_fifo_limit
(
struct
Qdisc
*
q
,
int
limit
)
{
struct
rtattr
*
rta
;
int
ret
=
-
ENOMEM
;
/* Hack to avoid sending change message to non-FIFO */
if
(
strncmp
(
q
->
ops
->
id
+
1
,
"fifo"
,
4
)
!=
0
)
return
0
;
rta
=
kmalloc
(
RTA_LENGTH
(
sizeof
(
struct
tc_fifo_qopt
)),
GFP_KERNEL
);
if
(
rta
)
{
rta
->
rta_type
=
RTM_NEWQDISC
;
...
...
@@ -426,6 +443,84 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
return
0
;
}
/*
* Special case version of FIFO queue for use by netem.
* It queues in order based on timestamps in skb's
*/
struct
fifo_sched_data
{
u32
limit
;
};
static
int
tfifo_enqueue
(
struct
sk_buff
*
nskb
,
struct
Qdisc
*
sch
)
{
struct
fifo_sched_data
*
q
=
qdisc_priv
(
sch
);
struct
sk_buff_head
*
list
=
&
sch
->
q
;
const
struct
netem_skb_cb
*
ncb
=
(
const
struct
netem_skb_cb
*
)
nskb
->
cb
;
struct
sk_buff
*
skb
;
if
(
likely
(
skb_queue_len
(
list
)
<
q
->
limit
))
{
skb_queue_reverse_walk
(
list
,
skb
)
{
const
struct
netem_skb_cb
*
cb
=
(
const
struct
netem_skb_cb
*
)
skb
->
cb
;
if
(
PSCHED_TLESS
(
cb
->
time_to_send
,
ncb
->
time_to_send
))
break
;
}
__skb_queue_after
(
list
,
skb
,
nskb
);
sch
->
qstats
.
backlog
+=
nskb
->
len
;
sch
->
bstats
.
bytes
+=
nskb
->
len
;
sch
->
bstats
.
packets
++
;
return
NET_XMIT_SUCCESS
;
}
return
qdisc_drop
(
nskb
,
sch
);
}
static
int
tfifo_init
(
struct
Qdisc
*
sch
,
struct
rtattr
*
opt
)
{
struct
fifo_sched_data
*
q
=
qdisc_priv
(
sch
);
if
(
opt
)
{
struct
tc_fifo_qopt
*
ctl
=
RTA_DATA
(
opt
);
if
(
RTA_PAYLOAD
(
opt
)
<
sizeof
(
*
ctl
))
return
-
EINVAL
;
q
->
limit
=
ctl
->
limit
;
}
else
q
->
limit
=
max_t
(
u32
,
sch
->
dev
->
tx_queue_len
,
1
);
return
0
;
}
static
int
tfifo_dump
(
struct
Qdisc
*
sch
,
struct
sk_buff
*
skb
)
{
struct
fifo_sched_data
*
q
=
qdisc_priv
(
sch
);
struct
tc_fifo_qopt
opt
=
{
.
limit
=
q
->
limit
};
RTA_PUT
(
skb
,
TCA_OPTIONS
,
sizeof
(
opt
),
&
opt
);
return
skb
->
len
;
rtattr_failure:
return
-
1
;
}
static
struct
Qdisc_ops
tfifo_qdisc_ops
=
{
.
id
=
"tfifo"
,
.
priv_size
=
sizeof
(
struct
fifo_sched_data
),
.
enqueue
=
tfifo_enqueue
,
.
dequeue
=
qdisc_dequeue_head
,
.
requeue
=
qdisc_requeue
,
.
drop
=
qdisc_queue_drop
,
.
init
=
tfifo_init
,
.
reset
=
qdisc_reset_queue
,
.
change
=
tfifo_init
,
.
dump
=
tfifo_dump
,
};
static
int
netem_init
(
struct
Qdisc
*
sch
,
struct
rtattr
*
opt
)
{
struct
netem_sched_data
*
q
=
qdisc_priv
(
sch
);
...
...
@@ -438,7 +533,7 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt)
q
->
timer
.
function
=
netem_watchdog
;
q
->
timer
.
data
=
(
unsigned
long
)
sch
;
q
->
qdisc
=
qdisc_create_dflt
(
sch
->
dev
,
&
p
fifo_qdisc_ops
);
q
->
qdisc
=
qdisc_create_dflt
(
sch
->
dev
,
&
t
fifo_qdisc_ops
);
if
(
!
q
->
qdisc
)
{
pr_debug
(
"netem: qdisc create failed
\n
"
);
return
-
ENOMEM
;
...
...
@@ -601,6 +696,7 @@ static struct Qdisc_ops netem_qdisc_ops = {
static
int
__init
netem_module_init
(
void
)
{
pr_info
(
"netem: version "
VERSION
"
\n
"
);
return
register_qdisc
(
&
netem_qdisc_ops
);
}
static
void
__exit
netem_module_exit
(
void
)
...
...
net/sched/sch_red.c
View file @
8e33ba49
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment