Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
nexedi
linux
Commits
8e33ba49
Commit
8e33ba49
authored
Nov 07, 2005
by
Linus Torvalds
Browse files
Options
Browse Files
Download
Plain Diff
Merge master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6
parents
8cde0776
2d43f112
Changes
24
Show whitespace changes
Inline
Side-by-side
Showing
24 changed files
with
1066 additions
and
886 deletions
+1066
-886
include/linux/pkt_sched.h
include/linux/pkt_sched.h
+24
-26
include/linux/skbuff.h
include/linux/skbuff.h
+30
-8
include/net/inet_ecn.h
include/net/inet_ecn.h
+24
-4
include/net/inet_hashtables.h
include/net/inet_hashtables.h
+0
-2
include/net/red.h
include/net/red.h
+325
-0
net/core/stream.c
net/core/stream.c
+6
-6
net/dccp/ipv4.c
net/dccp/ipv4.c
+3
-29
net/ipv4/inet_connection_sock.c
net/ipv4/inet_connection_sock.c
+3
-11
net/ipv4/netfilter/ip_conntrack_helper_pptp.c
net/ipv4/netfilter/ip_conntrack_helper_pptp.c
+0
-4
net/ipv4/netfilter/ip_conntrack_netlink.c
net/ipv4/netfilter/ip_conntrack_netlink.c
+7
-12
net/ipv4/netfilter/ip_nat_core.c
net/ipv4/netfilter/ip_nat_core.c
+2
-4
net/ipv4/netfilter/ip_nat_helper_pptp.c
net/ipv4/netfilter/ip_nat_helper_pptp.c
+2
-0
net/ipv4/netfilter/ip_nat_proto_gre.c
net/ipv4/netfilter/ip_nat_proto_gre.c
+2
-2
net/ipv4/netfilter/ip_nat_proto_unknown.c
net/ipv4/netfilter/ip_nat_proto_unknown.c
+1
-1
net/ipv4/netfilter/ipt_CONNMARK.c
net/ipv4/netfilter/ipt_CONNMARK.c
+1
-0
net/ipv4/tcp.c
net/ipv4/tcp.c
+0
-1
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_ipv4.c
+0
-2
net/ipv6/tcp_ipv6.c
net/ipv6/tcp_ipv6.c
+4
-11
net/netfilter/nf_queue.c
net/netfilter/nf_queue.c
+1
-1
net/netfilter/nfnetlink_log.c
net/netfilter/nfnetlink_log.c
+2
-4
net/netfilter/nfnetlink_queue.c
net/netfilter/nfnetlink_queue.c
+2
-4
net/sched/sch_gred.c
net/sched/sch_gred.c
+411
-430
net/sched/sch_netem.c
net/sched/sch_netem.c
+109
-13
net/sched/sch_red.c
net/sched/sch_red.c
+107
-311
No files found.
include/linux/pkt_sched.h
View file @
8e33ba49
...
...
@@ -93,6 +93,7 @@ struct tc_fifo_qopt
/* PRIO section */
#define TCQ_PRIO_BANDS 16
#define TCQ_MIN_PRIO_BANDS 2
struct
tc_prio_qopt
{
...
...
@@ -169,6 +170,7 @@ struct tc_red_qopt
unsigned
char
Scell_log
;
/* cell size for idle damping */
unsigned
char
flags
;
#define TC_RED_ECN 1
#define TC_RED_HARDDROP 2
};
struct
tc_red_xstats
...
...
@@ -194,15 +196,11 @@ enum
#define TCA_GRED_MAX (__TCA_GRED_MAX - 1)
#define TCA_SET_OFF TCA_GRED_PARMS
struct
tc_gred_qopt
{
__u32
limit
;
/* HARD maximal queue length (bytes)
*/
__u32
qth_min
;
/* Min average length threshold (bytes)
*/
__u32
qth_max
;
/* Max average length threshold (bytes)
*/
__u32
limit
;
/* HARD maximal queue length (bytes) */
__u32
qth_min
;
/* Min average length threshold (bytes) */
__u32
qth_max
;
/* Max average length threshold (bytes) */
__u32
DP
;
/* upto 2^32 DPs */
__u32
backlog
;
__u32
qave
;
...
...
@@ -210,22 +208,22 @@ struct tc_gred_qopt
__u32
early
;
__u32
other
;
__u32
pdrop
;
unsigned
char
Wlog
;
/* log(W) */
unsigned
char
Plog
;
/* log(P_max/(qth_max-qth_min)) */
unsigned
char
Scell_log
;
/* cell size for idle damping */
__u8
Wlog
;
/* log(W) */
__u8
Plog
;
/* log(P_max/(qth_max-qth_min)) */
__u8
Scell_log
;
/* cell size for idle damping */
__u8
prio
;
/* prio of this VQ */
__u32
packets
;
__u32
bytesin
;
};
/* gred setup */
struct
tc_gred_sopt
{
__u32
DPs
;
__u32
def_DP
;
__u8
grio
;
__u8
pad1
;
__u16
pad2
;
__u8
flags
;
__u16
pad1
;
};
/* HTB section */
...
...
include/linux/skbuff.h
View file @
8e33ba49
...
...
@@ -603,29 +603,46 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
*/
/**
* __skb_queue_
head
- queue a buffer at the list head
* __skb_queue_
after
- queue a buffer at the list head
* @list: list to use
* @prev: place after this buffer
* @newsk: buffer to queue
*
* Queue a buffer
at the start
of a list. This function takes no locks
* Queue a buffer
int the middle
of a list. This function takes no locks
* and you must therefore hold required locks before calling it.
*
* A buffer cannot be placed on two lists at the same time.
*/
extern
void
skb_queue_head
(
struct
sk_buff_head
*
list
,
struct
sk_buff
*
newsk
);
static
inline
void
__skb_queue_head
(
struct
sk_buff_head
*
list
,
static
inline
void
__skb_queue_after
(
struct
sk_buff_head
*
list
,
struct
sk_buff
*
prev
,
struct
sk_buff
*
newsk
)
{
struct
sk_buff
*
prev
,
*
next
;
struct
sk_buff
*
next
;
list
->
qlen
++
;
prev
=
(
struct
sk_buff
*
)
list
;
next
=
prev
->
next
;
newsk
->
next
=
next
;
newsk
->
prev
=
prev
;
next
->
prev
=
prev
->
next
=
newsk
;
}
/**
* __skb_queue_head - queue a buffer at the list head
* @list: list to use
* @newsk: buffer to queue
*
* Queue a buffer at the start of a list. This function takes no locks
* and you must therefore hold required locks before calling it.
*
* A buffer cannot be placed on two lists at the same time.
*/
extern
void
skb_queue_head
(
struct
sk_buff_head
*
list
,
struct
sk_buff
*
newsk
);
static
inline
void
__skb_queue_head
(
struct
sk_buff_head
*
list
,
struct
sk_buff
*
newsk
)
{
__skb_queue_after
(
list
,
(
struct
sk_buff
*
)
list
,
newsk
);
}
/**
* __skb_queue_tail - queue a buffer at the list tail
* @list: list to use
...
...
@@ -1203,6 +1220,11 @@ static inline void kunmap_skb_frag(void *vaddr)
prefetch(skb->next), (skb != (struct sk_buff *)(queue)); \
skb = skb->next)
#define skb_queue_reverse_walk(queue, skb) \
for (skb = (queue)->prev; \
prefetch(skb->prev), (skb != (struct sk_buff *)(queue)); \
skb = skb->prev)
extern
struct
sk_buff
*
skb_recv_datagram
(
struct
sock
*
sk
,
unsigned
flags
,
int
noblock
,
int
*
err
);
...
...
include/net/inet_ecn.h
View file @
8e33ba49
...
...
@@ -2,6 +2,7 @@
#define _INET_ECN_H_
#include <linux/ip.h>
#include <linux/skbuff.h>
#include <net/dsfield.h>
enum
{
...
...
@@ -48,7 +49,7 @@ static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner)
(label) |= __constant_htons(INET_ECN_ECT_0 << 4); \
} while (0)
static
inline
void
IP_ECN_set_ce
(
struct
iphdr
*
iph
)
static
inline
int
IP_ECN_set_ce
(
struct
iphdr
*
iph
)
{
u32
check
=
iph
->
check
;
u32
ecn
=
(
iph
->
tos
+
1
)
&
INET_ECN_MASK
;
...
...
@@ -61,7 +62,7 @@ static inline void IP_ECN_set_ce(struct iphdr *iph)
* INET_ECN_CE => 00
*/
if
(
!
(
ecn
&
2
))
return
;
return
!
ecn
;
/*
* The following gives us:
...
...
@@ -72,6 +73,7 @@ static inline void IP_ECN_set_ce(struct iphdr *iph)
iph
->
check
=
check
+
(
check
>=
0xFFFF
);
iph
->
tos
|=
INET_ECN_CE
;
return
1
;
}
static
inline
void
IP_ECN_clear
(
struct
iphdr
*
iph
)
...
...
@@ -87,11 +89,12 @@ static inline void ipv4_copy_dscp(struct iphdr *outer, struct iphdr *inner)
struct
ipv6hdr
;
static
inline
void
IP6_ECN_set_ce
(
struct
ipv6hdr
*
iph
)
static
inline
int
IP6_ECN_set_ce
(
struct
ipv6hdr
*
iph
)
{
if
(
INET_ECN_is_not_ect
(
ipv6_get_dsfield
(
iph
)))
return
;
return
0
;
*
(
u32
*
)
iph
|=
htonl
(
INET_ECN_CE
<<
20
);
return
1
;
}
static
inline
void
IP6_ECN_clear
(
struct
ipv6hdr
*
iph
)
...
...
@@ -105,4 +108,21 @@ static inline void ipv6_copy_dscp(struct ipv6hdr *outer, struct ipv6hdr *inner)
ipv6_change_dsfield
(
inner
,
INET_ECN_MASK
,
dscp
);
}
static
inline
int
INET_ECN_set_ce
(
struct
sk_buff
*
skb
)
{
switch
(
skb
->
protocol
)
{
case
__constant_htons
(
ETH_P_IP
):
if
(
skb
->
nh
.
raw
+
sizeof
(
struct
iphdr
)
<=
skb
->
tail
)
return
IP_ECN_set_ce
(
skb
->
nh
.
iph
);
break
;
case
__constant_htons
(
ETH_P_IPV6
):
if
(
skb
->
nh
.
raw
+
sizeof
(
struct
ipv6hdr
)
<=
skb
->
tail
)
return
IP6_ECN_set_ce
(
skb
->
nh
.
ipv6h
);
break
;
}
return
0
;
}
#endif
include/net/inet_hashtables.h
View file @
8e33ba49
...
...
@@ -125,9 +125,7 @@ struct inet_hashinfo {
rwlock_t
lhash_lock
____cacheline_aligned
;
atomic_t
lhash_users
;
wait_queue_head_t
lhash_wait
;
spinlock_t
portalloc_lock
;
kmem_cache_t
*
bind_bucket_cachep
;
int
port_rover
;
};
static
inline
unsigned
int
inet_ehashfn
(
const
__u32
laddr
,
const
__u16
lport
,
...
...
include/net/red.h
0 → 100644
View file @
8e33ba49
#ifndef __NET_SCHED_RED_H
#define __NET_SCHED_RED_H
#include <linux/config.h>
#include <linux/types.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#include <net/dsfield.h>
/* Random Early Detection (RED) algorithm.
=======================================
Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
This file codes a "divisionless" version of RED algorithm
as written down in Fig.17 of the paper.
Short description.
------------------
When a new packet arrives we calculate the average queue length:
avg = (1-W)*avg + W*current_queue_len,
W is the filter time constant (chosen as 2^(-Wlog)), it controls
the inertia of the algorithm. To allow larger bursts, W should be
decreased.
if (avg > th_max) -> packet marked (dropped).
if (avg < th_min) -> packet passes.
if (th_min < avg < th_max) we calculate probability:
Pb = max_P * (avg - th_min)/(th_max-th_min)
and mark (drop) packet with this probability.
Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
max_P should be small (not 1), usually 0.01..0.02 is good value.
max_P is chosen as a number, so that max_P/(th_max-th_min)
is a negative power of two in order arithmetics to contain
only shifts.
Parameters, settable by user:
-----------------------------
qth_min - bytes (should be < qth_max/2)
qth_max - bytes (should be at least 2*qth_min and less limit)
Wlog - bits (<32) log(1/W).
Plog - bits (<32)
Plog is related to max_P by formula:
max_P = (qth_max-qth_min)/2^Plog;
F.e. if qth_max=128K and qth_min=32K, then Plog=22
corresponds to max_P=0.02
Scell_log
Stab
Lookup table for log((1-W)^(t/t_ave).
NOTES:
Upper bound on W.
-----------------
If you want to allow bursts of L packets of size S,
you should choose W:
L + 1 - th_min/S < (1-(1-W)^L)/W
th_min/S = 32 th_min/S = 4
log(W) L
-1 33
-2 35
-3 39
-4 46
-5 57
-6 75
-7 101
-8 135
-9 190
etc.
*/
#define RED_STAB_SIZE 256
#define RED_STAB_MASK (RED_STAB_SIZE - 1)
struct
red_stats
{
u32
prob_drop
;
/* Early probability drops */
u32
prob_mark
;
/* Early probability marks */
u32
forced_drop
;
/* Forced drops, qavg > max_thresh */
u32
forced_mark
;
/* Forced marks, qavg > max_thresh */
u32
pdrop
;
/* Drops due to queue limits */
u32
other
;
/* Drops due to drop() calls */
u32
backlog
;
};
struct
red_parms
{
/* Parameters */
u32
qth_min
;
/* Min avg length threshold: A scaled */
u32
qth_max
;
/* Max avg length threshold: A scaled */
u32
Scell_max
;
u32
Rmask
;
/* Cached random mask, see red_rmask */
u8
Scell_log
;
u8
Wlog
;
/* log(W) */
u8
Plog
;
/* random number bits */
u8
Stab
[
RED_STAB_SIZE
];
/* Variables */
int
qcount
;
/* Number of packets since last random
number generation */
u32
qR
;
/* Cached random number */
unsigned
long
qavg
;
/* Average queue length: A scaled */
psched_time_t
qidlestart
;
/* Start of current idle period */
};
static
inline
u32
red_rmask
(
u8
Plog
)
{
return
Plog
<
32
?
((
1
<<
Plog
)
-
1
)
:
~
0UL
;
}
static
inline
void
red_set_parms
(
struct
red_parms
*
p
,
u32
qth_min
,
u32
qth_max
,
u8
Wlog
,
u8
Plog
,
u8
Scell_log
,
u8
*
stab
)
{
/* Reset average queue length, the value is strictly bound
* to the parameters below, reseting hurts a bit but leaving
* it might result in an unreasonable qavg for a while. --TGR
*/
p
->
qavg
=
0
;
p
->
qcount
=
-
1
;
p
->
qth_min
=
qth_min
<<
Wlog
;
p
->
qth_max
=
qth_max
<<
Wlog
;
p
->
Wlog
=
Wlog
;
p
->
Plog
=
Plog
;
p
->
Rmask
=
red_rmask
(
Plog
);
p
->
Scell_log
=
Scell_log
;
p
->
Scell_max
=
(
255
<<
Scell_log
);
memcpy
(
p
->
Stab
,
stab
,
sizeof
(
p
->
Stab
));
}
static
inline
int
red_is_idling
(
struct
red_parms
*
p
)
{
return
!
PSCHED_IS_PASTPERFECT
(
p
->
qidlestart
);
}
static
inline
void
red_start_of_idle_period
(
struct
red_parms
*
p
)
{
PSCHED_GET_TIME
(
p
->
qidlestart
);
}
static
inline
void
red_end_of_idle_period
(
struct
red_parms
*
p
)
{
PSCHED_SET_PASTPERFECT
(
p
->
qidlestart
);
}
static
inline
void
red_restart
(
struct
red_parms
*
p
)
{
red_end_of_idle_period
(
p
);
p
->
qavg
=
0
;
p
->
qcount
=
-
1
;
}
static
inline
unsigned
long
red_calc_qavg_from_idle_time
(
struct
red_parms
*
p
)
{
psched_time_t
now
;
long
us_idle
;
int
shift
;
PSCHED_GET_TIME
(
now
);
us_idle
=
PSCHED_TDIFF_SAFE
(
now
,
p
->
qidlestart
,
p
->
Scell_max
);
/*
* The problem: ideally, average length queue recalcultion should
* be done over constant clock intervals. This is too expensive, so
* that the calculation is driven by outgoing packets.
* When the queue is idle we have to model this clock by hand.
*
* SF+VJ proposed to "generate":
*
* m = idletime / (average_pkt_size / bandwidth)
*
* dummy packets as a burst after idle time, i.e.
*
* p->qavg *= (1-W)^m
*
* This is an apparently overcomplicated solution (f.e. we have to
* precompute a table to make this calculation in reasonable time)
* I believe that a simpler model may be used here,
* but it is field for experiments.
*/
shift
=
p
->
Stab
[(
us_idle
>>
p
->
Scell_log
)
&
RED_STAB_MASK
];
if
(
shift
)
return
p
->
qavg
>>
shift
;
else
{
/* Approximate initial part of exponent with linear function:
*
* (1-W)^m ~= 1-mW + ...
*
* Seems, it is the best solution to
* problem of too coarse exponent tabulation.
*/
us_idle
=
(
p
->
qavg
*
us_idle
)
>>
p
->
Scell_log
;
if
(
us_idle
<
(
p
->
qavg
>>
1
))
return
p
->
qavg
-
us_idle
;
else
return
p
->
qavg
>>
1
;
}
}
static
inline
unsigned
long
red_calc_qavg_no_idle_time
(
struct
red_parms
*
p
,
unsigned
int
backlog
)
{
/*
* NOTE: p->qavg is fixed point number with point at Wlog.
* The formula below is equvalent to floating point
* version:
*
* qavg = qavg*(1-W) + backlog*W;
*
* --ANK (980924)
*/
return
p
->
qavg
+
(
backlog
-
(
p
->
qavg
>>
p
->
Wlog
));
}
static
inline
unsigned
long
red_calc_qavg
(
struct
red_parms
*
p
,
unsigned
int
backlog
)
{
if
(
!
red_is_idling
(
p
))
return
red_calc_qavg_no_idle_time
(
p
,
backlog
);
else
return
red_calc_qavg_from_idle_time
(
p
);
}
static
inline
u32
red_random
(
struct
red_parms
*
p
)
{
return
net_random
()
&
p
->
Rmask
;
}
static
inline
int
red_mark_probability
(
struct
red_parms
*
p
,
unsigned
long
qavg
)
{
/* The formula used below causes questions.
OK. qR is random number in the interval 0..Rmask
i.e. 0..(2^Plog). If we used floating point
arithmetics, it would be: (2^Plog)*rnd_num,
where rnd_num is less 1.
Taking into account, that qavg have fixed
point at Wlog, and Plog is related to max_P by
max_P = (qth_max-qth_min)/2^Plog; two lines
below have the following floating point equivalent:
max_P*(qavg - qth_min)/(qth_max-qth_min) < rnd/qcount
Any questions? --ANK (980924)
*/
return
!
(((
qavg
-
p
->
qth_min
)
>>
p
->
Wlog
)
*
p
->
qcount
<
p
->
qR
);
}
enum
{
RED_BELOW_MIN_THRESH
,
RED_BETWEEN_TRESH
,
RED_ABOVE_MAX_TRESH
,
};
static
inline
int
red_cmp_thresh
(
struct
red_parms
*
p
,
unsigned
long
qavg
)
{
if
(
qavg
<
p
->
qth_min
)
return
RED_BELOW_MIN_THRESH
;
else
if
(
qavg
>=
p
->
qth_max
)
return
RED_ABOVE_MAX_TRESH
;
else
return
RED_BETWEEN_TRESH
;
}
enum
{
RED_DONT_MARK
,
RED_PROB_MARK
,
RED_HARD_MARK
,
};
static
inline
int
red_action
(
struct
red_parms
*
p
,
unsigned
long
qavg
)
{
switch
(
red_cmp_thresh
(
p
,
qavg
))
{
case
RED_BELOW_MIN_THRESH
:
p
->
qcount
=
-
1
;
return
RED_DONT_MARK
;
case
RED_BETWEEN_TRESH
:
if
(
++
p
->
qcount
)
{
if
(
red_mark_probability
(
p
,
qavg
))
{
p
->
qcount
=
0
;
p
->
qR
=
red_random
(
p
);
return
RED_PROB_MARK
;
}
}
else
p
->
qR
=
red_random
(
p
);
return
RED_DONT_MARK
;
case
RED_ABOVE_MAX_TRESH
:
p
->
qcount
=
-
1
;
return
RED_HARD_MARK
;
}
BUG
();
return
RED_DONT_MARK
;
}
#endif
net/core/stream.c
View file @
8e33ba49
...
...
@@ -52,8 +52,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
{
struct
task_struct
*
tsk
=
current
;
DEFINE_WAIT
(
wait
);
int
done
;
while
(
1
)
{
do
{
if
(
sk
->
sk_err
)
return
sock_error
(
sk
);
if
((
1
<<
sk
->
sk_state
)
&
~
(
TCPF_SYN_SENT
|
TCPF_SYN_RECV
))
...
...
@@ -65,13 +66,12 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
prepare_to_wait
(
sk
->
sk_sleep
,
&
wait
,
TASK_INTERRUPTIBLE
);
sk
->
sk_write_pending
++
;
if
(
sk_wait_event
(
sk
,
timeo_p
,
done
=
sk_wait_event
(
sk
,
timeo_p
,
!
((
1
<<
sk
->
sk_state
)
&
~
(
TCPF_ESTABLISHED
|
TCPF_CLOSE_WAIT
))))
break
;
~
(
TCPF_ESTABLISHED
|
TCPF_CLOSE_WAIT
)));
finish_wait
(
sk
->
sk_sleep
,
&
wait
);
sk
->
sk_write_pending
--
;
}
}
while
(
!
done
);
return
0
;
}
...
...
net/dccp/ipv4.c
View file @
8e33ba49
...
...
@@ -31,8 +31,6 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
.
lhash_lock
=
RW_LOCK_UNLOCKED
,
.
lhash_users
=
ATOMIC_INIT
(
0
),
.
lhash_wait
=
__WAIT_QUEUE_HEAD_INITIALIZER
(
dccp_hashinfo
.
lhash_wait
),
.
portalloc_lock
=
SPIN_LOCK_UNLOCKED
,
.
port_rover
=
1024
-
1
,
};
EXPORT_SYMBOL_GPL
(
dccp_hashinfo
);
...
...
@@ -125,36 +123,15 @@ static int dccp_v4_hash_connect(struct sock *sk)
int
ret
;
if
(
snum
==
0
)
{
int
rover
;
int
low
=
sysctl_local_port_range
[
0
];
int
high
=
sysctl_local_port_range
[
1
];
int
remaining
=
(
high
-
low
)
+
1
;
int
rover
=
net_random
()
%
(
high
-
low
)
+
low
;
struct
hlist_node
*
node
;
struct
inet_timewait_sock
*
tw
=
NULL
;
local_bh_disable
();
/* TODO. Actually it is not so bad idea to remove
* dccp_hashinfo.portalloc_lock before next submission to
* Linus.
* As soon as we touch this place at all it is time to think.
*
* Now it protects single _advisory_ variable
* dccp_hashinfo.port_rover, hence it is mostly useless.
* Code will work nicely if we just delete it, but
* I am afraid in contented case it will work not better or
* even worse: another cpu just will hit the same bucket
* and spin there.
* So some cpu salt could remove both contention and
* memory pingpong. Any ideas how to do this in a nice way?
*/
spin_lock
(
&
dccp_hashinfo
.
portalloc_lock
);
rover
=
dccp_hashinfo
.
port_rover
;
do
{
rover
++
;
if
((
rover
<
low
)
||
(
rover
>
high
))
rover
=
low
;
head
=
&
dccp_hashinfo
.
bhash
[
inet_bhashfn
(
rover
,
dccp_hashinfo
.
bhash_size
)];
spin_lock
(
&
head
->
lock
);
...
...
@@ -187,9 +164,9 @@ static int dccp_v4_hash_connect(struct sock *sk)
next_port:
spin_unlock
(
&
head
->
lock
);
if
(
++
rover
>
high
)
rover
=
low
;
}
while
(
--
remaining
>
0
);
dccp_hashinfo
.
port_rover
=
rover
;
spin_unlock
(
&
dccp_hashinfo
.
portalloc_lock
);
local_bh_enable
();
...
...
@@ -197,9 +174,6 @@ static int dccp_v4_hash_connect(struct sock *sk)
ok:
/* All locks still held and bhs disabled */
dccp_hashinfo
.
port_rover
=
rover
;
spin_unlock
(
&
dccp_hashinfo
.
portalloc_lock
);
inet_bind_hash
(
sk
,
tb
,
rover
);
if
(
sk_unhashed
(
sk
))
{
inet_sk
(
sk
)
->
sport
=
htons
(
rover
);
...
...
net/ipv4/inet_connection_sock.c
View file @
8e33ba49
...
...
@@ -78,17 +78,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
int
low
=
sysctl_local_port_range
[
0
];
int
high
=
sysctl_local_port_range
[
1
];
int
remaining
=
(
high
-
low
)
+
1
;
int
rover
;
int
rover
=
net_random
()
%
(
high
-
low
)
+
low
;
spin_lock
(
&
hashinfo
->
portalloc_lock
);
if
(
hashinfo
->
port_rover
<
low
)
rover
=
low
;
else
rover
=
hashinfo
->
port_rover
;
do
{
rover
++
;
if
(
rover
>
high
)
rover
=
low
;
head
=
&
hashinfo
->
bhash
[
inet_bhashfn
(
rover
,
hashinfo
->
bhash_size
)];
spin_lock
(
&
head
->
lock
);
inet_bind_bucket_for_each
(
tb
,
node
,
&
head
->
chain
)
...
...
@@ -97,9 +89,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
break
;
next:
spin_unlock
(
&
head
->
lock
);
if
(
++
rover
>
high
)
rover
=
low
;
}
while
(
--
remaining
>
0
);
hashinfo
->
port_rover
=
rover
;
spin_unlock
(
&
hashinfo
->
portalloc_lock
);
/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
...
...
net/ipv4/netfilter/ip_conntrack_helper_pptp.c
View file @
8e33ba49
...
...
@@ -270,14 +270,10 @@ exp_gre(struct ip_conntrack *master,
exp_orig
->
expectfn
=
pptp_expectfn
;
exp_orig
->
flags
=
0
;
exp_orig
->
dir
=
IP_CT_DIR_ORIGINAL
;
/* both expectations are identical apart from tuple */
memcpy
(
exp_reply
,
exp_orig
,
sizeof
(
*
exp_reply
));
memcpy
(
&
exp_reply
->
tuple
,
&
exp_tuples
[
1
],
sizeof
(
exp_reply
->
tuple
));
exp_reply
->
dir
=
!
exp_orig
->
dir
;
if
(
ip_nat_pptp_hook_exp_gre
)
ret
=
ip_nat_pptp_hook_exp_gre
(
exp_orig
,
exp_reply
);
else
{
...
...
net/ipv4/netfilter/ip_conntrack_netlink.c
View file @
8e33ba49
...
...
@@ -815,7 +815,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
IPCTNL_MSG_CT_NEW
,
1
,
ct
);
ip_conntrack_put
(
ct
);
if
(
err
<=
0
)
goto
out
;
goto
free
;
err
=
netlink_unicast
(
ctnl
,
skb2
,
NETLINK_CB
(
skb
).
pid
,
MSG_DONTWAIT
);
if
(
err
<
0
)
...
...
@@ -824,9 +824,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
DEBUGP
(
"leaving
\n
"
);
return
0
;
out:
if
(
skb2
)
free:
kfree_skb
(
skb2
);
out:
return
-
1
;
}
...
...
@@ -1322,21 +1322,16 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
nlh
->
nlmsg_seq
,
IPCTNL_MSG_EXP_NEW
,
1
,
exp
);
if
(
err
<=
0
)
goto
out
;
goto
free
;
ip_conntrack_expect_put
(
exp
);
err
=
netlink_unicast
(
ctnl
,
skb2
,
NETLINK_CB
(
skb
).
pid
,
MSG_DONTWAIT
);
if
(
err
<
0
)
goto
free
;
return
err
;
return
netlink_unicast
(
ctnl
,
skb2
,
NETLINK_CB
(
skb
).
pid
,
MSG_DONTWAIT
);
out:
ip_conntrack_expect_put
(
exp
);
free:
if
(
skb2
)
kfree_skb
(
skb2
);
out:
ip_conntrack_expect_put
(
exp
);
return
err
;
}
...
...
net/ipv4/netfilter/ip_nat_core.c
View file @
8e33ba49
...
...
@@ -66,10 +66,8 @@ ip_nat_proto_find_get(u_int8_t protonum)
* removed until we've grabbed the reference */
preempt_disable
();
p
=
__ip_nat_proto_find
(
protonum
);
if
(
p
)
{
if
(
!
try_module_get
(
p
->
me
))
p
=
&
ip_nat_unknown_protocol
;
}
preempt_enable
();
return
p
;
...
...
net/ipv4/netfilter/ip_nat_helper_pptp.c
View file @
8e33ba49
...
...
@@ -216,6 +216,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
expect_orig
->
saved_proto
.
gre
.
key
=
htons
(
nat_pptp_info
->
pac_call_id
);
expect_orig
->
tuple
.
src
.
u
.
gre
.
key
=
htons
(
nat_pptp_info
->
pns_call_id
);
expect_orig
->
tuple
.
dst
.
u
.
gre
.
key
=
htons
(
ct_pptp_info
->
pac_call_id
);
expect_orig
->
dir
=
IP_CT_DIR_ORIGINAL
;
inv_t
.
src
.
ip
=
reply_t
->
src
.
ip
;
inv_t
.
dst
.
ip
=
reply_t
->
dst
.
ip
;
inv_t
.
src
.
u
.
gre
.
key
=
htons
(
nat_pptp_info
->
pac_call_id
);
...
...
@@ -233,6 +234,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
expect_reply
->
saved_proto
.
gre
.
key
=
htons
(
nat_pptp_info
->
pns_call_id
);
expect_reply
->
tuple
.
src
.
u
.
gre
.
key
=
htons
(
nat_pptp_info
->
pac_call_id
);
expect_reply
->
tuple
.
dst
.
u
.
gre
.
key
=
htons
(
ct_pptp_info
->
pns_call_id
);
expect_reply
->
dir
=
IP_CT_DIR_REPLY
;
inv_t
.
src
.
ip
=
orig_t
->
src
.
ip
;
inv_t
.
dst
.
ip
=
orig_t
->
dst
.
ip
;
inv_t
.
src
.
u
.
gre
.
key
=
htons
(
nat_pptp_info
->
pns_call_id
);
...
...
net/ipv4/netfilter/ip_nat_proto_gre.c
View file @
8e33ba49
...
...
@@ -139,8 +139,8 @@ gre_manip_pkt(struct sk_buff **pskb,
break
;
case
GRE_VERSION_PPTP
:
DEBUGP
(
"call_id -> 0x%04x
\n
"
,
ntoh
l
(
tuple
->
dst
.
u
.
gre
.
key
));
pgreh
->
call_id
=
htons
(
ntohl
(
tuple
->
dst
.
u
.
gre
.
key
))
;
ntoh
s
(
tuple
->
dst
.
u
.
gre
.
key
));
pgreh
->
call_id
=
tuple
->
dst
.
u
.
gre
.
key
;
break
;
default:
DEBUGP
(
"can't nat unknown GRE version
\n
"
);
...
...
net/ipv4/netfilter/ip_nat_proto_unknown.c
View file @
8e33ba49
...
...
@@ -62,7 +62,7 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range)
struct
ip_nat_protocol
ip_nat_unknown_protocol
=
{
.
name
=
"unknown"
,
.
me
=
THIS_MODULE
,
/* .me isn't set: getting a ref to this cannot fail. */
.
manip_pkt
=
unknown_manip_pkt
,
.
in_range
=
unknown_in_range
,
.
unique_tuple
=
unknown_unique_tuple
,
...
...
net/ipv4/netfilter/ipt_CONNMARK.c
View file @
8e33ba49
...
...
@@ -109,6 +109,7 @@ static struct ipt_target ipt_connmark_reg = {
static
int
__init
init
(
void
)
{
need_ip_conntrack
();
return
ipt_register_target
(
&
ipt_connmark_reg
);
}
...
...
net/ipv4/tcp.c
View file @
8e33ba49
...
...
@@ -2112,7 +2112,6 @@ void __init tcp_init(void)
sysctl_tcp_max_orphans
>>=
(
3
-
order
);
sysctl_max_syn_backlog
=
128
;
}
tcp_hashinfo
.
port_rover
=
sysctl_local_port_range
[
0
]
-
1
;
sysctl_tcp_mem
[
0
]
=
768
<<
order
;
sysctl_tcp_mem
[
1
]
=
1024
<<
order
;
...
...
net/ipv4/tcp_ipv4.c
View file @
8e33ba49
...
...
@@ -93,8 +93,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
.
lhash_lock
=
RW_LOCK_UNLOCKED
,
.
lhash_users
=
ATOMIC_INIT
(
0
),
.
lhash_wait
=
__WAIT_QUEUE_HEAD_INITIALIZER
(
tcp_hashinfo
.
lhash_wait
),
.
portalloc_lock
=
SPIN_LOCK_UNLOCKED
,
.
port_rover
=
1024
-
1
,
};
static
int
tcp_v4_get_port
(
struct
sock
*
sk
,
unsigned
short
snum
)
...
...
net/ipv6/tcp_ipv6.c
View file @
8e33ba49
...
...
@@ -114,16 +114,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
int
low
=
sysctl_local_port_range
[
0
];
int
high
=
sysctl_local_port_range
[
1
];
int
remaining
=
(
high
-
low
)
+
1
;
int
rover
;
int
rover
=
net_random
()
%
(
high
-
low
)
+
low
;
spin_lock
(
&
tcp_hashinfo
.
portalloc_lock
);
if
(
tcp_hashinfo
.
port_rover
<
low
)
rover
=
low
;
else
rover
=
tcp_hashinfo
.
port_rover
;
do
{
rover
++
;
if
(
rover
>
high
)
rover
=
low
;
do
{
head
=
&
tcp_hashinfo
.
bhash
[
inet_bhashfn
(
rover
,
tcp_hashinfo
.
bhash_size
)];
spin_lock
(
&
head
->
lock
);
inet_bind_bucket_for_each
(
tb
,
node
,
&
head
->
chain
)
...
...
@@ -132,9 +125,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
break
;
next:
spin_unlock
(
&
head
->
lock
);
if
(
++
rover
>
high
)
rover
=
low
;
}
while
(
--
remaining
>
0
);
tcp_hashinfo
.
port_rover
=
rover
;
spin_unlock
(
&
tcp_hashinfo
.
portalloc_lock
);
/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
...
...
net/netfilter/nf_queue.c
View file @
8e33ba49
...
...
@@ -117,7 +117,7 @@ int nf_queue(struct sk_buff **skb,
/* QUEUE == DROP if noone is waiting, to be safe. */
read_lock
(
&
queue_handler_lock
);
if
(
!
queue_handler
[
pf
]
->
outfn
)
{
if
(
!
queue_handler
[
pf
]
||
!
queue_handler
[
pf
]
->
outfn
)
{
read_unlock
(
&
queue_handler_lock
);
kfree_skb
(
*
skb
);
return
1
;
...
...
net/netfilter/nfnetlink_log.c
View file @
8e33ba49
...
...
@@ -146,11 +146,10 @@ instance_create(u_int16_t group_num, int pid)
goto
out_unlock
;
}
inst
=
k
m
alloc
(
sizeof
(
*
inst
),
GFP_ATOMIC
);
inst
=
k
z
alloc
(
sizeof
(
*
inst
),
GFP_ATOMIC
);
if
(
!
inst
)
goto
out_unlock
;
memset
(
inst
,
0
,
sizeof
(
*
inst
));
INIT_HLIST_NODE
(
&
inst
->
hlist
);
inst
->
lock
=
SPIN_LOCK_UNLOCKED
;
/* needs to be two, since we _put() after creation */
...
...
@@ -962,10 +961,9 @@ static int nful_open(struct inode *inode, struct file *file)
struct
iter_state
*
is
;
int
ret
;
is
=
k
m
alloc
(
sizeof
(
*
is
),
GFP_KERNEL
);
is
=
k
z
alloc
(
sizeof
(
*
is
),
GFP_KERNEL
);
if
(
!
is
)
return
-
ENOMEM
;
memset
(
is
,
0
,
sizeof
(
*
is
));
ret
=
seq_open
(
file
,
&
nful_seq_ops
);
if
(
ret
<
0
)
goto
out_free
;
...
...
net/netfilter/nfnetlink_queue.c
View file @
8e33ba49
...
...
@@ -136,11 +136,10 @@ instance_create(u_int16_t queue_num, int pid)
goto
out_unlock
;
}
inst
=
k
m
alloc
(
sizeof
(
*
inst
),
GFP_ATOMIC
);
inst
=
k
z
alloc
(
sizeof
(
*
inst
),
GFP_ATOMIC
);
if
(
!
inst
)
goto
out_unlock
;
memset
(
inst
,
0
,
sizeof
(
*
inst
));
inst
->
queue_num
=
queue_num
;
inst
->
peer_pid
=
pid
;
inst
->
queue_maxlen
=
NFQNL_QMAX_DEFAULT
;
...
...
@@ -1036,10 +1035,9 @@ static int nfqnl_open(struct inode *inode, struct file *file)
struct
iter_state
*
is
;
int
ret
;
is
=
k
m
alloc
(
sizeof
(
*
is
),
GFP_KERNEL
);
is
=
k
z
alloc
(
sizeof
(
*
is
),
GFP_KERNEL
);
if
(
!
is
)
return
-
ENOMEM
;
memset
(
is
,
0
,
sizeof
(
*
is
));
ret
=
seq_open
(
file
,
&
nfqnl_seq_ops
);
if
(
ret
<
0
)
goto
out_free
;
...
...
net/sched/sch_gred.c
View file @
8e33ba49
...
...
@@ -15,247 +15,281 @@
* from Ren Liu
* - More error checks
*
*
*
* For all the glorious comments look at Alexey's sch_red.c
* For all the glorious comments look at include/net/red.h
*/
#include <linux/config.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/red.h>
#if 1
/* control */
#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
#else
#define DPRINTK(format,args...)
#endif
#if 0 /* data */
#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
#else
#define D2PRINTK(format,args...)
#endif
#define GRED_DEF_PRIO (MAX_DPs / 2)
#define GRED_VQ_MASK (MAX_DPs - 1)
struct
gred_sched_data
;
struct
gred_sched
;
struct
gred_sched_data
{
/* Parameters */
u32
limit
;
/* HARD maximal queue length */
u32
qth_min
;
/* Min average length threshold: A scaled */
u32
qth_max
;
/* Max average length threshold: A scaled */
u32
DP
;
/* the drop pramaters */
char
Wlog
;
/* log(W) */
char
Plog
;
/* random number bits */
u32
Scell_max
;
u32
Rmask
;
u32
bytesin
;
/* bytes seen on virtualQ so far*/
u32
packetsin
;
/* packets seen on virtualQ so far*/
u32
backlog
;
/* bytes on the virtualQ */
u32
forced
;
/* packets dropped for exceeding limits */
u32
early
;
/* packets dropped as a warning */
u32
other
;
/* packets dropped by invoking drop() */
u32
pdrop
;
/* packets dropped because we exceeded physical queue limits */
char
Scell_log
;
u8
Stab
[
256
];
u8
prio
;
/* the prio of this vq */
/* Variables */
unsigned
long
qave
;
/* Average queue length: A scaled */
int
qcount
;
/* Packets since last random number generation */
u32
qR
;
/* Cached random number */
struct
red_parms
parms
;
struct
red_stats
stats
;
};
psched_time_t
qidlestart
;
/* Start of idle period */
enum
{
GRED_WRED_MODE
=
1
,
GRED_RIO_MODE
,
};
struct
gred_sched
{
struct
gred_sched_data
*
tab
[
MAX_DPs
];
unsigned
long
flags
;
u32
red_flags
;
u32
DPs
;
u32
def
;
u8
initd
;
u8
grio
;
u8
eqp
;
struct
red_parms
wred_set
;
};
static
int
gred_enqueue
(
struct
sk_buff
*
skb
,
struct
Qdisc
*
sch
)
static
inline
int
gred_wred_mode
(
struct
gred_sched
*
table
)
{
psched_time_t
now
;
struct
gred_sched_data
*
q
=
NULL
;
struct
gred_sched
*
t
=
qdisc_priv
(
sch
);
unsigned
long
qave
=
0
;
int
i
=
0
;
return
test_bit
(
GRED_WRED_MODE
,
&
table
->
flags
);
}
static
inline
void
gred_enable_wred_mode
(
struct
gred_sched
*
table
)
{
__set_bit
(
GRED_WRED_MODE
,
&
table
->
flags
);
}
static
inline
void
gred_disable_wred_mode
(
struct
gred_sched
*
table
)
{
__clear_bit
(
GRED_WRED_MODE
,
&
table
->
flags
);
}
if
(
!
t
->
initd
&&
skb_queue_len
(
&
sch
->
q
)
<
(
sch
->
dev
->
tx_queue_len
?
:
1
))
{
D2PRINTK
(
"NO GRED Queues setup yet! Enqueued anyway
\n
"
);
goto
do_enqueue
;
static
inline
int
gred_rio_mode
(
struct
gred_sched
*
table
)
{
return
test_bit
(
GRED_RIO_MODE
,
&
table
->
flags
);
}
static
inline
void
gred_enable_rio_mode
(
struct
gred_sched
*
table
)
{
__set_bit
(
GRED_RIO_MODE
,
&
table
->
flags
);
}
static
inline
void
gred_disable_rio_mode
(
struct
gred_sched
*
table
)
{
__clear_bit
(
GRED_RIO_MODE
,
&
table
->
flags
);
}
static
inline
int
gred_wred_mode_check
(
struct
Qdisc
*
sch
)
{
struct
gred_sched
*
table
=
qdisc_priv
(
sch
);
int
i
;
/* Really ugly O(n^2) but shouldn't be necessary too frequent. */
for
(
i
=
0
;
i
<
table
->
DPs
;
i
++
)
{
struct
gred_sched_data
*
q
=
table
->
tab
[
i
];
int
n
;
if
(
q
==
NULL
)
continue
;
for
(
n
=
0
;
n
<
table
->
DPs
;
n
++
)
if
(
table
->
tab
[
n
]
&&
table
->
tab
[
n
]
!=
q
&&
table
->
tab
[
n
]
->
prio
==
q
->
prio
)
return
1
;
}
return
0
;
}
static
inline
unsigned
int
gred_backlog
(
struct
gred_sched
*
table
,
struct
gred_sched_data
*
q
,
struct
Qdisc
*
sch
)
{
if
(
gred_wred_mode
(
table
))
return
sch
->
qstats
.
backlog
;
else
return
q
->
backlog
;
}
static
inline
u16
tc_index_to_dp
(
struct
sk_buff
*
skb
)
{
return
skb
->
tc_index
&
GRED_VQ_MASK
;
}
static
inline
void
gred_load_wred_set
(
struct
gred_sched
*
table
,
struct
gred_sched_data
*
q
)
{
q
->
parms
.
qavg
=
table
->
wred_set
.
qavg
;
q
->
parms
.
qidlestart
=
table
->
wred_set
.
qidlestart
;
}
static
inline
void
gred_store_wred_set
(
struct
gred_sched
*
table
,
struct
gred_sched_data
*
q
)
{
table
->
wred_set
.
qavg
=
q
->
parms
.
qavg
;
}
static
inline
int
gred_use_ecn
(
struct
gred_sched
*
t
)
{
return
t
->
red_flags
&
TC_RED_ECN
;
}
static
inline
int
gred_use_harddrop
(
struct
gred_sched
*
t
)
{
return
t
->
red_flags
&
TC_RED_HARDDROP
;
}
static
int
gred_enqueue
(
struct
sk_buff
*
skb
,
struct
Qdisc
*
sch
)
{
struct
gred_sched_data
*
q
=
NULL
;
struct
gred_sched
*
t
=
qdisc_priv
(
sch
);
unsigned
long
qavg
=
0
;
u16
dp
=
tc_index_to_dp
(
skb
);
if
(
dp
>=
t
->
DPs
||
(
q
=
t
->
tab
[
dp
])
==
NULL
)
{
dp
=
t
->
def
;
if
(
((
skb
->
tc_index
&
0xf
)
>
(
t
->
DPs
-
1
))
||
!
(
q
=
t
->
tab
[
skb
->
tc_index
&
0xf
]))
{
printk
(
"GRED: setting to default (%d)
\n
"
,
t
->
def
);
if
(
!
(
q
=
t
->
tab
[
t
->
def
]))
{
DPRINTK
(
"GRED: setting to default FAILED! dropping!! "
"(%d)
\n
"
,
t
->
def
);
if
((
q
=
t
->
tab
[
dp
])
==
NULL
)
{
/* Pass through packets not assigned to a DP
* if no default DP has been configured. This
* allows for DP flows to be left untouched.
*/
if
(
skb_queue_len
(
&
sch
->
q
)
<
sch
->
dev
->
tx_queue_len
)
return
qdisc_enqueue_tail
(
skb
,
sch
);
else
goto
drop
;
}
/* fix tc_index? --could be controvesial but needed for
requeueing */
skb
->
tc_index
=
(
skb
->
tc_index
&
0xfffffff0
)
|
t
->
def
;
skb
->
tc_index
=
(
skb
->
tc_index
&
~
GRED_VQ_MASK
)
|
dp
;
}
D2PRINTK
(
"gred_enqueue virtualQ 0x%x classid %x backlog %d "
"general backlog %d
\n
"
,
skb
->
tc_index
&
0xf
,
sch
->
handle
,
q
->
backlog
,
sch
->
qstats
.
backlog
);
/* sum up all the qaves of prios <= to ours to get the new qave*/
if
(
!
t
->
eqp
&&
t
->
grio
)
{
for
(
i
=
0
;
i
<
t
->
DPs
;
i
++
)
{
if
((
!
t
->
tab
[
i
])
||
(
i
==
q
->
DP
))
continue
;
/* sum up all the qaves of prios <= to ours to get the new qave */
if
(
!
gred_wred_mode
(
t
)
&&
gred_rio_mode
(
t
))
{
int
i
;
if
((
t
->
tab
[
i
]
->
prio
<
q
->
prio
)
&&
(
PSCHED_IS_PASTPERFECT
(
t
->
tab
[
i
]
->
qidlestart
)))
qave
+=
t
->
tab
[
i
]
->
qave
;
for
(
i
=
0
;
i
<
t
->
DPs
;
i
++
)
{
if
(
t
->
tab
[
i
]
&&
t
->
tab
[
i
]
->
prio
<
q
->
prio
&&
!
red_is_idling
(
&
t
->
tab
[
i
]
->
parms
))
qavg
+=
t
->
tab
[
i
]
->
parms
.
qavg
;
}
}
q
->
packetsin
++
;
q
->
bytesin
+=
skb
->
len
;
q
->
bytesin
+=
skb
->
len
;
if
(
t
->
eqp
&&
t
->
grio
)
{
qave
=
0
;
q
->
qave
=
t
->
tab
[
t
->
def
]
->
qave
;
q
->
qidlestart
=
t
->
tab
[
t
->
def
]
->
qidlestart
;
}
if
(
gred_wred_mode
(
t
))
gred_load_wred_set
(
t
,
q
);
if
(
!
PSCHED_IS_PASTPERFECT
(
q
->
qidlestart
))
{
long
us_idle
;
PSCHED_GET_TIME
(
now
);
us_idle
=
PSCHED_TDIFF_SAFE
(
now
,
q
->
qidlestart
,
q
->
Scell_max
);
PSCHED_SET_PASTPERFECT
(
q
->
qidlestart
);
q
->
parms
.
qavg
=
red_calc_qavg
(
&
q
->
parms
,
gred_backlog
(
t
,
q
,
sch
));
q
->
qave
>>=
q
->
Stab
[(
us_idle
>>
q
->
Scell_log
)
&
0xFF
];
}
else
{
if
(
t
->
eqp
)
{
q
->
qave
+=
sch
->
qstats
.
backlog
-
(
q
->
qave
>>
q
->
Wlog
);
}
else
{
q
->
qave
+=
q
->
backlog
-
(
q
->
qave
>>
q
->
Wlog
);
}
if
(
red_is_idling
(
&
q
->
parms
))
red_end_of_idle_period
(
&
q
->
parms
);
if
(
gred_wred_mode
(
t
))
gred_store_wred_set
(
t
,
q
);
switch
(
red_action
(
&
q
->
parms
,
q
->
parms
.
qavg
+
qavg
))
{
case
RED_DONT_MARK
:
break
;
case
RED_PROB_MARK
:
sch
->
qstats
.
overlimits
++
;
if
(
!
gred_use_ecn
(
t
)
||
!
INET_ECN_set_ce
(
skb
))
{
q
->
stats
.
prob_drop
++
;
goto
congestion_drop
;
}
q
->
stats
.
prob_mark
++
;
break
;
if
(
t
->
eqp
&&
t
->
grio
)
t
->
tab
[
t
->
def
]
->
qave
=
q
->
qave
;
case
RED_HARD_MARK
:
sch
->
qstats
.
overlimits
++
;
if
(
gred_use_harddrop
(
t
)
||
!
gred_use_ecn
(
t
)
||
!
INET_ECN_set_ce
(
skb
))
{
q
->
stats
.
forced_drop
++
;
goto
congestion_drop
;
}
q
->
stats
.
forced_mark
++
;
break
;
}
if
((
q
->
qave
+
qave
)
<
q
->
qth_min
)
{
q
->
qcount
=
-
1
;
enqueue:
if
(
q
->
backlog
+
skb
->
len
<=
q
->
limit
)
{
q
->
backlog
+=
skb
->
len
;
do_enqueue:
__skb_queue_tail
(
&
sch
->
q
,
skb
);
sch
->
qstats
.
backlog
+=
skb
->
len
;
sch
->
bstats
.
bytes
+=
skb
->
len
;
sch
->
bstats
.
packets
++
;
return
0
;
}
else
{
q
->
pdrop
++
;
return
qdisc_enqueue_tail
(
skb
,
sch
);
}
q
->
stats
.
pdrop
++
;
drop:
kfree_skb
(
skb
);
sch
->
qstats
.
drops
++
;
return
NET_XMIT_DROP
;
}
if
((
q
->
qave
+
qave
)
>=
q
->
qth_max
)
{
q
->
qcount
=
-
1
;
sch
->
qstats
.
overlimits
++
;
q
->
forced
++
;
goto
drop
;
}
if
(
++
q
->
qcount
)
{
if
((((
qave
+
q
->
qave
)
-
q
->
qth_min
)
>>
q
->
Wlog
)
*
q
->
qcount
<
q
->
qR
)
goto
enqueue
;
q
->
qcount
=
0
;
q
->
qR
=
net_random
()
&
q
->
Rmask
;
sch
->
qstats
.
overlimits
++
;
q
->
early
++
;
goto
drop
;
}
q
->
qR
=
net_random
()
&
q
->
Rmask
;
goto
enqueue
;
return
qdisc_drop
(
skb
,
sch
);
congestion_drop:
qdisc_drop
(
skb
,
sch
);
return
NET_XMIT_CN
;
}
static
int
gred_requeue
(
struct
sk_buff
*
skb
,
struct
Qdisc
*
sch
)
static
int
gred_requeue
(
struct
sk_buff
*
skb
,
struct
Qdisc
*
sch
)
{
struct
gred_sched
*
t
=
qdisc_priv
(
sch
);
struct
gred_sched_data
*
q
;
struct
gred_sched
*
t
=
qdisc_priv
(
sch
);
q
=
t
->
tab
[(
skb
->
tc_index
&
0xf
)];
/* error checking here -- probably unnecessary */
PSCHED_SET_PASTPERFECT
(
q
->
qidlestart
);
u16
dp
=
tc_index_to_dp
(
skb
);
__skb_queue_head
(
&
sch
->
q
,
skb
);
sch
->
qstats
.
backlog
+=
skb
->
len
;
sch
->
qstats
.
requeues
++
;
if
(
dp
>=
t
->
DPs
||
(
q
=
t
->
tab
[
dp
])
==
NULL
)
{
if
(
net_ratelimit
())
printk
(
KERN_WARNING
"GRED: Unable to relocate VQ 0x%x "
"for requeue, screwing up backlog.
\n
"
,
tc_index_to_dp
(
skb
));
}
else
{
if
(
red_is_idling
(
&
q
->
parms
))
red_end_of_idle_period
(
&
q
->
parms
);
q
->
backlog
+=
skb
->
len
;
return
0
;
}
return
qdisc_requeue
(
skb
,
sch
);
}
static
struct
sk_buff
*
gred_dequeue
(
struct
Qdisc
*
sch
)
static
struct
sk_buff
*
gred_dequeue
(
struct
Qdisc
*
sch
)
{
struct
sk_buff
*
skb
;
struct
gred_sched_data
*
q
;
struct
gred_sched
*
t
=
qdisc_priv
(
sch
);
struct
gred_sched
*
t
=
qdisc_priv
(
sch
);
skb
=
qdisc_dequeue_head
(
sch
);
skb
=
__skb_dequeue
(
&
sch
->
q
);
if
(
skb
)
{
sch
->
qstats
.
backlog
-=
skb
->
len
;
q
=
t
->
tab
[(
skb
->
tc_index
&
0xf
)];
if
(
q
)
{
q
->
backlog
-=
skb
->
len
;
if
(
!
q
->
backlog
&&
!
t
->
eqp
)
PSCHED_GET_TIME
(
q
->
qidlestart
);
struct
gred_sched_data
*
q
;
u16
dp
=
tc_index_to_dp
(
skb
);
if
(
dp
>=
t
->
DPs
||
(
q
=
t
->
tab
[
dp
])
==
NULL
)
{
if
(
net_ratelimit
())
printk
(
KERN_WARNING
"GRED: Unable to relocate "
"VQ 0x%x after dequeue, screwing up "
"backlog.
\n
"
,
tc_index_to_dp
(
skb
));
}
else
{
D2PRINTK
(
"gred_dequeue: skb has bad tcindex %x
\n
"
,
skb
->
tc_index
&
0xf
);
q
->
backlog
-=
skb
->
len
;
if
(
!
q
->
backlog
&&
!
gred_wred_mode
(
t
))
red_start_of_idle_period
(
&
q
->
parms
);
}
return
skb
;
}
if
(
t
->
eqp
)
{
q
=
t
->
tab
[
t
->
def
];
if
(
!
q
)
D2PRINTK
(
"no default VQ set: Results will be "
"screwed up
\n
"
);
else
PSCHED_GET_TIME
(
q
->
qidlestart
);
}
if
(
gred_wred_mode
(
t
)
&&
!
red_is_idling
(
&
t
->
wred_set
))
red_start_of_idle_period
(
&
t
->
wred_set
);
return
NULL
;
}
...
...
@@ -263,36 +297,34 @@ gred_dequeue(struct Qdisc* sch)
static
unsigned
int
gred_drop
(
struct
Qdisc
*
sch
)
{
struct
sk_buff
*
skb
;
struct
gred_sched
*
t
=
qdisc_priv
(
sch
);
struct
gred_sched_data
*
q
;
struct
gred_sched
*
t
=
qdisc_priv
(
sch
);
skb
=
__skb_dequeue_tail
(
&
sch
->
q
);
skb
=
qdisc_dequeue_tail
(
sch
);
if
(
skb
)
{
unsigned
int
len
=
skb
->
len
;
s
ch
->
qstats
.
backlog
-=
len
;
sch
->
qstats
.
drops
++
;
q
=
t
->
tab
[(
skb
->
tc_index
&
0xf
)];
if
(
q
)
{
q
->
backlog
-=
len
;
q
->
other
++
;
if
(
!
q
->
backlog
&&
!
t
->
eqp
)
PSCHED_GET_TIME
(
q
->
qidlestart
);
s
truct
gred_sched_data
*
q
;
u16
dp
=
tc_index_to_dp
(
skb
)
;
if
(
dp
>=
t
->
DPs
||
(
q
=
t
->
tab
[
dp
])
==
NULL
)
{
if
(
net_ratelimit
())
printk
(
KERN_WARNING
"GRED: Unable to relocate "
"VQ 0x%x while dropping, screwing up "
"backlog.
\n
"
,
tc_index_to_dp
(
skb
)
);
}
else
{
D2PRINTK
(
"gred_dequeue: skb has bad tcindex %x
\n
"
,
skb
->
tc_index
&
0xf
);
q
->
backlog
-=
len
;
q
->
stats
.
other
++
;
if
(
!
q
->
backlog
&&
!
gred_wred_mode
(
t
))
red_start_of_idle_period
(
&
q
->
parms
);
}
kfree_skb
(
skb
);
qdisc_drop
(
skb
,
sch
);
return
len
;
}
q
=
t
->
tab
[
t
->
def
];
if
(
!
q
)
{
D2PRINTK
(
"no default VQ set: Results might be screwed up
\n
"
);
return
0
;
}
if
(
gred_wred_mode
(
t
)
&&
!
red_is_idling
(
&
t
->
wred_set
))
red_start_of_idle_period
(
&
t
->
wred_set
);
PSCHED_GET_TIME
(
q
->
qidlestart
);
return
0
;
}
...
...
@@ -300,293 +332,241 @@ static unsigned int gred_drop(struct Qdisc* sch)
static
void
gred_reset
(
struct
Qdisc
*
sch
)
{
int
i
;
struct
gred_sched_data
*
q
;
struct
gred_sched
*
t
=
qdisc_priv
(
sch
);
struct
gred_sched
*
t
=
qdisc_priv
(
sch
);
__skb_queue_purge
(
&
sch
->
q
);
qdisc_reset_queue
(
sch
);
sch
->
qstats
.
backlog
=
0
;
for
(
i
=
0
;
i
<
t
->
DPs
;
i
++
)
{
struct
gred_sched_data
*
q
=
t
->
tab
[
i
];
for
(
i
=
0
;
i
<
t
->
DPs
;
i
++
)
{
q
=
t
->
tab
[
i
];
if
(
!
q
)
continue
;
PSCHED_SET_PASTPERFECT
(
q
->
qidlestart
);
q
->
qave
=
0
;
q
->
qcount
=
-
1
;
red_restart
(
&
q
->
parms
);
q
->
backlog
=
0
;
q
->
other
=
0
;
q
->
forced
=
0
;
q
->
pdrop
=
0
;
q
->
early
=
0
;
}
}
static
int
gred_change
(
struct
Qdisc
*
sch
,
struct
rtattr
*
opt
)
static
inline
void
gred_destroy_vq
(
struct
gred_sched_data
*
q
)
{
kfree
(
q
);
}
static
inline
int
gred_change_table_def
(
struct
Qdisc
*
sch
,
struct
rtattr
*
dps
)
{
struct
gred_sched
*
table
=
qdisc_priv
(
sch
);
struct
gred_sched_data
*
q
;
struct
tc_gred_qopt
*
ctl
;
struct
tc_gred_sopt
*
sopt
;
struct
rtattr
*
tb
[
TCA_GRED_STAB
];
struct
rtattr
*
tb2
[
TCA_GRED_DPS
];
int
i
;
if
(
opt
==
NULL
||
rtattr_parse_nested
(
tb
,
TCA_GRED_STAB
,
opt
))
if
(
dps
==
NULL
||
RTA_PAYLOAD
(
dps
)
<
sizeof
(
*
s
opt
))
return
-
EINVAL
;
if
(
tb
[
TCA_GRED_PARMS
-
1
]
==
0
&&
tb
[
TCA_GRED_STAB
-
1
]
==
0
)
{
rtattr_parse_nested
(
tb2
,
TCA_GRED_DPS
,
opt
);
sopt
=
RTA_DATA
(
dps
);
if
(
tb2
[
TCA_GRED_DPS
-
1
]
==
0
)
if
(
sopt
->
DPs
>
MAX_DPs
||
sopt
->
DPs
==
0
||
sopt
->
def_DP
>=
sopt
->
DPs
)
return
-
EINVAL
;
sopt
=
RTA_DATA
(
tb2
[
TCA_GRED_DPS
-
1
]);
table
->
DPs
=
sopt
->
DPs
;
table
->
def
=
sopt
->
def_DP
;
table
->
grio
=
sopt
->
grio
;
table
->
initd
=
0
;
/* probably need to clear all the table DP entries as well */
sch_tree_lock
(
sch
);
table
->
DPs
=
sopt
->
DPs
;
table
->
def
=
sopt
->
def_DP
;
table
->
red_flags
=
sopt
->
flags
;
/*
* Every entry point to GRED is synchronized with the above code
* and the DP is checked against DPs, i.e. shadowed VQs can no
* longer be found so we can unlock right here.
*/
sch_tree_unlock
(
sch
);
if
(
sopt
->
grio
)
{
gred_enable_rio_mode
(
table
);
gred_disable_wred_mode
(
table
);
if
(
gred_wred_mode_check
(
sch
))
gred_enable_wred_mode
(
table
);
}
else
{
gred_disable_rio_mode
(
table
);
gred_disable_wred_mode
(
table
);
}
for
(
i
=
table
->
DPs
;
i
<
MAX_DPs
;
i
++
)
{
if
(
table
->
tab
[
i
])
{
printk
(
KERN_WARNING
"GRED: Warning: Destroying "
"shadowed VQ 0x%x
\n
"
,
i
);
gred_destroy_vq
(
table
->
tab
[
i
]);
table
->
tab
[
i
]
=
NULL
;
}
}
return
0
;
}
static
inline
int
gred_change_vq
(
struct
Qdisc
*
sch
,
int
dp
,
struct
tc_gred_qopt
*
ctl
,
int
prio
,
u8
*
stab
)
{
struct
gred_sched
*
table
=
qdisc_priv
(
sch
);
struct
gred_sched_data
*
q
;
if
(
table
->
tab
[
dp
]
==
NULL
)
{
table
->
tab
[
dp
]
=
kmalloc
(
sizeof
(
*
q
),
GFP_KERNEL
);
if
(
table
->
tab
[
dp
]
==
NULL
)
return
-
ENOMEM
;
memset
(
table
->
tab
[
dp
],
0
,
sizeof
(
*
q
));
}
q
=
table
->
tab
[
dp
];
q
->
DP
=
dp
;
q
->
prio
=
prio
;
q
->
limit
=
ctl
->
limit
;
if
(
q
->
backlog
==
0
)
red_end_of_idle_period
(
&
q
->
parms
);
red_set_parms
(
&
q
->
parms
,
ctl
->
qth_min
,
ctl
->
qth_max
,
ctl
->
Wlog
,
ctl
->
Plog
,
ctl
->
Scell_log
,
stab
);
return
0
;
}
static
int
gred_change
(
struct
Qdisc
*
sch
,
struct
rtattr
*
opt
)
{
struct
gred_sched
*
table
=
qdisc_priv
(
sch
);
struct
tc_gred_qopt
*
ctl
;
struct
rtattr
*
tb
[
TCA_GRED_MAX
];
int
err
=
-
EINVAL
,
prio
=
GRED_DEF_PRIO
;
u8
*
stab
;
if
(
opt
==
NULL
||
rtattr_parse_nested
(
tb
,
TCA_GRED_MAX
,
opt
))
return
-
EINVAL
;
if
(
tb
[
TCA_GRED_PARMS
-
1
]
==
NULL
&&
tb
[
TCA_GRED_STAB
-
1
]
==
NULL
)
return
gred_change_table_def
(
sch
,
opt
);
if
(
!
table
->
DPs
||
tb
[
TCA_GRED_PARMS
-
1
]
==
0
||
tb
[
TCA_GRED_STAB
-
1
]
==
0
||
if
(
tb
[
TCA_GRED_PARMS
-
1
]
==
NULL
||
RTA_PAYLOAD
(
tb
[
TCA_GRED_PARMS
-
1
])
<
sizeof
(
*
ctl
)
||
tb
[
TCA_GRED_STAB
-
1
]
==
NULL
||
RTA_PAYLOAD
(
tb
[
TCA_GRED_STAB
-
1
])
<
256
)
return
-
EINVAL
;
ctl
=
RTA_DATA
(
tb
[
TCA_GRED_PARMS
-
1
]);
if
(
ctl
->
DP
>
MAX_DPs
-
1
)
{
/* misbehaving is punished! Put in the default drop probability */
DPRINTK
(
"
\n
GRED: DP %u not in the proper range fixed. New DP "
"set to default at %d
\n
"
,
ctl
->
DP
,
table
->
def
);
ctl
->
DP
=
table
->
def
;
}
stab
=
RTA_DATA
(
tb
[
TCA_GRED_STAB
-
1
]);
if
(
table
->
tab
[
ctl
->
DP
]
==
NULL
)
{
table
->
tab
[
ctl
->
DP
]
=
kmalloc
(
sizeof
(
struct
gred_sched_data
),
GFP_KERNEL
);
if
(
NULL
==
table
->
tab
[
ctl
->
DP
])
return
-
ENOMEM
;
memset
(
table
->
tab
[
ctl
->
DP
],
0
,
(
sizeof
(
struct
gred_sched_data
)));
}
q
=
table
->
tab
[
ctl
->
DP
];
if
(
table
->
grio
)
{
if
(
ctl
->
prio
<=
0
)
{
if
(
table
->
def
&&
table
->
tab
[
table
->
def
])
{
DPRINTK
(
"
\n
GRED: DP %u does not have a prio"
"setting default to %d
\n
"
,
ctl
->
DP
,
table
->
tab
[
table
->
def
]
->
prio
);
q
->
prio
=
table
->
tab
[
table
->
def
]
->
prio
;
}
else
{
DPRINTK
(
"
\n
GRED: DP %u does not have a prio"
" setting default to 8
\n
"
,
ctl
->
DP
);
q
->
prio
=
8
;
}
}
else
{
q
->
prio
=
ctl
->
prio
;
}
}
else
{
q
->
prio
=
8
;
}
if
(
ctl
->
DP
>=
table
->
DPs
)
goto
errout
;
if
(
gred_rio_mode
(
table
))
{
if
(
ctl
->
prio
==
0
)
{
int
def_prio
=
GRED_DEF_PRIO
;
q
->
DP
=
ctl
->
DP
;
q
->
Wlog
=
ctl
->
Wlog
;
q
->
Plog
=
ctl
->
Plog
;
q
->
limit
=
ctl
->
limit
;
q
->
Scell_log
=
ctl
->
Scell_log
;
q
->
Rmask
=
ctl
->
Plog
<
32
?
((
1
<<
ctl
->
Plog
)
-
1
)
:
~
0UL
;
q
->
Scell_max
=
(
255
<<
q
->
Scell_log
);
q
->
qth_min
=
ctl
->
qth_min
<<
ctl
->
Wlog
;
q
->
qth_max
=
ctl
->
qth_max
<<
ctl
->
Wlog
;
q
->
qave
=
0
;
q
->
backlog
=
0
;
q
->
qcount
=
-
1
;
q
->
other
=
0
;
q
->
forced
=
0
;
q
->
pdrop
=
0
;
q
->
early
=
0
;
PSCHED_SET_PASTPERFECT
(
q
->
qidlestart
);
memcpy
(
q
->
Stab
,
RTA_DATA
(
tb
[
TCA_GRED_STAB
-
1
]),
256
);
if
(
table
->
initd
&&
table
->
grio
)
{
/* this looks ugly but it's not in the fast path */
for
(
i
=
0
;
i
<
table
->
DPs
;
i
++
)
{
if
((
!
table
->
tab
[
i
])
||
(
i
==
q
->
DP
)
)
continue
;
if
(
table
->
tab
[
i
]
->
prio
==
q
->
prio
){
/* WRED mode detected */
table
->
eqp
=
1
;
break
;
}
}
if
(
table
->
tab
[
table
->
def
])
def_prio
=
table
->
tab
[
table
->
def
]
->
prio
;
printk
(
KERN_DEBUG
"GRED: DP %u does not have a prio "
"setting default to %d
\n
"
,
ctl
->
DP
,
def_prio
);
prio
=
def_prio
;
}
else
prio
=
ctl
->
prio
;
}
if
(
!
table
->
initd
)
{
table
->
initd
=
1
;
/*
the first entry also goes into the default until
over-written
*/
sch_tree_lock
(
sch
);
if
(
table
->
tab
[
table
->
def
]
==
NULL
)
{
table
->
tab
[
table
->
def
]
=
kmalloc
(
sizeof
(
struct
gred_sched_data
),
GFP_KERNEL
);
if
(
NULL
==
table
->
tab
[
table
->
def
])
return
-
ENOMEM
;
err
=
gred_change_vq
(
sch
,
ctl
->
DP
,
ctl
,
prio
,
stab
);
if
(
err
<
0
)
goto
errout_locked
;
memset
(
table
->
tab
[
table
->
def
],
0
,
(
sizeof
(
struct
gred_sched_data
)));
if
(
gred_rio_mode
(
table
))
{
gred_disable_wred_mode
(
table
);
if
(
gred_wred_mode_check
(
sch
))
gred_enable_wred_mode
(
table
);
}
q
=
table
->
tab
[
table
->
def
];
q
->
DP
=
table
->
def
;
q
->
Wlog
=
ctl
->
Wlog
;
q
->
Plog
=
ctl
->
Plog
;
q
->
limit
=
ctl
->
limit
;
q
->
Scell_log
=
ctl
->
Scell_log
;
q
->
Rmask
=
ctl
->
Plog
<
32
?
((
1
<<
ctl
->
Plog
)
-
1
)
:
~
0UL
;
q
->
Scell_max
=
(
255
<<
q
->
Scell_log
);
q
->
qth_min
=
ctl
->
qth_min
<<
ctl
->
Wlog
;
q
->
qth_max
=
ctl
->
qth_max
<<
ctl
->
Wlog
;
if
(
table
->
grio
)
q
->
prio
=
table
->
tab
[
ctl
->
DP
]
->
prio
;
else
q
->
prio
=
8
;
q
->
qcount
=
-
1
;
PSCHED_SET_PASTPERFECT
(
q
->
qidlestart
);
memcpy
(
q
->
Stab
,
RTA_DATA
(
tb
[
TCA_GRED_STAB
-
1
]),
256
);
}
return
0
;
err
=
0
;
errout_locked:
sch_tree_unlock
(
sch
);
errout:
return
err
;
}
static
int
gred_init
(
struct
Qdisc
*
sch
,
struct
rtattr
*
opt
)
{
struct
gred_sched
*
table
=
qdisc_priv
(
sch
);
struct
tc_gred_sopt
*
sopt
;
struct
rtattr
*
tb
[
TCA_GRED_STAB
];
struct
rtattr
*
tb2
[
TCA_GRED_DPS
];
struct
rtattr
*
tb
[
TCA_GRED_MAX
];
if
(
opt
==
NULL
||
rtattr_parse_nested
(
tb
,
TCA_GRED_
STAB
,
opt
))
if
(
opt
==
NULL
||
rtattr_parse_nested
(
tb
,
TCA_GRED_
MAX
,
opt
))
return
-
EINVAL
;
if
(
tb
[
TCA_GRED_PARMS
-
1
]
==
0
&&
tb
[
TCA_GRED_STAB
-
1
]
==
0
)
{
rtattr_parse_nested
(
tb2
,
TCA_GRED_DPS
,
opt
);
if
(
tb2
[
TCA_GRED_DPS
-
1
]
==
0
)
if
(
tb
[
TCA_GRED_PARMS
-
1
]
||
tb
[
TCA_GRED_STAB
-
1
])
return
-
EINVAL
;
sopt
=
RTA_DATA
(
tb2
[
TCA_GRED_DPS
-
1
]);
table
->
DPs
=
sopt
->
DPs
;
table
->
def
=
sopt
->
def_DP
;
table
->
grio
=
sopt
->
grio
;
table
->
initd
=
0
;
return
0
;
}
DPRINTK
(
"
\n
GRED_INIT error!
\n
"
);
return
-
EINVAL
;
return
gred_change_table_def
(
sch
,
tb
[
TCA_GRED_DPS
-
1
]);
}
static
int
gred_dump
(
struct
Qdisc
*
sch
,
struct
sk_buff
*
skb
)
{
unsigned
long
qave
;
struct
rtattr
*
rta
;
struct
tc_gred_qopt
*
opt
=
NULL
;
struct
tc_gred_qopt
*
dst
;
struct
gred_sched
*
table
=
qdisc_priv
(
sch
);
struct
gred_sched_data
*
q
;
struct
rtattr
*
parms
,
*
opts
=
NULL
;
int
i
;
unsigned
char
*
b
=
skb
->
tail
;
rta
=
(
struct
rtattr
*
)
b
;
RTA_PUT
(
skb
,
TCA_OPTIONS
,
0
,
NULL
);
opt
=
kmalloc
(
sizeof
(
struct
tc_gred_qopt
)
*
MAX_DPs
,
GFP_KERNEL
)
;
struct
tc_gred_sopt
sopt
=
{
.
DPs
=
table
->
DPs
,
.
def_DP
=
table
->
def
,
.
grio
=
gred_rio_mode
(
table
),
.
flags
=
table
->
red_flags
,
}
;
if
(
opt
==
NULL
)
{
DPRINTK
(
"gred_dump:failed to malloc for %Zd
\n
"
,
sizeof
(
struct
tc_gred_qopt
)
*
MAX_DPs
);
goto
rtattr_failure
;
}
memset
(
opt
,
0
,
(
sizeof
(
struct
tc_gred_qopt
))
*
table
->
DPs
);
opts
=
RTA_NEST
(
skb
,
TCA_OPTIONS
);
RTA_PUT
(
skb
,
TCA_GRED_DPS
,
sizeof
(
sopt
),
&
sopt
);
parms
=
RTA_NEST
(
skb
,
TCA_GRED_PARMS
);
if
(
!
table
->
initd
)
{
DPRINTK
(
"NO GRED Queues setup!
\n
"
)
;
}
for
(
i
=
0
;
i
<
MAX_DPs
;
i
++
)
{
struct
gred_sched_data
*
q
=
table
->
tab
[
i
]
;
struct
tc_gred_qopt
opt
;
for
(
i
=
0
;
i
<
MAX_DPs
;
i
++
)
{
dst
=
&
opt
[
i
];
q
=
table
->
tab
[
i
];
memset
(
&
opt
,
0
,
sizeof
(
opt
));
if
(
!
q
)
{
/* hack -- fix at some point with proper message
This is how we indicate to tc that there is no VQ
at this DP */
dst
->
DP
=
MAX_DPs
+
i
;
continue
;
opt
.
DP
=
MAX_DPs
+
i
;
goto
append_opt
;
}
dst
->
limit
=
q
->
limit
;
dst
->
qth_min
=
q
->
qth_min
>>
q
->
Wlog
;
dst
->
qth_max
=
q
->
qth_max
>>
q
->
Wlog
;
dst
->
DP
=
q
->
DP
;
dst
->
backlog
=
q
->
backlog
;
if
(
q
->
qave
)
{
if
(
table
->
eqp
&&
table
->
grio
)
{
q
->
qidlestart
=
table
->
tab
[
table
->
def
]
->
qidlestart
;
q
->
qave
=
table
->
tab
[
table
->
def
]
->
qave
;
}
if
(
!
PSCHED_IS_PASTPERFECT
(
q
->
qidlestart
))
{
long
idle
;
psched_time_t
now
;
PSCHED_GET_TIME
(
now
);
idle
=
PSCHED_TDIFF_SAFE
(
now
,
q
->
qidlestart
,
q
->
Scell_max
);
qave
=
q
->
qave
>>
q
->
Stab
[(
idle
>>
q
->
Scell_log
)
&
0xFF
];
dst
->
qave
=
qave
>>
q
->
Wlog
;
opt
.
limit
=
q
->
limit
;
opt
.
DP
=
q
->
DP
;
opt
.
backlog
=
q
->
backlog
;
opt
.
prio
=
q
->
prio
;
opt
.
qth_min
=
q
->
parms
.
qth_min
>>
q
->
parms
.
Wlog
;
opt
.
qth_max
=
q
->
parms
.
qth_max
>>
q
->
parms
.
Wlog
;
opt
.
Wlog
=
q
->
parms
.
Wlog
;
opt
.
Plog
=
q
->
parms
.
Plog
;
opt
.
Scell_log
=
q
->
parms
.
Scell_log
;
opt
.
other
=
q
->
stats
.
other
;
opt
.
early
=
q
->
stats
.
prob_drop
;
opt
.
forced
=
q
->
stats
.
forced_drop
;
opt
.
pdrop
=
q
->
stats
.
pdrop
;
opt
.
packets
=
q
->
packetsin
;
opt
.
bytesin
=
q
->
bytesin
;
}
else
{
dst
->
qave
=
q
->
qave
>>
q
->
Wlog
;
}
}
else
{
dst
->
qave
=
0
;
if
(
gred_wred_mode
(
table
))
{
q
->
parms
.
qidlestart
=
table
->
tab
[
table
->
def
]
->
parms
.
qidlestart
;
q
->
parms
.
qavg
=
table
->
tab
[
table
->
def
]
->
parms
.
qavg
;
}
opt
.
qave
=
red_calc_qavg
(
&
q
->
parms
,
q
->
parms
.
qavg
);
dst
->
Wlog
=
q
->
Wlog
;
dst
->
Plog
=
q
->
Plog
;
dst
->
Scell_log
=
q
->
Scell_log
;
dst
->
other
=
q
->
other
;
dst
->
forced
=
q
->
forced
;
dst
->
early
=
q
->
early
;
dst
->
pdrop
=
q
->
pdrop
;
dst
->
prio
=
q
->
prio
;
dst
->
packets
=
q
->
packetsin
;
dst
->
bytesin
=
q
->
bytesin
;
append_opt:
RTA_APPEND
(
skb
,
sizeof
(
opt
),
&
opt
);
}
RTA_PUT
(
skb
,
TCA_GRED_PARMS
,
sizeof
(
struct
tc_gred_qopt
)
*
MAX_DPs
,
opt
);
rta
->
rta_len
=
skb
->
tail
-
b
;
RTA_NEST_END
(
skb
,
parms
);
kfree
(
opt
);
return
skb
->
len
;
return
RTA_NEST_END
(
skb
,
opts
);
rtattr_failure:
if
(
opt
)
kfree
(
opt
);
DPRINTK
(
"gred_dump: FAILURE!!!!
\n
"
);
/* also free the opt struct here */
skb_trim
(
skb
,
b
-
skb
->
data
);
return
-
1
;
return
RTA_NEST_CANCEL
(
skb
,
opts
);
}
static
void
gred_destroy
(
struct
Qdisc
*
sch
)
...
...
@@ -594,15 +574,13 @@ static void gred_destroy(struct Qdisc *sch)
struct
gred_sched
*
table
=
qdisc_priv
(
sch
);
int
i
;
for
(
i
=
0
;
i
<
table
->
DPs
;
i
++
)
{
for
(
i
=
0
;
i
<
table
->
DPs
;
i
++
)
{
if
(
table
->
tab
[
i
])
kfree
(
table
->
tab
[
i
]);
gred_destroy_vq
(
table
->
tab
[
i
]);
}
}
static
struct
Qdisc_ops
gred_qdisc_ops
=
{
.
next
=
NULL
,
.
cl_ops
=
NULL
,
.
id
=
"gred"
,
.
priv_size
=
sizeof
(
struct
gred_sched
),
.
enqueue
=
gred_enqueue
,
...
...
@@ -621,10 +599,13 @@ static int __init gred_module_init(void)
{
return
register_qdisc
(
&
gred_qdisc_ops
);
}
static
void
__exit
gred_module_exit
(
void
)
{
unregister_qdisc
(
&
gred_qdisc_ops
);
}
module_init
(
gred_module_init
)
module_exit
(
gred_module_exit
)
MODULE_LICENSE
(
"GPL"
);
net/sched/sch_netem.c
View file @
8e33ba49
...
...
@@ -25,6 +25,8 @@
#include <net/pkt_sched.h>
#define VERSION "1.1"
/* Network Emulation Queuing algorithm.
====================================
...
...
@@ -185,10 +187,13 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
||
q
->
counter
<
q
->
gap
/* inside last reordering gap */
||
q
->
reorder
<
get_crandom
(
&
q
->
reorder_cor
))
{
psched_time_t
now
;
psched_tdiff_t
delay
;
delay
=
tabledist
(
q
->
latency
,
q
->
jitter
,
&
q
->
delay_cor
,
q
->
delay_dist
);
PSCHED_GET_TIME
(
now
);
PSCHED_TADD2
(
now
,
tabledist
(
q
->
latency
,
q
->
jitter
,
&
q
->
delay_cor
,
q
->
delay_dist
),
cb
->
time_to_send
);
PSCHED_TADD2
(
now
,
delay
,
cb
->
time_to_send
);
++
q
->
counter
;
ret
=
q
->
qdisc
->
enqueue
(
skb
,
q
->
qdisc
);
}
else
{
...
...
@@ -248,24 +253,31 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
const
struct
netem_skb_cb
*
cb
=
(
const
struct
netem_skb_cb
*
)
skb
->
cb
;
psched_time_t
now
;
long
delay
;
/* if more time remaining? */
PSCHED_GET_TIME
(
now
);
delay
=
PSCHED_US2JIFFIE
(
PSCHED_TDIFF
(
cb
->
time_to_send
,
now
));
pr_debug
(
"netem_run: skb=%p delay=%ld
\n
"
,
skb
,
delay
);
if
(
delay
<=
0
)
{
if
(
PSCHED_TLESS
(
cb
->
time_to_send
,
now
))
{
pr_debug
(
"netem_dequeue: return skb=%p
\n
"
,
skb
);
sch
->
q
.
qlen
--
;
sch
->
flags
&=
~
TCQ_F_THROTTLED
;
return
skb
;
}
else
{
psched_tdiff_t
delay
=
PSCHED_TDIFF
(
cb
->
time_to_send
,
now
);
if
(
q
->
qdisc
->
ops
->
requeue
(
skb
,
q
->
qdisc
)
!=
NET_XMIT_SUCCESS
)
{
sch
->
qstats
.
drops
++
;
/* After this qlen is confused */
printk
(
KERN_ERR
"netem: queue discpline %s could not requeue
\n
"
,
q
->
qdisc
->
ops
->
id
);
sch
->
q
.
qlen
--
;
}
mod_timer
(
&
q
->
timer
,
jiffies
+
delay
);
mod_timer
(
&
q
->
timer
,
jiffies
+
PSCHED_US2JIFFIE
(
delay
)
);
sch
->
flags
|=
TCQ_F_THROTTLED
;
if
(
q
->
qdisc
->
ops
->
requeue
(
skb
,
q
->
qdisc
)
!=
0
)
sch
->
qstats
.
drops
++
;
}
}
return
NULL
;
...
...
@@ -290,11 +302,16 @@ static void netem_reset(struct Qdisc *sch)
del_timer_sync
(
&
q
->
timer
);
}
/* Pass size change message down to embedded FIFO */
static
int
set_fifo_limit
(
struct
Qdisc
*
q
,
int
limit
)
{
struct
rtattr
*
rta
;
int
ret
=
-
ENOMEM
;
/* Hack to avoid sending change message to non-FIFO */
if
(
strncmp
(
q
->
ops
->
id
+
1
,
"fifo"
,
4
)
!=
0
)
return
0
;
rta
=
kmalloc
(
RTA_LENGTH
(
sizeof
(
struct
tc_fifo_qopt
)),
GFP_KERNEL
);
if
(
rta
)
{
rta
->
rta_type
=
RTM_NEWQDISC
;
...
...
@@ -426,6 +443,84 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
return
0
;
}
/*
* Special case version of FIFO queue for use by netem.
* It queues in order based on timestamps in skb's
*/
struct
fifo_sched_data
{
u32
limit
;
};
static
int
tfifo_enqueue
(
struct
sk_buff
*
nskb
,
struct
Qdisc
*
sch
)
{
struct
fifo_sched_data
*
q
=
qdisc_priv
(
sch
);
struct
sk_buff_head
*
list
=
&
sch
->
q
;
const
struct
netem_skb_cb
*
ncb
=
(
const
struct
netem_skb_cb
*
)
nskb
->
cb
;
struct
sk_buff
*
skb
;
if
(
likely
(
skb_queue_len
(
list
)
<
q
->
limit
))
{
skb_queue_reverse_walk
(
list
,
skb
)
{
const
struct
netem_skb_cb
*
cb
=
(
const
struct
netem_skb_cb
*
)
skb
->
cb
;
if
(
PSCHED_TLESS
(
cb
->
time_to_send
,
ncb
->
time_to_send
))
break
;
}
__skb_queue_after
(
list
,
skb
,
nskb
);
sch
->
qstats
.
backlog
+=
nskb
->
len
;
sch
->
bstats
.
bytes
+=
nskb
->
len
;
sch
->
bstats
.
packets
++
;
return
NET_XMIT_SUCCESS
;
}
return
qdisc_drop
(
nskb
,
sch
);
}
static
int
tfifo_init
(
struct
Qdisc
*
sch
,
struct
rtattr
*
opt
)
{
struct
fifo_sched_data
*
q
=
qdisc_priv
(
sch
);
if
(
opt
)
{
struct
tc_fifo_qopt
*
ctl
=
RTA_DATA
(
opt
);
if
(
RTA_PAYLOAD
(
opt
)
<
sizeof
(
*
ctl
))
return
-
EINVAL
;
q
->
limit
=
ctl
->
limit
;
}
else
q
->
limit
=
max_t
(
u32
,
sch
->
dev
->
tx_queue_len
,
1
);
return
0
;
}
static
int
tfifo_dump
(
struct
Qdisc
*
sch
,
struct
sk_buff
*
skb
)
{
struct
fifo_sched_data
*
q
=
qdisc_priv
(
sch
);
struct
tc_fifo_qopt
opt
=
{
.
limit
=
q
->
limit
};
RTA_PUT
(
skb
,
TCA_OPTIONS
,
sizeof
(
opt
),
&
opt
);
return
skb
->
len
;
rtattr_failure:
return
-
1
;
}
static
struct
Qdisc_ops
tfifo_qdisc_ops
=
{
.
id
=
"tfifo"
,
.
priv_size
=
sizeof
(
struct
fifo_sched_data
),
.
enqueue
=
tfifo_enqueue
,
.
dequeue
=
qdisc_dequeue_head
,
.
requeue
=
qdisc_requeue
,
.
drop
=
qdisc_queue_drop
,
.
init
=
tfifo_init
,
.
reset
=
qdisc_reset_queue
,
.
change
=
tfifo_init
,
.
dump
=
tfifo_dump
,
};
static
int
netem_init
(
struct
Qdisc
*
sch
,
struct
rtattr
*
opt
)
{
struct
netem_sched_data
*
q
=
qdisc_priv
(
sch
);
...
...
@@ -438,7 +533,7 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt)
q
->
timer
.
function
=
netem_watchdog
;
q
->
timer
.
data
=
(
unsigned
long
)
sch
;
q
->
qdisc
=
qdisc_create_dflt
(
sch
->
dev
,
&
p
fifo_qdisc_ops
);
q
->
qdisc
=
qdisc_create_dflt
(
sch
->
dev
,
&
t
fifo_qdisc_ops
);
if
(
!
q
->
qdisc
)
{
pr_debug
(
"netem: qdisc create failed
\n
"
);
return
-
ENOMEM
;
...
...
@@ -601,6 +696,7 @@ static struct Qdisc_ops netem_qdisc_ops = {
static
int
__init
netem_module_init
(
void
)
{
pr_info
(
"netem: version "
VERSION
"
\n
"
);
return
register_qdisc
(
&
netem_qdisc_ops
);
}
static
void
__exit
netem_module_exit
(
void
)
...
...
net/sched/sch_red.c
View file @
8e33ba49
...
...
@@ -9,76 +9,23 @@
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
* J Hadi Salim
<hadi@nortel.com>
980914: computation fixes
* J Hadi Salim 980914: computation fixes
* Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
* J Hadi Salim
<hadi@nortelnetworks.com> 980816: ECN support
* J Hadi Salim
980816: ECN support
*/
#include <linux/config.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#include <net/
dsfiel
d.h>
#include <net/
re
d.h>
/* Random Early Detection (RED) algorithm.
=======================================
Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
This file codes a "divisionless" version of RED algorithm
as written down in Fig.17 of the paper.
Short description.
------------------
When a new packet arrives we calculate the average queue length:
avg = (1-W)*avg + W*current_queue_len,
W is the filter time constant (chosen as 2^(-Wlog)), it controls
the inertia of the algorithm. To allow larger bursts, W should be
decreased.
if (avg > th_max) -> packet marked (dropped).
if (avg < th_min) -> packet passes.
if (th_min < avg < th_max) we calculate probability:
Pb = max_P * (avg - th_min)/(th_max-th_min)
and mark (drop) packet with this probability.
Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
max_P should be small (not 1), usually 0.01..0.02 is good value.
max_P is chosen as a number, so that max_P/(th_max-th_min)
is a negative power of two in order arithmetics to contain
only shifts.
Parameters, settable by user:
/* Parameters, settable by user:
-----------------------------
limit - bytes (must be > qth_max + burst)
...
...
@@ -89,243 +36,93 @@ Short description.
arbitrarily high (well, less than ram size)
Really, this limit will never be reached
if RED works correctly.
qth_min - bytes (should be < qth_max/2)
qth_max - bytes (should be at least 2*qth_min and less limit)
Wlog - bits (<32) log(1/W).
Plog - bits (<32)
Plog is related to max_P by formula:
max_P = (qth_max-qth_min)/2^Plog;
F.e. if qth_max=128K and qth_min=32K, then Plog=22
corresponds to max_P=0.02
Scell_log
Stab
Lookup table for log((1-W)^(t/t_ave).
NOTES:
Upper bound on W.
-----------------
If you want to allow bursts of L packets of size S,
you should choose W:
L + 1 - th_min/S < (1-(1-W)^L)/W
th_min/S = 32 th_min/S = 4
log(W) L
-1 33
-2 35
-3 39
-4 46
-5 57
-6 75
-7 101
-8 135
-9 190
etc.
*/
struct
red_sched_data
{
/* Parameters */
u32
limit
;
/* HARD maximal queue length */
u32
qth_min
;
/* Min average length threshold: A scaled */
u32
qth_max
;
/* Max average length threshold: A scaled */
u32
Rmask
;
u32
Scell_max
;
unsigned
char
flags
;
char
Wlog
;
/* log(W) */
char
Plog
;
/* random number bits */
char
Scell_log
;
u8
Stab
[
256
];
/* Variables */
unsigned
long
qave
;
/* Average queue length: A scaled */
int
qcount
;
/* Packets since last random number generation */
u32
qR
;
/* Cached random number */
psched_time_t
qidlestart
;
/* Start of idle period */
struct
tc_red_xstats
st
;
struct
red_parms
parms
;
struct
red_stats
stats
;
};
static
in
t
red_ecn_mark
(
struct
sk_buff
*
skb
)
static
in
line
int
red_use_ecn
(
struct
red_sched_data
*
q
)
{
if
(
skb
->
nh
.
raw
+
20
>
skb
->
tail
)
return
0
;
return
q
->
flags
&
TC_RED_ECN
;
}
switch
(
skb
->
protocol
)
{
case
__constant_htons
(
ETH_P_IP
):
if
(
INET_ECN_is_not_ect
(
skb
->
nh
.
iph
->
tos
))
return
0
;
IP_ECN_set_ce
(
skb
->
nh
.
iph
);
return
1
;
case
__constant_htons
(
ETH_P_IPV6
):
if
(
INET_ECN_is_not_ect
(
ipv6_get_dsfield
(
skb
->
nh
.
ipv6h
)))
return
0
;
IP6_ECN_set_ce
(
skb
->
nh
.
ipv6h
);
return
1
;
default:
return
0
;
}
static
inline
int
red_use_harddrop
(
struct
red_sched_data
*
q
)
{
return
q
->
flags
&
TC_RED_HARDDROP
;
}
static
int
red_enqueue
(
struct
sk_buff
*
skb
,
struct
Qdisc
*
sch
)
static
int
red_enqueue
(
struct
sk_buff
*
skb
,
struct
Qdisc
*
sch
)
{
struct
red_sched_data
*
q
=
qdisc_priv
(
sch
);
psched_time_t
now
;
q
->
parms
.
qavg
=
red_calc_qavg
(
&
q
->
parms
,
sch
->
qstats
.
backlog
)
;
if
(
!
PSCHED_IS_PASTPERFECT
(
q
->
qidlestart
))
{
long
us_idle
;
int
shift
;
if
(
red_is_idling
(
&
q
->
parms
))
red_end_of_idle_period
(
&
q
->
parms
);
PSCHED_GET_TIME
(
now
);
us_idle
=
PSCHED_TDIFF_SAFE
(
now
,
q
->
qidlestart
,
q
->
Scell_max
);
PSCHED_SET_PASTPERFECT
(
q
->
qidlestart
)
;
switch
(
red_action
(
&
q
->
parms
,
q
->
parms
.
qavg
))
{
case
RED_DONT_MARK
:
break
;
/*
The problem: ideally, average length queue recalcultion should
be done over constant clock intervals. This is too expensive, so that
the calculation is driven by outgoing packets.
When the queue is idle we have to model this clock by hand.
SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
dummy packets as a burst after idle time, i.e.
q->qave *= (1-W)^m
This is an apparently overcomplicated solution (f.e. we have to precompute
a table to make this calculation in reasonable time)
I believe that a simpler model may be used here,
but it is field for experiments.
*/
shift
=
q
->
Stab
[
us_idle
>>
q
->
Scell_log
];
if
(
shift
)
{
q
->
qave
>>=
shift
;
}
else
{
/* Approximate initial part of exponent
with linear function:
(1-W)^m ~= 1-mW + ...
Seems, it is the best solution to
problem of too coarce exponent tabulation.
*/
us_idle
=
(
q
->
qave
*
us_idle
)
>>
q
->
Scell_log
;
if
(
us_idle
<
q
->
qave
/
2
)
q
->
qave
-=
us_idle
;
else
q
->
qave
>>=
1
;
}
}
else
{
q
->
qave
+=
sch
->
qstats
.
backlog
-
(
q
->
qave
>>
q
->
Wlog
);
/* NOTE:
q->qave is fixed point number with point at Wlog.
The formulae above is equvalent to floating point
version:
qave = qave*(1-W) + sch->qstats.backlog*W;
--ANK (980924)
*/
}
if
(
q
->
qave
<
q
->
qth_min
)
{
q
->
qcount
=
-
1
;
enqueue:
if
(
sch
->
qstats
.
backlog
+
skb
->
len
<=
q
->
limit
)
{
__skb_queue_tail
(
&
sch
->
q
,
skb
);
sch
->
qstats
.
backlog
+=
skb
->
len
;
sch
->
bstats
.
bytes
+=
skb
->
len
;
sch
->
bstats
.
packets
++
;
return
NET_XMIT_SUCCESS
;
}
else
{
q
->
st
.
pdrop
++
;
}
kfree_skb
(
skb
);
sch
->
qstats
.
drops
++
;
return
NET_XMIT_DROP
;
}
if
(
q
->
qave
>=
q
->
qth_max
)
{
q
->
qcount
=
-
1
;
case
RED_PROB_MARK
:
sch
->
qstats
.
overlimits
++
;
mark:
if
(
!
(
q
->
flags
&
TC_RED_ECN
)
||
!
red_ecn_mark
(
skb
))
{
q
->
st
.
early
++
;
goto
drop
;
}
q
->
st
.
marked
++
;
goto
enqueue
;
if
(
!
red_use_ecn
(
q
)
||
!
INET_ECN_set_ce
(
skb
))
{
q
->
stats
.
prob_drop
++
;
goto
congestion_drop
;
}
if
(
++
q
->
qcount
)
{
/* The formula used below causes questions.
q
->
stats
.
prob_mark
++
;
break
;
OK. qR is random number in the interval 0..Rmask
i.e. 0..(2^Plog). If we used floating point
arithmetics, it would be: (2^Plog)*rnd_num,
where rnd_num is less 1.
case
RED_HARD_MARK
:
sch
->
qstats
.
overlimits
++
;
if
(
red_use_harddrop
(
q
)
||
!
red_use_ecn
(
q
)
||
!
INET_ECN_set_ce
(
skb
))
{
q
->
stats
.
forced_drop
++
;
goto
congestion_drop
;
}
Taking into account, that qave have fixed
point at Wlog, and Plog is related to max_P by
max_P = (qth_max-qth_min)/2^Plog; two lines
below have the following floating point equivalent:
q
->
stats
.
forced_mark
++
;
break
;
}
max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
if
(
sch
->
qstats
.
backlog
+
skb
->
len
<=
q
->
limit
)
return
qdisc_enqueue_tail
(
skb
,
sch
);
Any questions? --ANK (980924)
*/
if
(((
q
->
qave
-
q
->
qth_min
)
>>
q
->
Wlog
)
*
q
->
qcount
<
q
->
qR
)
goto
enqueue
;
q
->
qcount
=
0
;
q
->
qR
=
net_random
()
&
q
->
Rmask
;
sch
->
qstats
.
overlimits
++
;
goto
mark
;
}
q
->
qR
=
net_random
()
&
q
->
Rmask
;
goto
enqueue
;
q
->
stats
.
pdrop
++
;
return
qdisc_drop
(
skb
,
sch
);
drop:
kfree_skb
(
skb
);
sch
->
qstats
.
drops
++
;
congestion_drop:
qdisc_drop
(
skb
,
sch
);
return
NET_XMIT_CN
;
}
static
int
red_requeue
(
struct
sk_buff
*
skb
,
struct
Qdisc
*
sch
)
static
int
red_requeue
(
struct
sk_buff
*
skb
,
struct
Qdisc
*
sch
)
{
struct
red_sched_data
*
q
=
qdisc_priv
(
sch
);
PSCHED_SET_PASTPERFECT
(
q
->
qidlestart
);
if
(
red_is_idling
(
&
q
->
parms
))
red_end_of_idle_period
(
&
q
->
parms
);
__skb_queue_head
(
&
sch
->
q
,
skb
);
sch
->
qstats
.
backlog
+=
skb
->
len
;
sch
->
qstats
.
requeues
++
;
return
0
;
return
qdisc_requeue
(
skb
,
sch
);
}
static
struct
sk_buff
*
red_dequeue
(
struct
Qdisc
*
sch
)
static
struct
sk_buff
*
red_dequeue
(
struct
Qdisc
*
sch
)
{
struct
sk_buff
*
skb
;
struct
red_sched_data
*
q
=
qdisc_priv
(
sch
);
skb
=
__skb_dequeue
(
&
sch
->
q
);
if
(
skb
)
{
sch
->
qstats
.
backlog
-=
skb
->
len
;
skb
=
qdisc_dequeue_head
(
sch
);
if
(
skb
==
NULL
&&
!
red_is_idling
(
&
q
->
parms
))
red_start_of_idle_period
(
&
q
->
parms
);
return
skb
;
}
PSCHED_GET_TIME
(
q
->
qidlestart
);
return
NULL
;
}
static
unsigned
int
red_drop
(
struct
Qdisc
*
sch
)
...
...
@@ -333,16 +130,17 @@ static unsigned int red_drop(struct Qdisc* sch)
struct
sk_buff
*
skb
;
struct
red_sched_data
*
q
=
qdisc_priv
(
sch
);
skb
=
__skb_dequeue_tail
(
&
sch
->
q
);
skb
=
qdisc_dequeue_tail
(
sch
);
if
(
skb
)
{
unsigned
int
len
=
skb
->
len
;
sch
->
qstats
.
backlog
-=
len
;
sch
->
qstats
.
drops
++
;
q
->
st
.
other
++
;
kfree_skb
(
skb
);
q
->
stats
.
other
++
;
qdisc_drop
(
skb
,
sch
);
return
len
;
}
PSCHED_GET_TIME
(
q
->
qidlestart
);
if
(
!
red_is_idling
(
&
q
->
parms
))
red_start_of_idle_period
(
&
q
->
parms
);
return
0
;
}
...
...
@@ -350,43 +148,38 @@ static void red_reset(struct Qdisc* sch)
{
struct
red_sched_data
*
q
=
qdisc_priv
(
sch
);
__skb_queue_purge
(
&
sch
->
q
);
sch
->
qstats
.
backlog
=
0
;
PSCHED_SET_PASTPERFECT
(
q
->
qidlestart
);
q
->
qave
=
0
;
q
->
qcount
=
-
1
;
qdisc_reset_queue
(
sch
);
red_restart
(
&
q
->
parms
);
}
static
int
red_change
(
struct
Qdisc
*
sch
,
struct
rtattr
*
opt
)
{
struct
red_sched_data
*
q
=
qdisc_priv
(
sch
);
struct
rtattr
*
tb
[
TCA_RED_
STAB
];
struct
rtattr
*
tb
[
TCA_RED_
MAX
];
struct
tc_red_qopt
*
ctl
;
if
(
opt
==
NULL
||
rtattr_parse_nested
(
tb
,
TCA_RED_STAB
,
opt
)
||
tb
[
TCA_RED_PARMS
-
1
]
==
0
||
tb
[
TCA_RED_STAB
-
1
]
==
0
||
if
(
opt
==
NULL
||
rtattr_parse_nested
(
tb
,
TCA_RED_MAX
,
opt
))
return
-
EINVAL
;
if
(
tb
[
TCA_RED_PARMS
-
1
]
==
NULL
||
RTA_PAYLOAD
(
tb
[
TCA_RED_PARMS
-
1
])
<
sizeof
(
*
ctl
)
||
RTA_PAYLOAD
(
tb
[
TCA_RED_STAB
-
1
])
<
256
)
tb
[
TCA_RED_STAB
-
1
]
==
NULL
||
RTA_PAYLOAD
(
tb
[
TCA_RED_STAB
-
1
])
<
RED_STAB_SIZE
)
return
-
EINVAL
;
ctl
=
RTA_DATA
(
tb
[
TCA_RED_PARMS
-
1
]);
sch_tree_lock
(
sch
);
q
->
flags
=
ctl
->
flags
;
q
->
Wlog
=
ctl
->
Wlog
;
q
->
Plog
=
ctl
->
Plog
;
q
->
Rmask
=
ctl
->
Plog
<
32
?
((
1
<<
ctl
->
Plog
)
-
1
)
:
~
0UL
;
q
->
Scell_log
=
ctl
->
Scell_log
;
q
->
Scell_max
=
(
255
<<
q
->
Scell_log
);
q
->
qth_min
=
ctl
->
qth_min
<<
ctl
->
Wlog
;
q
->
qth_max
=
ctl
->
qth_max
<<
ctl
->
Wlog
;
q
->
limit
=
ctl
->
limit
;
memcpy
(
q
->
Stab
,
RTA_DATA
(
tb
[
TCA_RED_STAB
-
1
]),
256
);
q
->
qcount
=
-
1
;
red_set_parms
(
&
q
->
parms
,
ctl
->
qth_min
,
ctl
->
qth_max
,
ctl
->
Wlog
,
ctl
->
Plog
,
ctl
->
Scell_log
,
RTA_DATA
(
tb
[
TCA_RED_STAB
-
1
]));
if
(
skb_queue_empty
(
&
sch
->
q
))
PSCHED_SET_PASTPERFECT
(
q
->
qidlestart
);
red_end_of_idle_period
(
&
q
->
parms
);
sch_tree_unlock
(
sch
);
return
0
;
}
...
...
@@ -399,39 +192,39 @@ static int red_init(struct Qdisc* sch, struct rtattr *opt)
static
int
red_dump
(
struct
Qdisc
*
sch
,
struct
sk_buff
*
skb
)
{
struct
red_sched_data
*
q
=
qdisc_priv
(
sch
);
unsigned
char
*
b
=
skb
->
tail
;
struct
rtattr
*
rta
;
struct
tc_red_qopt
opt
;
rta
=
(
struct
rtattr
*
)
b
;
RTA_PUT
(
skb
,
TCA_OPTIONS
,
0
,
NULL
);
opt
.
limit
=
q
->
limit
;
opt
.
qth_min
=
q
->
qth_min
>>
q
->
Wlog
;
opt
.
qth_max
=
q
->
qth_max
>>
q
->
Wlog
;
opt
.
Wlog
=
q
->
Wlog
;
opt
.
Plog
=
q
->
Plog
;
opt
.
Scell_log
=
q
->
Scell_log
;
opt
.
flags
=
q
->
flags
;
struct
rtattr
*
opts
=
NULL
;
struct
tc_red_qopt
opt
=
{
.
limit
=
q
->
limit
,
.
flags
=
q
->
flags
,
.
qth_min
=
q
->
parms
.
qth_min
>>
q
->
parms
.
Wlog
,
.
qth_max
=
q
->
parms
.
qth_max
>>
q
->
parms
.
Wlog
,
.
Wlog
=
q
->
parms
.
Wlog
,
.
Plog
=
q
->
parms
.
Plog
,
.
Scell_log
=
q
->
parms
.
Scell_log
,
};
opts
=
RTA_NEST
(
skb
,
TCA_OPTIONS
);
RTA_PUT
(
skb
,
TCA_RED_PARMS
,
sizeof
(
opt
),
&
opt
);
rta
->
rta_len
=
skb
->
tail
-
b
;
return
skb
->
len
;
return
RTA_NEST_END
(
skb
,
opts
);
rtattr_failure:
skb_trim
(
skb
,
b
-
skb
->
data
);
return
-
1
;
return
RTA_NEST_CANCEL
(
skb
,
opts
);
}
static
int
red_dump_stats
(
struct
Qdisc
*
sch
,
struct
gnet_dump
*
d
)
{
struct
red_sched_data
*
q
=
qdisc_priv
(
sch
);
return
gnet_stats_copy_app
(
d
,
&
q
->
st
,
sizeof
(
q
->
st
));
struct
tc_red_xstats
st
=
{
.
early
=
q
->
stats
.
prob_drop
+
q
->
stats
.
forced_drop
,
.
pdrop
=
q
->
stats
.
pdrop
,
.
other
=
q
->
stats
.
other
,
.
marked
=
q
->
stats
.
prob_mark
+
q
->
stats
.
forced_mark
,
};
return
gnet_stats_copy_app
(
d
,
&
st
,
sizeof
(
st
));
}
static
struct
Qdisc_ops
red_qdisc_ops
=
{
.
next
=
NULL
,
.
cl_ops
=
NULL
,
.
id
=
"red"
,
.
priv_size
=
sizeof
(
struct
red_sched_data
),
.
enqueue
=
red_enqueue
,
...
...
@@ -450,10 +243,13 @@ static int __init red_module_init(void)
{
return
register_qdisc
(
&
red_qdisc_ops
);
}
static
void
__exit
red_module_exit
(
void
)
{
unregister_qdisc
(
&
red_qdisc_ops
);
}
module_init
(
red_module_init
)
module_exit
(
red_module_exit
)
MODULE_LICENSE
(
"GPL"
);
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment