Commit abb85ec3 authored by David S. Miller's avatar David S. Miller

Integrate NAPI work done by Jamal Hadi Salim,

Robert Olsson, and Alexey Kuznetsov.  This changeset adds
the framework and implementation, but drivers need to be
ported to NAPI in order to take advantage of the new
facilities.  NAPI is fully backwards compatible, current
drivers will continue to work as they always have.

NAPI is a way for dealing with high packet load.  It allows
the driver to disable the RX interrupts on the card and enter
a polling mode.  Another way to describe NAPI would be as
implicit mitigation.  Once the device enters this polling
mode, it will exit back to interrupt based processing when
the receive packet queue is purged.

A full porting and description document is found at:
Documentation/networking/NAPI_HOWTO.txt
and this also makes reference to Usenix papers on the
web and other such resources available on NAPI.

NAPI has been found to not only increase packet processing
rates, it also gives greater fairness to the other interfaces
in the system which are not experiencing high packet load.
parent c3a12cc5
This diff is collapsed.
......@@ -206,7 +206,8 @@ enum netdev_state_t
__LINK_STATE_START,
__LINK_STATE_PRESENT,
__LINK_STATE_SCHED,
__LINK_STATE_NOCARRIER
__LINK_STATE_NOCARRIER,
__LINK_STATE_RX_SCHED
};
......@@ -330,6 +331,10 @@ struct net_device
void *ip6_ptr; /* IPv6 specific data */
void *ec_ptr; /* Econet specific data */
struct list_head poll_list; /* Link to poll list */
int quota;
int weight;
struct Qdisc *qdisc;
struct Qdisc *qdisc_sleeping;
struct Qdisc *qdisc_list;
......@@ -373,6 +378,7 @@ struct net_device
int (*stop)(struct net_device *dev);
int (*hard_start_xmit) (struct sk_buff *skb,
struct net_device *dev);
int (*poll) (struct net_device *dev, int *quota);
int (*hard_header) (struct sk_buff *skb,
struct net_device *dev,
unsigned short type,
......@@ -492,8 +498,11 @@ struct softnet_data
int cng_level;
int avg_blog;
struct sk_buff_head input_pkt_queue;
struct list_head poll_list;
struct net_device *output_queue;
struct sk_buff *completion_queue;
struct net_device backlog_dev; /* Sorry. 8) */
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
......@@ -547,6 +556,7 @@ static inline int netif_running(struct net_device *dev)
return test_bit(__LINK_STATE_START, &dev->state);
}
/* Use this variant when it is known for sure that it
* is executing from interrupt context.
*/
......@@ -578,6 +588,8 @@ static inline void dev_kfree_skb_any(struct sk_buff *skb)
extern void net_call_rx_atomic(void (*fn)(void));
#define HAVE_NETIF_RX 1
extern int netif_rx(struct sk_buff *skb);
#define HAVE_NETIF_RECEIVE_SKB 1
extern int netif_receive_skb(struct sk_buff *skb);
extern int dev_ioctl(unsigned int cmd, void *);
extern int dev_change_flags(struct net_device *, unsigned);
extern void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
......@@ -695,6 +707,78 @@ enum {
#define netif_msg_rx_status(p) ((p)->msg_enable & NETIF_MSG_RX_STATUS)
#define netif_msg_pktdata(p) ((p)->msg_enable & NETIF_MSG_PKTDATA)
/* Schedule rx intr now? */
static inline int netif_rx_schedule_prep(struct net_device *dev)
{
return netif_running(dev) &&
!test_and_set_bit(__LINK_STATE_RX_SCHED, &dev->state);
}
/* Add interface to tail of rx poll list. This assumes that _prep has
* already been called and returned 1.
*/
static inline void __netif_rx_schedule(struct net_device *dev)
{
unsigned long flags;
int cpu = smp_processor_id();
local_irq_save(flags);
dev_hold(dev);
list_add_tail(&dev->poll_list, &softnet_data[cpu].poll_list);
if (dev->quota < 0)
dev->quota += dev->weight;
else
dev->quota = dev->weight;
__cpu_raise_softirq(cpu, NET_RX_SOFTIRQ);
local_irq_restore(flags);
}
/* Try to reschedule poll. Called by irq handler. */
static inline void netif_rx_schedule(struct net_device *dev)
{
if (netif_rx_schedule_prep(dev))
__netif_rx_schedule(dev);
}
/* Try to reschedule poll. Called by dev->poll() after netif_rx_complete().
* Do not inline this?
*/
static inline int netif_rx_reschedule(struct net_device *dev, int undo)
{
if (netif_rx_schedule_prep(dev)) {
unsigned long flags;
int cpu = smp_processor_id();
dev->quota += undo;
local_irq_save(flags);
list_add_tail(&dev->poll_list, &softnet_data[cpu].poll_list);
__cpu_raise_softirq(cpu, NET_RX_SOFTIRQ);
local_irq_restore(flags);
return 1;
}
return 0;
}
/* Remove interface from poll list: it must be in the poll list
* on current cpu. This primitive is called by dev->poll(), when
* it completes the work. The device cannot be out of poll list at this
* moment, it is BUG().
*/
static inline void netif_rx_complete(struct net_device *dev)
{
unsigned long flags;
local_irq_save(flags);
if (!test_bit(__LINK_STATE_RX_SCHED, &dev->state)) BUG();
list_del(&dev->poll_list);
clear_bit(__LINK_STATE_RX_SCHED, &dev->state);
local_irq_restore(flags);
}
/* These functions live elsewhere (drivers/net/net_init.c, but related) */
extern void ether_setup(struct net_device *dev);
......@@ -719,6 +803,7 @@ extern void dev_mcast_init(void);
extern int netdev_register_fc(struct net_device *dev, void (*stimul)(struct net_device *dev));
extern void netdev_unregister_fc(int bit);
extern int netdev_max_backlog;
extern int weight_p;
extern unsigned long netdev_fc_xoff;
extern atomic_t netdev_dropping;
extern int netdev_set_master(struct net_device *dev, struct net_device *master);
......
......@@ -202,7 +202,8 @@ enum
NET_CORE_NO_CONG_THRESH=13,
NET_CORE_NO_CONG=14,
NET_CORE_LO_CONG=15,
NET_CORE_MOD_CONG=16
NET_CORE_MOD_CONG=16,
NET_CORE_DEV_WEIGHT=17
};
/* /proc/sys/net/ethernet */
......
......@@ -798,6 +798,19 @@ int dev_close(struct net_device *dev)
clear_bit(__LINK_STATE_START, &dev->state);
/* Synchronize to scheduled poll. We cannot touch poll list,
* it can be even on different cpu. So just clear netif_running(),
* and wait when poll really will happen. Actually, the best place
* for this is inside dev->stop() after device stopped its irq
* engine, but this requires more changes in devices. */
smp_mb__after_clear_bit(); /* Commit netif_running(). */
while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
/* No hurry. */
current->state = TASK_INTERRUPTIBLE;
schedule_timeout(1);
}
/*
* Call the device specific close. This cannot fail.
* Only if device is UP
......@@ -1072,6 +1085,7 @@ int dev_queue_xmit(struct sk_buff *skb)
=======================================================================*/
int netdev_max_backlog = 300;
int weight_p = 64; /* old backlog weight */
/* These numbers are selected based on intuition and some
* experimentatiom, if you have more scientific way of doing this
* please go ahead and fix things.
......@@ -1237,13 +1251,11 @@ int netif_rx(struct sk_buff *skb)
enqueue:
dev_hold(skb->dev);
__skb_queue_tail(&queue->input_pkt_queue,skb);
/* Runs from irqs or BH's, no need to wake BH */
cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
local_irq_restore(flags);
#ifndef OFFLINE_SAMPLE
get_sample_stats(this_cpu);
#endif
return softnet_data[this_cpu].cng_level;
return queue->cng_level;
}
if (queue->throttle) {
......@@ -1253,6 +1265,8 @@ int netif_rx(struct sk_buff *skb)
netdev_wakeup();
#endif
}
netif_rx_schedule(&queue->backlog_dev);
goto enqueue;
}
......@@ -1308,19 +1322,12 @@ static int deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int
return ret;
}
/* Reparent skb to master device. This function is called
* only from net_rx_action under BR_NETPROTO_LOCK. It is misuse
* of BR_NETPROTO_LOCK, but it is OK for now.
*/
static __inline__ void skb_bond(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
if (dev->master) {
dev_hold(dev->master);
if (dev->master)
skb->dev = dev->master;
dev_put(dev);
}
}
static void net_tx_action(struct softirq_action *h)
......@@ -1416,121 +1423,138 @@ static inline void handle_diverter(struct sk_buff *skb)
}
#endif /* CONFIG_NET_DIVERT */
static void net_rx_action(struct softirq_action *h)
int netif_receive_skb(struct sk_buff *skb)
{
int this_cpu = smp_processor_id();
struct softnet_data *queue = &softnet_data[this_cpu];
unsigned long start_time = jiffies;
int bugdet = netdev_max_backlog;
br_read_lock(BR_NETPROTO_LOCK);
for (;;) {
struct sk_buff *skb;
struct net_device *rx_dev;
local_irq_disable();
skb = __skb_dequeue(&queue->input_pkt_queue);
local_irq_enable();
struct packet_type *ptype, *pt_prev;
int ret = NET_RX_DROP;
unsigned short type = skb->protocol;
if (skb == NULL)
break;
if (skb->stamp.tv_sec == 0)
do_gettimeofday(&skb->stamp);
skb_bond(skb);
skb_bond(skb);
rx_dev = skb->dev;
netdev_rx_stat[smp_processor_id()].total++;
#ifdef CONFIG_NET_FASTROUTE
if (skb->pkt_type == PACKET_FASTROUTE) {
netdev_rx_stat[this_cpu].fastroute_deferred_out++;
dev_queue_xmit(skb);
dev_put(rx_dev);
continue;
}
if (skb->pkt_type == PACKET_FASTROUTE) {
netdev_rx_stat[smp_processor_id()].fastroute_deferred_out++;
return dev_queue_xmit(skb);
}
#endif
skb->h.raw = skb->nh.raw = skb->data;
{
struct packet_type *ptype, *pt_prev;
unsigned short type = skb->protocol;
pt_prev = NULL;
for (ptype = ptype_all; ptype; ptype = ptype->next) {
if (!ptype->dev || ptype->dev == skb->dev) {
if (pt_prev) {
if (!pt_prev->data) {
deliver_to_old_ones(pt_prev, skb, 0);
} else {
atomic_inc(&skb->users);
pt_prev->func(skb,
skb->dev,
pt_prev);
}
}
pt_prev = ptype;
skb->h.raw = skb->nh.raw = skb->data;
pt_prev = NULL;
for (ptype = ptype_all; ptype; ptype = ptype->next) {
if (!ptype->dev || ptype->dev == skb->dev) {
if (pt_prev) {
if (!pt_prev->data) {
ret = deliver_to_old_ones(pt_prev, skb, 0);
} else {
atomic_inc(&skb->users);
ret = pt_prev->func(skb, skb->dev, pt_prev);
}
}
pt_prev = ptype;
}
}
#ifdef CONFIG_NET_DIVERT
if (skb->dev->divert && skb->dev->divert->divert)
handle_diverter(skb);
if (skb->dev->divert && skb->dev->divert->divert)
ret = handle_diverter(skb);
#endif /* CONFIG_NET_DIVERT */
#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
if (skb->dev->br_port != NULL &&
br_handle_frame_hook != NULL) {
handle_bridge(skb, pt_prev);
dev_put(rx_dev);
continue;
}
if (skb->dev->br_port != NULL &&
br_handle_frame_hook != NULL) {
return handle_bridge(skb, pt_prev);
}
#endif
for (ptype=ptype_base[ntohs(type)&15];ptype;ptype=ptype->next) {
if (ptype->type == type &&
(!ptype->dev || ptype->dev == skb->dev)) {
if (pt_prev) {
if (!pt_prev->data)
deliver_to_old_ones(pt_prev, skb, 0);
else {
atomic_inc(&skb->users);
pt_prev->func(skb,
skb->dev,
pt_prev);
}
}
pt_prev = ptype;
for (ptype=ptype_base[ntohs(type)&15];ptype;ptype=ptype->next) {
if (ptype->type == type &&
(!ptype->dev || ptype->dev == skb->dev)) {
if (pt_prev) {
if (!pt_prev->data) {
ret = deliver_to_old_ones(pt_prev, skb, 0);
} else {
atomic_inc(&skb->users);
ret = pt_prev->func(skb, skb->dev, pt_prev);
}
}
pt_prev = ptype;
}
}
if (pt_prev) {
if (!pt_prev->data)
deliver_to_old_ones(pt_prev, skb, 1);
else
pt_prev->func(skb, skb->dev, pt_prev);
} else
kfree_skb(skb);
if (pt_prev) {
if (!pt_prev->data) {
ret = deliver_to_old_ones(pt_prev, skb, 1);
} else {
ret = pt_prev->func(skb, skb->dev, pt_prev);
}
} else {
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
dev_put(rx_dev);
return ret;
}
if (bugdet-- < 0 || jiffies - start_time > 1)
goto softnet_break;
static int process_backlog(struct net_device *backlog_dev, int *budget)
{
int work = 0;
int quota = min(backlog_dev->quota, *budget);
int this_cpu = smp_processor_id();
struct softnet_data *queue = &softnet_data[this_cpu];
unsigned long start_time = jiffies;
for (;;) {
struct sk_buff *skb;
struct net_device *dev;
local_irq_disable();
skb = __skb_dequeue(&queue->input_pkt_queue);
if (skb == NULL)
goto job_done;
local_irq_enable();
dev = skb->dev;
netif_receive_skb(skb);
dev_put(dev);
work++;
if (work >= quota || jiffies - start_time > 1)
break;
#ifdef CONFIG_NET_HW_FLOWCONTROL
if (queue->throttle && queue->input_pkt_queue.qlen < no_cong_thresh ) {
if (atomic_dec_and_test(&netdev_dropping)) {
queue->throttle = 0;
netdev_wakeup();
goto softnet_break;
if (queue->throttle && queue->input_pkt_queue.qlen < no_cong_thresh ) {
if (atomic_dec_and_test(&netdev_dropping)) {
queue->throttle = 0;
netdev_wakeup();
break;
}
}
}
#endif
}
br_read_unlock(BR_NETPROTO_LOCK);
local_irq_disable();
backlog_dev->quota -= work;
*budget -= work;
return -1;
job_done:
backlog_dev->quota -= work;
*budget -= work;
list_del(&backlog_dev->poll_list);
clear_bit(__LINK_STATE_RX_SCHED, &backlog_dev->state);
if (queue->throttle) {
queue->throttle = 0;
#ifdef CONFIG_NET_HW_FLOWCONTROL
......@@ -1539,21 +1563,53 @@ static void net_rx_action(struct softirq_action *h)
#endif
}
local_irq_enable();
return 0;
}
NET_PROFILE_LEAVE(softnet_process);
return;
static void net_rx_action(struct softirq_action *h)
{
int this_cpu = smp_processor_id();
struct softnet_data *queue = &softnet_data[this_cpu];
unsigned long start_time = jiffies;
int budget = netdev_max_backlog;
softnet_break:
br_read_lock(BR_NETPROTO_LOCK);
local_irq_disable();
while (!list_empty(&queue->poll_list)) {
struct net_device *dev;
if (budget <= 0 || jiffies - start_time > 1)
goto softnet_break;
local_irq_enable();
dev = list_entry(queue->poll_list.next, struct net_device, poll_list);
if (dev->quota <= 0 || dev->poll(dev, &budget)) {
local_irq_disable();
list_del(&dev->poll_list);
list_add_tail(&dev->poll_list, &queue->poll_list);
if (dev->quota < 0)
dev->quota += dev->weight;
else
dev->quota = dev->weight;
} else {
dev_put(dev);
local_irq_disable();
}
}
local_irq_enable();
br_read_unlock(BR_NETPROTO_LOCK);
return;
local_irq_disable();
softnet_break:
netdev_rx_stat[this_cpu].time_squeeze++;
/* This already runs in BH context, no need to wake up BH's */
cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
local_irq_enable();
__cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
NET_PROFILE_LEAVE(softnet_process);
return;
local_irq_enable();
br_read_unlock(BR_NETPROTO_LOCK);
}
static gifconf_func_t * gifconf_list [NPROTO];
......@@ -2626,6 +2682,7 @@ int __init net_dev_init(void)
if (!dev_boot_phase)
return 0;
#ifdef CONFIG_NET_DIVERT
dv_init();
#endif /* CONFIG_NET_DIVERT */
......@@ -2643,8 +2700,13 @@ int __init net_dev_init(void)
queue->cng_level = 0;
queue->avg_blog = 10; /* arbitrary non-zero */
queue->completion_queue = NULL;
INIT_LIST_HEAD(&queue->poll_list);
set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
queue->backlog_dev.weight = weight_p;
queue->backlog_dev.poll = process_backlog;
atomic_set(&queue->backlog_dev.refcnt, 1);
}
#ifdef CONFIG_NET_PROFILE
net_profile_init();
NET_PROFILE_REGISTER(dev_queue_xmit);
......@@ -2744,7 +2806,6 @@ int __init net_dev_init(void)
#ifdef CONFIG_NET_SCHED
pktsched_init();
#endif
/*
* Initialise network devices
*/
......
......@@ -12,6 +12,7 @@
#ifdef CONFIG_SYSCTL
extern int netdev_max_backlog;
extern int weight_p;
extern int no_cong_thresh;
extern int no_cong;
extern int lo_cong;
......@@ -47,6 +48,9 @@ ctl_table core_table[] = {
{NET_CORE_RMEM_DEFAULT, "rmem_default",
&sysctl_rmem_default, sizeof(int), 0644, NULL,
&proc_dointvec},
{NET_CORE_DEV_WEIGHT, "dev_weight",
&weight_p, sizeof(int), 0644, NULL,
&proc_dointvec},
{NET_CORE_MAX_BACKLOG, "netdev_max_backlog",
&netdev_max_backlog, sizeof(int), 0644, NULL,
&proc_dointvec},
......
......@@ -490,6 +490,7 @@ EXPORT_SYMBOL(__kfree_skb);
EXPORT_SYMBOL(skb_clone);
EXPORT_SYMBOL(skb_copy);
EXPORT_SYMBOL(netif_rx);
EXPORT_SYMBOL(netif_receive_skb);
EXPORT_SYMBOL(dev_add_pack);
EXPORT_SYMBOL(dev_remove_pack);
EXPORT_SYMBOL(dev_get);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment