Integrate NAPI work done by Jamal Hadi Salim,

Robert Olsson, and Alexey Kuznetsov. This changeset adds the framework and implementation, but drivers need to be ported to NAPI in order to take advantage of the new facilities. NAPI is fully backwards compatible, current drivers will continue to work as they always have. NAPI is a way for dealing with high packet load. It allows the driver to disable the RX interrupts on the card and enter a polling mode. Another way to describe NAPI would be as implicit mitigation. Once the device enters this polling mode, it will exit back to interrupt based processing when the receive packet queue is purged. A full porting and description document is found at: Documentation/networking/NAPI_HOWTO.txt and this also makes reference to Usenix papers on the web and other such resources available on NAPI. NAPI has been found to not only increase packet processing rates, it also gives greater fairness to the other interfaces in the system which are not experiencing high packet load.

Integrate NAPI work done by Jamal Hadi Salim,
Robert Olsson, and Alexey Kuznetsov. This changeset adds the framework and implementation, but drivers need to be ported to NAPI in order to take advantage of the new facilities. NAPI is fully backwards compatible, current drivers will continue to work as they always have. NAPI is a way for dealing with high packet load. It allows the driver to disable the RX interrupts on the card and enter a polling mode. Another way to describe NAPI would be as implicit mitigation. Once the device enters this polling mode, it will exit back to interrupt based processing when the receive packet queue is purged. A full porting and description document is found at: Documentation/networking/NAPI_HOWTO.txt and this also makes reference to Usenix papers on the web and other such resources available on NAPI. NAPI has been found to not only increase packet processing rates, it also gives greater fairness to the other interfaces in the system which are not experiencing high packet load.
abb85ec3 · David S. Miller · c3a12cc5 · abb85ec3 · abb85ec3 · abb85ec3
Commit abb85ec3 authored Mar 13, 2002 by David S. Miller
6 changed files
--- a/Documentation/networking/NAPI_HOWTO.txt
+++ b/Documentation/networking/NAPI_HOWTO.txt
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -206,7 +206,8 @@ enum netdev_state_t
 	__LINK_STATE_START,
 	__LINK_STATE_PRESENT,
 	__LINK_STATE_SCHED,
-	__LINK_STATE_NOCARRIER
+	__LINK_STATE_NOCARRIER,
+	__LINK_STATE_RX_SCHED
 };


@@ -330,6 +331,10 @@ struct net_device
 	void                    *ip6_ptr;       /* IPv6 specific data */
 	void			*ec_ptr;	/* Econet specific data	*/

+	struct list_head	poll_list;	/* Link to poll list	*/
+	int			quota;
+	int			weight;
+
 	struct Qdisc		*qdisc;
 	struct Qdisc		*qdisc_sleeping;
 	struct Qdisc		*qdisc_list;
@@ -373,6 +378,7 @@ struct net_device
 	int			(*stop)(struct net_device *dev);
 	int			(*hard_start_xmit) (struct sk_buff *skb,
 						    struct net_device *dev);
+	int			(*poll) (struct net_device *dev, int *quota);
 	int			(*hard_header) (struct sk_buff *skb,
 						struct net_device *dev,
 						unsigned short type,
@@ -492,8 +498,11 @@ struct softnet_data
 	int			cng_level;
 	int			avg_blog;
 	struct sk_buff_head	input_pkt_queue;
+	struct list_head	poll_list;
 	struct net_device	*output_queue;
 	struct sk_buff		*completion_queue;
+
+	struct net_device	backlog_dev;	/* Sorry. 8) */
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));


@@ -547,6 +556,7 @@ static inline int netif_running(struct net_device *dev)
 	return test_bit(__LINK_STATE_START, &dev->state);
 }

+
 /* Use this variant when it is known for sure that it
 * is executing from interrupt context.
 */
@@ -578,6 +588,8 @@ static inline void dev_kfree_skb_any(struct sk_buff *skb)
 extern void		net_call_rx_atomic(void (*fn)(void));
 #define HAVE_NETIF_RX 1
 extern int		netif_rx(struct sk_buff *skb);
+#define HAVE_NETIF_RECEIVE_SKB 1
+extern int		netif_receive_skb(struct sk_buff *skb);
 extern int		dev_ioctl(unsigned int cmd, void *);
 extern int		dev_change_flags(struct net_device *, unsigned);
 extern void		dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
@@ -695,6 +707,78 @@ enum {
 #define netif_msg_rx_status(p)	((p)->msg_enable & NETIF_MSG_RX_STATUS)
 #define netif_msg_pktdata(p)	((p)->msg_enable & NETIF_MSG_PKTDATA)

+/* Schedule rx intr now? */
+
+static inline int netif_rx_schedule_prep(struct net_device *dev)
+{
+	return netif_running(dev) &&
+		!test_and_set_bit(__LINK_STATE_RX_SCHED, &dev->state);
+}
+
+/* Add interface to tail of rx poll list. This assumes that _prep has
+ * already been called and returned 1.
+ */
+
+static inline void __netif_rx_schedule(struct net_device *dev)
+{
+	unsigned long flags;
+	int cpu = smp_processor_id();
+
+	local_irq_save(flags);
+	dev_hold(dev);
+	list_add_tail(&dev->poll_list, &softnet_data[cpu].poll_list);
+	if (dev->quota < 0)
+		dev->quota += dev->weight;
+	else
+		dev->quota = dev->weight;
+	__cpu_raise_softirq(cpu, NET_RX_SOFTIRQ);
+	local_irq_restore(flags);
+}
+
+/* Try to reschedule poll. Called by irq handler. */
+
+static inline void netif_rx_schedule(struct net_device *dev)
+{
+	if (netif_rx_schedule_prep(dev))
+		__netif_rx_schedule(dev);
+}
+
+/* Try to reschedule poll. Called by dev->poll() after netif_rx_complete().
+ * Do not inline this?
+ */
+static inline int netif_rx_reschedule(struct net_device *dev, int undo)
+{
+	if (netif_rx_schedule_prep(dev)) {
+		unsigned long flags;
+		int cpu = smp_processor_id();
+
+		dev->quota += undo;
+
+		local_irq_save(flags);
+		list_add_tail(&dev->poll_list, &softnet_data[cpu].poll_list);
+		__cpu_raise_softirq(cpu, NET_RX_SOFTIRQ);
+		local_irq_restore(flags);
+		return 1;
+	}
+	return 0;
+}
+
+/* Remove interface from poll list: it must be in the poll list
+ * on current cpu. This primitive is called by dev->poll(), when
+ * it completes the work. The device cannot be out of poll list at this
+ * moment, it is BUG().
+ */
+static inline void netif_rx_complete(struct net_device *dev)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (!test_bit(__LINK_STATE_RX_SCHED, &dev->state)) BUG();
+	list_del(&dev->poll_list);
+	clear_bit(__LINK_STATE_RX_SCHED, &dev->state);
+	local_irq_restore(flags);
+}
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */

 extern void		ether_setup(struct net_device *dev);
@@ -719,6 +803,7 @@ extern void		dev_mcast_init(void);
 extern int		netdev_register_fc(struct net_device *dev, void (*stimul)(struct net_device *dev));
 extern void		netdev_unregister_fc(int bit);
 extern int		netdev_max_backlog;
+extern int		weight_p;
 extern unsigned long	netdev_fc_xoff;
 extern atomic_t netdev_dropping;
 extern int		netdev_set_master(struct net_device *dev, struct net_device *master);

--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -202,7 +202,8 @@ enum
 	NET_CORE_NO_CONG_THRESH=13,
 	NET_CORE_NO_CONG=14,
 	NET_CORE_LO_CONG=15,
-	NET_CORE_MOD_CONG=16
+	NET_CORE_MOD_CONG=16,
+	NET_CORE_DEV_WEIGHT=17
 };

 /* /proc/sys/net/ethernet */

--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -798,6 +798,19 @@ int dev_close(struct net_device *dev)

 	clear_bit(__LINK_STATE_START, &dev->state);

+	/* Synchronize to scheduled poll. We cannot touch poll list,
+	 * it can be even on different cpu. So just clear netif_running(),
+	 * and wait when poll really will happen. Actually, the best place
+	 * for this is inside dev->stop() after device stopped its irq
+	 * engine, but this requires more changes in devices. */
+
+	smp_mb__after_clear_bit(); /* Commit netif_running(). */
+	while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
+		/* No hurry. */
+		current->state = TASK_INTERRUPTIBLE;
+		schedule_timeout(1);
+	}
+
 	/*
 	 *	Call the device specific close. This cannot fail.
 	 *	Only if device is UP
@@ -1072,6 +1085,7 @@ int dev_queue_xmit(struct sk_buff *skb)
  =======================================================================*/

 int netdev_max_backlog = 300;
+int weight_p = 64;            /* old backlog weight */
 /* These numbers are selected based on intuition and some
 * experimentatiom, if you have more scientific way of doing this
 * please go ahead and fix things.
@@ -1237,13 +1251,11 @@ int netif_rx(struct sk_buff *skb)
 enqueue:
 			dev_hold(skb->dev);
 			__skb_queue_tail(&queue->input_pkt_queue,skb);
-			/* Runs from irqs or BH's, no need to wake BH */
-			cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
 			local_irq_restore(flags);
 #ifndef OFFLINE_SAMPLE
 			get_sample_stats(this_cpu);
 #endif
-			return softnet_data[this_cpu].cng_level;
+			return queue->cng_level;
 		}

 		if (queue->throttle) {
@@ -1253,6 +1265,8 @@ int netif_rx(struct sk_buff *skb)
 				netdev_wakeup();
 #endif
 		}
+
+		netif_rx_schedule(&queue->backlog_dev);
 		goto enqueue;
 	}

@@ -1308,19 +1322,12 @@ static int deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int
 	return ret;
 }

-/* Reparent skb to master device. This function is called
- * only from net_rx_action under BR_NETPROTO_LOCK. It is misuse
- * of BR_NETPROTO_LOCK, but it is OK for now.
- */
 static __inline__ void skb_bond(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dev;
-	
-	if (dev->master) {
-		dev_hold(dev->master);
+
+	if (dev->master)
 		skb->dev = dev->master;
-		dev_put(dev);
-	}
 }

 static void net_tx_action(struct softirq_action *h)
@@ -1416,121 +1423,138 @@ static inline void handle_diverter(struct sk_buff *skb)
 }
 #endif   /* CONFIG_NET_DIVERT */

-
-static void net_rx_action(struct softirq_action *h)
+int netif_receive_skb(struct sk_buff *skb)
 {
-	int this_cpu = smp_processor_id();
-	struct softnet_data *queue = &softnet_data[this_cpu];
-	unsigned long start_time = jiffies;
-	int bugdet = netdev_max_backlog;
-
-	br_read_lock(BR_NETPROTO_LOCK);
-
-	for (;;) {
-		struct sk_buff *skb;
-		struct net_device *rx_dev;
-
-		local_irq_disable();
-		skb = __skb_dequeue(&queue->input_pkt_queue);
-		local_irq_enable();
+	struct packet_type *ptype, *pt_prev;
+	int ret = NET_RX_DROP;
+	unsigned short type = skb->protocol;

-		if (skb == NULL)
-			break;
+	if (skb->stamp.tv_sec == 0)
+		do_gettimeofday(&skb->stamp);

-		skb_bond(skb);
+	skb_bond(skb);

-		rx_dev = skb->dev;
+	netdev_rx_stat[smp_processor_id()].total++;

 #ifdef CONFIG_NET_FASTROUTE
-		if (skb->pkt_type == PACKET_FASTROUTE) {
-			netdev_rx_stat[this_cpu].fastroute_deferred_out++;
-			dev_queue_xmit(skb);
-			dev_put(rx_dev);
-			continue;
-		}
+	if (skb->pkt_type == PACKET_FASTROUTE) {
+		netdev_rx_stat[smp_processor_id()].fastroute_deferred_out++;
+		return dev_queue_xmit(skb);
+	}
 #endif
-		skb->h.raw = skb->nh.raw = skb->data;
-		{
-			struct packet_type *ptype, *pt_prev;
-			unsigned short type = skb->protocol;
-
-			pt_prev = NULL;
-			for (ptype = ptype_all; ptype; ptype = ptype->next) {
-				if (!ptype->dev || ptype->dev == skb->dev) {
-					if (pt_prev) {
-						if (!pt_prev->data) {
-							deliver_to_old_ones(pt_prev, skb, 0);
-						} else {
-							atomic_inc(&skb->users);
-							pt_prev->func(skb,
-								      skb->dev,
-								      pt_prev);
-						}
-					}
-					pt_prev = ptype;
+
+	skb->h.raw = skb->nh.raw = skb->data;
+
+	pt_prev = NULL;
+	for (ptype = ptype_all; ptype; ptype = ptype->next) {
+		if (!ptype->dev || ptype->dev == skb->dev) {
+			if (pt_prev) {
+				if (!pt_prev->data) {
+					ret = deliver_to_old_ones(pt_prev, skb, 0);
+				} else {
+					atomic_inc(&skb->users);
+					ret = pt_prev->func(skb, skb->dev, pt_prev);
 				}
 			}
+			pt_prev = ptype;
+		}
+	}

 #ifdef CONFIG_NET_DIVERT
-			if (skb->dev->divert && skb->dev->divert->divert)
-				handle_diverter(skb);
+	if (skb->dev->divert && skb->dev->divert->divert)
+		ret = handle_diverter(skb);
 #endif /* CONFIG_NET_DIVERT */
-
 			
 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
-			if (skb->dev->br_port != NULL &&
-			    br_handle_frame_hook != NULL) {
-				handle_bridge(skb, pt_prev);
-				dev_put(rx_dev);
-				continue;
-			}
+	if (skb->dev->br_port != NULL &&
+	    br_handle_frame_hook != NULL) {
+		return handle_bridge(skb, pt_prev);
+	}
 #endif

-			for (ptype=ptype_base[ntohs(type)&15];ptype;ptype=ptype->next) {
-				if (ptype->type == type &&
-				    (!ptype->dev || ptype->dev == skb->dev)) {
-					if (pt_prev) {
-						if (!pt_prev->data)
-							deliver_to_old_ones(pt_prev, skb, 0);
-						else {
-							atomic_inc(&skb->users);
-							pt_prev->func(skb,
-								      skb->dev,
-								      pt_prev);
-						}
-					}
-					pt_prev = ptype;
+	for (ptype=ptype_base[ntohs(type)&15];ptype;ptype=ptype->next) {
+		if (ptype->type == type &&
+		    (!ptype->dev || ptype->dev == skb->dev)) {
+			if (pt_prev) {
+				if (!pt_prev->data) {
+					ret = deliver_to_old_ones(pt_prev, skb, 0);
+				} else {
+					atomic_inc(&skb->users);
+					ret = pt_prev->func(skb, skb->dev, pt_prev);
 				}
 			}
+			pt_prev = ptype;
+		}
+	}

-			if (pt_prev) {
-				if (!pt_prev->data)
-					deliver_to_old_ones(pt_prev, skb, 1);
-				else
-					pt_prev->func(skb, skb->dev, pt_prev);
-			} else
-				kfree_skb(skb);
+	if (pt_prev) {
+		if (!pt_prev->data) {
+			ret = deliver_to_old_ones(pt_prev, skb, 1);
+		} else {
+			ret = pt_prev->func(skb, skb->dev, pt_prev);
 		}
+	} else {
+		kfree_skb(skb);
+		/* Jamal, now you will not able to escape explaining
+		 * me how you were going to use this. :-)
+		 */
+		ret = NET_RX_DROP;
+	}

-		dev_put(rx_dev);
+	return ret;
+}

-		if (bugdet-- < 0 || jiffies - start_time > 1)
-			goto softnet_break;
+static int process_backlog(struct net_device *backlog_dev, int *budget)
+{
+	int work = 0;
+	int quota = min(backlog_dev->quota, *budget);
+	int this_cpu = smp_processor_id();
+	struct softnet_data *queue = &softnet_data[this_cpu];
+	unsigned long start_time = jiffies;
+
+	for (;;) {
+		struct sk_buff *skb;
+		struct net_device *dev;
+
+		local_irq_disable();
+		skb = __skb_dequeue(&queue->input_pkt_queue);
+		if (skb == NULL)
+			goto job_done;
+		local_irq_enable();
+
+		dev = skb->dev;
+
+		netif_receive_skb(skb);
+
+		dev_put(dev);
+
+		work++;
+
+		if (work >= quota || jiffies - start_time > 1)
+			break;

 #ifdef CONFIG_NET_HW_FLOWCONTROL
-	if (queue->throttle && queue->input_pkt_queue.qlen < no_cong_thresh ) {
-		if (atomic_dec_and_test(&netdev_dropping)) {
-			queue->throttle = 0;
-			netdev_wakeup();
-			goto softnet_break;
+		if (queue->throttle && queue->input_pkt_queue.qlen < no_cong_thresh ) {
+			if (atomic_dec_and_test(&netdev_dropping)) {
+				queue->throttle = 0;
+				netdev_wakeup();
+				break;
+			}
 		}
-	}
 #endif
-
 	}
-	br_read_unlock(BR_NETPROTO_LOCK);

-	local_irq_disable();
+	backlog_dev->quota -= work;
+	*budget -= work;
+	return -1;
+
+job_done:
+	backlog_dev->quota -= work;
+	*budget -= work;
+
+	list_del(&backlog_dev->poll_list);
+	clear_bit(__LINK_STATE_RX_SCHED, &backlog_dev->state);
+
 	if (queue->throttle) {
 		queue->throttle = 0;
 #ifdef CONFIG_NET_HW_FLOWCONTROL
@@ -1539,21 +1563,53 @@ static void net_rx_action(struct softirq_action *h)
 #endif
 	}
 	local_irq_enable();
+	return 0;
+}

-	NET_PROFILE_LEAVE(softnet_process);
-	return;
+static void net_rx_action(struct softirq_action *h)
+{
+	int this_cpu = smp_processor_id();
+	struct softnet_data *queue = &softnet_data[this_cpu];
+	unsigned long start_time = jiffies;
+	int budget = netdev_max_backlog;

-softnet_break:
+	br_read_lock(BR_NETPROTO_LOCK);
+	local_irq_disable();
+
+	while (!list_empty(&queue->poll_list)) {
+		struct net_device *dev;
+
+		if (budget <= 0 || jiffies - start_time > 1)
+			goto softnet_break;
+
+		local_irq_enable();
+
+		dev = list_entry(queue->poll_list.next, struct net_device, poll_list);
+
+		if (dev->quota <= 0 || dev->poll(dev, &budget)) {
+			local_irq_disable();
+			list_del(&dev->poll_list);
+			list_add_tail(&dev->poll_list, &queue->poll_list);
+			if (dev->quota < 0)
+				dev->quota += dev->weight;
+			else
+				dev->quota = dev->weight;
+		} else {
+			dev_put(dev);
+			local_irq_disable();
+		}
+	}
+
+	local_irq_enable();
 	br_read_unlock(BR_NETPROTO_LOCK);
+	return;

-	local_irq_disable();
+softnet_break:
 	netdev_rx_stat[this_cpu].time_squeeze++;
-	/* This already runs in BH context, no need to wake up BH's */
-	cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
-	local_irq_enable();
+	__cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);

-	NET_PROFILE_LEAVE(softnet_process);
-	return;
+	local_irq_enable();
+	br_read_unlock(BR_NETPROTO_LOCK);
 }

 static gifconf_func_t * gifconf_list [NPROTO];
@@ -2626,6 +2682,7 @@ int __init net_dev_init(void)
 	if (!dev_boot_phase)
 		return 0;

+
 #ifdef CONFIG_NET_DIVERT
 	dv_init();
 #endif /* CONFIG_NET_DIVERT */
@@ -2643,8 +2700,13 @@ int __init net_dev_init(void)
 		queue->cng_level = 0;
 		queue->avg_blog = 10; /* arbitrary non-zero */
 		queue->completion_queue = NULL;
+		INIT_LIST_HEAD(&queue->poll_list);
+		set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
+		queue->backlog_dev.weight = weight_p;
+		queue->backlog_dev.poll = process_backlog;
+		atomic_set(&queue->backlog_dev.refcnt, 1);
 	}
-	
+
 #ifdef CONFIG_NET_PROFILE
 	net_profile_init();
 	NET_PROFILE_REGISTER(dev_queue_xmit);
@@ -2744,7 +2806,6 @@ int __init net_dev_init(void)
 #ifdef CONFIG_NET_SCHED
 	pktsched_init();
 #endif
-
 	/*
 	 *	Initialise network devices
 	 */

--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -12,6 +12,7 @@
 #ifdef CONFIG_SYSCTL

 extern int netdev_max_backlog;
+extern int weight_p;
 extern int no_cong_thresh;
 extern int no_cong;
 extern int lo_cong;
@@ -47,6 +48,9 @@ ctl_table core_table[] = {
 	{NET_CORE_RMEM_DEFAULT, "rmem_default",
 	 &sysctl_rmem_default, sizeof(int), 0644, NULL,
 	 &proc_dointvec},
+	{NET_CORE_DEV_WEIGHT, "dev_weight",
+	 &weight_p, sizeof(int), 0644, NULL,
+	 &proc_dointvec},
 	{NET_CORE_MAX_BACKLOG, "netdev_max_backlog",
 	 &netdev_max_backlog, sizeof(int), 0644, NULL,
 	 &proc_dointvec},

--- a/net/netsyms.c
+++ b/net/netsyms.c
@@ -490,6 +490,7 @@ EXPORT_SYMBOL(__kfree_skb);
 EXPORT_SYMBOL(skb_clone);
 EXPORT_SYMBOL(skb_copy);
 EXPORT_SYMBOL(netif_rx);
+EXPORT_SYMBOL(netif_receive_skb);
 EXPORT_SYMBOL(dev_add_pack);
 EXPORT_SYMBOL(dev_remove_pack);
 EXPORT_SYMBOL(dev_get);