Commit 29b4433d authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

net: percpu net_device refcount

We tried very hard to remove all possible dev_hold()/dev_put() pairs in
network stack, using RCU conversions.

There is still an unavoidable device refcount change for every dst we
create/destroy, and this can slow down some workloads (routers or some
app servers, mmap af_packet)

We can switch to a percpu refcount implementation, now dynamic per_cpu
infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
per device.

On x86, dev_hold(dev) code :

before
        lock    incl 0x280(%ebx)
after:
        movl    0x260(%ebx),%eax
        incl    fs:(%eax)

Stress bench :

(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE)

Before:

real    1m1.662s
user    0m14.373s
sys     12m55.960s

After:

real    0m51.179s
user    0m15.329s
sys     10m15.942s
Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent f0b9f472
...@@ -2701,7 +2701,7 @@ static int nes_disconnect(struct nes_qp *nesqp, int abrupt) ...@@ -2701,7 +2701,7 @@ static int nes_disconnect(struct nes_qp *nesqp, int abrupt)
nesibdev = nesvnic->nesibdev; nesibdev = nesvnic->nesibdev;
nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
atomic_read(&nesvnic->netdev->refcnt)); netdev_refcnt_read(nesvnic->netdev));
if (nesqp->active_conn) { if (nesqp->active_conn) {
...@@ -2791,7 +2791,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ...@@ -2791,7 +2791,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
atomic_inc(&cm_accepts); atomic_inc(&cm_accepts);
nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
atomic_read(&nesvnic->netdev->refcnt)); netdev_refcnt_read(nesvnic->netdev));
/* allocate the ietf frame and space for private data */ /* allocate the ietf frame and space for private data */
nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev, nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev,
......
...@@ -785,7 +785,7 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, ...@@ -785,7 +785,7 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n", nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n",
nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context, nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context,
atomic_read(&nesvnic->netdev->refcnt)); netdev_refcnt_read(nesvnic->netdev));
err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds, err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
nesadapter->max_pd, &pd_num, &nesadapter->next_pd); nesadapter->max_pd, &pd_num, &nesadapter->next_pd);
...@@ -1416,7 +1416,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, ...@@ -1416,7 +1416,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
/* update the QP table */ /* update the QP table */
nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp; nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp;
nes_debug(NES_DBG_QP, "netdev refcnt=%u\n", nes_debug(NES_DBG_QP, "netdev refcnt=%u\n",
atomic_read(&nesvnic->netdev->refcnt)); netdev_refcnt_read(nesvnic->netdev));
return &nesqp->ibqp; return &nesqp->ibqp;
} }
......
...@@ -1026,7 +1026,7 @@ struct net_device { ...@@ -1026,7 +1026,7 @@ struct net_device {
struct timer_list watchdog_timer; struct timer_list watchdog_timer;
/* Number of references to this device */ /* Number of references to this device */
atomic_t refcnt ____cacheline_aligned_in_smp; int __percpu *pcpu_refcnt;
/* delayed register/unregister */ /* delayed register/unregister */
struct list_head todo_list; struct list_head todo_list;
...@@ -1330,6 +1330,7 @@ static inline void unregister_netdevice(struct net_device *dev) ...@@ -1330,6 +1330,7 @@ static inline void unregister_netdevice(struct net_device *dev)
unregister_netdevice_queue(dev, NULL); unregister_netdevice_queue(dev, NULL);
} }
extern int netdev_refcnt_read(const struct net_device *dev);
extern void free_netdev(struct net_device *dev); extern void free_netdev(struct net_device *dev);
extern void synchronize_net(void); extern void synchronize_net(void);
extern int register_netdevice_notifier(struct notifier_block *nb); extern int register_netdevice_notifier(struct notifier_block *nb);
...@@ -1798,7 +1799,7 @@ extern void netdev_run_todo(void); ...@@ -1798,7 +1799,7 @@ extern void netdev_run_todo(void);
*/ */
static inline void dev_put(struct net_device *dev) static inline void dev_put(struct net_device *dev)
{ {
atomic_dec(&dev->refcnt); irqsafe_cpu_dec(*dev->pcpu_refcnt);
} }
/** /**
...@@ -1809,7 +1810,7 @@ static inline void dev_put(struct net_device *dev) ...@@ -1809,7 +1810,7 @@ static inline void dev_put(struct net_device *dev)
*/ */
static inline void dev_hold(struct net_device *dev) static inline void dev_hold(struct net_device *dev)
{ {
atomic_inc(&dev->refcnt); irqsafe_cpu_inc(*dev->pcpu_refcnt);
} }
/* Carrier loss detection, dial on demand. The functions netif_carrier_on /* Carrier loss detection, dial on demand. The functions netif_carrier_on
......
...@@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev) ...@@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev)
*/ */
dev->reg_state = NETREG_DUMMY; dev->reg_state = NETREG_DUMMY;
/* initialize the ref count */
atomic_set(&dev->refcnt, 1);
/* NAPI wants this */ /* NAPI wants this */
INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->napi_list);
...@@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev) ...@@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev)
set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state); set_bit(__LINK_STATE_START, &dev->state);
/* Note : We dont allocate pcpu_refcnt for dummy devices,
* because users of this 'device' dont need to change
* its refcount.
*/
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(init_dummy_netdev); EXPORT_SYMBOL_GPL(init_dummy_netdev);
...@@ -5243,6 +5245,16 @@ int register_netdev(struct net_device *dev) ...@@ -5243,6 +5245,16 @@ int register_netdev(struct net_device *dev)
} }
EXPORT_SYMBOL(register_netdev); EXPORT_SYMBOL(register_netdev);
int netdev_refcnt_read(const struct net_device *dev)
{
int i, refcnt = 0;
for_each_possible_cpu(i)
refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
return refcnt;
}
EXPORT_SYMBOL(netdev_refcnt_read);
/* /*
* netdev_wait_allrefs - wait until all references are gone. * netdev_wait_allrefs - wait until all references are gone.
* *
...@@ -5257,11 +5269,14 @@ EXPORT_SYMBOL(register_netdev); ...@@ -5257,11 +5269,14 @@ EXPORT_SYMBOL(register_netdev);
static void netdev_wait_allrefs(struct net_device *dev) static void netdev_wait_allrefs(struct net_device *dev)
{ {
unsigned long rebroadcast_time, warning_time; unsigned long rebroadcast_time, warning_time;
int refcnt;
linkwatch_forget_dev(dev); linkwatch_forget_dev(dev);
rebroadcast_time = warning_time = jiffies; rebroadcast_time = warning_time = jiffies;
while (atomic_read(&dev->refcnt) != 0) { refcnt = netdev_refcnt_read(dev);
while (refcnt != 0) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock(); rtnl_lock();
...@@ -5288,11 +5303,13 @@ static void netdev_wait_allrefs(struct net_device *dev) ...@@ -5288,11 +5303,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
msleep(250); msleep(250);
refcnt = netdev_refcnt_read(dev);
if (time_after(jiffies, warning_time + 10 * HZ)) { if (time_after(jiffies, warning_time + 10 * HZ)) {
printk(KERN_EMERG "unregister_netdevice: " printk(KERN_EMERG "unregister_netdevice: "
"waiting for %s to become free. Usage " "waiting for %s to become free. Usage "
"count = %d\n", "count = %d\n",
dev->name, atomic_read(&dev->refcnt)); dev->name, refcnt);
warning_time = jiffies; warning_time = jiffies;
} }
} }
...@@ -5350,7 +5367,7 @@ void netdev_run_todo(void) ...@@ -5350,7 +5367,7 @@ void netdev_run_todo(void)
netdev_wait_allrefs(dev); netdev_wait_allrefs(dev);
/* paranoia */ /* paranoia */
BUG_ON(atomic_read(&dev->refcnt)); BUG_ON(netdev_refcnt_read(dev));
WARN_ON(rcu_dereference_raw(dev->ip_ptr)); WARN_ON(rcu_dereference_raw(dev->ip_ptr));
WARN_ON(dev->ip6_ptr); WARN_ON(dev->ip6_ptr);
WARN_ON(dev->dn_ptr); WARN_ON(dev->dn_ptr);
...@@ -5520,9 +5537,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, ...@@ -5520,9 +5537,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
dev = PTR_ALIGN(p, NETDEV_ALIGN); dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p; dev->padded = (char *)dev - (char *)p;
if (dev_addr_init(dev)) dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
goto free_tx; goto free_tx;
if (dev_addr_init(dev))
goto free_pcpu;
dev_mc_init(dev); dev_mc_init(dev);
dev_uc_init(dev); dev_uc_init(dev);
...@@ -5553,6 +5574,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, ...@@ -5553,6 +5574,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
free_tx: free_tx:
kfree(tx); kfree(tx);
free_pcpu:
free_percpu(dev->pcpu_refcnt);
free_p: free_p:
kfree(p); kfree(p);
return NULL; return NULL;
...@@ -5586,6 +5609,9 @@ void free_netdev(struct net_device *dev) ...@@ -5586,6 +5609,9 @@ void free_netdev(struct net_device *dev)
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p); netif_napi_del(p);
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;
/* Compatibility with error handling in drivers */ /* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) { if (dev->reg_state == NETREG_UNINITIALIZED) {
kfree((char *)dev - dev->padded); kfree((char *)dev - dev->padded);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment