Commit 455302d1 authored by Ilya Maximets's avatar Ilya Maximets Committed by Daniel Borkmann

xdp: fix hang while unregistering device bound to xdp socket

Device that bound to XDP socket will not have zero refcount until the
userspace application will not close it. This leads to hang inside
'netdev_wait_allrefs()' if device unregistering requested:

  # ip link del p1
  < hang on recvmsg on netlink socket >

  # ps -x | grep ip
  5126  pts/0    D+   0:00 ip link del p1

  # journalctl -b

  Jun 05 07:19:16 kernel:
  unregister_netdevice: waiting for p1 to become free. Usage count = 1

  Jun 05 07:19:27 kernel:
  unregister_netdevice: waiting for p1 to become free. Usage count = 1
  ...

Fix that by implementing NETDEV_UNREGISTER event notification handler
to properly clean up all the resources and unref device.

This should also allow socket killing via ss(8) utility.

Fixes: 965a9909 ("xsk: add support for bind for Rx")
Signed-off-by: default avatarIlya Maximets <i.maximets@samsung.com>
Acked-by: default avatarJonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parent 162c820e
...@@ -61,6 +61,11 @@ struct xdp_sock { ...@@ -61,6 +61,11 @@ struct xdp_sock {
struct xsk_queue *tx ____cacheline_aligned_in_smp; struct xsk_queue *tx ____cacheline_aligned_in_smp;
struct list_head list; struct list_head list;
bool zc; bool zc;
enum {
XSK_READY = 0,
XSK_BOUND,
XSK_UNBOUND,
} state;
/* Protects multiple processes in the control path */ /* Protects multiple processes in the control path */
struct mutex mutex; struct mutex mutex;
/* Mutual exclusion of NAPI TX thread and sendmsg error paths /* Mutual exclusion of NAPI TX thread and sendmsg error paths
......
...@@ -140,11 +140,13 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, ...@@ -140,11 +140,13 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
return err; return err;
} }
static void xdp_umem_clear_dev(struct xdp_umem *umem) void xdp_umem_clear_dev(struct xdp_umem *umem)
{ {
struct netdev_bpf bpf; struct netdev_bpf bpf;
int err; int err;
ASSERT_RTNL();
if (!umem->dev) if (!umem->dev)
return; return;
...@@ -153,17 +155,13 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem) ...@@ -153,17 +155,13 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem)
bpf.xsk.umem = NULL; bpf.xsk.umem = NULL;
bpf.xsk.queue_id = umem->queue_id; bpf.xsk.queue_id = umem->queue_id;
rtnl_lock();
err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf); err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
rtnl_unlock();
if (err) if (err)
WARN(1, "failed to disable umem!\n"); WARN(1, "failed to disable umem!\n");
} }
rtnl_lock();
xdp_clear_umem_at_qid(umem->dev, umem->queue_id); xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
rtnl_unlock();
dev_put(umem->dev); dev_put(umem->dev);
umem->dev = NULL; umem->dev = NULL;
...@@ -195,7 +193,9 @@ static void xdp_umem_unaccount_pages(struct xdp_umem *umem) ...@@ -195,7 +193,9 @@ static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
static void xdp_umem_release(struct xdp_umem *umem) static void xdp_umem_release(struct xdp_umem *umem)
{ {
rtnl_lock();
xdp_umem_clear_dev(umem); xdp_umem_clear_dev(umem);
rtnl_unlock();
ida_simple_remove(&umem_ida, umem->id); ida_simple_remove(&umem_ida, umem->id);
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
u16 queue_id, u16 flags); u16 queue_id, u16 flags);
void xdp_umem_clear_dev(struct xdp_umem *umem);
bool xdp_umem_validate_queues(struct xdp_umem *umem); bool xdp_umem_validate_queues(struct xdp_umem *umem);
void xdp_get_umem(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem);
void xdp_put_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem);
......
...@@ -335,6 +335,22 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, ...@@ -335,6 +335,22 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
return 0; return 0;
} }
static void xsk_unbind_dev(struct xdp_sock *xs)
{
struct net_device *dev = xs->dev;
if (!dev || xs->state != XSK_BOUND)
return;
xs->state = XSK_UNBOUND;
/* Wait for driver to stop using the xdp socket. */
xdp_del_sk_umem(xs->umem, xs);
xs->dev = NULL;
synchronize_net();
dev_put(dev);
}
static int xsk_release(struct socket *sock) static int xsk_release(struct socket *sock)
{ {
struct sock *sk = sock->sk; struct sock *sk = sock->sk;
...@@ -354,15 +370,7 @@ static int xsk_release(struct socket *sock) ...@@ -354,15 +370,7 @@ static int xsk_release(struct socket *sock)
sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, sk->sk_prot, -1);
local_bh_enable(); local_bh_enable();
if (xs->dev) { xsk_unbind_dev(xs);
struct net_device *dev = xs->dev;
/* Wait for driver to stop using the xdp socket. */
xdp_del_sk_umem(xs->umem, xs);
xs->dev = NULL;
synchronize_net();
dev_put(dev);
}
xskq_destroy(xs->rx); xskq_destroy(xs->rx);
xskq_destroy(xs->tx); xskq_destroy(xs->tx);
...@@ -412,7 +420,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) ...@@ -412,7 +420,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
return -EINVAL; return -EINVAL;
mutex_lock(&xs->mutex); mutex_lock(&xs->mutex);
if (xs->dev) { if (xs->state != XSK_READY) {
err = -EBUSY; err = -EBUSY;
goto out_release; goto out_release;
} }
...@@ -492,6 +500,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) ...@@ -492,6 +500,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
out_unlock: out_unlock:
if (err) if (err)
dev_put(dev); dev_put(dev);
else
xs->state = XSK_BOUND;
out_release: out_release:
mutex_unlock(&xs->mutex); mutex_unlock(&xs->mutex);
return err; return err;
...@@ -520,6 +530,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, ...@@ -520,6 +530,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
return -EFAULT; return -EFAULT;
mutex_lock(&xs->mutex); mutex_lock(&xs->mutex);
if (xs->state != XSK_READY) {
mutex_unlock(&xs->mutex);
return -EBUSY;
}
q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
err = xsk_init_queue(entries, q, false); err = xsk_init_queue(entries, q, false);
mutex_unlock(&xs->mutex); mutex_unlock(&xs->mutex);
...@@ -534,7 +548,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, ...@@ -534,7 +548,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
return -EFAULT; return -EFAULT;
mutex_lock(&xs->mutex); mutex_lock(&xs->mutex);
if (xs->umem) { if (xs->state != XSK_READY || xs->umem) {
mutex_unlock(&xs->mutex); mutex_unlock(&xs->mutex);
return -EBUSY; return -EBUSY;
} }
...@@ -561,6 +575,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, ...@@ -561,6 +575,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
return -EFAULT; return -EFAULT;
mutex_lock(&xs->mutex); mutex_lock(&xs->mutex);
if (xs->state != XSK_READY) {
mutex_unlock(&xs->mutex);
return -EBUSY;
}
if (!xs->umem) { if (!xs->umem) {
mutex_unlock(&xs->mutex); mutex_unlock(&xs->mutex);
return -EINVAL; return -EINVAL;
...@@ -662,6 +680,9 @@ static int xsk_mmap(struct file *file, struct socket *sock, ...@@ -662,6 +680,9 @@ static int xsk_mmap(struct file *file, struct socket *sock,
unsigned long pfn; unsigned long pfn;
struct page *qpg; struct page *qpg;
if (xs->state != XSK_READY)
return -EBUSY;
if (offset == XDP_PGOFF_RX_RING) { if (offset == XDP_PGOFF_RX_RING) {
q = READ_ONCE(xs->rx); q = READ_ONCE(xs->rx);
} else if (offset == XDP_PGOFF_TX_RING) { } else if (offset == XDP_PGOFF_TX_RING) {
...@@ -693,6 +714,38 @@ static int xsk_mmap(struct file *file, struct socket *sock, ...@@ -693,6 +714,38 @@ static int xsk_mmap(struct file *file, struct socket *sock,
size, vma->vm_page_prot); size, vma->vm_page_prot);
} }
static int xsk_notifier(struct notifier_block *this,
unsigned long msg, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
struct sock *sk;
switch (msg) {
case NETDEV_UNREGISTER:
mutex_lock(&net->xdp.lock);
sk_for_each(sk, &net->xdp.list) {
struct xdp_sock *xs = xdp_sk(sk);
mutex_lock(&xs->mutex);
if (xs->dev == dev) {
sk->sk_err = ENETDOWN;
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_error_report(sk);
xsk_unbind_dev(xs);
/* Clear device references in umem. */
xdp_umem_clear_dev(xs->umem);
}
mutex_unlock(&xs->mutex);
}
mutex_unlock(&net->xdp.lock);
break;
}
return NOTIFY_DONE;
}
static struct proto xsk_proto = { static struct proto xsk_proto = {
.name = "XDP", .name = "XDP",
.owner = THIS_MODULE, .owner = THIS_MODULE,
...@@ -764,6 +817,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, ...@@ -764,6 +817,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
sock_set_flag(sk, SOCK_RCU_FREE); sock_set_flag(sk, SOCK_RCU_FREE);
xs = xdp_sk(sk); xs = xdp_sk(sk);
xs->state = XSK_READY;
mutex_init(&xs->mutex); mutex_init(&xs->mutex);
spin_lock_init(&xs->tx_completion_lock); spin_lock_init(&xs->tx_completion_lock);
...@@ -784,6 +838,10 @@ static const struct net_proto_family xsk_family_ops = { ...@@ -784,6 +838,10 @@ static const struct net_proto_family xsk_family_ops = {
.owner = THIS_MODULE, .owner = THIS_MODULE,
}; };
static struct notifier_block xsk_netdev_notifier = {
.notifier_call = xsk_notifier,
};
static int __net_init xsk_net_init(struct net *net) static int __net_init xsk_net_init(struct net *net)
{ {
mutex_init(&net->xdp.lock); mutex_init(&net->xdp.lock);
...@@ -816,8 +874,15 @@ static int __init xsk_init(void) ...@@ -816,8 +874,15 @@ static int __init xsk_init(void)
err = register_pernet_subsys(&xsk_net_ops); err = register_pernet_subsys(&xsk_net_ops);
if (err) if (err)
goto out_sk; goto out_sk;
err = register_netdevice_notifier(&xsk_netdev_notifier);
if (err)
goto out_pernet;
return 0; return 0;
out_pernet:
unregister_pernet_subsys(&xsk_net_ops);
out_sk: out_sk:
sock_unregister(PF_XDP); sock_unregister(PF_XDP);
out_proto: out_proto:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment