Commit 593f191a authored by Daniel Borkmann's avatar Daniel Borkmann

Merge branch 'bpf-af-xdp-barrier-fixes'

Björn Töpel says:

====================
This is a four patch series of various barrier, {READ, WRITE}_ONCE
cleanups in the AF_XDP socket code. More details can be found in the
corresponding commit message. Previous revisions: v1 [4] and v2 [5].

For an AF_XDP socket, most control plane operations are done under the
control mutex (struct xdp_sock, mutex), but there are some places
where members of the struct is read outside the control mutex. The
dev, queue_id members are set in bind() and cleared at cleanup. The
umem, fq, cq, tx, rx, and state member are all assigned in various
places, e.g. bind() and setsockopt(). When the members are assigned,
they are protected by the control mutex, but since they are read
outside the mutex, a WRITE_ONCE is required to avoid store-tearing on
the read-side.

Prior the state variable was introduced by Ilya, the dev member was
used to determine whether the socket was bound or not. However, when
dev was read, proper SMP barriers and READ_ONCE were missing. In order
to address the missing barriers and READ_ONCE, we start using the
state variable as a point of synchronization. The state member
read/write is paired with proper SMP barriers, and from this follows
that the members described above does not need READ_ONCE statements if
used in conjunction with state check.

To summarize: The members struct xdp_sock members dev, queue_id, umem,
fq, cq, tx, rx, and state were read lock-less, with incorrect barriers
and missing {READ, WRITE}_ONCE. After this series umem, fq, cq, tx,
rx, and state are read lock-less. When these members are updated,
WRITE_ONCE is used. When read, READ_ONCE are only used when read
outside the control mutex (e.g. mmap) or, not synchronized with the
state member (XSK_BOUND plus smp_rmb())

[1] https://lore.kernel.org/bpf/beef16bb-a09b-40f1-7dd0-c323b4b89b17@iogearbox.net/
[2] https://lwn.net/Articles/793253/
[3] https://github.com/google/ktsan/wiki/READ_ONCE-and-WRITE_ONCE
[4] https://lore.kernel.org/bpf/20190822091306.20581-1-bjorn.topel@gmail.com/
[5] https://lore.kernel.org/bpf/20190826061053.15996-1-bjorn.topel@gmail.com/

v2->v3:
  Minor restructure of commits.
  Improve cover and commit messages. (Daniel)
v1->v2:
  Removed redundant dev check. (Jonathan)
====================
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents 310f4204 25dc18ff
...@@ -186,10 +186,23 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) ...@@ -186,10 +186,23 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
return err; return err;
} }
static bool xsk_is_bound(struct xdp_sock *xs)
{
if (READ_ONCE(xs->state) == XSK_BOUND) {
/* Matches smp_wmb() in bind(). */
smp_rmb();
return true;
}
return false;
}
int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{ {
u32 len; u32 len;
if (!xsk_is_bound(xs))
return -EINVAL;
if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
return -EINVAL; return -EINVAL;
...@@ -387,7 +400,7 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) ...@@ -387,7 +400,7 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
struct sock *sk = sock->sk; struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk); struct xdp_sock *xs = xdp_sk(sk);
if (unlikely(!xs->dev)) if (unlikely(!xsk_is_bound(xs)))
return -ENXIO; return -ENXIO;
if (unlikely(!(xs->dev->flags & IFF_UP))) if (unlikely(!(xs->dev->flags & IFF_UP)))
return -ENETDOWN; return -ENETDOWN;
...@@ -403,10 +416,15 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock, ...@@ -403,10 +416,15 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait) struct poll_table_struct *wait)
{ {
unsigned int mask = datagram_poll(file, sock, wait); unsigned int mask = datagram_poll(file, sock, wait);
struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sock->sk);
struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev;
struct net_device *dev = xs->dev; struct xdp_umem *umem;
struct xdp_umem *umem = xs->umem;
if (unlikely(!xsk_is_bound(xs)))
return mask;
dev = xs->dev;
umem = xs->umem;
if (umem->need_wakeup) if (umem->need_wakeup)
dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
...@@ -434,7 +452,7 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, ...@@ -434,7 +452,7 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
/* Make sure queue is ready before it can be seen by others */ /* Make sure queue is ready before it can be seen by others */
smp_wmb(); smp_wmb();
*queue = q; WRITE_ONCE(*queue, q);
return 0; return 0;
} }
...@@ -442,10 +460,9 @@ static void xsk_unbind_dev(struct xdp_sock *xs) ...@@ -442,10 +460,9 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
{ {
struct net_device *dev = xs->dev; struct net_device *dev = xs->dev;
if (!dev || xs->state != XSK_BOUND) if (xs->state != XSK_BOUND)
return; return;
WRITE_ONCE(xs->state, XSK_UNBOUND);
xs->state = XSK_UNBOUND;
/* Wait for driver to stop using the xdp socket. */ /* Wait for driver to stop using the xdp socket. */
xdp_del_sk_umem(xs->umem, xs); xdp_del_sk_umem(xs->umem, xs);
...@@ -520,7 +537,9 @@ static int xsk_release(struct socket *sock) ...@@ -520,7 +537,9 @@ static int xsk_release(struct socket *sock)
local_bh_enable(); local_bh_enable();
xsk_delete_from_maps(xs); xsk_delete_from_maps(xs);
mutex_lock(&xs->mutex);
xsk_unbind_dev(xs); xsk_unbind_dev(xs);
mutex_unlock(&xs->mutex);
xskq_destroy(xs->rx); xskq_destroy(xs->rx);
xskq_destroy(xs->tx); xskq_destroy(xs->tx);
...@@ -632,19 +651,19 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) ...@@ -632,19 +651,19 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
} }
umem_xs = xdp_sk(sock->sk); umem_xs = xdp_sk(sock->sk);
if (!umem_xs->umem) { if (!xsk_is_bound(umem_xs)) {
/* No umem to inherit. */
err = -EBADF; err = -EBADF;
sockfd_put(sock); sockfd_put(sock);
goto out_unlock; goto out_unlock;
} else if (umem_xs->dev != dev || umem_xs->queue_id != qid) { }
if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
err = -EINVAL; err = -EINVAL;
sockfd_put(sock); sockfd_put(sock);
goto out_unlock; goto out_unlock;
} }
xdp_get_umem(umem_xs->umem); xdp_get_umem(umem_xs->umem);
xs->umem = umem_xs->umem; WRITE_ONCE(xs->umem, umem_xs->umem);
sockfd_put(sock); sockfd_put(sock);
} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
err = -EINVAL; err = -EINVAL;
...@@ -671,10 +690,15 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) ...@@ -671,10 +690,15 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
xdp_add_sk_umem(xs->umem, xs); xdp_add_sk_umem(xs->umem, xs);
out_unlock: out_unlock:
if (err) if (err) {
dev_put(dev); dev_put(dev);
else } else {
xs->state = XSK_BOUND; /* Matches smp_rmb() in bind() for shared umem
* sockets, and xsk_is_bound().
*/
smp_wmb();
WRITE_ONCE(xs->state, XSK_BOUND);
}
out_release: out_release:
mutex_unlock(&xs->mutex); mutex_unlock(&xs->mutex);
rtnl_unlock(); rtnl_unlock();
...@@ -751,7 +775,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, ...@@ -751,7 +775,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
/* Make sure umem is ready before it can be seen by others */ /* Make sure umem is ready before it can be seen by others */
smp_wmb(); smp_wmb();
xs->umem = umem; WRITE_ONCE(xs->umem, umem);
mutex_unlock(&xs->mutex); mutex_unlock(&xs->mutex);
return 0; return 0;
} }
...@@ -927,7 +951,7 @@ static int xsk_mmap(struct file *file, struct socket *sock, ...@@ -927,7 +951,7 @@ static int xsk_mmap(struct file *file, struct socket *sock,
unsigned long pfn; unsigned long pfn;
struct page *qpg; struct page *qpg;
if (xs->state != XSK_READY) if (READ_ONCE(xs->state) != XSK_READY)
return -EBUSY; return -EBUSY;
if (offset == XDP_PGOFF_RX_RING) { if (offset == XDP_PGOFF_RX_RING) {
......
...@@ -97,6 +97,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, ...@@ -97,6 +97,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
msg->xdiag_ino = sk_ino; msg->xdiag_ino = sk_ino;
sock_diag_save_cookie(sk, msg->xdiag_cookie); sock_diag_save_cookie(sk, msg->xdiag_cookie);
mutex_lock(&xs->mutex);
if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb)) if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb))
goto out_nlmsg_trim; goto out_nlmsg_trim;
...@@ -117,10 +118,12 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, ...@@ -117,10 +118,12 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO)) sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO))
goto out_nlmsg_trim; goto out_nlmsg_trim;
mutex_unlock(&xs->mutex);
nlmsg_end(nlskb, nlh); nlmsg_end(nlskb, nlh);
return 0; return 0;
out_nlmsg_trim: out_nlmsg_trim:
mutex_unlock(&xs->mutex);
nlmsg_cancel(nlskb, nlh); nlmsg_cancel(nlskb, nlh);
return -EMSGSIZE; return -EMSGSIZE;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment