Commit 9647c57b authored by Magnus Karlsson's avatar Magnus Karlsson Committed by Daniel Borkmann

xsk: i40e: ice: ixgbe: mlx5: Test for dma_need_sync earlier for better performance

Test for dma_need_sync earlier to increase
performance. xsk_buff_dma_sync_for_cpu() takes an xdp_buff as
parameter and from that the xsk_buff_pool reference is dug out. Perf
shows that this dereference causes a lot of cache misses. But as the
buffer pool is now sent down to the driver at zero-copy initialization
time, we might as well use this pointer directly, instead of going via
the xsk_buff and we can do so already in xsk_buff_dma_sync_for_cpu()
instead of in xp_dma_sync_for_cpu. This gets rid of these cache
misses.

Throughput increases with 3% for the xdpsock l2fwd sample application
on my machine.
Signed-off-by: default avatarMagnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Acked-by: default avatarBjörn Töpel <bjorn.topel@intel.com>
Link: https://lore.kernel.org/bpf/1598603189-32145-11-git-send-email-magnus.karlsson@intel.com
parent 8ef4e27e
...@@ -314,7 +314,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) ...@@ -314,7 +314,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
(*bi)->data_end = (*bi)->data + size; (*bi)->data_end = (*bi)->data + size;
xsk_buff_dma_sync_for_cpu(*bi); xsk_buff_dma_sync_for_cpu(*bi, rx_ring->xsk_pool);
xdp_res = i40e_run_xdp_zc(rx_ring, *bi); xdp_res = i40e_run_xdp_zc(rx_ring, *bi);
if (xdp_res) { if (xdp_res) {
......
...@@ -595,7 +595,7 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget) ...@@ -595,7 +595,7 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean]; rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
rx_buf->xdp->data_end = rx_buf->xdp->data + size; rx_buf->xdp->data_end = rx_buf->xdp->data + size;
xsk_buff_dma_sync_for_cpu(rx_buf->xdp); xsk_buff_dma_sync_for_cpu(rx_buf->xdp, rx_ring->xsk_pool);
xdp_res = ice_run_xdp_zc(rx_ring, rx_buf->xdp); xdp_res = ice_run_xdp_zc(rx_ring, rx_buf->xdp);
if (xdp_res) { if (xdp_res) {
......
...@@ -287,7 +287,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, ...@@ -287,7 +287,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
} }
bi->xdp->data_end = bi->xdp->data + size; bi->xdp->data_end = bi->xdp->data + size;
xsk_buff_dma_sync_for_cpu(bi->xdp); xsk_buff_dma_sync_for_cpu(bi->xdp, rx_ring->xsk_pool);
xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, bi->xdp); xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, bi->xdp);
if (xdp_res) { if (xdp_res) {
......
...@@ -48,7 +48,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, ...@@ -48,7 +48,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
xdp->data_end = xdp->data + cqe_bcnt32; xdp->data_end = xdp->data + cqe_bcnt32;
xdp_set_data_meta_invalid(xdp); xdp_set_data_meta_invalid(xdp);
xsk_buff_dma_sync_for_cpu(xdp); xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
prefetch(xdp->data); prefetch(xdp->data);
rcu_read_lock(); rcu_read_lock();
...@@ -99,7 +99,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, ...@@ -99,7 +99,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
xdp->data_end = xdp->data + cqe_bcnt; xdp->data_end = xdp->data + cqe_bcnt;
xdp_set_data_meta_invalid(xdp); xdp_set_data_meta_invalid(xdp);
xsk_buff_dma_sync_for_cpu(xdp); xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
prefetch(xdp->data); prefetch(xdp->data);
if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) { if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) {
......
...@@ -99,10 +99,13 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) ...@@ -99,10 +99,13 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
return xp_raw_get_data(pool, addr); return xp_raw_get_data(pool, addr);
} }
static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
{ {
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
if (!pool->dma_need_sync)
return;
xp_dma_sync_for_cpu(xskb); xp_dma_sync_for_cpu(xskb);
} }
...@@ -222,7 +225,7 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) ...@@ -222,7 +225,7 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
return NULL; return NULL;
} }
static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
{ {
} }
......
...@@ -114,9 +114,6 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) ...@@ -114,9 +114,6 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb)
void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb);
static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb)
{ {
if (!xskb->pool->dma_need_sync)
return;
xp_dma_sync_for_cpu_slow(xskb); xp_dma_sync_for_cpu_slow(xskb);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment