Commit aa1d3faf authored by Alexander Lobakin's avatar Alexander Lobakin Committed by Daniel Borkmann

ice: Robustify cleaning/completing XDP Tx buffers

When queueing frames from a Page Pool for redirecting to a device backed
by the ice driver, `perf top` shows heavy load on page_alloc() and
page_frag_free(), despite that on a properly working system it must be
fully or at least almost zero-alloc. The problem is in fact a bit deeper
and raises from how ice cleans up completed Tx buffers.

The story so far: when cleaning/freeing the resources related to
a particular completed Tx frame (skbs, DMA mappings etc.), ice uses some
heuristics only without setting any type explicitly (except for dummy
Flow Director packets, which are marked via ice_tx_buf::tx_flags).
This kinda works, but only up to some point. For example, currently ice
assumes that each frame coming to __ice_xmit_xdp_ring(), is backed by
either plain order-0 page or plain page frag, while it may also be
backed by Page Pool or any other possible memory models introduced in
future. This means any &xdp_frame must be freed properly via
xdp_return_frame() family with no assumptions.

In order to do that, the whole heuristics must be replaced with setting
the Tx buffer/frame type explicitly, just how it's always been done via
an enum. Let us reuse 16 bits from ::tx_flags -- 1 bit-and instr won't
hurt much -- especially given that sometimes there was a check for
%ICE_TX_FLAGS_DUMMY_PKT, which is now turned from a flag to an enum
member. The rest of the changes is straightforward and most of it is
just a conversion to rely now on the type set in &ice_tx_buf rather than
to some secondary properties.
For now, no functional changes intended, the change only prepares the
ground for starting freeing XDP frames properly next step. And it must
be done atomically/synchronously to not break stuff.
Signed-off-by: default avatarAlexander Lobakin <alexandr.lobakin@intel.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Acked-by: default avatarMaciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20230210170618.1973430-5-alexandr.lobakin@intel.com
parent 923096b5
...@@ -85,7 +85,7 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc, ...@@ -85,7 +85,7 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
td_cmd = ICE_TXD_LAST_DESC_CMD | ICE_TX_DESC_CMD_DUMMY | td_cmd = ICE_TXD_LAST_DESC_CMD | ICE_TX_DESC_CMD_DUMMY |
ICE_TX_DESC_CMD_RE; ICE_TX_DESC_CMD_RE;
tx_buf->tx_flags = ICE_TX_FLAGS_DUMMY_PKT; tx_buf->type = ICE_TX_BUF_DUMMY;
tx_buf->raw_buf = raw_packet; tx_buf->raw_buf = raw_packet;
tx_desc->cmd_type_offset_bsz = tx_desc->cmd_type_offset_bsz =
...@@ -112,28 +112,26 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc, ...@@ -112,28 +112,26 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
static void static void
ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf) ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf)
{ {
if (tx_buf->skb) { if (dma_unmap_len(tx_buf, len))
if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT) {
devm_kfree(ring->dev, tx_buf->raw_buf);
} else if (ice_ring_is_xdp(ring)) {
page_frag_free(tx_buf->raw_buf);
} else {
dev_kfree_skb_any(tx_buf->skb);
}
if (dma_unmap_len(tx_buf, len))
dma_unmap_single(ring->dev,
dma_unmap_addr(tx_buf, dma),
dma_unmap_len(tx_buf, len),
DMA_TO_DEVICE);
} else if (dma_unmap_len(tx_buf, len)) {
dma_unmap_page(ring->dev, dma_unmap_page(ring->dev,
dma_unmap_addr(tx_buf, dma), dma_unmap_addr(tx_buf, dma),
dma_unmap_len(tx_buf, len), dma_unmap_len(tx_buf, len),
DMA_TO_DEVICE); DMA_TO_DEVICE);
switch (tx_buf->type) {
case ICE_TX_BUF_DUMMY:
devm_kfree(ring->dev, tx_buf->raw_buf);
break;
case ICE_TX_BUF_SKB:
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
page_frag_free(tx_buf->raw_buf);
break;
} }
tx_buf->next_to_watch = NULL; tx_buf->next_to_watch = NULL;
tx_buf->skb = NULL; tx_buf->type = ICE_TX_BUF_EMPTY;
dma_unmap_len_set(tx_buf, len, 0); dma_unmap_len_set(tx_buf, len, 0);
/* tx_buf must be completely set up in the transmit path */ /* tx_buf must be completely set up in the transmit path */
} }
...@@ -266,7 +264,7 @@ static bool ice_clean_tx_irq(struct ice_tx_ring *tx_ring, int napi_budget) ...@@ -266,7 +264,7 @@ static bool ice_clean_tx_irq(struct ice_tx_ring *tx_ring, int napi_budget)
DMA_TO_DEVICE); DMA_TO_DEVICE);
/* clear tx_buf data */ /* clear tx_buf data */
tx_buf->skb = NULL; tx_buf->type = ICE_TX_BUF_EMPTY;
dma_unmap_len_set(tx_buf, len, 0); dma_unmap_len_set(tx_buf, len, 0);
/* unmap remaining buffers */ /* unmap remaining buffers */
...@@ -1709,6 +1707,7 @@ ice_tx_map(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first, ...@@ -1709,6 +1707,7 @@ ice_tx_map(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first,
DMA_TO_DEVICE); DMA_TO_DEVICE);
tx_buf = &tx_ring->tx_buf[i]; tx_buf = &tx_ring->tx_buf[i];
tx_buf->type = ICE_TX_BUF_FRAG;
} }
/* record SW timestamp if HW timestamp is not available */ /* record SW timestamp if HW timestamp is not available */
...@@ -2352,6 +2351,7 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring) ...@@ -2352,6 +2351,7 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring)
/* record the location of the first descriptor for this packet */ /* record the location of the first descriptor for this packet */
first = &tx_ring->tx_buf[tx_ring->next_to_use]; first = &tx_ring->tx_buf[tx_ring->next_to_use];
first->skb = skb; first->skb = skb;
first->type = ICE_TX_BUF_SKB;
first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN); first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN);
first->gso_segs = 1; first->gso_segs = 1;
first->tx_flags = 0; first->tx_flags = 0;
...@@ -2524,11 +2524,11 @@ void ice_clean_ctrl_tx_irq(struct ice_tx_ring *tx_ring) ...@@ -2524,11 +2524,11 @@ void ice_clean_ctrl_tx_irq(struct ice_tx_ring *tx_ring)
dma_unmap_addr(tx_buf, dma), dma_unmap_addr(tx_buf, dma),
dma_unmap_len(tx_buf, len), dma_unmap_len(tx_buf, len),
DMA_TO_DEVICE); DMA_TO_DEVICE);
if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT) if (tx_buf->type == ICE_TX_BUF_DUMMY)
devm_kfree(tx_ring->dev, tx_buf->raw_buf); devm_kfree(tx_ring->dev, tx_buf->raw_buf);
/* clear next_to_watch to prevent false hangs */ /* clear next_to_watch to prevent false hangs */
tx_buf->raw_buf = NULL; tx_buf->type = ICE_TX_BUF_EMPTY;
tx_buf->tx_flags = 0; tx_buf->tx_flags = 0;
tx_buf->next_to_watch = NULL; tx_buf->next_to_watch = NULL;
dma_unmap_len_set(tx_buf, len, 0); dma_unmap_len_set(tx_buf, len, 0);
......
...@@ -121,10 +121,7 @@ static inline int ice_skb_pad(void) ...@@ -121,10 +121,7 @@ static inline int ice_skb_pad(void)
#define ICE_TX_FLAGS_TSO BIT(0) #define ICE_TX_FLAGS_TSO BIT(0)
#define ICE_TX_FLAGS_HW_VLAN BIT(1) #define ICE_TX_FLAGS_HW_VLAN BIT(1)
#define ICE_TX_FLAGS_SW_VLAN BIT(2) #define ICE_TX_FLAGS_SW_VLAN BIT(2)
/* ICE_TX_FLAGS_DUMMY_PKT is used to mark dummy packets that should be /* Free, was ICE_TX_FLAGS_DUMMY_PKT */
* freed instead of returned like skb packets.
*/
#define ICE_TX_FLAGS_DUMMY_PKT BIT(3)
#define ICE_TX_FLAGS_TSYN BIT(4) #define ICE_TX_FLAGS_TSYN BIT(4)
#define ICE_TX_FLAGS_IPV4 BIT(5) #define ICE_TX_FLAGS_IPV4 BIT(5)
#define ICE_TX_FLAGS_IPV6 BIT(6) #define ICE_TX_FLAGS_IPV6 BIT(6)
...@@ -149,22 +146,41 @@ static inline int ice_skb_pad(void) ...@@ -149,22 +146,41 @@ static inline int ice_skb_pad(void)
#define ICE_TXD_LAST_DESC_CMD (ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS) #define ICE_TXD_LAST_DESC_CMD (ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS)
/**
* enum ice_tx_buf_type - type of &ice_tx_buf to act on Tx completion
* @ICE_TX_BUF_EMPTY: unused OR XSk frame, no action required
* @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
* @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
* @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
* @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
* @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
*/
enum ice_tx_buf_type {
ICE_TX_BUF_EMPTY = 0U,
ICE_TX_BUF_DUMMY,
ICE_TX_BUF_FRAG,
ICE_TX_BUF_SKB,
ICE_TX_BUF_XDP_TX,
ICE_TX_BUF_XSK_TX,
};
struct ice_tx_buf { struct ice_tx_buf {
union { union {
struct ice_tx_desc *next_to_watch; struct ice_tx_desc *next_to_watch;
u32 rs_idx; u32 rs_idx;
}; };
union { union {
struct sk_buff *skb; void *raw_buf; /* used for XDP_TX and FDir rules */
void *raw_buf; /* used for XDP */ struct sk_buff *skb; /* used for .ndo_start_xmit() */
struct xdp_buff *xdp; /* used for XDP_TX ZC */ struct xdp_buff *xdp; /* used for XDP_TX ZC */
}; };
unsigned int bytecount; unsigned int bytecount;
union { union {
unsigned int gso_segs; unsigned int gso_segs;
unsigned int nr_frags; /* used for mbuf XDP */ unsigned int nr_frags; /* used for mbuf XDP */
}; };
u32 tx_flags; u32 type:16; /* &ice_tx_buf_type */
u32 tx_flags:16;
DEFINE_DMA_UNMAP_LEN(len); DEFINE_DMA_UNMAP_LEN(len);
DEFINE_DMA_UNMAP_ADDR(dma); DEFINE_DMA_UNMAP_ADDR(dma);
}; };
......
...@@ -231,8 +231,14 @@ ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf) ...@@ -231,8 +231,14 @@ ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf)
dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma), dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma),
dma_unmap_len(tx_buf, len), DMA_TO_DEVICE); dma_unmap_len(tx_buf, len), DMA_TO_DEVICE);
dma_unmap_len_set(tx_buf, len, 0); dma_unmap_len_set(tx_buf, len, 0);
page_frag_free(tx_buf->raw_buf);
tx_buf->raw_buf = NULL; switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
page_frag_free(tx_buf->raw_buf);
break;
}
tx_buf->type = ICE_TX_BUF_EMPTY;
} }
/** /**
...@@ -266,6 +272,7 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) ...@@ -266,6 +272,7 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
while (ready_frames) { while (ready_frames) {
struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc]; struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc];
struct ice_tx_buf *head = tx_buf;
/* bytecount holds size of head + frags */ /* bytecount holds size of head + frags */
total_bytes += tx_buf->bytecount; total_bytes += tx_buf->bytecount;
...@@ -275,7 +282,6 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) ...@@ -275,7 +282,6 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
ready_frames -= frags + 1; ready_frames -= frags + 1;
xdp_tx++; xdp_tx++;
ice_clean_xdp_tx_buf(xdp_ring, tx_buf);
ntc++; ntc++;
if (ntc == cnt) if (ntc == cnt)
ntc = 0; ntc = 0;
...@@ -288,6 +294,8 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) ...@@ -288,6 +294,8 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
if (ntc == cnt) if (ntc == cnt)
ntc = 0; ntc = 0;
} }
ice_clean_xdp_tx_buf(xdp_ring, head);
} }
tx_desc->cmd_type_offset_bsz = 0; tx_desc->cmd_type_offset_bsz = 0;
...@@ -349,6 +357,7 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring) ...@@ -349,6 +357,7 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring)
tx_desc->buf_addr = cpu_to_le64(dma); tx_desc->buf_addr = cpu_to_le64(dma);
tx_desc->cmd_type_offset_bsz = ice_build_ctob(0, 0, size, 0); tx_desc->cmd_type_offset_bsz = ice_build_ctob(0, 0, size, 0);
tx_buf->type = ICE_TX_BUF_XDP_TX;
tx_buf->raw_buf = data; tx_buf->raw_buf = data;
ntu++; ntu++;
......
...@@ -631,7 +631,8 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) ...@@ -631,7 +631,8 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring)
for (i = 0; i < xsk_frames; i++) { for (i = 0; i < xsk_frames; i++) {
tx_buf = &xdp_ring->tx_buf[ntc]; tx_buf = &xdp_ring->tx_buf[ntc];
if (tx_buf->xdp) { if (tx_buf->type == ICE_TX_BUF_XSK_TX) {
tx_buf->type = ICE_TX_BUF_EMPTY;
xsk_buff_free(tx_buf->xdp); xsk_buff_free(tx_buf->xdp);
xdp_ring->xdp_tx_active--; xdp_ring->xdp_tx_active--;
} else { } else {
...@@ -685,6 +686,7 @@ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp, ...@@ -685,6 +686,7 @@ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp,
tx_buf = &xdp_ring->tx_buf[ntu]; tx_buf = &xdp_ring->tx_buf[ntu];
tx_buf->xdp = xdp; tx_buf->xdp = xdp;
tx_buf->type = ICE_TX_BUF_XSK_TX;
tx_desc = ICE_TX_DESC(xdp_ring, ntu); tx_desc = ICE_TX_DESC(xdp_ring, ntu);
tx_desc->buf_addr = cpu_to_le64(dma); tx_desc->buf_addr = cpu_to_le64(dma);
tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP, tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP,
...@@ -1083,12 +1085,12 @@ void ice_xsk_clean_xdp_ring(struct ice_tx_ring *xdp_ring) ...@@ -1083,12 +1085,12 @@ void ice_xsk_clean_xdp_ring(struct ice_tx_ring *xdp_ring)
while (ntc != ntu) { while (ntc != ntu) {
struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc]; struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc];
if (tx_buf->xdp) if (tx_buf->type == ICE_TX_BUF_XSK_TX) {
tx_buf->type = ICE_TX_BUF_EMPTY;
xsk_buff_free(tx_buf->xdp); xsk_buff_free(tx_buf->xdp);
else } else {
xsk_frames++; xsk_frames++;
}
tx_buf->raw_buf = NULL;
ntc++; ntc++;
if (ntc >= xdp_ring->count) if (ntc >= xdp_ring->count)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment