Commit b39212d5 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch '40GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue

Tony Nguyen says:

====================
i40e: support XDP multi-buffer

Tirthendu Sarkar says:

This patchset adds multi-buffer support for XDP. Tx side already has
support for multi-buffer. This patchset focuses on Rx side. The last
patch contains actual multi-buffer changes while the previous ones are
preparatory patches.

On receiving the first buffer of a packet, xdp_buff is built and its
subsequent buffers are added to it as frags. While 'next_to_clean' keeps
pointing to the first descriptor, the newly introduced 'next_to_process'
keeps track of every descriptor for the packet.

On receiving EOP buffer the XDP program is called and appropriate action
is taken (building skb for XDP_PASS, reusing page for XDP_DROP, adjusting
page offsets for XDP_{REDIRECT,TX}).

The patchset also streamlines page offset adjustments for buffer reuse
to make it easier to post process the rx_buffers after running XDP prog.

With this patchset there does not seem to be any performance degradation
for XDP_PASS and some improvement (~1% for XDP_TX, ~5% for XDP_DROP) when
measured using xdp_rxq_info program from samples/bpf/ for 64B packets.

v1: https://lore.kernel.org/netdev/20230306210822.3381942-1-anthony.l.nguyen@intel.com/

* '40GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue:
  i40e: add support for XDP multi-buffer Rx
  i40e: add xdp_buff to i40e_ring struct
  i40e: introduce next_to_process to i40e_ring
  i40e: use frame_sz instead of recalculating truesize for building skb
  i40e: Change size to truesize when using i40e_rx_buffer_flip()
  i40e: add pre-xdp page_count in rx_buffer
  i40e: change Rx buffer size for legacy-rx to support XDP multi-buffer
  i40e: consolidate maximum frame size calculation for vsi
====================

Link: https://lore.kernel.org/r/20230309212819.1198218-1-anthony.l.nguyen@intel.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents c66b2111 e213ced1
...@@ -5402,6 +5402,13 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags) ...@@ -5402,6 +5402,13 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags)
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
if ((changed_flags & I40E_FLAG_LEGACY_RX) &&
I40E_2K_TOO_SMALL_WITH_PADDING) {
dev_warn(&pf->pdev->dev,
"2k Rx buffer is too small to fit standard MTU and skb_shared_info\n");
return -EOPNOTSUPP;
}
if ((changed_flags & new_flags & if ((changed_flags & new_flags &
I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED) && I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED) &&
(new_flags & I40E_FLAG_MFP_ENABLED)) (new_flags & I40E_FLAG_MFP_ENABLED))
......
...@@ -2896,15 +2896,35 @@ static void i40e_sync_filters_subtask(struct i40e_pf *pf) ...@@ -2896,15 +2896,35 @@ static void i40e_sync_filters_subtask(struct i40e_pf *pf)
} }
/** /**
* i40e_max_xdp_frame_size - returns the maximum allowed frame size for XDP * i40e_calculate_vsi_rx_buf_len - Calculates buffer length
*
* @vsi: VSI to calculate rx_buf_len from
*/
static u16 i40e_calculate_vsi_rx_buf_len(struct i40e_vsi *vsi)
{
if (!vsi->netdev || (vsi->back->flags & I40E_FLAG_LEGACY_RX))
return SKB_WITH_OVERHEAD(I40E_RXBUFFER_2048);
return PAGE_SIZE < 8192 ? I40E_RXBUFFER_3072 : I40E_RXBUFFER_2048;
}
/**
* i40e_max_vsi_frame_size - returns the maximum allowed frame size for VSI
* @vsi: the vsi * @vsi: the vsi
* @xdp_prog: XDP program
**/ **/
static int i40e_max_xdp_frame_size(struct i40e_vsi *vsi) static int i40e_max_vsi_frame_size(struct i40e_vsi *vsi,
struct bpf_prog *xdp_prog)
{ {
if (PAGE_SIZE >= 8192 || (vsi->back->flags & I40E_FLAG_LEGACY_RX)) u16 rx_buf_len = i40e_calculate_vsi_rx_buf_len(vsi);
return I40E_RXBUFFER_2048; u16 chain_len;
if (xdp_prog && !xdp_prog->aux->xdp_has_frags)
chain_len = 1;
else else
return I40E_RXBUFFER_3072; chain_len = I40E_MAX_CHAINED_RX_BUFFERS;
return min_t(u16, rx_buf_len * chain_len, I40E_MAX_RXBUFFER);
} }
/** /**
...@@ -2919,12 +2939,13 @@ static int i40e_change_mtu(struct net_device *netdev, int new_mtu) ...@@ -2919,12 +2939,13 @@ static int i40e_change_mtu(struct net_device *netdev, int new_mtu)
struct i40e_netdev_priv *np = netdev_priv(netdev); struct i40e_netdev_priv *np = netdev_priv(netdev);
struct i40e_vsi *vsi = np->vsi; struct i40e_vsi *vsi = np->vsi;
struct i40e_pf *pf = vsi->back; struct i40e_pf *pf = vsi->back;
int frame_size;
if (i40e_enabled_xdp_vsi(vsi)) { frame_size = i40e_max_vsi_frame_size(vsi, vsi->xdp_prog);
int frame_size = new_mtu + I40E_PACKET_HDR_PAD; if (new_mtu > frame_size - I40E_PACKET_HDR_PAD) {
netdev_err(netdev, "Error changing mtu to %d, Max is %d\n",
if (frame_size > i40e_max_xdp_frame_size(vsi)) new_mtu, frame_size - I40E_PACKET_HDR_PAD);
return -EINVAL; return -EINVAL;
} }
netdev_dbg(netdev, "changing MTU from %d to %d\n", netdev_dbg(netdev, "changing MTU from %d to %d\n",
...@@ -3595,6 +3616,8 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) ...@@ -3595,6 +3616,8 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
} }
} }
xdp_init_buff(&ring->xdp, i40e_rx_pg_size(ring) / 2, &ring->xdp_rxq);
rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len, rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len,
BIT_ULL(I40E_RXQ_CTX_DBUFF_SHIFT)); BIT_ULL(I40E_RXQ_CTX_DBUFF_SHIFT));
...@@ -3640,10 +3663,16 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) ...@@ -3640,10 +3663,16 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
} }
/* configure Rx buffer alignment */ /* configure Rx buffer alignment */
if (!vsi->netdev || (vsi->back->flags & I40E_FLAG_LEGACY_RX)) if (!vsi->netdev || (vsi->back->flags & I40E_FLAG_LEGACY_RX)) {
if (I40E_2K_TOO_SMALL_WITH_PADDING) {
dev_info(&vsi->back->pdev->dev,
"2k Rx buffer is too small to fit standard MTU and skb_shared_info\n");
return -EOPNOTSUPP;
}
clear_ring_build_skb_enabled(ring); clear_ring_build_skb_enabled(ring);
else } else {
set_ring_build_skb_enabled(ring); set_ring_build_skb_enabled(ring);
}
ring->rx_offset = i40e_rx_offset(ring); ring->rx_offset = i40e_rx_offset(ring);
...@@ -3693,24 +3722,6 @@ static int i40e_vsi_configure_tx(struct i40e_vsi *vsi) ...@@ -3693,24 +3722,6 @@ static int i40e_vsi_configure_tx(struct i40e_vsi *vsi)
return err; return err;
} }
/**
* i40e_calculate_vsi_rx_buf_len - Calculates buffer length
*
* @vsi: VSI to calculate rx_buf_len from
*/
static u16 i40e_calculate_vsi_rx_buf_len(struct i40e_vsi *vsi)
{
if (!vsi->netdev || (vsi->back->flags & I40E_FLAG_LEGACY_RX))
return I40E_RXBUFFER_2048;
#if (PAGE_SIZE < 8192)
if (!I40E_2K_TOO_SMALL_WITH_PADDING && vsi->netdev->mtu <= ETH_DATA_LEN)
return I40E_RXBUFFER_1536 - NET_IP_ALIGN;
#endif
return PAGE_SIZE < 8192 ? I40E_RXBUFFER_3072 : I40E_RXBUFFER_2048;
}
/** /**
* i40e_vsi_configure_rx - Configure the VSI for Rx * i40e_vsi_configure_rx - Configure the VSI for Rx
* @vsi: the VSI being configured * @vsi: the VSI being configured
...@@ -3722,13 +3733,15 @@ static int i40e_vsi_configure_rx(struct i40e_vsi *vsi) ...@@ -3722,13 +3733,15 @@ static int i40e_vsi_configure_rx(struct i40e_vsi *vsi)
int err = 0; int err = 0;
u16 i; u16 i;
vsi->max_frame = I40E_MAX_RXBUFFER; vsi->max_frame = i40e_max_vsi_frame_size(vsi, vsi->xdp_prog);
vsi->rx_buf_len = i40e_calculate_vsi_rx_buf_len(vsi); vsi->rx_buf_len = i40e_calculate_vsi_rx_buf_len(vsi);
#if (PAGE_SIZE < 8192) #if (PAGE_SIZE < 8192)
if (vsi->netdev && !I40E_2K_TOO_SMALL_WITH_PADDING && if (vsi->netdev && !I40E_2K_TOO_SMALL_WITH_PADDING &&
vsi->netdev->mtu <= ETH_DATA_LEN) vsi->netdev->mtu <= ETH_DATA_LEN) {
vsi->max_frame = I40E_RXBUFFER_1536 - NET_IP_ALIGN; vsi->rx_buf_len = I40E_RXBUFFER_1536 - NET_IP_ALIGN;
vsi->max_frame = vsi->rx_buf_len;
}
#endif #endif
/* set up individual rings */ /* set up individual rings */
...@@ -13316,15 +13329,15 @@ static netdev_features_t i40e_features_check(struct sk_buff *skb, ...@@ -13316,15 +13329,15 @@ static netdev_features_t i40e_features_check(struct sk_buff *skb,
static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog, static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog,
struct netlink_ext_ack *extack) struct netlink_ext_ack *extack)
{ {
int frame_size = vsi->netdev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; int frame_size = i40e_max_vsi_frame_size(vsi, prog);
struct i40e_pf *pf = vsi->back; struct i40e_pf *pf = vsi->back;
struct bpf_prog *old_prog; struct bpf_prog *old_prog;
bool need_reset; bool need_reset;
int i; int i;
/* Don't allow frames that span over multiple buffers */ /* Don't allow frames that span over multiple buffers */
if (frame_size > i40e_calculate_vsi_rx_buf_len(vsi)) { if (vsi->netdev->mtu > frame_size - I40E_PACKET_HDR_PAD) {
NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP"); NL_SET_ERR_MSG_MOD(extack, "MTU too large for linear frames and XDP prog does not support frags");
return -EINVAL; return -EINVAL;
} }
...@@ -13810,7 +13823,8 @@ static int i40e_config_netdev(struct i40e_vsi *vsi) ...@@ -13810,7 +13823,8 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
netdev->xdp_features = NETDEV_XDP_ACT_BASIC | netdev->xdp_features = NETDEV_XDP_ACT_BASIC |
NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_REDIRECT |
NETDEV_XDP_ACT_XSK_ZEROCOPY; NETDEV_XDP_ACT_XSK_ZEROCOPY |
NETDEV_XDP_ACT_RX_SG;
} else { } else {
/* Relate the VSI_VMDQ name to the VSI_MAIN name. Note that we /* Relate the VSI_VMDQ name to the VSI_MAIN name. Note that we
* are still limited by IFNAMSIZ, but we're adding 'v%d\0' to * are still limited by IFNAMSIZ, but we're adding 'v%d\0' to
......
...@@ -162,45 +162,45 @@ DECLARE_EVENT_CLASS( ...@@ -162,45 +162,45 @@ DECLARE_EVENT_CLASS(
TP_PROTO(struct i40e_ring *ring, TP_PROTO(struct i40e_ring *ring,
union i40e_16byte_rx_desc *desc, union i40e_16byte_rx_desc *desc,
struct sk_buff *skb), struct xdp_buff *xdp),
TP_ARGS(ring, desc, skb), TP_ARGS(ring, desc, xdp),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(void*, ring) __field(void*, ring)
__field(void*, desc) __field(void*, desc)
__field(void*, skb) __field(void*, xdp)
__string(devname, ring->netdev->name) __string(devname, ring->netdev->name)
), ),
TP_fast_assign( TP_fast_assign(
__entry->ring = ring; __entry->ring = ring;
__entry->desc = desc; __entry->desc = desc;
__entry->skb = skb; __entry->xdp = xdp;
__assign_str(devname, ring->netdev->name); __assign_str(devname, ring->netdev->name);
), ),
TP_printk( TP_printk(
"netdev: %s ring: %p desc: %p skb %p", "netdev: %s ring: %p desc: %p xdp %p",
__get_str(devname), __entry->ring, __get_str(devname), __entry->ring,
__entry->desc, __entry->skb) __entry->desc, __entry->xdp)
); );
DEFINE_EVENT( DEFINE_EVENT(
i40e_rx_template, i40e_clean_rx_irq, i40e_rx_template, i40e_clean_rx_irq,
TP_PROTO(struct i40e_ring *ring, TP_PROTO(struct i40e_ring *ring,
union i40e_16byte_rx_desc *desc, union i40e_16byte_rx_desc *desc,
struct sk_buff *skb), struct xdp_buff *xdp),
TP_ARGS(ring, desc, skb)); TP_ARGS(ring, desc, xdp));
DEFINE_EVENT( DEFINE_EVENT(
i40e_rx_template, i40e_clean_rx_irq_rx, i40e_rx_template, i40e_clean_rx_irq_rx,
TP_PROTO(struct i40e_ring *ring, TP_PROTO(struct i40e_ring *ring,
union i40e_16byte_rx_desc *desc, union i40e_16byte_rx_desc *desc,
struct sk_buff *skb), struct xdp_buff *xdp),
TP_ARGS(ring, desc, skb)); TP_ARGS(ring, desc, xdp));
DECLARE_EVENT_CLASS( DECLARE_EVENT_CLASS(
i40e_xmit_template, i40e_xmit_template,
......
This diff is collapsed.
...@@ -277,6 +277,7 @@ struct i40e_rx_buffer { ...@@ -277,6 +277,7 @@ struct i40e_rx_buffer {
struct page *page; struct page *page;
__u32 page_offset; __u32 page_offset;
__u16 pagecnt_bias; __u16 pagecnt_bias;
__u32 page_count;
}; };
struct i40e_queue_stats { struct i40e_queue_stats {
...@@ -336,6 +337,17 @@ struct i40e_ring { ...@@ -336,6 +337,17 @@ struct i40e_ring {
u8 dcb_tc; /* Traffic class of ring */ u8 dcb_tc; /* Traffic class of ring */
u8 __iomem *tail; u8 __iomem *tail;
/* Storing xdp_buff on ring helps in saving the state of partially built
* packet when i40e_clean_rx_ring_irq() must return before it sees EOP
* and to resume packet building for this ring in the next call to
* i40e_clean_rx_ring_irq().
*/
struct xdp_buff xdp;
/* Next descriptor to be processed; next_to_clean is updated only on
* processing EOP descriptor
*/
u16 next_to_process;
/* high bit set means dynamic, use accessor routines to read/write. /* high bit set means dynamic, use accessor routines to read/write.
* hardware only supports 2us resolution for the ITR registers. * hardware only supports 2us resolution for the ITR registers.
* these values always store the USER setting, and must be converted * these values always store the USER setting, and must be converted
...@@ -380,14 +392,6 @@ struct i40e_ring { ...@@ -380,14 +392,6 @@ struct i40e_ring {
struct rcu_head rcu; /* to avoid race on free */ struct rcu_head rcu; /* to avoid race on free */
u16 next_to_alloc; u16 next_to_alloc;
struct sk_buff *skb; /* When i40e_clean_rx_ring_irq() must
* return before it sees the EOP for
* the current packet, we save that skb
* here and resume receiving this
* packet the next time
* i40e_clean_rx_ring_irq() is called
* for this ring.
*/
struct i40e_channel *ch; struct i40e_channel *ch;
u16 rx_offset; u16 rx_offset;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment