Commit 3d03641c authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

net/mlx4_en: Avoid a cache line miss in TX completion for single frag skb's

Add frag0_dma/frag0_byte_count into mlx4_en_tx_info to avoid a cache
line miss in TX completion for frames having one dma element.  (We avoid
reading back the tx descriptor)

Note this could be extended to 2/3 dma elements later, as we have free
room in mlx4_en_tx_info

Also, mlx4_en_free_tx_desc() no longer accesses skb_shinfo(). We use a
new nr_maps fields in mlx4_en_tx_info to avoid 2 or 3 cache misses.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarAmir Vadai <amirv@mellanox.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent dc9b06d1
...@@ -259,38 +259,40 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, ...@@ -259,38 +259,40 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
struct mlx4_en_tx_ring *ring, struct mlx4_en_tx_ring *ring,
int index, u8 owner, u64 timestamp) int index, u8 owner, u64 timestamp)
{ {
struct mlx4_en_dev *mdev = priv->mdev;
struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE; struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset; struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
struct sk_buff *skb = tx_info->skb;
struct skb_frag_struct *frag;
void *end = ring->buf + ring->buf_size; void *end = ring->buf + ring->buf_size;
int frags = skb_shinfo(skb)->nr_frags; struct sk_buff *skb = tx_info->skb;
int nr_maps = tx_info->nr_maps;
int i; int i;
struct skb_shared_hwtstamps hwts;
if (timestamp) { if (unlikely(timestamp)) {
mlx4_en_fill_hwtstamps(mdev, &hwts, timestamp); struct skb_shared_hwtstamps hwts;
mlx4_en_fill_hwtstamps(priv->mdev, &hwts, timestamp);
skb_tstamp_tx(skb, &hwts); skb_tstamp_tx(skb, &hwts);
} }
/* Optimize the common case when there are no wraparounds */ /* Optimize the common case when there are no wraparounds */
if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) { if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
if (!tx_info->inl) { if (!tx_info->inl) {
if (tx_info->linear) { if (tx_info->linear)
dma_unmap_single(priv->ddev, dma_unmap_single(priv->ddev,
(dma_addr_t) be64_to_cpu(data->addr), tx_info->map0_dma,
be32_to_cpu(data->byte_count), tx_info->map0_byte_count,
PCI_DMA_TODEVICE); PCI_DMA_TODEVICE);
++data; else
} dma_unmap_page(priv->ddev,
tx_info->map0_dma,
for (i = 0; i < frags; i++) { tx_info->map0_byte_count,
frag = &skb_shinfo(skb)->frags[i]; PCI_DMA_TODEVICE);
for (i = 1; i < nr_maps; i++) {
data++;
dma_unmap_page(priv->ddev, dma_unmap_page(priv->ddev,
(dma_addr_t) be64_to_cpu(data[i].addr), (dma_addr_t)be64_to_cpu(data->addr),
skb_frag_size(frag), PCI_DMA_TODEVICE); be32_to_cpu(data->byte_count),
PCI_DMA_TODEVICE);
} }
} }
} else { } else {
...@@ -299,23 +301,25 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, ...@@ -299,23 +301,25 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
data = ring->buf + ((void *)data - end); data = ring->buf + ((void *)data - end);
} }
if (tx_info->linear) { if (tx_info->linear)
dma_unmap_single(priv->ddev, dma_unmap_single(priv->ddev,
(dma_addr_t) be64_to_cpu(data->addr), tx_info->map0_dma,
be32_to_cpu(data->byte_count), tx_info->map0_byte_count,
PCI_DMA_TODEVICE); PCI_DMA_TODEVICE);
++data; else
} dma_unmap_page(priv->ddev,
tx_info->map0_dma,
for (i = 0; i < frags; i++) { tx_info->map0_byte_count,
PCI_DMA_TODEVICE);
for (i = 1; i < nr_maps; i++) {
data++;
/* Check for wraparound before unmapping */ /* Check for wraparound before unmapping */
if ((void *) data >= end) if ((void *) data >= end)
data = ring->buf; data = ring->buf;
frag = &skb_shinfo(skb)->frags[i];
dma_unmap_page(priv->ddev, dma_unmap_page(priv->ddev,
(dma_addr_t) be64_to_cpu(data->addr), (dma_addr_t)be64_to_cpu(data->addr),
skb_frag_size(frag), PCI_DMA_TODEVICE); be32_to_cpu(data->byte_count),
++data; PCI_DMA_TODEVICE);
} }
} }
} }
...@@ -751,19 +755,22 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -751,19 +755,22 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
tx_info->linear = (lso_header_size < skb_headlen(skb) && tx_info->linear = (lso_header_size < skb_headlen(skb) &&
!is_inline(ring->inline_thold, skb, NULL)) ? 1 : 0; !is_inline(ring->inline_thold, skb, NULL)) ? 1 : 0;
data += skb_shinfo(skb)->nr_frags + tx_info->linear - 1; tx_info->nr_maps = skb_shinfo(skb)->nr_frags + tx_info->linear;
data += tx_info->nr_maps - 1;
if (is_inline(ring->inline_thold, skb, &fragptr)) { if (is_inline(ring->inline_thold, skb, &fragptr)) {
tx_info->inl = 1; tx_info->inl = 1;
} else { } else {
dma_addr_t dma = 0;
u32 byte_count = 0;
/* Map fragments if any */ /* Map fragments if any */
for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) { for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) {
const struct skb_frag_struct *frag; const struct skb_frag_struct *frag;
dma_addr_t dma;
frag = &skb_shinfo(skb)->frags[i]; frag = &skb_shinfo(skb)->frags[i];
byte_count = skb_frag_size(frag);
dma = skb_frag_dma_map(ddev, frag, dma = skb_frag_dma_map(ddev, frag,
0, skb_frag_size(frag), 0, byte_count,
DMA_TO_DEVICE); DMA_TO_DEVICE);
if (dma_mapping_error(ddev, dma)) if (dma_mapping_error(ddev, dma))
goto tx_drop_unmap; goto tx_drop_unmap;
...@@ -771,14 +778,13 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -771,14 +778,13 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
data->addr = cpu_to_be64(dma); data->addr = cpu_to_be64(dma);
data->lkey = ring->mr_key; data->lkey = ring->mr_key;
wmb(); wmb();
data->byte_count = cpu_to_be32(skb_frag_size(frag)); data->byte_count = cpu_to_be32(byte_count);
--data; --data;
} }
/* Map linear part if needed */ /* Map linear part if needed */
if (tx_info->linear) { if (tx_info->linear) {
u32 byte_count = skb_headlen(skb) - lso_header_size; byte_count = skb_headlen(skb) - lso_header_size;
dma_addr_t dma;
dma = dma_map_single(ddev, skb->data + dma = dma_map_single(ddev, skb->data +
lso_header_size, byte_count, lso_header_size, byte_count,
...@@ -792,6 +798,9 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -792,6 +798,9 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
data->byte_count = cpu_to_be32(byte_count); data->byte_count = cpu_to_be32(byte_count);
} }
tx_info->inl = 0; tx_info->inl = 0;
/* tx completion can avoid cache line miss for common cases */
tx_info->map0_dma = dma;
tx_info->map0_byte_count = byte_count;
} }
/* /*
......
...@@ -216,12 +216,15 @@ enum cq_type { ...@@ -216,12 +216,15 @@ enum cq_type {
struct mlx4_en_tx_info { struct mlx4_en_tx_info {
struct sk_buff *skb; struct sk_buff *skb;
dma_addr_t map0_dma;
u32 map0_byte_count;
u32 nr_txbb; u32 nr_txbb;
u32 nr_bytes; u32 nr_bytes;
u8 linear; u8 linear;
u8 data_offset; u8 data_offset;
u8 inl; u8 inl;
u8 ts_requested; u8 ts_requested;
u8 nr_maps;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment