Commit e61caf04 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'page_pool-allow-caching-from-safely-localized-napi'

Jakub Kicinski says:

====================
page_pool: allow caching from safely localized NAPI

I went back to the explicit "are we in NAPI method", mostly
because I don't like having both around :( (even tho I maintain
that in_softirq() && !in_hardirq() is as safe, as softirqs do
not nest).

Still returning the skbs to a CPU, tho, not to the NAPI instance.
I reckon we could create a small refcounted struct per NAPI instance
which would allow sockets and other users so hold a persisent
and safe reference. But that's a bigger change, and I get 90+%
recycling thru the cache with just these patches (for RR and
streaming tests with 100% CPU use it's almost 100%).

Some numbers for streaming test with 100% CPU use (from previous version,
but really they perform the same):

		HW-GRO				page=page
		before		after		before		after
recycle:
cached:			0	138669686		0	150197505
cache_full:		0	   223391		0	    74582
ring:		138551933         9997191	149299454		0
ring_full: 		0             488	     3154	   127590
released_refcnt:	0		0		0		0

alloc:
fast:		136491361	148615710	146969587	150322859
slow:		     1772	     1799	      144	      105
slow_high_order:	0		0		0		0
empty:		     1772	     1799	      144	      105
refill:		  2165245	   156302	  2332880	     2128
waive:			0		0		0		0

v1: https://lore.kernel.org/all/20230411201800.596103-1-kuba@kernel.org/
rfcv2: https://lore.kernel.org/all/20230405232100.103392-1-kuba@kernel.org/
====================

Link: https://lore.kernel.org/r/20230413042605.895677-1-kuba@kernel.orgSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents c11d2e71 294e39e0
......@@ -165,6 +165,7 @@ Registration
pp_params.pool_size = DESC_NUM;
pp_params.nid = NUMA_NO_NODE;
pp_params.dev = priv->dev;
pp_params.napi = napi; /* only if locking is tied to NAPI */
pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
page_pool = page_pool_create(&pp_params);
......
......@@ -3211,6 +3211,7 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
pp.pool_size = bp->rx_ring_size;
pp.nid = dev_to_node(&bp->pdev->dev);
pp.napi = &rxr->bnapi->napi;
pp.dev = &bp->pdev->dev;
pp.dma_dir = DMA_BIDIRECTIONAL;
......
......@@ -360,8 +360,11 @@ struct napi_struct {
unsigned long gro_bitmask;
int (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
/* CPU actively polling if netpoll is configured */
int poll_owner;
#endif
/* CPU on which NAPI has been scheduled for processing */
int list_owner;
struct net_device *dev;
struct gro_list gro_hash[GRO_HASH_BUCKETS];
struct sk_buff *skb;
......
......@@ -3386,6 +3386,18 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
}
static inline void
napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
{
struct page *page = skb_frag_page(frag);
#ifdef CONFIG_PAGE_POOL
if (recycle && page_pool_return_skb_page(page, napi_safe))
return;
#endif
put_page(page);
}
/**
* __skb_frag_unref - release a reference on a paged fragment.
* @frag: the paged fragment
......@@ -3396,13 +3408,7 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
*/
static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
{
struct page *page = skb_frag_page(frag);
#ifdef CONFIG_PAGE_POOL
if (recycle && page_pool_return_skb_page(page))
return;
#endif
put_page(page);
napi_frag_unref(frag, recycle, false);
}
/**
......
......@@ -77,6 +77,7 @@ struct page_pool_params {
unsigned int pool_size;
int nid; /* Numa node id to allocate from pages from */
struct device *dev; /* device, for DMA pre-mapping purposes */
struct napi_struct *napi; /* Sole consumer of pages, otherwise NULL */
enum dma_data_direction dma_dir; /* DMA mapping direction */
unsigned int max_len; /* max DMA sync memory size */
unsigned int offset; /* DMA addr offset */
......@@ -239,7 +240,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
return pool->p.dma_dir;
}
bool page_pool_return_skb_page(struct page *page);
bool page_pool_return_skb_page(struct page *page, bool napi_safe);
struct page_pool *page_pool_create(const struct page_pool_params *params);
......
......@@ -4359,6 +4359,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
}
list_add_tail(&napi->poll_list, &sd->poll_list);
WRITE_ONCE(napi->list_owner, smp_processor_id());
/* If not called from net_rx_action()
* we have to raise NET_RX_SOFTIRQ.
*/
......@@ -6069,6 +6070,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
list_del_init(&n->poll_list);
local_irq_restore(flags);
}
WRITE_ONCE(n->list_owner, -1);
val = READ_ONCE(n->state);
do {
......@@ -6384,6 +6386,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
#ifdef CONFIG_NETPOLL
napi->poll_owner = -1;
#endif
napi->list_owner = -1;
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
......
......@@ -19,6 +19,7 @@
#include <linux/mm.h> /* for put_page() */
#include <linux/poison.h>
#include <linux/ethtool.h>
#include <linux/netdevice.h>
#include <trace/events/page_pool.h>
......@@ -874,9 +875,11 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
}
EXPORT_SYMBOL(page_pool_update_nid);
bool page_pool_return_skb_page(struct page *page)
bool page_pool_return_skb_page(struct page *page, bool napi_safe)
{
struct napi_struct *napi;
struct page_pool *pp;
bool allow_direct;
page = compound_head(page);
......@@ -892,12 +895,20 @@ bool page_pool_return_skb_page(struct page *page)
pp = page->pp;
/* Allow direct recycle if we have reasons to believe that we are
* in the same context as the consumer would run, so there's
* no possible race.
*/
napi = pp->p.napi;
allow_direct = napi_safe && napi &&
READ_ONCE(napi->list_owner) == smp_processor_id();
/* Driver set this to memory recycling info. Reset it on recycle.
* This will *not* work for NIC using a split-page memory model.
* The page will be returned to the pool here regardless of the
* 'flipped' fragment being in use or not.
*/
page_pool_put_full_page(pp, page, false);
page_pool_put_full_page(pp, page, allow_direct);
return true;
}
......
......@@ -839,11 +839,11 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
static bool skb_pp_recycle(struct sk_buff *skb, void *data)
static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
{
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
return false;
return page_pool_return_skb_page(virt_to_page(data));
return page_pool_return_skb_page(virt_to_page(data), napi_safe);
}
static void skb_kfree_head(void *head, unsigned int end_offset)
......@@ -856,12 +856,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
kfree(head);
}
static void skb_free_head(struct sk_buff *skb)
static void skb_free_head(struct sk_buff *skb, bool napi_safe)
{
unsigned char *head = skb->head;
if (skb->head_frag) {
if (skb_pp_recycle(skb, head))
if (skb_pp_recycle(skb, head, napi_safe))
return;
skb_free_frag(head);
} else {
......@@ -869,7 +869,8 @@ static void skb_free_head(struct sk_buff *skb)
}
}
static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
bool napi_safe)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
int i;
......@@ -888,13 +889,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
}
for (i = 0; i < shinfo->nr_frags; i++)
__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
free_head:
if (shinfo->frag_list)
kfree_skb_list_reason(shinfo->frag_list, reason);
skb_free_head(skb);
skb_free_head(skb, napi_safe);
exit:
/* When we clone an SKB we copy the reycling bit. The pp_recycle
* bit is only set on the head though, so in order to avoid races
......@@ -955,11 +956,12 @@ void skb_release_head_state(struct sk_buff *skb)
}
/* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
bool napi_safe)
{
skb_release_head_state(skb);
if (likely(skb->head))
skb_release_data(skb, reason);
skb_release_data(skb, reason, napi_safe);
}
/**
......@@ -973,7 +975,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
void __kfree_skb(struct sk_buff *skb)
{
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);
......@@ -1027,7 +1029,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
return;
}
skb_release_all(skb, reason);
skb_release_all(skb, reason, false);
sa->skb_array[sa->skb_count++] = skb;
if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
......@@ -1201,7 +1203,7 @@ EXPORT_SYMBOL(consume_skb);
void __consume_stateless_skb(struct sk_buff *skb)
{
trace_consume_skb(skb, __builtin_return_address(0));
skb_release_data(skb, SKB_CONSUMED);
skb_release_data(skb, SKB_CONSUMED, false);
kfree_skbmem(skb);
}
......@@ -1226,7 +1228,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
void __kfree_skb_defer(struct sk_buff *skb)
{
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, true);
napi_skb_cache_put(skb);
}
......@@ -1264,7 +1266,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
skb_release_all(skb, SKB_CONSUMED);
skb_release_all(skb, SKB_CONSUMED, !!budget);
napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
......@@ -1395,7 +1397,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
*/
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
skb_release_all(dst, SKB_CONSUMED);
skb_release_all(dst, SKB_CONSUMED, false);
return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);
......@@ -2018,9 +2020,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
skb_release_data(skb, SKB_CONSUMED);
skb_release_data(skb, SKB_CONSUMED, false);
} else {
skb_free_head(skb);
skb_free_head(skb, false);
}
off = (data + nhead) - skb->head;
......@@ -6389,12 +6391,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
skb_release_data(skb, SKB_CONSUMED);
skb_release_data(skb, SKB_CONSUMED, false);
} else {
/* we can reuse existing recount- all we did was
* relocate values
*/
skb_free_head(skb);
skb_free_head(skb, false);
}
skb->head = data;
......@@ -6529,7 +6531,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb_kfree_head(data, size);
return -ENOMEM;
}
skb_release_data(skb, SKB_CONSUMED);
skb_release_data(skb, SKB_CONSUMED, false);
skb->head = data;
skb->head_frag = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment