Commit eb05529a authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'page_pool-allow-direct-bulk-recycling'

Alexander Lobakin says:

====================
page_pool: allow direct bulk recycling

Previously, there was no reliable way to check whether it's safe to use
direct PP cache. The drivers were passing @allow_direct to the PP
recycling functions and that was it. Bulk recycling is used by
xdp_return_frame_bulk() on .ndo_xdp_xmit() frames completion where
the page origin is unknown, thus the direct recycling has never been
tried.
Now that we have at least 2 ways of checking if we're allowed to perform
direct recycling -- pool->p.napi (Jakub) and pool->cpuid (Lorenzo), we
can use them when doing bulk recycling as well. Just move that logic
from the skb core to the PP core and call it before
__page_pool_put_page() every time @allow_direct is false.
Under high .ndo_xdp_xmit() traffic load, the win is 2-3% Pps assuming
the sending driver uses xdp_return_frame_bulk() on Tx completion.
====================

Link: https://lore.kernel.org/r/20240329165507.3240110-1-aleksander.lobakin@intel.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 8db2509f 39806b96
...@@ -3510,25 +3510,25 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, ...@@ -3510,25 +3510,25 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
unsigned int headroom); unsigned int headroom);
int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
struct bpf_prog *prog); struct bpf_prog *prog);
bool napi_pp_put_page(struct page *page, bool napi_safe); bool napi_pp_put_page(struct page *page);
static inline void static inline void
skb_page_unref(const struct sk_buff *skb, struct page *page, bool napi_safe) skb_page_unref(const struct sk_buff *skb, struct page *page)
{ {
#ifdef CONFIG_PAGE_POOL #ifdef CONFIG_PAGE_POOL
if (skb->pp_recycle && napi_pp_put_page(page, napi_safe)) if (skb->pp_recycle && napi_pp_put_page(page))
return; return;
#endif #endif
put_page(page); put_page(page);
} }
static inline void static inline void
napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) napi_frag_unref(skb_frag_t *frag, bool recycle)
{ {
struct page *page = skb_frag_page(frag); struct page *page = skb_frag_page(frag);
#ifdef CONFIG_PAGE_POOL #ifdef CONFIG_PAGE_POOL
if (recycle && napi_pp_put_page(page, napi_safe)) if (recycle && napi_pp_put_page(page))
return; return;
#endif #endif
put_page(page); put_page(page);
...@@ -3544,7 +3544,7 @@ napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) ...@@ -3544,7 +3544,7 @@ napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
*/ */
static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
{ {
napi_frag_unref(frag, recycle, false); napi_frag_unref(frag, recycle);
} }
/** /**
......
...@@ -690,8 +690,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, ...@@ -690,8 +690,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
page_pool_dma_sync_for_device(pool, page, page_pool_dma_sync_for_device(pool, page,
dma_sync_size); dma_sync_size);
if (allow_direct && in_softirq() && if (allow_direct && page_pool_recycle_in_cache(page, pool))
page_pool_recycle_in_cache(page, pool))
return NULL; return NULL;
/* Page found as candidate for recycling */ /* Page found as candidate for recycling */
...@@ -716,9 +715,35 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, ...@@ -716,9 +715,35 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
return NULL; return NULL;
} }
static bool page_pool_napi_local(const struct page_pool *pool)
{
const struct napi_struct *napi;
u32 cpuid;
if (unlikely(!in_softirq()))
return false;
/* Allow direct recycle if we have reasons to believe that we are
* in the same context as the consumer would run, so there's
* no possible race.
* __page_pool_put_page() makes sure we're not in hardirq context
* and interrupts are enabled prior to accessing the cache.
*/
cpuid = smp_processor_id();
if (READ_ONCE(pool->cpuid) == cpuid)
return true;
napi = READ_ONCE(pool->p.napi);
return napi && READ_ONCE(napi->list_owner) == cpuid;
}
void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
unsigned int dma_sync_size, bool allow_direct) unsigned int dma_sync_size, bool allow_direct)
{ {
if (!allow_direct)
allow_direct = page_pool_napi_local(pool);
page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
if (page && !page_pool_recycle_in_ring(pool, page)) { if (page && !page_pool_recycle_in_ring(pool, page)) {
/* Cache full, fallback to free pages */ /* Cache full, fallback to free pages */
...@@ -747,8 +772,11 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, ...@@ -747,8 +772,11 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
int count) int count)
{ {
int i, bulk_len = 0; int i, bulk_len = 0;
bool allow_direct;
bool in_softirq; bool in_softirq;
allow_direct = page_pool_napi_local(pool);
for (i = 0; i < count; i++) { for (i = 0; i < count; i++) {
struct page *page = virt_to_head_page(data[i]); struct page *page = virt_to_head_page(data[i]);
...@@ -756,13 +784,13 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, ...@@ -756,13 +784,13 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
if (!page_pool_is_last_ref(page)) if (!page_pool_is_last_ref(page))
continue; continue;
page = __page_pool_put_page(pool, page, -1, false); page = __page_pool_put_page(pool, page, -1, allow_direct);
/* Approved for bulk recycling in ptr_ring cache */ /* Approved for bulk recycling in ptr_ring cache */
if (page) if (page)
data[bulk_len++] = page; data[bulk_len++] = page;
} }
if (unlikely(!bulk_len)) if (!bulk_len)
return; return;
/* Bulk producer into ptr_ring page_pool cache */ /* Bulk producer into ptr_ring page_pool cache */
...@@ -969,7 +997,7 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), ...@@ -969,7 +997,7 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
static void page_pool_disable_direct_recycling(struct page_pool *pool) static void page_pool_disable_direct_recycling(struct page_pool *pool)
{ {
/* Disable direct recycling based on pool->cpuid. /* Disable direct recycling based on pool->cpuid.
* Paired with READ_ONCE() in napi_pp_put_page(). * Paired with READ_ONCE() in page_pool_napi_local().
*/ */
WRITE_ONCE(pool->cpuid, -1); WRITE_ONCE(pool->cpuid, -1);
......
...@@ -1004,11 +1004,8 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, ...@@ -1004,11 +1004,8 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
EXPORT_SYMBOL(skb_cow_data_for_xdp); EXPORT_SYMBOL(skb_cow_data_for_xdp);
#if IS_ENABLED(CONFIG_PAGE_POOL) #if IS_ENABLED(CONFIG_PAGE_POOL)
bool napi_pp_put_page(struct page *page, bool napi_safe) bool napi_pp_put_page(struct page *page)
{ {
bool allow_direct = false;
struct page_pool *pp;
page = compound_head(page); page = compound_head(page);
/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
...@@ -1021,39 +1018,18 @@ bool napi_pp_put_page(struct page *page, bool napi_safe) ...@@ -1021,39 +1018,18 @@ bool napi_pp_put_page(struct page *page, bool napi_safe)
if (unlikely(!is_pp_page(page))) if (unlikely(!is_pp_page(page)))
return false; return false;
pp = page->pp; page_pool_put_full_page(page->pp, page, false);
/* Allow direct recycle if we have reasons to believe that we are
* in the same context as the consumer would run, so there's
* no possible race.
* __page_pool_put_page() makes sure we're not in hardirq context
* and interrupts are enabled prior to accessing the cache.
*/
if (napi_safe || in_softirq()) {
const struct napi_struct *napi = READ_ONCE(pp->p.napi);
unsigned int cpuid = smp_processor_id();
allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid;
allow_direct |= READ_ONCE(pp->cpuid) == cpuid;
}
/* Driver set this to memory recycling info. Reset it on recycle.
* This will *not* work for NIC using a split-page memory model.
* The page will be returned to the pool here regardless of the
* 'flipped' fragment being in use or not.
*/
page_pool_put_full_page(pp, page, allow_direct);
return true; return true;
} }
EXPORT_SYMBOL(napi_pp_put_page); EXPORT_SYMBOL(napi_pp_put_page);
#endif #endif
static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) static bool skb_pp_recycle(struct sk_buff *skb, void *data)
{ {
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
return false; return false;
return napi_pp_put_page(virt_to_page(data), napi_safe); return napi_pp_put_page(virt_to_page(data));
} }
/** /**
...@@ -1095,12 +1071,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset) ...@@ -1095,12 +1071,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
kfree(head); kfree(head);
} }
static void skb_free_head(struct sk_buff *skb, bool napi_safe) static void skb_free_head(struct sk_buff *skb)
{ {
unsigned char *head = skb->head; unsigned char *head = skb->head;
if (skb->head_frag) { if (skb->head_frag) {
if (skb_pp_recycle(skb, head, napi_safe)) if (skb_pp_recycle(skb, head))
return; return;
skb_free_frag(head); skb_free_frag(head);
} else { } else {
...@@ -1108,8 +1084,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe) ...@@ -1108,8 +1084,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe)
} }
} }
static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
bool napi_safe)
{ {
struct skb_shared_info *shinfo = skb_shinfo(skb); struct skb_shared_info *shinfo = skb_shinfo(skb);
int i; int i;
...@@ -1126,13 +1101,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, ...@@ -1126,13 +1101,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
} }
for (i = 0; i < shinfo->nr_frags; i++) for (i = 0; i < shinfo->nr_frags; i++)
napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe); napi_frag_unref(&shinfo->frags[i], skb->pp_recycle);
free_head: free_head:
if (shinfo->frag_list) if (shinfo->frag_list)
kfree_skb_list_reason(shinfo->frag_list, reason); kfree_skb_list_reason(shinfo->frag_list, reason);
skb_free_head(skb, napi_safe); skb_free_head(skb);
exit: exit:
/* When we clone an SKB we copy the reycling bit. The pp_recycle /* When we clone an SKB we copy the reycling bit. The pp_recycle
* bit is only set on the head though, so in order to avoid races * bit is only set on the head though, so in order to avoid races
...@@ -1193,12 +1168,11 @@ void skb_release_head_state(struct sk_buff *skb) ...@@ -1193,12 +1168,11 @@ void skb_release_head_state(struct sk_buff *skb)
} }
/* Free everything but the sk_buff shell. */ /* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
bool napi_safe)
{ {
skb_release_head_state(skb); skb_release_head_state(skb);
if (likely(skb->head)) if (likely(skb->head))
skb_release_data(skb, reason, napi_safe); skb_release_data(skb, reason);
} }
/** /**
...@@ -1212,7 +1186,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, ...@@ -1212,7 +1186,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
void __kfree_skb(struct sk_buff *skb) void __kfree_skb(struct sk_buff *skb)
{ {
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false); skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
kfree_skbmem(skb); kfree_skbmem(skb);
} }
EXPORT_SYMBOL(__kfree_skb); EXPORT_SYMBOL(__kfree_skb);
...@@ -1269,7 +1243,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb, ...@@ -1269,7 +1243,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
return; return;
} }
skb_release_all(skb, reason, false); skb_release_all(skb, reason);
sa->skb_array[sa->skb_count++] = skb; sa->skb_array[sa->skb_count++] = skb;
if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
...@@ -1443,7 +1417,7 @@ EXPORT_SYMBOL(consume_skb); ...@@ -1443,7 +1417,7 @@ EXPORT_SYMBOL(consume_skb);
void __consume_stateless_skb(struct sk_buff *skb) void __consume_stateless_skb(struct sk_buff *skb)
{ {
trace_consume_skb(skb, __builtin_return_address(0)); trace_consume_skb(skb, __builtin_return_address(0));
skb_release_data(skb, SKB_CONSUMED, false); skb_release_data(skb, SKB_CONSUMED);
kfree_skbmem(skb); kfree_skbmem(skb);
} }
...@@ -1470,7 +1444,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) ...@@ -1470,7 +1444,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
{ {
skb_release_all(skb, reason, true); skb_release_all(skb, reason);
napi_skb_cache_put(skb); napi_skb_cache_put(skb);
} }
...@@ -1508,7 +1482,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) ...@@ -1508,7 +1482,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return; return;
} }
skb_release_all(skb, SKB_CONSUMED, !!budget); skb_release_all(skb, SKB_CONSUMED);
napi_skb_cache_put(skb); napi_skb_cache_put(skb);
} }
EXPORT_SYMBOL(napi_consume_skb); EXPORT_SYMBOL(napi_consume_skb);
...@@ -1639,7 +1613,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg); ...@@ -1639,7 +1613,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
*/ */
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{ {
skb_release_all(dst, SKB_CONSUMED, false); skb_release_all(dst, SKB_CONSUMED);
return __skb_clone(dst, src); return __skb_clone(dst, src);
} }
EXPORT_SYMBOL_GPL(skb_morph); EXPORT_SYMBOL_GPL(skb_morph);
...@@ -2271,9 +2245,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, ...@@ -2271,9 +2245,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_has_frag_list(skb)) if (skb_has_frag_list(skb))
skb_clone_fraglist(skb); skb_clone_fraglist(skb);
skb_release_data(skb, SKB_CONSUMED, false); skb_release_data(skb, SKB_CONSUMED);
} else { } else {
skb_free_head(skb, false); skb_free_head(skb);
} }
off = (data + nhead) - skb->head; off = (data + nhead) - skb->head;
...@@ -6574,12 +6548,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, ...@@ -6574,12 +6548,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb_frag_ref(skb, i); skb_frag_ref(skb, i);
if (skb_has_frag_list(skb)) if (skb_has_frag_list(skb))
skb_clone_fraglist(skb); skb_clone_fraglist(skb);
skb_release_data(skb, SKB_CONSUMED, false); skb_release_data(skb, SKB_CONSUMED);
} else { } else {
/* we can reuse existing recount- all we did was /* we can reuse existing recount- all we did was
* relocate values * relocate values
*/ */
skb_free_head(skb, false); skb_free_head(skb);
} }
skb->head = data; skb->head = data;
...@@ -6714,7 +6688,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, ...@@ -6714,7 +6688,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb_kfree_head(data, size); skb_kfree_head(data, size);
return -ENOMEM; return -ENOMEM;
} }
skb_release_data(skb, SKB_CONSUMED, false); skb_release_data(skb, SKB_CONSUMED);
skb->head = data; skb->head = data;
skb->head_frag = 0; skb->head_frag = 0;
......
...@@ -114,7 +114,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) ...@@ -114,7 +114,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
*/ */
if (req->src != req->dst) if (req->src != req->dst)
for (sg = sg_next(req->src); sg; sg = sg_next(sg)) for (sg = sg_next(req->src); sg; sg = sg_next(sg))
skb_page_unref(skb, sg_page(sg), false); skb_page_unref(skb, sg_page(sg));
} }
#ifdef CONFIG_INET_ESPINTCP #ifdef CONFIG_INET_ESPINTCP
......
...@@ -131,7 +131,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) ...@@ -131,7 +131,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
*/ */
if (req->src != req->dst) if (req->src != req->dst)
for (sg = sg_next(req->src); sg; sg = sg_next(sg)) for (sg = sg_next(req->src); sg; sg = sg_next(sg))
skb_page_unref(skb, sg_page(sg), false); skb_page_unref(skb, sg_page(sg));
} }
#ifdef CONFIG_INET6_ESPINTCP #ifdef CONFIG_INET6_ESPINTCP
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment