Commit 0d707a2f authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-4.16-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu

Pull percpu fixes from Tejun Heo:
 "Late percpu pull request for v4.16-rc6.

   - percpu allocator pool replenishing no longer triggers OOM or
     warning messages.

     Also, the alloc interface now understands __GFP_NORETRY and
     __GFP_NOWARN. This is to allow avoiding OOMs from userland
     triggered actions like bpf map creation.

     Also added cond_resched() in alloc loop.

   - perpcu allocation now can be interrupted by kill sigs to avoid
     deadlocking OOM killer.

   - Added Dennis Zhou as a co-maintainer.

     He has rewritten the area map allocator, understands most of the
     code base and has been responsive for all bug reports"

* 'for-4.16-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu:
  percpu_ref: Update doc to dissuade users from depending on internal RCU grace periods
  mm: Allow to kill tasks doing pcpu_alloc() and waiting for pcpu_balance_workfn()
  percpu: include linux/sched.h for cond_resched()
  percpu: add a schedule point in pcpu_balance_workfn()
  percpu: allow select gfp to be passed to underlying allocators
  percpu: add __GFP_NORETRY semantics to the percpu balancing path
  percpu: match chunk allocator declarations with definitions
  percpu: add Dennis Zhou as a percpu co-maintainer
parents efac2483 b3a5d111
...@@ -10844,6 +10844,7 @@ F: drivers/platform/x86/peaq-wmi.c ...@@ -10844,6 +10844,7 @@ F: drivers/platform/x86/peaq-wmi.c
PER-CPU MEMORY ALLOCATOR PER-CPU MEMORY ALLOCATOR
M: Tejun Heo <tj@kernel.org> M: Tejun Heo <tj@kernel.org>
M: Christoph Lameter <cl@linux.com> M: Christoph Lameter <cl@linux.com>
M: Dennis Zhou <dennisszhou@gmail.com>
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu.git
S: Maintained S: Maintained
F: include/linux/percpu*.h F: include/linux/percpu*.h
......
...@@ -30,10 +30,14 @@ ...@@ -30,10 +30,14 @@
* calls io_destroy() or the process exits. * calls io_destroy() or the process exits.
* *
* In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
* calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to remove * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref.
* the kioctx from the proccess's list of kioctxs - after that, there can't be * After that, there can't be any new users of the kioctx (from lookup_ioctx())
* any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop * and it's then safe to drop the initial ref with percpu_ref_put().
* the initial ref with percpu_ref_put(). *
* Note that the free path, free_ioctx(), needs to go through explicit call_rcu()
* to synchronize with RCU protected lookup_ioctx(). percpu_ref operations don't
* imply RCU grace periods of any kind and if a user wants to combine percpu_ref
* with RCU protection, it must be done explicitly.
* *
* Code that does a two stage shutdown like this often needs some kind of * Code that does a two stage shutdown like this often needs some kind of
* explicit synchronization to ensure the initial refcount can only be dropped * explicit synchronization to ensure the initial refcount can only be dropped
...@@ -113,8 +117,10 @@ void percpu_ref_reinit(struct percpu_ref *ref); ...@@ -113,8 +117,10 @@ void percpu_ref_reinit(struct percpu_ref *ref);
* Must be used to drop the initial ref on a percpu refcount; must be called * Must be used to drop the initial ref on a percpu refcount; must be called
* precisely once before shutdown. * precisely once before shutdown.
* *
* Puts @ref in non percpu mode, then does a call_rcu() before gathering up the * Switches @ref into atomic mode before gathering up the percpu counters
* percpu counters and dropping the initial ref. * and dropping the initial ref.
*
* There are no implied RCU grace periods between kill and release.
*/ */
static inline void percpu_ref_kill(struct percpu_ref *ref) static inline void percpu_ref_kill(struct percpu_ref *ref)
{ {
......
...@@ -322,6 +322,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu); ...@@ -322,6 +322,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
* This function normally doesn't block and can be called from any context * This function normally doesn't block and can be called from any context
* but it may block if @confirm_kill is specified and @ref is in the * but it may block if @confirm_kill is specified and @ref is in the
* process of switching to atomic mode by percpu_ref_switch_to_atomic(). * process of switching to atomic mode by percpu_ref_switch_to_atomic().
*
* There are no implied RCU grace periods between kill and release.
*/ */
void percpu_ref_kill_and_confirm(struct percpu_ref *ref, void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
percpu_ref_func_t *confirm_kill) percpu_ref_func_t *confirm_kill)
......
...@@ -34,7 +34,7 @@ ...@@ -34,7 +34,7 @@
#include <linux/log2.h> #include <linux/log2.h>
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end) int page_start, int page_end, gfp_t gfp)
{ {
return 0; return 0;
} }
...@@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, ...@@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
/* nada */ /* nada */
} }
static struct pcpu_chunk *pcpu_create_chunk(void) static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{ {
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
struct pcpu_chunk *chunk; struct pcpu_chunk *chunk;
struct page *pages; struct page *pages;
int i; int i;
chunk = pcpu_alloc_chunk(); chunk = pcpu_alloc_chunk(gfp);
if (!chunk) if (!chunk)
return NULL; return NULL;
pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages)); pages = alloc_pages(gfp, order_base_2(nr_pages));
if (!pages) { if (!pages) {
pcpu_free_chunk(chunk); pcpu_free_chunk(chunk);
return NULL; return NULL;
......
...@@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void) ...@@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void)
lockdep_assert_held(&pcpu_alloc_mutex); lockdep_assert_held(&pcpu_alloc_mutex);
if (!pages) if (!pages)
pages = pcpu_mem_zalloc(pages_size); pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL);
return pages; return pages;
} }
...@@ -73,18 +73,21 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, ...@@ -73,18 +73,21 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
* @pages: array to put the allocated pages into, indexed by pcpu_page_idx() * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
* @page_start: page index of the first page to be allocated * @page_start: page index of the first page to be allocated
* @page_end: page index of the last page to be allocated + 1 * @page_end: page index of the last page to be allocated + 1
* @gfp: allocation flags passed to the underlying allocator
* *
* Allocate pages [@page_start,@page_end) into @pages for all units. * Allocate pages [@page_start,@page_end) into @pages for all units.
* The allocation is for @chunk. Percpu core doesn't care about the * The allocation is for @chunk. Percpu core doesn't care about the
* content of @pages and will pass it verbatim to pcpu_map_pages(). * content of @pages and will pass it verbatim to pcpu_map_pages().
*/ */
static int pcpu_alloc_pages(struct pcpu_chunk *chunk, static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
struct page **pages, int page_start, int page_end) struct page **pages, int page_start, int page_end,
gfp_t gfp)
{ {
const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
unsigned int cpu, tcpu; unsigned int cpu, tcpu;
int i; int i;
gfp |= __GFP_HIGHMEM;
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) { for (i = page_start; i < page_end; i++) {
struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
...@@ -262,6 +265,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, ...@@ -262,6 +265,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
* @chunk: chunk of interest * @chunk: chunk of interest
* @page_start: the start page * @page_start: the start page
* @page_end: the end page * @page_end: the end page
* @gfp: allocation flags passed to the underlying memory allocator
* *
* For each cpu, populate and map pages [@page_start,@page_end) into * For each cpu, populate and map pages [@page_start,@page_end) into
* @chunk. * @chunk.
...@@ -270,7 +274,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, ...@@ -270,7 +274,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
* pcpu_alloc_mutex, does GFP_KERNEL allocation. * pcpu_alloc_mutex, does GFP_KERNEL allocation.
*/ */
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end) int page_start, int page_end, gfp_t gfp)
{ {
struct page **pages; struct page **pages;
...@@ -278,7 +282,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, ...@@ -278,7 +282,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
if (!pages) if (!pages)
return -ENOMEM; return -ENOMEM;
if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp))
return -ENOMEM; return -ENOMEM;
if (pcpu_map_pages(chunk, pages, page_start, page_end)) { if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
...@@ -325,12 +329,12 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, ...@@ -325,12 +329,12 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
pcpu_free_pages(chunk, pages, page_start, page_end); pcpu_free_pages(chunk, pages, page_start, page_end);
} }
static struct pcpu_chunk *pcpu_create_chunk(void) static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{ {
struct pcpu_chunk *chunk; struct pcpu_chunk *chunk;
struct vm_struct **vms; struct vm_struct **vms;
chunk = pcpu_alloc_chunk(); chunk = pcpu_alloc_chunk(gfp);
if (!chunk) if (!chunk)
return NULL; return NULL;
......
...@@ -80,6 +80,7 @@ ...@@ -80,6 +80,7 @@
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/kmemleak.h> #include <linux/kmemleak.h>
#include <linux/sched.h>
#include <asm/cacheflush.h> #include <asm/cacheflush.h>
#include <asm/sections.h> #include <asm/sections.h>
...@@ -447,26 +448,25 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, ...@@ -447,26 +448,25 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
/** /**
* pcpu_mem_zalloc - allocate memory * pcpu_mem_zalloc - allocate memory
* @size: bytes to allocate * @size: bytes to allocate
* @gfp: allocation flags
* *
* Allocate @size bytes. If @size is smaller than PAGE_SIZE, * Allocate @size bytes. If @size is smaller than PAGE_SIZE,
* kzalloc() is used; otherwise, vzalloc() is used. The returned * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
* memory is always zeroed. * This is to facilitate passing through whitelisted flags. The
* * returned memory is always zeroed.
* CONTEXT:
* Does GFP_KERNEL allocation.
* *
* RETURNS: * RETURNS:
* Pointer to the allocated area on success, NULL on failure. * Pointer to the allocated area on success, NULL on failure.
*/ */
static void *pcpu_mem_zalloc(size_t size) static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{ {
if (WARN_ON_ONCE(!slab_is_available())) if (WARN_ON_ONCE(!slab_is_available()))
return NULL; return NULL;
if (size <= PAGE_SIZE) if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL); return kzalloc(size, gfp);
else else
return vzalloc(size); return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
} }
/** /**
...@@ -1154,12 +1154,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, ...@@ -1154,12 +1154,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
return chunk; return chunk;
} }
static struct pcpu_chunk *pcpu_alloc_chunk(void) static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
{ {
struct pcpu_chunk *chunk; struct pcpu_chunk *chunk;
int region_bits; int region_bits;
chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
if (!chunk) if (!chunk)
return NULL; return NULL;
...@@ -1168,17 +1168,17 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) ...@@ -1168,17 +1168,17 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
region_bits = pcpu_chunk_map_bits(chunk); region_bits = pcpu_chunk_map_bits(chunk);
chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) * chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
sizeof(chunk->alloc_map[0])); sizeof(chunk->alloc_map[0]), gfp);
if (!chunk->alloc_map) if (!chunk->alloc_map)
goto alloc_map_fail; goto alloc_map_fail;
chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) * chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
sizeof(chunk->bound_map[0])); sizeof(chunk->bound_map[0]), gfp);
if (!chunk->bound_map) if (!chunk->bound_map)
goto bound_map_fail; goto bound_map_fail;
chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) * chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
sizeof(chunk->md_blocks[0])); sizeof(chunk->md_blocks[0]), gfp);
if (!chunk->md_blocks) if (!chunk->md_blocks)
goto md_blocks_fail; goto md_blocks_fail;
...@@ -1277,9 +1277,11 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, ...@@ -1277,9 +1277,11 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
* pcpu_addr_to_page - translate address to physical address * pcpu_addr_to_page - translate address to physical address
* pcpu_verify_alloc_info - check alloc_info is acceptable during init * pcpu_verify_alloc_info - check alloc_info is acceptable during init
*/ */
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size); static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size); int page_start, int page_end, gfp_t gfp);
static struct pcpu_chunk *pcpu_create_chunk(void); static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end);
static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr); static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
...@@ -1339,6 +1341,8 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) ...@@ -1339,6 +1341,8 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t gfp) gfp_t gfp)
{ {
/* whitelisted flags that can be passed to the backing allocators */
gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
bool do_warn = !(gfp & __GFP_NOWARN); bool do_warn = !(gfp & __GFP_NOWARN);
static int warn_limit = 10; static int warn_limit = 10;
...@@ -1369,8 +1373,17 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, ...@@ -1369,8 +1373,17 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
return NULL; return NULL;
} }
if (!is_atomic) if (!is_atomic) {
/*
* pcpu_balance_workfn() allocates memory under this mutex,
* and it may wait for memory reclaim. Allow current task
* to become OOM victim, in case of memory pressure.
*/
if (gfp & __GFP_NOFAIL)
mutex_lock(&pcpu_alloc_mutex); mutex_lock(&pcpu_alloc_mutex);
else if (mutex_lock_killable(&pcpu_alloc_mutex))
return NULL;
}
spin_lock_irqsave(&pcpu_lock, flags); spin_lock_irqsave(&pcpu_lock, flags);
...@@ -1421,7 +1434,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, ...@@ -1421,7 +1434,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
} }
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
chunk = pcpu_create_chunk(); chunk = pcpu_create_chunk(pcpu_gfp);
if (!chunk) { if (!chunk) {
err = "failed to allocate new chunk"; err = "failed to allocate new chunk";
goto fail; goto fail;
...@@ -1450,7 +1463,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, ...@@ -1450,7 +1463,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
page_start, page_end) { page_start, page_end) {
WARN_ON(chunk->immutable); WARN_ON(chunk->immutable);
ret = pcpu_populate_chunk(chunk, rs, re); ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
spin_lock_irqsave(&pcpu_lock, flags); spin_lock_irqsave(&pcpu_lock, flags);
if (ret) { if (ret) {
...@@ -1561,10 +1574,17 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) ...@@ -1561,10 +1574,17 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
* pcpu_balance_workfn - manage the amount of free chunks and populated pages * pcpu_balance_workfn - manage the amount of free chunks and populated pages
* @work: unused * @work: unused
* *
* Reclaim all fully free chunks except for the first one. * Reclaim all fully free chunks except for the first one. This is also
* responsible for maintaining the pool of empty populated pages. However,
* it is possible that this is called when physical memory is scarce causing
* OOM killer to be triggered. We should avoid doing so until an actual
* allocation causes the failure as it is possible that requests can be
* serviced from already backed regions.
*/ */
static void pcpu_balance_workfn(struct work_struct *work) static void pcpu_balance_workfn(struct work_struct *work)
{ {
/* gfp flags passed to underlying allocators */
const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
LIST_HEAD(to_free); LIST_HEAD(to_free);
struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next; struct pcpu_chunk *chunk, *next;
...@@ -1600,6 +1620,7 @@ static void pcpu_balance_workfn(struct work_struct *work) ...@@ -1600,6 +1620,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
spin_unlock_irq(&pcpu_lock); spin_unlock_irq(&pcpu_lock);
} }
pcpu_destroy_chunk(chunk); pcpu_destroy_chunk(chunk);
cond_resched();
} }
/* /*
...@@ -1645,7 +1666,7 @@ static void pcpu_balance_workfn(struct work_struct *work) ...@@ -1645,7 +1666,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
chunk->nr_pages) { chunk->nr_pages) {
int nr = min(re - rs, nr_to_pop); int nr = min(re - rs, nr_to_pop);
ret = pcpu_populate_chunk(chunk, rs, rs + nr); ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
if (!ret) { if (!ret) {
nr_to_pop -= nr; nr_to_pop -= nr;
spin_lock_irq(&pcpu_lock); spin_lock_irq(&pcpu_lock);
...@@ -1662,7 +1683,7 @@ static void pcpu_balance_workfn(struct work_struct *work) ...@@ -1662,7 +1683,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
if (nr_to_pop) { if (nr_to_pop) {
/* ran out of chunks to populate, create a new one and retry */ /* ran out of chunks to populate, create a new one and retry */
chunk = pcpu_create_chunk(); chunk = pcpu_create_chunk(gfp);
if (chunk) { if (chunk) {
spin_lock_irq(&pcpu_lock); spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1); pcpu_chunk_relocate(chunk, -1);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment