Commit a7cbfd05 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-4.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu

Pull percpu updates from Tejun Heo:
 "A lot of changes for percpu this time around. percpu inherited the
  same area allocator from the original pre-virtual-address-mapped
  implementation. This was from the time when percpu allocator wasn't
  used all that much and the implementation was focused on simplicity,
  with the unfortunate computational complexity of O(number of areas
  allocated from the chunk) per alloc / free.

  With the increase in percpu usage, we're hitting cases where the lack
  of scalability is hurting. The most prominent one right now is bpf
  perpcu map creation / destruction which may allocate and free a lot of
  entries consecutively and it's likely that the problem will become
  more prominent in the future.

  To address the issue, Dennis replaced the area allocator with hinted
  bitmap allocator which is more consistent. While the new allocator
  does perform a bit worse in some cases, it outperforms the old
  allocator way more than an order of magnitude in other more common
  scenarios while staying mostly flat in CPU overhead and completely
  flat in memory consumption"

* 'for-4.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu: (27 commits)
  percpu: update header to contain bitmap allocator explanation.
  percpu: update pcpu_find_block_fit to use an iterator
  percpu: use metadata blocks to update the chunk contig hint
  percpu: update free path to take advantage of contig hints
  percpu: update alloc path to only scan if contig hints are broken
  percpu: keep track of the best offset for contig hints
  percpu: skip chunks if the alloc does not fit in the contig hint
  percpu: add first_bit to keep track of the first free in the bitmap
  percpu: introduce bitmap metadata blocks
  percpu: replace area map allocator with bitmap
  percpu: generalize bitmap (un)populated iterators
  percpu: increase minimum percpu allocation size and align first regions
  percpu: introduce nr_empty_pop_pages to help empty page accounting
  percpu: change the number of pages marked in the first_chunk pop bitmap
  percpu: combine percpu address checks
  percpu: modify base_addr to be region specific
  percpu: setup_first_chunk rename schunk/dchunk to chunk
  percpu: end chunk area maps page aligned for the populated bitmap
  percpu: unify allocation of schunk and dchunk
  percpu: setup_first_chunk remove dyn_size and consolidate logic
  ...
parents d34fc1ad 5e81ee3e
...@@ -21,6 +21,25 @@ ...@@ -21,6 +21,25 @@
/* minimum unit size, also is the maximum supported allocation size */ /* minimum unit size, also is the maximum supported allocation size */
#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10) #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10)
/* minimum allocation size and shift in bytes */
#define PCPU_MIN_ALLOC_SHIFT 2
#define PCPU_MIN_ALLOC_SIZE (1 << PCPU_MIN_ALLOC_SHIFT)
/* number of bits per page, used to trigger a scan if blocks are > PAGE_SIZE */
#define PCPU_BITS_PER_PAGE (PAGE_SIZE >> PCPU_MIN_ALLOC_SHIFT)
/*
* This determines the size of each metadata block. There are several subtle
* constraints around this constant. The reserved region must be a multiple of
* PCPU_BITMAP_BLOCK_SIZE. Additionally, PCPU_BITMAP_BLOCK_SIZE must be a
* multiple of PAGE_SIZE or PAGE_SIZE must be a multiple of
* PCPU_BITMAP_BLOCK_SIZE to align with the populated page map. The unit_size
* also has to be a multiple of PCPU_BITMAP_BLOCK_SIZE to ensure full blocks.
*/
#define PCPU_BITMAP_BLOCK_SIZE PAGE_SIZE
#define PCPU_BITMAP_BLOCK_BITS (PCPU_BITMAP_BLOCK_SIZE >> \
PCPU_MIN_ALLOC_SHIFT)
/* /*
* Percpu allocator can serve percpu allocations before slab is * Percpu allocator can serve percpu allocations before slab is
* initialized which allows slab to depend on the percpu allocator. * initialized which allows slab to depend on the percpu allocator.
...@@ -116,7 +135,6 @@ extern bool is_kernel_percpu_address(unsigned long addr); ...@@ -116,7 +135,6 @@ extern bool is_kernel_percpu_address(unsigned long addr);
#if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
extern void __init setup_per_cpu_areas(void); extern void __init setup_per_cpu_areas(void);
#endif #endif
extern void __init percpu_init_late(void);
extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp);
extern void __percpu *__alloc_percpu(size_t size, size_t align); extern void __percpu *__alloc_percpu(size_t size, size_t align);
......
...@@ -501,7 +501,6 @@ static void __init mm_init(void) ...@@ -501,7 +501,6 @@ static void __init mm_init(void)
page_ext_init_flatmem(); page_ext_init_flatmem();
mem_init(); mem_init();
kmem_cache_init(); kmem_cache_init();
percpu_init_late();
pgtable_init(); pgtable_init();
vmalloc_init(); vmalloc_init();
ioremap_huge_init(); ioremap_huge_init();
......
...@@ -4,6 +4,22 @@ ...@@ -4,6 +4,22 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/percpu.h> #include <linux/percpu.h>
/*
* pcpu_block_md is the metadata block struct.
* Each chunk's bitmap is split into a number of full blocks.
* All units are in terms of bits.
*/
struct pcpu_block_md {
int contig_hint; /* contig hint for block */
int contig_hint_start; /* block relative starting
position of the contig hint */
int left_free; /* size of free space along
the left side of the block */
int right_free; /* size of free space along
the right side of the block */
int first_free; /* block position of first free */
};
struct pcpu_chunk { struct pcpu_chunk {
#ifdef CONFIG_PERCPU_STATS #ifdef CONFIG_PERCPU_STATS
int nr_alloc; /* # of allocations */ int nr_alloc; /* # of allocations */
...@@ -11,24 +27,29 @@ struct pcpu_chunk { ...@@ -11,24 +27,29 @@ struct pcpu_chunk {
#endif #endif
struct list_head list; /* linked to pcpu_slot lists */ struct list_head list; /* linked to pcpu_slot lists */
int free_size; /* free bytes in the chunk */ int free_bytes; /* free bytes in the chunk */
int contig_hint; /* max contiguous size hint */ int contig_bits; /* max contiguous size hint */
int contig_bits_start; /* contig_bits starting
offset */
void *base_addr; /* base address of this chunk */ void *base_addr; /* base address of this chunk */
int map_used; /* # of map entries used before the sentry */ unsigned long *alloc_map; /* allocation map */
int map_alloc; /* # of map entries allocated */ unsigned long *bound_map; /* boundary map */
int *map; /* allocation map */ struct pcpu_block_md *md_blocks; /* metadata blocks */
struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
void *data; /* chunk data */ void *data; /* chunk data */
int first_free; /* no free below this */ int first_bit; /* no free below this */
bool immutable; /* no [de]population allowed */ bool immutable; /* no [de]population allowed */
bool has_reserved; /* Indicates if chunk has reserved space int start_offset; /* the overlap with the previous
at the beginning. Reserved chunk will region to have a page aligned
contain reservation for static chunk. base_addr */
Dynamic chunk will contain reservation int end_offset; /* additional area required to
for static and reserved chunks. */ have the region end page
aligned */
int nr_pages; /* # of pages served by this chunk */
int nr_populated; /* # of populated pages */ int nr_populated; /* # of populated pages */
int nr_empty_pop_pages; /* # of empty populated pages */
unsigned long populated[]; /* populated bitmap */ unsigned long populated[]; /* populated bitmap */
}; };
...@@ -36,10 +57,47 @@ extern spinlock_t pcpu_lock; ...@@ -36,10 +57,47 @@ extern spinlock_t pcpu_lock;
extern struct list_head *pcpu_slot; extern struct list_head *pcpu_slot;
extern int pcpu_nr_slots; extern int pcpu_nr_slots;
extern int pcpu_nr_empty_pop_pages;
extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_first_chunk;
extern struct pcpu_chunk *pcpu_reserved_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk;
/**
* pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
* @chunk: chunk of interest
*
* This conversion is from the number of physical pages that the chunk
* serves to the number of bitmap blocks used.
*/
static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
{
return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
}
/**
* pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
* @pages: number of physical pages
*
* This conversion is from physical pages to the number of bits
* required in the bitmap.
*/
static inline int pcpu_nr_pages_to_map_bits(int pages)
{
return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
}
/**
* pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
* @chunk: chunk of interest
*
* This conversion is from the number of physical pages that the chunk
* serves to the number of bits in the bitmap.
*/
static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
{
return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}
#ifdef CONFIG_PERCPU_STATS #ifdef CONFIG_PERCPU_STATS
#include <linux/spinlock.h> #include <linux/spinlock.h>
......
...@@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) ...@@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
spin_lock_irq(&pcpu_lock); spin_lock_irq(&pcpu_lock);
pcpu_chunk_populated(chunk, 0, nr_pages); pcpu_chunk_populated(chunk, 0, nr_pages, false);
spin_unlock_irq(&pcpu_lock); spin_unlock_irq(&pcpu_lock);
pcpu_stats_chunk_alloc(); pcpu_stats_chunk_alloc();
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include "percpu-internal.h" #include "percpu-internal.h"
#define P(X, Y) \ #define P(X, Y) \
seq_printf(m, " %-24s: %8lld\n", X, (long long int)Y) seq_printf(m, " %-20s: %12lld\n", X, (long long int)Y)
struct percpu_stats pcpu_stats; struct percpu_stats pcpu_stats;
struct pcpu_alloc_info pcpu_stats_ai; struct pcpu_alloc_info pcpu_stats_ai;
...@@ -29,64 +29,85 @@ static int cmpint(const void *a, const void *b) ...@@ -29,64 +29,85 @@ static int cmpint(const void *a, const void *b)
} }
/* /*
* Iterates over all chunks to find the max # of map entries used. * Iterates over all chunks to find the max nr_alloc entries.
*/ */
static int find_max_map_used(void) static int find_max_nr_alloc(void)
{ {
struct pcpu_chunk *chunk; struct pcpu_chunk *chunk;
int slot, max_map_used; int slot, max_nr_alloc;
max_map_used = 0; max_nr_alloc = 0;
for (slot = 0; slot < pcpu_nr_slots; slot++) for (slot = 0; slot < pcpu_nr_slots; slot++)
list_for_each_entry(chunk, &pcpu_slot[slot], list) list_for_each_entry(chunk, &pcpu_slot[slot], list)
max_map_used = max(max_map_used, chunk->map_used); max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc);
return max_map_used; return max_nr_alloc;
} }
/* /*
* Prints out chunk state. Fragmentation is considered between * Prints out chunk state. Fragmentation is considered between
* the beginning of the chunk to the last allocation. * the beginning of the chunk to the last allocation.
*
* All statistics are in bytes unless stated otherwise.
*/ */
static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
void *buffer) int *buffer)
{ {
int i, s_index, last_alloc, alloc_sign, as_len; int i, last_alloc, as_len, start, end;
int *alloc_sizes, *p; int *alloc_sizes, *p;
/* statistics */ /* statistics */
int sum_frag = 0, max_frag = 0; int sum_frag = 0, max_frag = 0;
int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0; int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0;
alloc_sizes = buffer; alloc_sizes = buffer;
s_index = chunk->has_reserved ? 1 : 0;
/* find last allocation */
last_alloc = -1;
for (i = chunk->map_used - 1; i >= s_index; i--) {
if (chunk->map[i] & 1) {
last_alloc = i;
break;
}
}
/* if the chunk is not empty - ignoring reserve */ /*
if (last_alloc >= s_index) { * find_last_bit returns the start value if nothing found.
as_len = last_alloc + 1 - s_index; * Therefore, we must determine if it is a failure of find_last_bit
* and set the appropriate value.
/* */
* Iterate through chunk map computing size info. last_alloc = find_last_bit(chunk->alloc_map,
* The first bit is overloaded to be a used flag. pcpu_chunk_map_bits(chunk) -
* negative = free space, positive = allocated chunk->end_offset / PCPU_MIN_ALLOC_SIZE - 1);
*/ last_alloc = test_bit(last_alloc, chunk->alloc_map) ?
for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) { last_alloc + 1 : 0;
alloc_sign = (*p & 1) ? 1 : -1;
alloc_sizes[i] = alloc_sign * as_len = 0;
((p[1] & ~1) - (p[0] & ~1)); start = chunk->start_offset;
/*
* If a bit is set in the allocation map, the bound_map identifies
* where the allocation ends. If the allocation is not set, the
* bound_map does not identify free areas as it is only kept accurate
* on allocation, not free.
*
* Positive values are allocations and negative values are free
* fragments.
*/
while (start < last_alloc) {
if (test_bit(start, chunk->alloc_map)) {
end = find_next_bit(chunk->bound_map, last_alloc,
start + 1);
alloc_sizes[as_len] = 1;
} else {
end = find_next_bit(chunk->alloc_map, last_alloc,
start + 1);
alloc_sizes[as_len] = -1;
} }
sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL); alloc_sizes[as_len++] *= (end - start) * PCPU_MIN_ALLOC_SIZE;
start = end;
}
/*
* The negative values are free fragments and thus sorting gives the
* free fragments at the beginning in largest first order.
*/
if (as_len > 0) {
sort(alloc_sizes, as_len, sizeof(int), cmpint, NULL);
/* Iterate through the unallocated fragements. */ /* iterate through the unallocated fragments */
for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) { for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) {
sum_frag -= *p; sum_frag -= *p;
max_frag = max(max_frag, -1 * (*p)); max_frag = max(max_frag, -1 * (*p));
...@@ -99,8 +120,10 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, ...@@ -99,8 +120,10 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
P("nr_alloc", chunk->nr_alloc); P("nr_alloc", chunk->nr_alloc);
P("max_alloc_size", chunk->max_alloc_size); P("max_alloc_size", chunk->max_alloc_size);
P("free_size", chunk->free_size); P("empty_pop_pages", chunk->nr_empty_pop_pages);
P("contig_hint", chunk->contig_hint); P("first_bit", chunk->first_bit);
P("free_bytes", chunk->free_bytes);
P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE);
P("sum_frag", sum_frag); P("sum_frag", sum_frag);
P("max_frag", max_frag); P("max_frag", max_frag);
P("cur_min_alloc", cur_min_alloc); P("cur_min_alloc", cur_min_alloc);
...@@ -112,29 +135,30 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, ...@@ -112,29 +135,30 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
static int percpu_stats_show(struct seq_file *m, void *v) static int percpu_stats_show(struct seq_file *m, void *v)
{ {
struct pcpu_chunk *chunk; struct pcpu_chunk *chunk;
int slot, max_map_used; int slot, max_nr_alloc;
void *buffer; int *buffer;
alloc_buffer: alloc_buffer:
spin_lock_irq(&pcpu_lock); spin_lock_irq(&pcpu_lock);
max_map_used = find_max_map_used(); max_nr_alloc = find_max_nr_alloc();
spin_unlock_irq(&pcpu_lock); spin_unlock_irq(&pcpu_lock);
buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0])); /* there can be at most this many free and allocated fragments */
buffer = vmalloc((2 * max_nr_alloc + 1) * sizeof(int));
if (!buffer) if (!buffer)
return -ENOMEM; return -ENOMEM;
spin_lock_irq(&pcpu_lock); spin_lock_irq(&pcpu_lock);
/* if the buffer allocated earlier is too small */ /* if the buffer allocated earlier is too small */
if (max_map_used < find_max_map_used()) { if (max_nr_alloc < find_max_nr_alloc()) {
spin_unlock_irq(&pcpu_lock); spin_unlock_irq(&pcpu_lock);
vfree(buffer); vfree(buffer);
goto alloc_buffer; goto alloc_buffer;
} }
#define PL(X) \ #define PL(X) \
seq_printf(m, " %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X) seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X)
seq_printf(m, seq_printf(m,
"Percpu Memory Statistics\n" "Percpu Memory Statistics\n"
...@@ -151,7 +175,7 @@ static int percpu_stats_show(struct seq_file *m, void *v) ...@@ -151,7 +175,7 @@ static int percpu_stats_show(struct seq_file *m, void *v)
#undef PL #undef PL
#define PU(X) \ #define PU(X) \
seq_printf(m, " %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X) seq_printf(m, " %-20s: %12llu\n", #X, (unsigned long long)pcpu_stats.X)
seq_printf(m, seq_printf(m,
"Global Stats:\n" "Global Stats:\n"
...@@ -164,6 +188,7 @@ static int percpu_stats_show(struct seq_file *m, void *v) ...@@ -164,6 +188,7 @@ static int percpu_stats_show(struct seq_file *m, void *v)
PU(nr_max_chunks); PU(nr_max_chunks);
PU(min_alloc_size); PU(min_alloc_size);
PU(max_alloc_size); PU(max_alloc_size);
P("empty_pop_pages", pcpu_nr_empty_pop_pages);
seq_putc(m, '\n'); seq_putc(m, '\n');
#undef PU #undef PU
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment