Commit dd6b3d93 authored by Andrew Morton's avatar Andrew Morton Committed by Ben Collins

[PATCH] magazine layer for slab

From: Manfred Spraul <manfred@colorfullife.com>

slab.c is not very efficient for passing objects between cpus.  Usually this
is a rare event, but with network routing and cpu-affine NICs it is possible
that nearly all allocation operations will occur on one cpu, and nearly all
free operations on another cpu.

This causes slab memory to be returned to slab's free page list rather than
being cached on behalf of the particular slab cache.

The attached patch solves that by adding an array of objects that is shared
by all cpus.  Instead of multiple linked list operations per object, object
pointers are now passed to/from the shared array (and thus between cpus) with
memcopy operations.  On uniprocessor, the default array size is 0, because
the shared array and the per-cpu head array are redundant.

Additionally, the patch exports more statistics in /proc/slabinfo and make
the array sizes tunable by writing to /proc/slabinfo.  Both changes break
backward compatibility, user space scripts must look at the slabinfo version
and act accordingly.

The size of the new shared array may be altered at runtime, by writing to
/proc/slabinfo.

The new parameters for writing to /proc/slabinfo are:

	#echo "cache-name limit batchcount shared" > /proc/slabinfo

For example "size-4096 120 60 8" improves the slab efficiency for network
routing, because the default values (24 12 8) are too small for the large
series generated due to irq mitigation.  Note that only root has write
permissions to /proc/slabinfo.

These changes provided an overall 12% speedup in Robert Olson's gigE
packet-formwarding testing on 2-way.
parent fa6ead8c
...@@ -210,6 +210,7 @@ struct kmem_list3 { ...@@ -210,6 +210,7 @@ struct kmem_list3 {
unsigned long free_objects; unsigned long free_objects;
int free_touched; int free_touched;
unsigned long next_reap; unsigned long next_reap;
struct array_cache *shared;
}; };
#define LIST3_INIT(parent) \ #define LIST3_INIT(parent) \
...@@ -1209,6 +1210,7 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) ...@@ -1209,6 +1210,7 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
} }
static void free_block (kmem_cache_t* cachep, void** objpp, int len); static void free_block (kmem_cache_t* cachep, void** objpp, int len);
static void drain_array_locked(kmem_cache_t* cachep, struct array_cache *ac);
static void do_drain(void *arg) static void do_drain(void *arg)
{ {
...@@ -1217,13 +1219,20 @@ static void do_drain(void *arg) ...@@ -1217,13 +1219,20 @@ static void do_drain(void *arg)
check_irq_off(); check_irq_off();
ac = ac_data(cachep); ac = ac_data(cachep);
spin_lock(&cachep->spinlock);
free_block(cachep, &ac_entry(ac)[0], ac->avail); free_block(cachep, &ac_entry(ac)[0], ac->avail);
spin_unlock(&cachep->spinlock);
ac->avail = 0; ac->avail = 0;
} }
static void drain_cpu_caches(kmem_cache_t *cachep) static void drain_cpu_caches(kmem_cache_t *cachep)
{ {
smp_call_function_all_cpus(do_drain, cachep); smp_call_function_all_cpus(do_drain, cachep);
check_irq_on();
spin_lock_irq(&cachep->spinlock);
if (cachep->lists.shared)
drain_array_locked(cachep, cachep->lists.shared);
spin_unlock_irq(&cachep->spinlock);
} }
...@@ -1321,6 +1330,8 @@ int kmem_cache_destroy (kmem_cache_t * cachep) ...@@ -1321,6 +1330,8 @@ int kmem_cache_destroy (kmem_cache_t * cachep)
for (i = 0; i < NR_CPUS; i++) for (i = 0; i < NR_CPUS; i++)
kfree(cachep->array[i]); kfree(cachep->array[i]);
/* NUMA: free the list3 structures */ /* NUMA: free the list3 structures */
kfree(cachep->lists.shared);
cachep->lists.shared = NULL;
} }
kmem_cache_free(&cache_cache, cachep); kmem_cache_free(&cache_cache, cachep);
...@@ -1663,6 +1674,19 @@ static void* cache_alloc_refill(kmem_cache_t* cachep, int flags) ...@@ -1663,6 +1674,19 @@ static void* cache_alloc_refill(kmem_cache_t* cachep, int flags)
BUG_ON(ac->avail > 0); BUG_ON(ac->avail > 0);
spin_lock(&cachep->spinlock); spin_lock(&cachep->spinlock);
if (l3->shared) {
struct array_cache *shared_array = l3->shared;
if (shared_array->avail) {
if (batchcount > shared_array->avail)
batchcount = shared_array->avail;
shared_array->avail -= batchcount;
ac->avail = batchcount;
memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail],
sizeof(void*)*batchcount);
shared_array->touched = 1;
goto alloc_done;
}
}
while (batchcount > 0) { while (batchcount > 0) {
struct list_head *entry; struct list_head *entry;
struct slab *slabp; struct slab *slabp;
...@@ -1686,6 +1710,7 @@ static void* cache_alloc_refill(kmem_cache_t* cachep, int flags) ...@@ -1686,6 +1710,7 @@ static void* cache_alloc_refill(kmem_cache_t* cachep, int flags)
must_grow: must_grow:
l3->free_objects -= ac->avail; l3->free_objects -= ac->avail;
alloc_done:
spin_unlock(&cachep->spinlock); spin_unlock(&cachep->spinlock);
if (unlikely(!ac->avail)) { if (unlikely(!ac->avail)) {
...@@ -1784,13 +1809,11 @@ static inline void * __cache_alloc (kmem_cache_t *cachep, int flags) ...@@ -1784,13 +1809,11 @@ static inline void * __cache_alloc (kmem_cache_t *cachep, int flags)
* the l3 structure * the l3 structure
*/ */
static inline void static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
__free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
{ {
int i; int i;
check_irq_off(); check_spinlock_acquired(cachep);
spin_lock(&cachep->spinlock);
/* NUMA: move add into loop */ /* NUMA: move add into loop */
cachep->lists.free_objects += nr_objects; cachep->lists.free_objects += nr_objects;
...@@ -1828,12 +1851,6 @@ __free_block(kmem_cache_t *cachep, void **objpp, int nr_objects) ...@@ -1828,12 +1851,6 @@ __free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
&list3_data_ptr(cachep, objp)->slabs_partial); &list3_data_ptr(cachep, objp)->slabs_partial);
} }
} }
spin_unlock(&cachep->spinlock);
}
static void free_block(kmem_cache_t* cachep, void** objpp, int len)
{
__free_block(cachep, objpp, len);
} }
static void cache_flusharray (kmem_cache_t* cachep, struct array_cache *ac) static void cache_flusharray (kmem_cache_t* cachep, struct array_cache *ac)
...@@ -1845,14 +1862,28 @@ static void cache_flusharray (kmem_cache_t* cachep, struct array_cache *ac) ...@@ -1845,14 +1862,28 @@ static void cache_flusharray (kmem_cache_t* cachep, struct array_cache *ac)
BUG_ON(!batchcount || batchcount > ac->avail); BUG_ON(!batchcount || batchcount > ac->avail);
#endif #endif
check_irq_off(); check_irq_off();
__free_block(cachep, &ac_entry(ac)[0], batchcount); spin_lock(&cachep->spinlock);
if (cachep->lists.shared) {
struct array_cache *shared_array = cachep->lists.shared;
int max = shared_array->limit-shared_array->avail;
if (max) {
if (batchcount > max)
batchcount = max;
memcpy(&ac_entry(shared_array)[shared_array->avail],
&ac_entry(ac)[0],
sizeof(void*)*batchcount);
shared_array->avail += batchcount;
goto free_done;
}
}
free_block(cachep, &ac_entry(ac)[0], batchcount);
free_done:
#if STATS #if STATS
{ {
int i = 0; int i = 0;
struct list_head *p; struct list_head *p;
spin_lock(&cachep->spinlock);
p = list3_data(cachep)->slabs_free.next; p = list3_data(cachep)->slabs_free.next;
while (p != &(list3_data(cachep)->slabs_free)) { while (p != &(list3_data(cachep)->slabs_free)) {
struct slab *slabp; struct slab *slabp;
...@@ -1864,9 +1895,9 @@ static void cache_flusharray (kmem_cache_t* cachep, struct array_cache *ac) ...@@ -1864,9 +1895,9 @@ static void cache_flusharray (kmem_cache_t* cachep, struct array_cache *ac)
p = p->next; p = p->next;
} }
STATS_SET_FREEABLE(cachep, i); STATS_SET_FREEABLE(cachep, i);
spin_unlock(&cachep->spinlock);
} }
#endif #endif
spin_unlock(&cachep->spinlock);
ac->avail -= batchcount; ac->avail -= batchcount;
memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount], memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount],
sizeof(void*)*ac->avail); sizeof(void*)*ac->avail);
...@@ -2107,9 +2138,10 @@ static void do_ccupdate_local(void *info) ...@@ -2107,9 +2138,10 @@ static void do_ccupdate_local(void *info)
} }
static int do_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount) static int do_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount, int shared)
{ {
struct ccupdate_struct new; struct ccupdate_struct new;
struct array_cache *new_shared;
int i; int i;
memset(&new.new,0,sizeof(new.new)); memset(&new.new,0,sizeof(new.new));
...@@ -2143,11 +2175,29 @@ static int do_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount) ...@@ -2143,11 +2175,29 @@ static int do_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount)
struct array_cache *ccold = new.new[i]; struct array_cache *ccold = new.new[i];
if (!ccold) if (!ccold)
continue; continue;
local_irq_disable(); spin_lock_irq(&cachep->spinlock);
free_block(cachep, ac_entry(ccold), ccold->avail); free_block(cachep, ac_entry(ccold), ccold->avail);
local_irq_enable(); spin_unlock_irq(&cachep->spinlock);
kfree(ccold); kfree(ccold);
} }
new_shared = kmalloc(sizeof(void*)*batchcount*shared+
sizeof(struct array_cache), GFP_KERNEL);
if (new_shared) {
struct array_cache *old;
new_shared->avail = 0;
new_shared->limit = batchcount*shared;
new_shared->batchcount = 0xbaadf00d;
new_shared->touched = 0;
spin_lock_irq(&cachep->spinlock);
old = cachep->lists.shared;
cachep->lists.shared = new_shared;
if (old)
free_block(cachep, ac_entry(old), old->avail);
spin_unlock_irq(&cachep->spinlock);
kfree(old);
}
return 0; return 0;
} }
...@@ -2155,7 +2205,7 @@ static int do_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount) ...@@ -2155,7 +2205,7 @@ static int do_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount)
static void enable_cpucache (kmem_cache_t *cachep) static void enable_cpucache (kmem_cache_t *cachep)
{ {
int err; int err;
int limit; int limit, shared;
/* The head array serves three purposes: /* The head array serves three purposes:
* - create a LIFO ordering, i.e. return objects that are cache-warm * - create a LIFO ordering, i.e. return objects that are cache-warm
...@@ -2170,11 +2220,25 @@ static void enable_cpucache (kmem_cache_t *cachep) ...@@ -2170,11 +2220,25 @@ static void enable_cpucache (kmem_cache_t *cachep)
else if (cachep->objsize > PAGE_SIZE) else if (cachep->objsize > PAGE_SIZE)
limit = 8; limit = 8;
else if (cachep->objsize > 1024) else if (cachep->objsize > 1024)
limit = 54; limit = 24;
else if (cachep->objsize > 256) else if (cachep->objsize > 256)
limit = 120; limit = 54;
else else
limit = 248; limit = 120;
/* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
* allocation behaviour: Most allocs on one cpu, most free operations
* on another cpu. For these cases, an efficient object passing between
* cpus is necessary. This is provided by a shared array. The array
* replaces Bonwick's magazine layer.
* On uniprocessor, it's functionally equivalent (but less efficient)
* to a larger limit. Thus disabled by default.
*/
shared = 0;
#ifdef CONFIG_SMP
if (cachep->objsize <= PAGE_SIZE)
shared = 8;
#endif
#if DEBUG #if DEBUG
/* With debugging enabled, large batchcount lead to excessively /* With debugging enabled, large batchcount lead to excessively
...@@ -2184,12 +2248,53 @@ static void enable_cpucache (kmem_cache_t *cachep) ...@@ -2184,12 +2248,53 @@ static void enable_cpucache (kmem_cache_t *cachep)
if (limit > 32) if (limit > 32)
limit = 32; limit = 32;
#endif #endif
err = do_tune_cpucache(cachep, limit, (limit+1)/2); err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
if (err) if (err)
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
cachep->name, -err); cachep->name, -err);
} }
static void drain_array(kmem_cache_t *cachep, struct array_cache *ac)
{
int tofree;
check_irq_off();
if (ac->touched) {
ac->touched = 0;
} else if (ac->avail) {
tofree = (ac->limit+4)/5;
if (tofree > ac->avail) {
tofree = (ac->avail+1)/2;
}
spin_lock(&cachep->spinlock);
free_block(cachep, ac_entry(ac), tofree);
spin_unlock(&cachep->spinlock);
ac->avail -= tofree;
memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
sizeof(void*)*ac->avail);
}
}
static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac)
{
int tofree;
check_spinlock_acquired(cachep);
if (ac->touched) {
ac->touched = 0;
} else if (ac->avail) {
tofree = (ac->limit+4)/5;
if (tofree > ac->avail) {
tofree = (ac->avail+1)/2;
}
free_block(cachep, ac_entry(ac), tofree);
ac->avail -= tofree;
memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
sizeof(void*)*ac->avail);
}
}
/** /**
* cache_reap - Reclaim memory from caches. * cache_reap - Reclaim memory from caches.
* *
...@@ -2216,7 +2321,6 @@ static inline void cache_reap (void) ...@@ -2216,7 +2321,6 @@ static inline void cache_reap (void)
kmem_cache_t *searchp; kmem_cache_t *searchp;
struct list_head* p; struct list_head* p;
int tofree; int tofree;
struct array_cache *ac;
struct slab *slabp; struct slab *slabp;
searchp = list_entry(walk, kmem_cache_t, next); searchp = list_entry(walk, kmem_cache_t, next);
...@@ -2226,19 +2330,8 @@ static inline void cache_reap (void) ...@@ -2226,19 +2330,8 @@ static inline void cache_reap (void)
check_irq_on(); check_irq_on();
local_irq_disable(); local_irq_disable();
ac = ac_data(searchp); drain_array(searchp, ac_data(searchp));
if (ac->touched) {
ac->touched = 0;
} else if (ac->avail) {
tofree = (ac->limit+4)/5;
if (tofree > ac->avail) {
tofree = (ac->avail+1)/2;
}
free_block(searchp, ac_entry(ac), tofree);
ac->avail -= tofree;
memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
sizeof(void*)*ac->avail);
}
if(time_after(searchp->lists.next_reap, jiffies)) if(time_after(searchp->lists.next_reap, jiffies))
goto next_irqon; goto next_irqon;
...@@ -2247,6 +2340,10 @@ static inline void cache_reap (void) ...@@ -2247,6 +2340,10 @@ static inline void cache_reap (void)
goto next_unlock; goto next_unlock;
} }
searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3; searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3;
if (searchp->lists.shared)
drain_array_locked(searchp, searchp->lists.shared);
if (searchp->lists.free_touched) { if (searchp->lists.free_touched) {
searchp->lists.free_touched = 0; searchp->lists.free_touched = 0;
goto next_unlock; goto next_unlock;
...@@ -2456,7 +2553,7 @@ struct seq_operations slabinfo_op = { ...@@ -2456,7 +2553,7 @@ struct seq_operations slabinfo_op = {
#define MAX_SLABINFO_WRITE 128 #define MAX_SLABINFO_WRITE 128
/** /**
* slabinfo_write - SMP tuning for the slab allocator * slabinfo_write - Tuning for the slab allocator
* @file: unused * @file: unused
* @buffer: user buffer * @buffer: user buffer
* @count: data len * @count: data len
...@@ -2466,7 +2563,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, ...@@ -2466,7 +2563,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos) size_t count, loff_t *ppos)
{ {
char kbuf[MAX_SLABINFO_WRITE+1], *tmp; char kbuf[MAX_SLABINFO_WRITE+1], *tmp;
int limit, batchcount, res; int limit, batchcount, shared, res;
struct list_head *p; struct list_head *p;
if (count > MAX_SLABINFO_WRITE) if (count > MAX_SLABINFO_WRITE)
...@@ -2480,10 +2577,8 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, ...@@ -2480,10 +2577,8 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
return -EINVAL; return -EINVAL;
*tmp = '\0'; *tmp = '\0';
tmp++; tmp++;
limit = simple_strtol(tmp, &tmp, 10); if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
while (*tmp == ' ') return -EINVAL;
tmp++;
batchcount = simple_strtol(tmp, &tmp, 10);
/* Find the cache in the chain of caches. */ /* Find the cache in the chain of caches. */
down(&cache_chain_sem); down(&cache_chain_sem);
...@@ -2494,10 +2589,11 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, ...@@ -2494,10 +2589,11 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
if (!strcmp(cachep->name, kbuf)) { if (!strcmp(cachep->name, kbuf)) {
if (limit < 1 || if (limit < 1 ||
batchcount < 1 || batchcount < 1 ||
batchcount > limit) { batchcount > limit ||
shared < 0) {
res = -EINVAL; res = -EINVAL;
} else { } else {
res = do_tune_cpucache(cachep, limit, batchcount); res = do_tune_cpucache(cachep, limit, batchcount, shared);
} }
break; break;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment