Commit a206231b authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] hot-n-cold pages: page allocator core

Hot/Cold pages and zone->lock amortisation
parent 1d2652dd
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#define __GFP_IO 0x40 /* Can start low memory physical IO? */ #define __GFP_IO 0x40 /* Can start low memory physical IO? */
#define __GFP_HIGHIO 0x80 /* Can start high mem physical IO? */ #define __GFP_HIGHIO 0x80 /* Can start high mem physical IO? */
#define __GFP_FS 0x100 /* Can call down to low-level FS? */ #define __GFP_FS 0x100 /* Can call down to low-level FS? */
#define __GFP_COLD 0x200 /* Cache-cold page required */
#define GFP_NOHIGHIO ( __GFP_WAIT | __GFP_IO) #define GFP_NOHIGHIO ( __GFP_WAIT | __GFP_IO)
#define GFP_NOIO ( __GFP_WAIT) #define GFP_NOIO ( __GFP_WAIT)
...@@ -32,6 +33,7 @@ ...@@ -32,6 +33,7 @@
#define GFP_DMA __GFP_DMA #define GFP_DMA __GFP_DMA
/* /*
* There is only one page-allocator function, and two main namespaces to * There is only one page-allocator function, and two main namespaces to
* it. The alloc_page*() variants return 'struct page *' and as such * it. The alloc_page*() variants return 'struct page *' and as such
...@@ -77,11 +79,10 @@ extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); ...@@ -77,11 +79,10 @@ extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask));
#define __get_dma_pages(gfp_mask, order) \ #define __get_dma_pages(gfp_mask, order) \
__get_free_pages((gfp_mask) | GFP_DMA,(order)) __get_free_pages((gfp_mask) | GFP_DMA,(order))
/*
* There is only one 'core' page-freeing function.
*/
extern void FASTCALL(__free_pages(struct page *page, unsigned int order)); extern void FASTCALL(__free_pages(struct page *page, unsigned int order));
extern void FASTCALL(free_pages(unsigned long addr, unsigned int order)); extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
extern void FASTCALL(free_hot_page(struct page *page));
extern void FASTCALL(free_cold_page(struct page *page));
#define __free_page(page) __free_pages((page), 0) #define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr),0) #define free_page(addr) free_pages((addr),0)
......
...@@ -211,7 +211,6 @@ struct page { ...@@ -211,7 +211,6 @@ struct page {
#define set_page_count(p,v) atomic_set(&(p)->count, v) #define set_page_count(p,v) atomic_set(&(p)->count, v)
extern void FASTCALL(__page_cache_release(struct page *)); extern void FASTCALL(__page_cache_release(struct page *));
void FASTCALL(__free_pages_ok(struct page *page, unsigned int order));
static inline void put_page(struct page *page) static inline void put_page(struct page *page)
{ {
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <linux/list.h> #include <linux/list.h>
#include <linux/wait.h> #include <linux/wait.h>
#include <linux/cache.h> #include <linux/cache.h>
#include <linux/threads.h>
#include <asm/atomic.h> #include <asm/atomic.h>
#ifdef CONFIG_DISCONTIGMEM #ifdef CONFIG_DISCONTIGMEM
#include <asm/numnodes.h> #include <asm/numnodes.h>
...@@ -46,6 +47,18 @@ struct zone_padding { ...@@ -46,6 +47,18 @@ struct zone_padding {
#define ZONE_PADDING(name) #define ZONE_PADDING(name)
#endif #endif
struct per_cpu_pages {
int count; /* number of pages in the list */
int low; /* low watermark, refill needed */
int high; /* high watermark, emptying needed */
int batch; /* chunk size for buddy add/remove */
struct list_head list; /* the list of pages */
};
struct per_cpu_pageset {
struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
} ____cacheline_aligned_in_smp;
/* /*
* On machines where it is needed (eg PCs) we divide physical memory * On machines where it is needed (eg PCs) we divide physical memory
* into multiple physical zones. On a PC we have 3 zones: * into multiple physical zones. On a PC we have 3 zones:
...@@ -107,6 +120,10 @@ struct zone { ...@@ -107,6 +120,10 @@ struct zone {
unsigned long wait_table_size; unsigned long wait_table_size;
unsigned long wait_table_bits; unsigned long wait_table_bits;
ZONE_PADDING(_pad3_)
struct per_cpu_pageset pageset[NR_CPUS];
/* /*
* Discontig memory support fields. * Discontig memory support fields.
*/ */
......
...@@ -10,6 +10,8 @@ ...@@ -10,6 +10,8 @@
* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
* Zone balancing, Kanoj Sarcar, SGI, Jan 2000 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
*/ */
#include <linux/config.h> #include <linux/config.h>
...@@ -151,13 +153,14 @@ static inline void free_pages_check(const char *function, struct page *page) ...@@ -151,13 +153,14 @@ static inline void free_pages_check(const char *function, struct page *page)
* Assumes all pages on list are in same zone, and of same order. * Assumes all pages on list are in same zone, and of same order.
* count is the number of pages to free, or 0 for all on the list. * count is the number of pages to free, or 0 for all on the list.
*/ */
static void static int
free_pages_bulk(struct zone *zone, int count, free_pages_bulk(struct zone *zone, int count,
struct list_head *list, unsigned int order) struct list_head *list, unsigned int order)
{ {
unsigned long mask, flags; unsigned long mask, flags;
struct free_area *area; struct free_area *area;
struct page *base, *page = NULL; struct page *base, *page = NULL;
int ret = 0;
mask = (~0UL) << order; mask = (~0UL) << order;
base = zone->zone_mem_map; base = zone->zone_mem_map;
...@@ -169,8 +172,10 @@ free_pages_bulk(struct zone *zone, int count, ...@@ -169,8 +172,10 @@ free_pages_bulk(struct zone *zone, int count,
list_del(&page->list); list_del(&page->list);
__free_pages_bulk(page, base, zone, area, mask, order); __free_pages_bulk(page, base, zone, area, mask, order);
mod_page_state(pgfree, count<<order); mod_page_state(pgfree, count<<order);
ret++;
} }
spin_unlock_irqrestore(&zone->lock, flags); spin_unlock_irqrestore(&zone->lock, flags);
return ret;
} }
void __free_pages_ok(struct page *page, unsigned int order) void __free_pages_ok(struct page *page, unsigned int order)
...@@ -201,14 +206,13 @@ expand(struct zone *zone, struct page *page, ...@@ -201,14 +206,13 @@ expand(struct zone *zone, struct page *page,
index += size; index += size;
page += size; page += size;
} }
BUG_ON(bad_range(zone, page));
return page; return page;
} }
/* /*
* This page is about to be returned from the page allocator * This page is about to be returned from the page allocator
*/ */
static inline void prep_new_page(struct page *page) static void prep_new_page(struct page *page)
{ {
if ( page->mapping || if ( page->mapping ||
page_mapped(page) || page_mapped(page) ||
...@@ -248,36 +252,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) ...@@ -248,36 +252,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
continue; continue;
page = list_entry(curr, struct page, list); page = list_entry(curr, struct page, list);
BUG_ON(bad_range(zone, page));
list_del(curr); list_del(curr);
index = page - zone->zone_mem_map; index = page - zone->zone_mem_map;
if (current_order != MAX_ORDER-1) if (current_order != MAX_ORDER-1)
MARK_USED(index, current_order, area); MARK_USED(index, current_order, area);
zone->free_pages -= 1UL << order; zone->free_pages -= 1UL << order;
page = expand(zone, page, index, order, current_order, area); return expand(zone, page, index, order, current_order, area);
return page;
} }
return NULL; return NULL;
} }
/* Obtain a single element from the buddy allocator */
static struct page *rmqueue(struct zone *zone, unsigned int order)
{
unsigned long flags;
struct page *page;
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
spin_unlock_irqrestore(&zone->lock, flags);
if (page != NULL) {
BUG_ON(bad_range(zone, page));
prep_new_page(page);
}
return page;
}
/* /*
* Obtain a specified number of elements from the buddy allocator, all under * Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list. * a single hold of the lock, for efficiency. Add them to the supplied list.
...@@ -340,6 +325,72 @@ int is_head_of_free_region(struct page *page) ...@@ -340,6 +325,72 @@ int is_head_of_free_region(struct page *page)
} }
#endif /* CONFIG_SOFTWARE_SUSPEND */ #endif /* CONFIG_SOFTWARE_SUSPEND */
/*
* Free a 0-order page
*/
static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
static void free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
free_pages_check(__FUNCTION__, page);
pcp = &zone->pageset[get_cpu()].pcp[cold];
local_irq_save(flags);
if (pcp->count >= pcp->high)
pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
list_add(&page->list, &pcp->list);
pcp->count++;
local_irq_restore(flags);
put_cpu();
}
void free_hot_page(struct page *page)
{
free_hot_cold_page(page, 0);
}
void free_cold_page(struct page *page)
{
free_hot_cold_page(page, 1);
}
static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
{
unsigned long flags;
struct page *page = NULL;
if (order == 0) {
struct per_cpu_pages *pcp;
pcp = &zone->pageset[get_cpu()].pcp[cold];
local_irq_save(flags);
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, &pcp->list);
if (pcp->count) {
page = list_entry(pcp->list.next, struct page, list);
list_del(&page->list);
pcp->count--;
}
local_irq_restore(flags);
put_cpu();
}
if (page == NULL) {
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
spin_unlock_irqrestore(&zone->lock, flags);
}
if (page != NULL) {
BUG_ON(bad_range(zone, page));
prep_new_page(page);
}
return page;
}
/* /*
* This is the 'heart' of the zoned buddy allocator: * This is the 'heart' of the zoned buddy allocator:
*/ */
...@@ -349,13 +400,18 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -349,13 +400,18 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
{ {
unsigned long min; unsigned long min;
struct zone **zones, *classzone; struct zone **zones, *classzone;
struct page * page; struct page *page;
int cflags; int cflags;
int i; int i;
int cold;
if (gfp_mask & __GFP_WAIT) if (gfp_mask & __GFP_WAIT)
might_sleep(); might_sleep();
cold = 0;
if (gfp_mask & __GFP_COLD)
cold = 1;
mod_page_state(pgalloc, 1<<order); mod_page_state(pgalloc, 1<<order);
zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
...@@ -371,7 +427,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -371,7 +427,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
/* the incremental min is allegedly to discourage fallback */ /* the incremental min is allegedly to discourage fallback */
min += z->pages_low; min += z->pages_low;
if (z->free_pages > min || z->free_pages >= z->pages_high) { if (z->free_pages > min || z->free_pages >= z->pages_high) {
page = rmqueue(z, order); page = buffered_rmqueue(z, order, cold);
if (page) if (page)
return page; return page;
} }
...@@ -396,7 +452,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -396,7 +452,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
local_min >>= 2; local_min >>= 2;
min += local_min; min += local_min;
if (z->free_pages > min || z->free_pages >= z->pages_high) { if (z->free_pages > min || z->free_pages >= z->pages_high) {
page = rmqueue(z, order); page = buffered_rmqueue(z, order, cold);
if (page) if (page)
return page; return page;
} }
...@@ -410,7 +466,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -410,7 +466,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
for (i = 0; zones[i] != NULL; i++) { for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i]; struct zone *z = zones[i];
page = rmqueue(z, order); page = buffered_rmqueue(z, order, cold);
if (page) if (page)
return page; return page;
} }
...@@ -440,7 +496,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -440,7 +496,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
min += z->pages_min; min += z->pages_min;
if (z->free_pages > min || z->free_pages >= z->pages_high) { if (z->free_pages > min || z->free_pages >= z->pages_high) {
page = rmqueue(z, order); page = buffered_rmqueue(z, order, cold);
if (page) if (page)
return page; return page;
} }
...@@ -492,13 +548,17 @@ void __pagevec_free(struct pagevec *pvec) ...@@ -492,13 +548,17 @@ void __pagevec_free(struct pagevec *pvec)
int i = pagevec_count(pvec); int i = pagevec_count(pvec);
while (--i >= 0) while (--i >= 0)
__free_pages_ok(pvec->pages[i], 0); free_hot_page(pvec->pages[i]);
} }
void __free_pages(struct page *page, unsigned int order) void __free_pages(struct page *page, unsigned int order)
{ {
if (!PageReserved(page) && put_page_testzero(page)) if (!PageReserved(page) && put_page_testzero(page)) {
__free_pages_ok(page, order); if (order == 0)
free_hot_page(page);
else
__free_pages_ok(page, order);
}
} }
void free_pages(unsigned long addr, unsigned int order) void free_pages(unsigned long addr, unsigned int order)
...@@ -899,7 +959,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, ...@@ -899,7 +959,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
unsigned long i, j; unsigned long i, j;
unsigned long local_offset; unsigned long local_offset;
const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
int nid = pgdat->node_id; int cpu, nid = pgdat->node_id;
struct page *lmem_map = pgdat->node_mem_map; struct page *lmem_map = pgdat->node_mem_map;
unsigned long zone_start_pfn = pgdat->node_start_pfn; unsigned long zone_start_pfn = pgdat->node_start_pfn;
...@@ -911,13 +971,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat, ...@@ -911,13 +971,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
struct zone *zone = pgdat->node_zones + j; struct zone *zone = pgdat->node_zones + j;
unsigned long mask; unsigned long mask;
unsigned long size, realsize; unsigned long size, realsize;
unsigned long batch;
zone_table[nid * MAX_NR_ZONES + j] = zone; zone_table[nid * MAX_NR_ZONES + j] = zone;
realsize = size = zones_size[j]; realsize = size = zones_size[j];
if (zholes_size) if (zholes_size)
realsize -= zholes_size[j]; realsize -= zholes_size[j];
printk(" %s zone: %lu pages\n", zone_names[j], realsize);
zone->spanned_pages = size; zone->spanned_pages = size;
zone->present_pages = realsize; zone->present_pages = realsize;
zone->name = zone_names[j]; zone->name = zone_names[j];
...@@ -925,6 +985,40 @@ static void __init free_area_init_core(struct pglist_data *pgdat, ...@@ -925,6 +985,40 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
spin_lock_init(&zone->lru_lock); spin_lock_init(&zone->lru_lock);
zone->zone_pgdat = pgdat; zone->zone_pgdat = pgdat;
zone->free_pages = 0; zone->free_pages = 0;
/*
* The per-cpu-pages pools are set to around 1000th of the
* size of the zone. But no more than 1/4 of a meg - there's
* no point in going beyond the size of L2 cache.
*
* OK, so we don't know how big the cache is. So guess.
*/
batch = zone->present_pages / 1024;
if (batch * PAGE_SIZE > 256 * 1024)
batch = (256 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
for (cpu = 0; cpu < NR_CPUS; cpu++) {
struct per_cpu_pages *pcp;
pcp = &zone->pageset[cpu].pcp[0]; /* hot */
pcp->count = 0;
pcp->low = 2 * batch;
pcp->high = 6 * batch;
pcp->batch = 1 * batch;
INIT_LIST_HEAD(&pcp->list);
pcp = &zone->pageset[cpu].pcp[1]; /* cold */
pcp->count = 0;
pcp->low = 0;
pcp->high = 2 * batch;
pcp->batch = 1 * batch;
INIT_LIST_HEAD(&pcp->list);
}
printk(" %s zone: %lu pages, LIFO batch:%lu\n",
zone_names[j], realsize, batch);
INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->active_list);
INIT_LIST_HEAD(&zone->inactive_list); INIT_LIST_HEAD(&zone->inactive_list);
atomic_set(&zone->refill_counter, 0); atomic_set(&zone->refill_counter, 0);
......
...@@ -69,7 +69,8 @@ void lru_add_drain(void) ...@@ -69,7 +69,8 @@ void lru_add_drain(void)
} }
/* /*
* This path almost never happens - pages are normally freed via pagevecs. * This path almost never happens for VM activity - pages are normally
* freed via pagevecs. But it gets used by networking.
*/ */
void __page_cache_release(struct page *page) void __page_cache_release(struct page *page)
{ {
...@@ -83,7 +84,7 @@ void __page_cache_release(struct page *page) ...@@ -83,7 +84,7 @@ void __page_cache_release(struct page *page)
page = NULL; page = NULL;
spin_unlock_irqrestore(&zone->lru_lock, flags); spin_unlock_irqrestore(&zone->lru_lock, flags);
if (page) if (page)
__free_pages_ok(page, 0); free_hot_page(page);
} }
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment