Commit af70f767 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Fix page allocator lower zone protection for NUMA

From: Martin Hicks <mort@wildopensource.com>

This changes __alloc_pages() so it uses precalculated values for the "min".
This should prevent the problem of min incrementing from zone to zone across
many nodes on a NUMA machine.  The result of falling back to other nodes with
the old incremental min calculations was that the min value became very
large.
parent 7860b371
......@@ -54,6 +54,15 @@ struct per_cpu_pageset {
struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
} ____cacheline_aligned_in_smp;
#define ZONE_DMA 0
#define ZONE_NORMAL 1
#define ZONE_HIGHMEM 2
#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
#define GFP_ZONEMASK 0x03
/*
* On machines where it is needed (eg PCs) we divide physical memory
* into multiple physical zones. On a PC we have 3 zones:
......@@ -70,6 +79,19 @@ struct zone {
spinlock_t lock;
unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high;
/*
* protection[] is a pre-calculated number of extra pages that must be
* available in a zone in order for __alloc_pages() to allocate memory
* from the zone. i.e., for a GFP_KERNEL alloc of "order" there must
* be "(1<<order) + protection[ZONE_NORMAL]" free pages in the zone
* for us to choose to allocate the page from that zone.
*
* It uses both min_free_kbytes and sysctl_lower_zone_protection.
* The protection values are recalculated if either of these values
* change. The array elements are in zonelist order:
* [0] == GFP_DMA, [1] == GFP_KERNEL, [2] == GFP_HIGHMEM.
*/
unsigned long protection[MAX_NR_ZONES];
ZONE_PADDING(_pad1_)
......@@ -157,14 +179,6 @@ struct zone {
unsigned long present_pages; /* amount of memory (excluding holes) */
} ____cacheline_maxaligned_in_smp;
#define ZONE_DMA 0
#define ZONE_NORMAL 1
#define ZONE_HIGHMEM 2
#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
#define GFP_ZONEMASK 0x03
/*
* The "priority" of VM scanning is how much of the queues we will scan in one
......@@ -228,6 +242,11 @@ void get_zone_counts(unsigned long *active, unsigned long *inactive,
void build_all_zonelists(void);
void wakeup_kswapd(struct zone *zone);
/*
* zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
*/
#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
/**
* for_each_pgdat - helper macro to iterate over all nodes
* @pgdat - pointer to a pg_data_t variable
......@@ -300,6 +319,8 @@ struct ctl_table;
struct file;
int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *);
int lower_zone_protection_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *);
#include <linux/topology.h>
/* Returns the number of the current Node. */
......
......@@ -722,7 +722,7 @@ static ctl_table vm_table[] = {
.data = &sysctl_lower_zone_protection,
.maxlen = sizeof(sysctl_lower_zone_protection),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.proc_handler = &lower_zone_protection_sysctl_handler,
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
......
......@@ -552,6 +552,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
struct task_struct *p = current;
int i;
int cold;
int alloc_type;
int do_retry;
might_sleep_if(wait);
......@@ -564,20 +565,20 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
if (zones[0] == NULL) /* no zones in the zonelist */
return NULL;
alloc_type = zone_idx(zones[0]);
/* Go through the zonelist once, looking for a zone with enough free */
min = 1UL << order;
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];
unsigned long local_low;
min = (1<<order) + z->protection[alloc_type];
/*
* This is the fabled 'incremental min'. We let real-time tasks
* dip their real-time paws a little deeper into reserves.
* We let real-time tasks dip their real-time paws a little
* deeper into reserves.
*/
local_low = z->pages_low;
if (rt_task(p))
local_low >>= 1;
min += local_low;
min -= z->pages_low >> 1;
if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) {
......@@ -585,7 +586,6 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
if (page)
goto got_pg;
}
min += z->pages_low * sysctl_lower_zone_protection;
}
/* we're somewhat low on memory, failed to find what we needed */
......@@ -593,24 +593,22 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
wakeup_kswapd(zones[i]);
/* Go through the zonelist again, taking __GFP_HIGH into account */
min = 1UL << order;
for (i = 0; zones[i] != NULL; i++) {
unsigned long local_min;
struct zone *z = zones[i];
local_min = z->pages_min;
min = (1<<order) + z->protection[alloc_type];
if (gfp_mask & __GFP_HIGH)
local_min >>= 2;
min -= z->pages_low >> 2;
if (rt_task(p))
local_min >>= 1;
min += local_min;
min -= z->pages_low >> 1;
if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) {
page = buffered_rmqueue(z, order, cold);
if (page)
goto got_pg;
}
min += local_min * sysctl_lower_zone_protection;
}
/* here we're in the low on memory slow path */
......@@ -642,18 +640,17 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
p->flags &= ~PF_MEMALLOC;
/* go through the zonelist yet one more time */
min = 1UL << order;
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];
min += z->pages_min;
min = (1UL << order) + z->protection[alloc_type];
if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) {
page = buffered_rmqueue(z, order, cold);
if (page)
goto got_pg;
}
min += z->pages_low * sysctl_lower_zone_protection;
}
/*
......@@ -1056,6 +1053,8 @@ void show_free_areas(void)
ps.nr_page_table_pages);
for_each_zone(zone) {
int i;
show_node(zone);
printk("%s"
" free:%lukB"
......@@ -1075,6 +1074,10 @@ void show_free_areas(void)
K(zone->nr_inactive),
K(zone->present_pages)
);
printk("protections[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
printk(" %lu", zone->protection[i]);
printk("\n");
}
for_each_zone(zone) {
......@@ -1744,6 +1747,93 @@ void __init page_alloc_init(void)
hotcpu_notifier(page_alloc_cpu_notify, 0);
}
static unsigned long higherzone_val(struct zone *z, int max_zone,
int alloc_type)
{
int z_idx = zone_idx(z);
struct zone *higherzone;
unsigned long pages;
/* there is no higher zone to get a contribution from */
if (z_idx == MAX_NR_ZONES-1)
return 0;
higherzone = &z->zone_pgdat->node_zones[z_idx+1];
/* We always start with the higher zone's protection value */
pages = higherzone->protection[alloc_type];
/*
* We get a lower-zone-protection contribution only if there are
* pages in the higher zone and if we're not the highest zone
* in the current zonelist. e.g., never happens for GFP_DMA. Happens
* only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
* and ZONE_NORMAL for a GFP_HIGHMEM allocation.
*/
if (higherzone->present_pages && z_idx < alloc_type)
pages += higherzone->pages_low * sysctl_lower_zone_protection;
return pages;
}
/*
* setup_per_zone_protection - called whenver min_free_kbytes or
* sysctl_lower_zone_protection changes. Ensures that each zone
* has a correct pages_protected value, so an adequate number of
* pages are left in the zone after a successful __alloc_pages().
*
* This algorithm is way confusing. I tries to keep the same behavior
* as we had with the incremental min iterative algorithm.
*/
static void setup_per_zone_protection(void)
{
struct pglist_data *pgdat;
struct zone *zones, *zone;
int max_zone;
int i, j;
for_each_pgdat(pgdat) {
zones = pgdat->node_zones;
for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++)
if (zones[i].present_pages)
max_zone = i;
/*
* For each of the different allocation types:
* GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
*/
for (i = 0; i < MAX_NR_ZONES; i++) {
/*
* For each of the zones:
* ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
*/
for (j = MAX_NR_ZONES-1; j >= 0; j--) {
zone = &zones[j];
/*
* We never protect zones that don't have memory
* in them (j>max_zone) or zones that aren't in
* the zonelists for a certain type of
* allocation (j>i). We have to assign these to
* zero because the lower zones take
* contributions from the higher zones.
*/
if (j > max_zone || j > i) {
zone->protection[i] = 0;
continue;
}
/*
* The contribution of the next higher zone
*/
zone->protection[i] = higherzone_val(zone,
max_zone, i);
zone->protection[i] += zone->pages_low;
}
}
}
}
/*
* setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures
* that the pages_{min,low,high} values for each zone are set correctly
......@@ -1757,9 +1847,10 @@ static void setup_per_zone_pages_min(void)
unsigned long flags;
/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone)
for_each_zone(zone) {
if (!is_highmem(zone))
lowmem_pages += zone->present_pages;
}
for_each_zone(zone) {
spin_lock_irqsave(&zone->lru_lock, flags);
......@@ -1827,13 +1918,14 @@ static int __init init_per_zone_pages_min(void)
if (min_free_kbytes > 16384)
min_free_kbytes = 16384;
setup_per_zone_pages_min();
setup_per_zone_protection();
return 0;
}
module_init(init_per_zone_pages_min)
/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call setup_per_zone_pages_min() whenever min_free_kbytes
* that we can call two helper functions whenever min_free_kbytes
* changes.
*/
int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
......@@ -1841,5 +1933,19 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
{
proc_dointvec(table, write, file, buffer, length);
setup_per_zone_pages_min();
setup_per_zone_protection();
return 0;
}
/*
* lower_zone_protection_sysctl_handler - just a wrapper around
* proc_dointvec() so that we can call setup_per_zone_protection()
* whenever sysctl_lower_zone_protection changes.
*/
int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length)
{
proc_dointvec_minmax(table, write, file, buffer, length);
setup_per_zone_protection();
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment