Commit ac12db05 authored by Nick Piggin's avatar Nick Piggin Committed by Linus Torvalds

[PATCH] vm: alloc_pages watermark fixes

Previously the ->protection[] logic was broken.  It was difficult to follow
and basically didn't use the asynch reclaim watermarks (pages_min,
pages_low, pages_high) properly.

Now use ->protection *only* for lower-zone protection.  So the allocator
now explicitly uses the ->pages_low, ->pages_min watermarks and adds
->protection on top of that, instead of trying to use ->protection for
everything.

Pages are allocated down to (->pages_low + ->protection), once this is
reached, kswapd the background reclaim is started; after this, we can
allocate down to (->pages_min + ->protection) without blocking; the memory
below pages_min is reserved for __GFP_HIGH and PF_MEMALLOC allocations. 
kswapd attempts to reclaim memory until ->pages_high is reached.
Signed-off-by: default avatarNick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 0d761325
...@@ -600,7 +600,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -600,7 +600,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
{ {
const int wait = gfp_mask & __GFP_WAIT; const int wait = gfp_mask & __GFP_WAIT;
unsigned long min; unsigned long min;
struct zone **zones; struct zone **zones, *z;
struct page *page; struct page *page;
struct reclaim_state reclaim_state; struct reclaim_state reclaim_state;
struct task_struct *p = current; struct task_struct *p = current;
...@@ -611,72 +611,56 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -611,72 +611,56 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
might_sleep_if(wait); might_sleep_if(wait);
zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
if (zones[0] == NULL) /* no zones in the zonelist */
if (unlikely(zones[0] == NULL)) {
/* Should this ever happen?? */
return NULL; return NULL;
}
alloc_type = zone_idx(zones[0]); alloc_type = zone_idx(zones[0]);
/* Go through the zonelist once, looking for a zone with enough free */ /* Go through the zonelist once, looking for a zone with enough free */
for (i = 0; zones[i] != NULL; i++) { for (i = 0; (z = zones[i]) != NULL; i++) {
struct zone *z = zones[i]; min = z->pages_low + (1<<order) + z->protection[alloc_type];
min = (1<<order) + z->protection[alloc_type]; if (z->free_pages < min)
continue;
/*
* We let real-time tasks dip their real-time paws a little
* deeper into reserves.
*/
if (rt_task(p))
min -= z->pages_low >> 1;
if (z->free_pages >= min || page = buffered_rmqueue(z, order, gfp_mask);
(!wait && z->free_pages >= z->pages_high)) { if (page)
page = buffered_rmqueue(z, order, gfp_mask); goto got_pg;
if (page) {
zone_statistics(zonelist, z);
goto got_pg;
}
}
} }
/* we're somewhat low on memory, failed to find what we needed */ for (i = 0; (z = zones[i]) != NULL; i++)
for (i = 0; zones[i] != NULL; i++) wakeup_kswapd(z);
wakeup_kswapd(zones[i]);
/* Go through the zonelist again, taking __GFP_HIGH into account */
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];
min = (1<<order) + z->protection[alloc_type];
/*
* Go through the zonelist again. Let __GFP_HIGH and allocations
* coming from realtime tasks to go deeper into reserves
*/
for (i = 0; (z = zones[i]) != NULL; i++) {
min = z->pages_min;
if (gfp_mask & __GFP_HIGH) if (gfp_mask & __GFP_HIGH)
min -= z->pages_low >> 2; min -= min>>1;
if (rt_task(p)) if (unlikely(rt_task(p)) && !in_interrupt())
min -= z->pages_low >> 1; min -= min>>2;
min += (1<<order) + z->protection[alloc_type];
if (z->free_pages >= min || if (z->free_pages < min)
(!wait && z->free_pages >= z->pages_high)) { continue;
page = buffered_rmqueue(z, order, gfp_mask);
if (page) {
zone_statistics(zonelist, z);
goto got_pg;
}
}
}
/* here we're in the low on memory slow path */ page = buffered_rmqueue(z, order, gfp_mask);
if (page)
goto got_pg;
}
rebalance: /* This allocation should allow future memory freeing. */
if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) { if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
/* go through the zonelist yet again, ignoring mins */ /* go through the zonelist yet again, ignoring mins */
for (i = 0; zones[i] != NULL; i++) { for (i = 0; (z = zones[i]) != NULL; i++) {
struct zone *z = zones[i];
page = buffered_rmqueue(z, order, gfp_mask); page = buffered_rmqueue(z, order, gfp_mask);
if (page) { if (page)
zone_statistics(zonelist, z);
goto got_pg; goto got_pg;
}
} }
goto nopage; goto nopage;
} }
...@@ -685,6 +669,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -685,6 +669,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
if (!wait) if (!wait)
goto nopage; goto nopage;
rebalance:
/* We now go into synchronous reclaim */
p->flags |= PF_MEMALLOC; p->flags |= PF_MEMALLOC;
reclaim_state.reclaimed_slab = 0; reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state; p->reclaim_state = &reclaim_state;
...@@ -695,27 +681,28 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -695,27 +681,28 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
p->flags &= ~PF_MEMALLOC; p->flags &= ~PF_MEMALLOC;
/* go through the zonelist yet one more time */ /* go through the zonelist yet one more time */
for (i = 0; zones[i] != NULL; i++) { for (i = 0; (z = zones[i]) != NULL; i++) {
struct zone *z = zones[i]; min = z->pages_min;
if (gfp_mask & __GFP_HIGH)
min -= min>>1;
if (unlikely(rt_task(p)) && !in_interrupt())
min -= min>>2;
min += (1<<order) + z->protection[alloc_type];
min = (1UL << order) + z->protection[alloc_type]; if (z->free_pages < min)
continue;
if (z->free_pages >= min || page = buffered_rmqueue(z, order, gfp_mask);
(!wait && z->free_pages >= z->pages_high)) { if (page)
page = buffered_rmqueue(z, order, gfp_mask); goto got_pg;
if (page) {
zone_statistics(zonelist, z);
goto got_pg;
}
}
} }
/* /*
* Don't let big-order allocations loop unless the caller explicitly * Don't let big-order allocations loop unless the caller explicitly
* requests that. Wait for some write requests to complete then retry. * requests that. Wait for some write requests to complete then retry.
* *
* In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
* may not be true in other implementations. * <= 3, but that may not be true in other implementations.
*/ */
do_retry = 0; do_retry = 0;
if (!(gfp_mask & __GFP_NORETRY)) { if (!(gfp_mask & __GFP_NORETRY)) {
...@@ -738,6 +725,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -738,6 +725,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
} }
return NULL; return NULL;
got_pg: got_pg:
zone_statistics(zonelist, z);
kernel_map_pages(page, 1 << order, 1); kernel_map_pages(page, 1 << order, 1);
return page; return page;
} }
...@@ -1857,11 +1845,11 @@ static void setup_per_zone_protection(void) ...@@ -1857,11 +1845,11 @@ static void setup_per_zone_protection(void)
* We never protect zones that don't have memory * We never protect zones that don't have memory
* in them (j>max_zone) or zones that aren't in * in them (j>max_zone) or zones that aren't in
* the zonelists for a certain type of * the zonelists for a certain type of
* allocation (j>i). We have to assign these to * allocation (j>=i). We have to assign these
* zero because the lower zones take * to zero because the lower zones take
* contributions from the higher zones. * contributions from the higher zones.
*/ */
if (j > max_zone || j > i) { if (j > max_zone || j >= i) {
zone->protection[i] = 0; zone->protection[i] = 0;
continue; continue;
} }
...@@ -1870,7 +1858,6 @@ static void setup_per_zone_protection(void) ...@@ -1870,7 +1858,6 @@ static void setup_per_zone_protection(void)
*/ */
zone->protection[i] = higherzone_val(zone, zone->protection[i] = higherzone_val(zone,
max_zone, i); max_zone, i);
zone->protection[i] += zone->pages_low;
} }
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment