Commit bf3f607a authored by Andrew Morton's avatar Andrew Morton Committed by Russell King

[PATCH] separation of direct-reclaim and kswapd functions

There is some lack of clarity in what kswapd does and what
direct-reclaim tasks do; try_to_free_pages() tries to service both
functions, and they are different.

- kswapd's role is to keep all zones on its node at

	zone->free_pages >= zone->pages_high.

  and to never stop as long as any zones do not meet that condition.

- A direct reclaimer's role is to try to free some pages from the
  zones which are suitable for this particular allocation request, and
  to return when that has been achieved, or when all the relevant zones
  are at

	zone->free_pages >= zone->pages_high.

The patch explicitly separates these two code paths; kswapd does not
run try_to_free_pages() any more.  kswapd should not be aware of zone
fallbacks.
parent fe66ad33
...@@ -62,7 +62,6 @@ struct zone { ...@@ -62,7 +62,6 @@ struct zone {
spinlock_t lock; spinlock_t lock;
unsigned long free_pages; unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high; unsigned long pages_min, pages_low, pages_high;
int need_balance;
ZONE_PADDING(_pad1_) ZONE_PADDING(_pad1_)
......
...@@ -346,8 +346,6 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -346,8 +346,6 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
} }
} }
classzone->need_balance = 1;
mb();
/* we're somewhat low on memory, failed to find what we needed */ /* we're somewhat low on memory, failed to find what we needed */
for (i = 0; zones[i] != NULL; i++) { for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i]; struct zone *z = zones[i];
...@@ -873,7 +871,6 @@ void __init free_area_init_core(pg_data_t *pgdat, ...@@ -873,7 +871,6 @@ void __init free_area_init_core(pg_data_t *pgdat,
spin_lock_init(&zone->lru_lock); spin_lock_init(&zone->lru_lock);
zone->zone_pgdat = pgdat; zone->zone_pgdat = pgdat;
zone->free_pages = 0; zone->free_pages = 0;
zone->need_balance = 0;
INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->active_list);
INIT_LIST_HEAD(&zone->inactive_list); INIT_LIST_HEAD(&zone->inactive_list);
atomic_set(&zone->refill_counter, 0); atomic_set(&zone->refill_counter, 0);
......
...@@ -101,15 +101,19 @@ static inline int is_page_cache_freeable(struct page *page) ...@@ -101,15 +101,19 @@ static inline int is_page_cache_freeable(struct page *page)
return page_count(page) - !!PagePrivate(page) == 2; return page_count(page) - !!PagePrivate(page) == 2;
} }
/*
* shrink_list returns the number of reclaimed pages
*/
static /* inline */ int static /* inline */ int
shrink_list(struct list_head *page_list, int nr_pages, shrink_list(struct list_head *page_list, unsigned int gfp_mask,
unsigned int gfp_mask, int *max_scan, int *nr_mapped) int *max_scan, int *nr_mapped)
{ {
struct address_space *mapping; struct address_space *mapping;
LIST_HEAD(ret_pages); LIST_HEAD(ret_pages);
struct pagevec freed_pvec; struct pagevec freed_pvec;
const int nr_pages_in = nr_pages;
int pgactivate = 0; int pgactivate = 0;
int ret = 0;
pagevec_init(&freed_pvec); pagevec_init(&freed_pvec);
while (!list_empty(page_list)) { while (!list_empty(page_list)) {
...@@ -295,7 +299,7 @@ shrink_list(struct list_head *page_list, int nr_pages, ...@@ -295,7 +299,7 @@ shrink_list(struct list_head *page_list, int nr_pages,
__put_page(page); /* The pagecache ref */ __put_page(page); /* The pagecache ref */
free_it: free_it:
unlock_page(page); unlock_page(page);
nr_pages--; ret++;
if (!pagevec_add(&freed_pvec, page)) if (!pagevec_add(&freed_pvec, page))
__pagevec_release_nonlru(&freed_pvec); __pagevec_release_nonlru(&freed_pvec);
continue; continue;
...@@ -312,11 +316,11 @@ shrink_list(struct list_head *page_list, int nr_pages, ...@@ -312,11 +316,11 @@ shrink_list(struct list_head *page_list, int nr_pages,
list_splice(&ret_pages, page_list); list_splice(&ret_pages, page_list);
if (pagevec_count(&freed_pvec)) if (pagevec_count(&freed_pvec))
__pagevec_release_nonlru(&freed_pvec); __pagevec_release_nonlru(&freed_pvec);
mod_page_state(pgsteal, nr_pages_in - nr_pages); mod_page_state(pgsteal, ret);
if (current->flags & PF_KSWAPD) if (current->flags & PF_KSWAPD)
mod_page_state(kswapd_steal, nr_pages_in - nr_pages); mod_page_state(kswapd_steal, ret);
mod_page_state(pgactivate, pgactivate); mod_page_state(pgactivate, pgactivate);
return nr_pages; return ret;
} }
/* /*
...@@ -325,18 +329,19 @@ shrink_list(struct list_head *page_list, int nr_pages, ...@@ -325,18 +329,19 @@ shrink_list(struct list_head *page_list, int nr_pages,
* not freed will be added back to the LRU. * not freed will be added back to the LRU.
* *
* shrink_cache() is passed the number of pages to try to free, and returns * shrink_cache() is passed the number of pages to try to free, and returns
* the number which are yet-to-free. * the number of pages which were reclaimed.
* *
* For pagecache intensive workloads, the first loop here is the hottest spot * For pagecache intensive workloads, the first loop here is the hottest spot
* in the kernel (apart from the copy_*_user functions). * in the kernel (apart from the copy_*_user functions).
*/ */
static /* inline */ int static /* inline */ int
shrink_cache(int nr_pages, struct zone *zone, shrink_cache(const int nr_pages, struct zone *zone,
unsigned int gfp_mask, int max_scan, int *nr_mapped) unsigned int gfp_mask, int max_scan, int *nr_mapped)
{ {
LIST_HEAD(page_list); LIST_HEAD(page_list);
struct pagevec pvec; struct pagevec pvec;
int nr_to_process; int nr_to_process;
int ret = 0;
/* /*
* Try to ensure that we free `nr_pages' pages in one pass of the loop. * Try to ensure that we free `nr_pages' pages in one pass of the loop.
...@@ -349,10 +354,11 @@ shrink_cache(int nr_pages, struct zone *zone, ...@@ -349,10 +354,11 @@ shrink_cache(int nr_pages, struct zone *zone,
lru_add_drain(); lru_add_drain();
spin_lock_irq(&zone->lru_lock); spin_lock_irq(&zone->lru_lock);
while (max_scan > 0 && nr_pages > 0) { while (max_scan > 0 && ret < nr_pages) {
struct page *page; struct page *page;
int nr_taken = 0; int nr_taken = 0;
int nr_scan = 0; int nr_scan = 0;
int nr_freed;
while (nr_scan++ < nr_to_process && while (nr_scan++ < nr_to_process &&
!list_empty(&zone->inactive_list)) { !list_empty(&zone->inactive_list)) {
...@@ -383,10 +389,10 @@ shrink_cache(int nr_pages, struct zone *zone, ...@@ -383,10 +389,10 @@ shrink_cache(int nr_pages, struct zone *zone,
max_scan -= nr_scan; max_scan -= nr_scan;
mod_page_state(pgscan, nr_scan); mod_page_state(pgscan, nr_scan);
nr_pages = shrink_list(&page_list, nr_pages, nr_freed = shrink_list(&page_list, gfp_mask,
gfp_mask, &max_scan, nr_mapped); &max_scan, nr_mapped);
ret += nr_freed;
if (nr_pages <= 0 && list_empty(&page_list)) if (nr_freed <= 0 && list_empty(&page_list))
goto done; goto done;
spin_lock_irq(&zone->lru_lock); spin_lock_irq(&zone->lru_lock);
...@@ -412,7 +418,7 @@ shrink_cache(int nr_pages, struct zone *zone, ...@@ -412,7 +418,7 @@ shrink_cache(int nr_pages, struct zone *zone,
spin_unlock_irq(&zone->lru_lock); spin_unlock_irq(&zone->lru_lock);
done: done:
pagevec_release(&pvec); pagevec_release(&pvec);
return nr_pages; return ret;
} }
/* /*
...@@ -533,9 +539,14 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -533,9 +539,14 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
mod_page_state(pgdeactivate, pgdeactivate); mod_page_state(pgdeactivate, pgdeactivate);
} }
/*
* Try to reclaim `nr_pages' from this zone. Returns the number of reclaimed
* pages. This is a basic per-zone page freer. Used by both kswapd and
* direct reclaim.
*/
static /* inline */ int static /* inline */ int
shrink_zone(struct zone *zone, int max_scan, shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
unsigned int gfp_mask, int nr_pages, int *nr_mapped) const int nr_pages, int *nr_mapped)
{ {
unsigned long ratio; unsigned long ratio;
...@@ -556,36 +567,60 @@ shrink_zone(struct zone *zone, int max_scan, ...@@ -556,36 +567,60 @@ shrink_zone(struct zone *zone, int max_scan,
atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter); atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter);
refill_inactive_zone(zone, SWAP_CLUSTER_MAX); refill_inactive_zone(zone, SWAP_CLUSTER_MAX);
} }
nr_pages = shrink_cache(nr_pages, zone, gfp_mask, return shrink_cache(nr_pages, zone, gfp_mask, max_scan, nr_mapped);
max_scan, nr_mapped); }
return nr_pages;
/*
* FIXME: don't do this for ZONE_HIGHMEM
*/
/*
* Here we assume it costs one seek to replace a lru page and that it also
* takes a seek to recreate a cache object. With this in mind we age equal
* percentages of the lru and ageable caches. This should balance the seeks
* generated by these structures.
*
* NOTE: for now I do this for all zones. If we find this is too aggressive
* on large boxes we may want to exclude ZONE_HIGHMEM.
*
* If we're encountering mapped pages on the LRU then increase the pressure on
* slab to avoid swapping.
*/
static void shrink_slab(int total_scanned, int gfp_mask)
{
int shrink_ratio;
int pages = nr_used_zone_pages();
shrink_ratio = (pages / (total_scanned + 1)) + 1;
shrink_dcache_memory(shrink_ratio, gfp_mask);
shrink_icache_memory(shrink_ratio, gfp_mask);
shrink_dqcache_memory(shrink_ratio, gfp_mask);
} }
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
* request.
*/
static int static int
shrink_caches(struct zone *classzone, int priority, shrink_caches(struct zone *classzone, int priority,
int *total_scanned, int gfp_mask, int nr_pages) int *total_scanned, int gfp_mask, const int nr_pages)
{ {
struct zone *first_classzone; struct zone *first_classzone;
struct zone *zone; struct zone *zone;
int ratio;
int nr_mapped = 0; int nr_mapped = 0;
int pages = nr_used_zone_pages(); int ret = 0;
first_classzone = classzone->zone_pgdat->node_zones; first_classzone = classzone->zone_pgdat->node_zones;
for (zone = classzone; zone >= first_classzone; zone--) { for (zone = classzone; zone >= first_classzone; zone--) {
int max_scan; int max_scan;
int to_reclaim; int to_reclaim;
int unreclaimed;
to_reclaim = zone->pages_high - zone->free_pages; to_reclaim = zone->pages_high - zone->free_pages;
if (to_reclaim < 0) if (to_reclaim < 0)
continue; /* zone has enough memory */ continue; /* zone has enough memory */
if (to_reclaim > SWAP_CLUSTER_MAX) to_reclaim = min(to_reclaim, SWAP_CLUSTER_MAX);
to_reclaim = SWAP_CLUSTER_MAX; to_reclaim = max(to_reclaim, nr_pages);
if (to_reclaim < nr_pages)
to_reclaim = nr_pages;
/* /*
* If we cannot reclaim `nr_pages' pages by scanning twice * If we cannot reclaim `nr_pages' pages by scanning twice
...@@ -594,33 +629,18 @@ shrink_caches(struct zone *classzone, int priority, ...@@ -594,33 +629,18 @@ shrink_caches(struct zone *classzone, int priority,
max_scan = zone->nr_inactive >> priority; max_scan = zone->nr_inactive >> priority;
if (max_scan < to_reclaim * 2) if (max_scan < to_reclaim * 2)
max_scan = to_reclaim * 2; max_scan = to_reclaim * 2;
unreclaimed = shrink_zone(zone, max_scan, ret += shrink_zone(zone, max_scan, gfp_mask,
gfp_mask, to_reclaim, &nr_mapped); to_reclaim, &nr_mapped);
nr_pages -= to_reclaim - unreclaimed;
*total_scanned += max_scan; *total_scanned += max_scan;
*total_scanned += nr_mapped;
if (ret >= nr_pages)
break;
} }
return ret;
/*
* Here we assume it costs one seek to replace a lru page and that
* it also takes a seek to recreate a cache object. With this in
* mind we age equal percentages of the lru and ageable caches.
* This should balance the seeks generated by these structures.
*
* NOTE: for now I do this for all zones. If we find this is too
* aggressive on large boxes we may want to exclude ZONE_HIGHMEM
*
* If we're encountering mapped pages on the LRU then increase the
* pressure on slab to avoid swapping.
*/
ratio = (pages / (*total_scanned + nr_mapped + 1)) + 1;
shrink_dcache_memory(ratio, gfp_mask);
shrink_icache_memory(ratio, gfp_mask);
shrink_dqcache_memory(ratio, gfp_mask);
return nr_pages;
} }
/* /*
* This is the main entry point to page reclaim. * This is the main entry point to direct page reclaim.
* *
* If a full scan of the inactive list fails to free enough memory then we * If a full scan of the inactive list fails to free enough memory then we
* are "out of memory" and something needs to be killed. * are "out of memory" and something needs to be killed.
...@@ -640,17 +660,18 @@ int ...@@ -640,17 +660,18 @@ int
try_to_free_pages(struct zone *classzone, try_to_free_pages(struct zone *classzone,
unsigned int gfp_mask, unsigned int order) unsigned int gfp_mask, unsigned int order)
{ {
int priority = DEF_PRIORITY; int priority;
int nr_pages = SWAP_CLUSTER_MAX; const int nr_pages = SWAP_CLUSTER_MAX;
int nr_reclaimed = 0;
inc_page_state(pageoutrun); inc_page_state(pageoutrun);
for (priority = DEF_PRIORITY; priority; priority--) { for (priority = DEF_PRIORITY; priority; priority--) {
int total_scanned = 0; int total_scanned = 0;
nr_pages = shrink_caches(classzone, priority, &total_scanned, nr_reclaimed += shrink_caches(classzone, priority,
gfp_mask, nr_pages); &total_scanned, gfp_mask, nr_pages);
if (nr_pages <= 0) if (nr_reclaimed >= nr_pages)
return 1; return 1;
if (total_scanned == 0) if (total_scanned == 0)
return 1; /* All zones had enough free memory */ return 1; /* All zones had enough free memory */
...@@ -665,62 +686,46 @@ try_to_free_pages(struct zone *classzone, ...@@ -665,62 +686,46 @@ try_to_free_pages(struct zone *classzone,
/* Take a nap, wait for some writeback to complete */ /* Take a nap, wait for some writeback to complete */
blk_congestion_wait(WRITE, HZ/4); blk_congestion_wait(WRITE, HZ/4);
shrink_slab(total_scanned, gfp_mask);
} }
if (gfp_mask & __GFP_FS) if (gfp_mask & __GFP_FS)
out_of_memory(); out_of_memory();
return 0; return 0;
} }
static int check_classzone_need_balance(struct zone *classzone) /*
* kswapd will work across all this node's zones until they are all at
* pages_high.
*/
static void kswapd_balance_pgdat(pg_data_t *pgdat)
{ {
struct zone *first_classzone; int priority = DEF_PRIORITY;
int i;
first_classzone = classzone->zone_pgdat->node_zones; for (priority = DEF_PRIORITY; priority; priority--) {
while (classzone >= first_classzone) { int success = 1;
if (classzone->free_pages > classzone->pages_high)
return 0;
classzone--;
}
return 1;
}
static int kswapd_balance_pgdat(pg_data_t * pgdat) for (i = 0; i < pgdat->nr_zones; i++) {
{ struct zone *zone = pgdat->node_zones + i;
int need_more_balance = 0, i; int nr_mapped = 0;
struct zone *zone; int max_scan;
int to_reclaim;
for (i = pgdat->nr_zones-1; i >= 0; i--) { to_reclaim = zone->pages_high - zone->free_pages;
zone = pgdat->node_zones + i; if (to_reclaim <= 0)
cond_resched();
if (!zone->need_balance)
continue;
if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
zone->need_balance = 0;
__set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ);
continue; continue;
success = 0;
max_scan = zone->nr_inactive >> priority;
if (max_scan < to_reclaim * 2)
max_scan = to_reclaim * 2;
shrink_zone(zone, max_scan, GFP_KSWAPD,
to_reclaim, &nr_mapped);
shrink_slab(max_scan + nr_mapped, GFP_KSWAPD);
} }
if (check_classzone_need_balance(zone)) if (success)
need_more_balance = 1; break; /* All zones are at pages_high */
else blk_congestion_wait(WRITE, HZ/4);
zone->need_balance = 0;
}
return need_more_balance;
}
static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
{
struct zone *zone;
int i;
for (i = pgdat->nr_zones-1; i >= 0; i--) {
zone = pgdat->node_zones + i;
if (zone->need_balance)
return 0;
} }
return 1;
} }
/* /*
...@@ -740,7 +745,7 @@ int kswapd(void *p) ...@@ -740,7 +745,7 @@ int kswapd(void *p)
{ {
pg_data_t *pgdat = (pg_data_t*)p; pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current; struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk); DEFINE_WAIT(wait);
daemonize(); daemonize();
set_cpus_allowed(tsk, __node_to_cpu_mask(pgdat->node_id)); set_cpus_allowed(tsk, __node_to_cpu_mask(pgdat->node_id));
...@@ -761,27 +766,12 @@ int kswapd(void *p) ...@@ -761,27 +766,12 @@ int kswapd(void *p)
*/ */
tsk->flags |= PF_MEMALLOC|PF_KSWAPD; tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
/* for ( ; ; ) {
* Kswapd main loop.
*/
for (;;) {
if (current->flags & PF_FREEZE) if (current->flags & PF_FREEZE)
refrigerator(PF_IOTHREAD); refrigerator(PF_IOTHREAD);
__set_current_state(TASK_INTERRUPTIBLE); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
add_wait_queue(&pgdat->kswapd_wait, &wait);
mb();
if (kswapd_can_sleep_pgdat(pgdat))
schedule(); schedule();
finish_wait(&pgdat->kswapd_wait, &wait);
__set_current_state(TASK_RUNNING);
remove_wait_queue(&pgdat->kswapd_wait, &wait);
/*
* If we actually get into a low-memory situation,
* the processes needing more memory will wake us
* up on a more timely basis.
*/
kswapd_balance_pgdat(pgdat); kswapd_balance_pgdat(pgdat);
blk_run_queues(); blk_run_queues();
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment