Commit a8382cf1 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] per-zone LRU locking

Now the LRUs are per-zone, make their lock per-zone as well.

In this patch the per-zone lock shares a cacheline with the zone's
buddy list lock, which is very bad.  Some groundwork is needed to fix
this well.

This change is expected to be a significant win on NUMA, where most
page allocation comes from the local node's zones.

For NUMA the `struct zone' itself should really be placed in that
node's memory, which is something the platform owners should look at.
However the internode cache will help here.

Per-node kswapd would make heaps of sense too.
parent e6f0e61d
...@@ -157,7 +157,7 @@ struct page { ...@@ -157,7 +157,7 @@ struct page {
struct address_space *mapping; /* The inode (or ...) we belong to. */ struct address_space *mapping; /* The inode (or ...) we belong to. */
unsigned long index; /* Our offset within mapping. */ unsigned long index; /* Our offset within mapping. */
struct list_head lru; /* Pageout list, eg. active_list; struct list_head lru; /* Pageout list, eg. active_list;
protected by pagemap_lru_lock !! */ protected by zone->lru_lock !! */
union { union {
struct pte_chain * chain; /* Reverse pte mapping pointer. struct pte_chain * chain; /* Reverse pte mapping pointer.
* protected by PG_chainlock */ * protected by PG_chainlock */
......
...@@ -44,6 +44,7 @@ struct zone { ...@@ -44,6 +44,7 @@ struct zone {
unsigned long pages_min, pages_low, pages_high; unsigned long pages_min, pages_low, pages_high;
int need_balance; int need_balance;
spinlock_t lru_lock;
struct list_head active_list; struct list_head active_list;
struct list_head inactive_list; struct list_head inactive_list;
atomic_t refill_counter; atomic_t refill_counter;
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
* *
* Note that the referenced bit, the page->lru list_head and the active, * Note that the referenced bit, the page->lru list_head and the active,
* inactive_dirty and inactive_clean lists are protected by the * inactive_dirty and inactive_clean lists are protected by the
* pagemap_lru_lock, and *NOT* by the usual PG_locked bit! * zone->lru_lock, and *NOT* by the usual PG_locked bit!
* *
* PG_error is set to indicate that an I/O error occurred on this page. * PG_error is set to indicate that an I/O error occurred on this page.
* *
......
...@@ -209,8 +209,6 @@ extern struct swap_list_t swap_list; ...@@ -209,8 +209,6 @@ extern struct swap_list_t swap_list;
asmlinkage long sys_swapoff(const char *); asmlinkage long sys_swapoff(const char *);
asmlinkage long sys_swapon(const char *, int); asmlinkage long sys_swapon(const char *, int);
extern spinlock_t _pagemap_lru_lock;
extern void FASTCALL(mark_page_accessed(struct page *)); extern void FASTCALL(mark_page_accessed(struct page *));
extern spinlock_t swaplock; extern spinlock_t swaplock;
......
...@@ -61,7 +61,6 @@ ...@@ -61,7 +61,6 @@
* ->inode_lock (__mark_inode_dirty) * ->inode_lock (__mark_inode_dirty)
* ->sb_lock (fs/fs-writeback.c) * ->sb_lock (fs/fs-writeback.c)
*/ */
spinlock_t _pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
/* /*
* Remove a page from the page cache and free it. Caller has to make * Remove a page from the page cache and free it. Caller has to make
......
...@@ -828,7 +828,8 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, ...@@ -828,7 +828,8 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
printk("zone(%lu): %lu pages.\n", j, size); printk("zone(%lu): %lu pages.\n", j, size);
zone->size = size; zone->size = size;
zone->name = zone_names[j]; zone->name = zone_names[j];
zone->lock = SPIN_LOCK_UNLOCKED; spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
zone->zone_pgdat = pgdat; zone->zone_pgdat = pgdat;
zone->free_pages = 0; zone->free_pages = 0;
zone->need_balance = 0; zone->need_balance = 0;
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
/* /*
* Locking: * Locking:
* - the page->pte.chain is protected by the PG_chainlock bit, * - the page->pte.chain is protected by the PG_chainlock bit,
* which nests within the pagemap_lru_lock, then the * which nests within the zone->lru_lock, then the
* mm->page_table_lock, and then the page lock. * mm->page_table_lock, and then the page lock.
* - because swapout locking is opposite to the locking order * - because swapout locking is opposite to the locking order
* in the page fault path, the swapout path uses trylocks * in the page fault path, the swapout path uses trylocks
...@@ -260,7 +260,7 @@ void page_remove_rmap(struct page * page, pte_t * ptep) ...@@ -260,7 +260,7 @@ void page_remove_rmap(struct page * page, pte_t * ptep)
* table entry mapping a page. Because locking order here is opposite * table entry mapping a page. Because locking order here is opposite
* to the locking order used by the page fault path, we use trylocks. * to the locking order used by the page fault path, we use trylocks.
* Locking: * Locking:
* pagemap_lru_lock page_launder() * zone->lru_lock page_launder()
* page lock page_launder(), trylock * page lock page_launder(), trylock
* pte_chain_lock page_launder() * pte_chain_lock page_launder()
* mm->page_table_lock try_to_unmap_one(), trylock * mm->page_table_lock try_to_unmap_one(), trylock
...@@ -328,7 +328,7 @@ static int try_to_unmap_one(struct page * page, pte_t * ptep) ...@@ -328,7 +328,7 @@ static int try_to_unmap_one(struct page * page, pte_t * ptep)
* @page: the page to get unmapped * @page: the page to get unmapped
* *
* Tries to remove all the page table entries which are mapping this * Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold pagemap_lru_lock * page, used in the pageout path. Caller must hold zone->lru_lock
* and the page lock. Return values are: * and the page lock. Return values are:
* *
* SWAP_SUCCESS - we succeeded in removing all mappings * SWAP_SUCCESS - we succeeded in removing all mappings
......
...@@ -26,26 +26,20 @@ ...@@ -26,26 +26,20 @@
int page_cluster; int page_cluster;
/* /*
* Move an inactive page to the active list. * FIXME: speed this up?
*/ */
static inline void activate_page_nolock(struct page * page) void activate_page(struct page *page)
{ {
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page) && !PageActive(page)) { if (PageLRU(page) && !PageActive(page)) {
del_page_from_inactive_list(page); del_page_from_inactive_list(zone, page);
SetPageActive(page); SetPageActive(page);
add_page_to_active_list(page); add_page_to_active_list(zone, page);
KERNEL_STAT_INC(pgactivate); KERNEL_STAT_INC(pgactivate);
} }
} spin_unlock_irq(&zone->lru_lock);
/*
* FIXME: speed this up?
*/
void activate_page(struct page * page)
{
spin_lock_irq(&_pagemap_lru_lock);
activate_page_nolock(page);
spin_unlock_irq(&_pagemap_lru_lock);
} }
/** /**
...@@ -79,13 +73,14 @@ void lru_add_drain(void) ...@@ -79,13 +73,14 @@ void lru_add_drain(void)
void __page_cache_release(struct page *page) void __page_cache_release(struct page *page)
{ {
unsigned long flags; unsigned long flags;
struct zone *zone = page_zone(page);
spin_lock_irqsave(&_pagemap_lru_lock, flags); spin_lock_irqsave(&zone->lru_lock, flags);
if (TestClearPageLRU(page)) if (TestClearPageLRU(page))
del_page_from_lru(page); del_page_from_lru(zone, page);
if (page_count(page) != 0) if (page_count(page) != 0)
page = NULL; page = NULL;
spin_unlock_irqrestore(&_pagemap_lru_lock, flags); spin_unlock_irqrestore(&zone->lru_lock, flags);
if (page) if (page)
__free_pages_ok(page, 0); __free_pages_ok(page, 0);
} }
...@@ -95,7 +90,7 @@ void __page_cache_release(struct page *page) ...@@ -95,7 +90,7 @@ void __page_cache_release(struct page *page)
* pagevec's pages. If it fell to zero then remove the page from the LRU and * pagevec's pages. If it fell to zero then remove the page from the LRU and
* free it. * free it.
* *
* Avoid taking pagemap_lru_lock if possible, but if it is taken, retain it * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
* for the remainder of the operation. * for the remainder of the operation.
* *
* The locking in this function is against shrink_cache(): we recheck the * The locking in this function is against shrink_cache(): we recheck the
...@@ -107,28 +102,31 @@ void __page_cache_release(struct page *page) ...@@ -107,28 +102,31 @@ void __page_cache_release(struct page *page)
void __pagevec_release(struct pagevec *pvec) void __pagevec_release(struct pagevec *pvec)
{ {
int i; int i;
int lock_held = 0;
struct pagevec pages_to_free; struct pagevec pages_to_free;
struct zone *zone = NULL;
pagevec_init(&pages_to_free); pagevec_init(&pages_to_free);
for (i = 0; i < pagevec_count(pvec); i++) { for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i]; struct page *page = pvec->pages[i];
struct zone *pagezone;
if (PageReserved(page) || !put_page_testzero(page)) if (PageReserved(page) || !put_page_testzero(page))
continue; continue;
if (!lock_held) { pagezone = page_zone(page);
spin_lock_irq(&_pagemap_lru_lock); if (pagezone != zone) {
lock_held = 1; if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
} }
if (TestClearPageLRU(page)) if (TestClearPageLRU(page))
del_page_from_lru(page); del_page_from_lru(zone, page);
if (page_count(page) == 0) if (page_count(page) == 0)
pagevec_add(&pages_to_free, page); pagevec_add(&pages_to_free, page);
} }
if (lock_held) if (zone)
spin_unlock_irq(&_pagemap_lru_lock); spin_unlock_irq(&zone->lru_lock);
pagevec_free(&pages_to_free); pagevec_free(&pages_to_free);
pagevec_init(pvec); pagevec_init(pvec);
...@@ -163,26 +161,27 @@ void __pagevec_release_nonlru(struct pagevec *pvec) ...@@ -163,26 +161,27 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
void pagevec_deactivate_inactive(struct pagevec *pvec) void pagevec_deactivate_inactive(struct pagevec *pvec)
{ {
int i; int i;
int lock_held = 0; struct zone *zone = NULL;
if (pagevec_count(pvec) == 0) if (pagevec_count(pvec) == 0)
return; return;
for (i = 0; i < pagevec_count(pvec); i++) { for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i]; struct page *page = pvec->pages[i];
struct zone *pagezone = page_zone(page);
if (!lock_held) { if (pagezone != zone) {
if (PageActive(page) || !PageLRU(page)) if (PageActive(page) || !PageLRU(page))
continue; continue;
spin_lock_irq(&_pagemap_lru_lock); if (zone)
lock_held = 1; spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
} }
if (!PageActive(page) && PageLRU(page)) { if (!PageActive(page) && PageLRU(page))
struct zone *zone = page_zone(page); list_move(&page->lru, &pagezone->inactive_list);
list_move(&page->lru, &zone->inactive_list);
} }
} if (zone)
if (lock_held) spin_unlock_irq(&zone->lru_lock);
spin_unlock_irq(&_pagemap_lru_lock);
__pagevec_release(pvec); __pagevec_release(pvec);
} }
...@@ -193,16 +192,24 @@ void pagevec_deactivate_inactive(struct pagevec *pvec) ...@@ -193,16 +192,24 @@ void pagevec_deactivate_inactive(struct pagevec *pvec)
void __pagevec_lru_add(struct pagevec *pvec) void __pagevec_lru_add(struct pagevec *pvec)
{ {
int i; int i;
struct zone *zone = NULL;
spin_lock_irq(&_pagemap_lru_lock);
for (i = 0; i < pagevec_count(pvec); i++) { for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i]; struct page *page = pvec->pages[i];
struct zone *pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
if (TestSetPageLRU(page)) if (TestSetPageLRU(page))
BUG(); BUG();
add_page_to_inactive_list(page); add_page_to_inactive_list(zone, page);
} }
spin_unlock_irq(&_pagemap_lru_lock); if (zone)
spin_unlock_irq(&zone->lru_lock);
pagevec_release(pvec); pagevec_release(pvec);
} }
...@@ -213,16 +220,24 @@ void __pagevec_lru_add(struct pagevec *pvec) ...@@ -213,16 +220,24 @@ void __pagevec_lru_add(struct pagevec *pvec)
void __pagevec_lru_del(struct pagevec *pvec) void __pagevec_lru_del(struct pagevec *pvec)
{ {
int i; int i;
struct zone *zone = NULL;
spin_lock_irq(&_pagemap_lru_lock);
for (i = 0; i < pagevec_count(pvec); i++) { for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i]; struct page *page = pvec->pages[i];
struct zone *pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
if (!TestClearPageLRU(page)) if (!TestClearPageLRU(page))
BUG(); BUG();
del_page_from_lru(page); del_page_from_lru(zone, page);
} }
spin_unlock_irq(&_pagemap_lru_lock); if (zone)
spin_unlock_irq(&zone->lru_lock);
pagevec_release(pvec); pagevec_release(pvec);
} }
......
...@@ -263,7 +263,7 @@ shrink_list(struct list_head *page_list, int nr_pages, ...@@ -263,7 +263,7 @@ shrink_list(struct list_head *page_list, int nr_pages,
} }
/* /*
* pagemap_lru_lock is heavily contented. We relieve it by quickly privatising * zone->lru_lock is heavily contented. We relieve it by quickly privatising
* a batch of pages and working on them outside the lock. Any pages which were * a batch of pages and working on them outside the lock. Any pages which were
* not freed will be added back to the LRU. * not freed will be added back to the LRU.
* *
...@@ -291,7 +291,7 @@ shrink_cache(int nr_pages, struct zone *zone, ...@@ -291,7 +291,7 @@ shrink_cache(int nr_pages, struct zone *zone,
pagevec_init(&pvec); pagevec_init(&pvec);
lru_add_drain(); lru_add_drain();
spin_lock_irq(&_pagemap_lru_lock); spin_lock_irq(&zone->lru_lock);
while (max_scan > 0 && nr_pages > 0) { while (max_scan > 0 && nr_pages > 0) {
struct page *page; struct page *page;
int n = 0; int n = 0;
...@@ -317,7 +317,7 @@ shrink_cache(int nr_pages, struct zone *zone, ...@@ -317,7 +317,7 @@ shrink_cache(int nr_pages, struct zone *zone,
n++; n++;
} }
zone->nr_inactive -= n; zone->nr_inactive -= n;
spin_unlock_irq(&_pagemap_lru_lock); spin_unlock_irq(&zone->lru_lock);
if (list_empty(&page_list)) if (list_empty(&page_list))
goto done; goto done;
...@@ -330,7 +330,7 @@ shrink_cache(int nr_pages, struct zone *zone, ...@@ -330,7 +330,7 @@ shrink_cache(int nr_pages, struct zone *zone,
if (nr_pages <= 0 && list_empty(&page_list)) if (nr_pages <= 0 && list_empty(&page_list))
goto done; goto done;
spin_lock_irq(&_pagemap_lru_lock); spin_lock_irq(&zone->lru_lock);
/* /*
* Put back any unfreeable pages. * Put back any unfreeable pages.
*/ */
...@@ -344,13 +344,13 @@ shrink_cache(int nr_pages, struct zone *zone, ...@@ -344,13 +344,13 @@ shrink_cache(int nr_pages, struct zone *zone,
else else
add_page_to_inactive_list(zone, page); add_page_to_inactive_list(zone, page);
if (!pagevec_add(&pvec, page)) { if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&_pagemap_lru_lock); spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec); __pagevec_release(&pvec);
spin_lock_irq(&_pagemap_lru_lock); spin_lock_irq(&zone->lru_lock);
} }
} }
} }
spin_unlock_irq(&_pagemap_lru_lock); spin_unlock_irq(&zone->lru_lock);
done: done:
pagevec_release(&pvec); pagevec_release(&pvec);
return nr_pages; return nr_pages;
...@@ -363,9 +363,9 @@ shrink_cache(int nr_pages, struct zone *zone, ...@@ -363,9 +363,9 @@ shrink_cache(int nr_pages, struct zone *zone,
* processes, from rmap. * processes, from rmap.
* *
* If the pages are mostly unmapped, the processing is fast and it is * If the pages are mostly unmapped, the processing is fast and it is
* appropriate to hold pagemap_lru_lock across the whole operation. But if * appropriate to hold zone->lru_lock across the whole operation. But if
* the pages are mapped, the processing is slow (page_referenced()) so we * the pages are mapped, the processing is slow (page_referenced()) so we
* should drop pagemap_lru_lock around each page. It's impossible to balance * should drop zone->lru_lock around each page. It's impossible to balance
* this, so instead we remove the pages from the LRU while processing them. * this, so instead we remove the pages from the LRU while processing them.
* It is safe to rely on PG_active against the non-LRU pages in here because * It is safe to rely on PG_active against the non-LRU pages in here because
* nobody will play with that bit on a non-LRU page. * nobody will play with that bit on a non-LRU page.
...@@ -385,7 +385,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -385,7 +385,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
struct pagevec pvec; struct pagevec pvec;
lru_add_drain(); lru_add_drain();
spin_lock_irq(&_pagemap_lru_lock); spin_lock_irq(&zone->lru_lock);
while (nr_pages && !list_empty(&zone->active_list)) { while (nr_pages && !list_empty(&zone->active_list)) {
page = list_entry(zone->active_list.prev, struct page, lru); page = list_entry(zone->active_list.prev, struct page, lru);
prefetchw_prev_lru_page(page, &zone->active_list, flags); prefetchw_prev_lru_page(page, &zone->active_list, flags);
...@@ -402,7 +402,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -402,7 +402,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
list_add(&page->lru, &l_hold); list_add(&page->lru, &l_hold);
nr_pages--; nr_pages--;
} }
spin_unlock_irq(&_pagemap_lru_lock); spin_unlock_irq(&zone->lru_lock);
while (!list_empty(&l_hold)) { while (!list_empty(&l_hold)) {
page = list_entry(l_hold.prev, struct page, lru); page = list_entry(l_hold.prev, struct page, lru);
...@@ -421,7 +421,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -421,7 +421,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
} }
pagevec_init(&pvec); pagevec_init(&pvec);
spin_lock_irq(&_pagemap_lru_lock); spin_lock_irq(&zone->lru_lock);
while (!list_empty(&l_inactive)) { while (!list_empty(&l_inactive)) {
page = list_entry(l_inactive.prev, struct page, lru); page = list_entry(l_inactive.prev, struct page, lru);
prefetchw_prev_lru_page(page, &l_inactive, flags); prefetchw_prev_lru_page(page, &l_inactive, flags);
...@@ -431,9 +431,9 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -431,9 +431,9 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
BUG(); BUG();
list_move(&page->lru, &zone->inactive_list); list_move(&page->lru, &zone->inactive_list);
if (!pagevec_add(&pvec, page)) { if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&_pagemap_lru_lock); spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec); __pagevec_release(&pvec);
spin_lock_irq(&_pagemap_lru_lock); spin_lock_irq(&zone->lru_lock);
} }
} }
while (!list_empty(&l_active)) { while (!list_empty(&l_active)) {
...@@ -444,14 +444,14 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -444,14 +444,14 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
BUG_ON(!PageActive(page)); BUG_ON(!PageActive(page));
list_move(&page->lru, &zone->active_list); list_move(&page->lru, &zone->active_list);
if (!pagevec_add(&pvec, page)) { if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&_pagemap_lru_lock); spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec); __pagevec_release(&pvec);
spin_lock_irq(&_pagemap_lru_lock); spin_lock_irq(&zone->lru_lock);
} }
} }
zone->nr_active -= pgdeactivate; zone->nr_active -= pgdeactivate;
zone->nr_inactive += pgdeactivate; zone->nr_inactive += pgdeactivate;
spin_unlock_irq(&_pagemap_lru_lock); spin_unlock_irq(&zone->lru_lock);
pagevec_release(&pvec); pagevec_release(&pvec);
KERNEL_STAT_ADD(pgscan, nr_pages_in - nr_pages); KERNEL_STAT_ADD(pgscan, nr_pages_in - nr_pages);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment