Commit a8382cf1 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] per-zone LRU locking

Now the LRUs are per-zone, make their lock per-zone as well.

In this patch the per-zone lock shares a cacheline with the zone's
buddy list lock, which is very bad.  Some groundwork is needed to fix
this well.

This change is expected to be a significant win on NUMA, where most
page allocation comes from the local node's zones.

For NUMA the `struct zone' itself should really be placed in that
node's memory, which is something the platform owners should look at.
However the internode cache will help here.

Per-node kswapd would make heaps of sense too.
parent e6f0e61d
......@@ -157,7 +157,7 @@ struct page {
struct address_space *mapping; /* The inode (or ...) we belong to. */
unsigned long index; /* Our offset within mapping. */
struct list_head lru; /* Pageout list, eg. active_list;
protected by pagemap_lru_lock !! */
protected by zone->lru_lock !! */
union {
struct pte_chain * chain; /* Reverse pte mapping pointer.
* protected by PG_chainlock */
......
......@@ -44,6 +44,7 @@ struct zone {
unsigned long pages_min, pages_low, pages_high;
int need_balance;
spinlock_t lru_lock;
struct list_head active_list;
struct list_head inactive_list;
atomic_t refill_counter;
......
......@@ -28,7 +28,7 @@
*
* Note that the referenced bit, the page->lru list_head and the active,
* inactive_dirty and inactive_clean lists are protected by the
* pagemap_lru_lock, and *NOT* by the usual PG_locked bit!
* zone->lru_lock, and *NOT* by the usual PG_locked bit!
*
* PG_error is set to indicate that an I/O error occurred on this page.
*
......
......@@ -209,8 +209,6 @@ extern struct swap_list_t swap_list;
asmlinkage long sys_swapoff(const char *);
asmlinkage long sys_swapon(const char *, int);
extern spinlock_t _pagemap_lru_lock;
extern void FASTCALL(mark_page_accessed(struct page *));
extern spinlock_t swaplock;
......
......@@ -61,7 +61,6 @@
* ->inode_lock (__mark_inode_dirty)
* ->sb_lock (fs/fs-writeback.c)
*/
spinlock_t _pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
/*
* Remove a page from the page cache and free it. Caller has to make
......
......@@ -828,7 +828,8 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
printk("zone(%lu): %lu pages.\n", j, size);
zone->size = size;
zone->name = zone_names[j];
zone->lock = SPIN_LOCK_UNLOCKED;
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
zone->need_balance = 0;
......
......@@ -14,7 +14,7 @@
/*
* Locking:
* - the page->pte.chain is protected by the PG_chainlock bit,
* which nests within the pagemap_lru_lock, then the
* which nests within the zone->lru_lock, then the
* mm->page_table_lock, and then the page lock.
* - because swapout locking is opposite to the locking order
* in the page fault path, the swapout path uses trylocks
......@@ -260,7 +260,7 @@ void page_remove_rmap(struct page * page, pte_t * ptep)
* table entry mapping a page. Because locking order here is opposite
* to the locking order used by the page fault path, we use trylocks.
* Locking:
* pagemap_lru_lock page_launder()
* zone->lru_lock page_launder()
* page lock page_launder(), trylock
* pte_chain_lock page_launder()
* mm->page_table_lock try_to_unmap_one(), trylock
......@@ -328,7 +328,7 @@ static int try_to_unmap_one(struct page * page, pte_t * ptep)
* @page: the page to get unmapped
*
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold pagemap_lru_lock
* page, used in the pageout path. Caller must hold zone->lru_lock
* and the page lock. Return values are:
*
* SWAP_SUCCESS - we succeeded in removing all mappings
......
......@@ -26,26 +26,20 @@
int page_cluster;
/*
* Move an inactive page to the active list.
* FIXME: speed this up?
*/
static inline void activate_page_nolock(struct page * page)
void activate_page(struct page *page)
{
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page) && !PageActive(page)) {
del_page_from_inactive_list(page);
del_page_from_inactive_list(zone, page);
SetPageActive(page);
add_page_to_active_list(page);
add_page_to_active_list(zone, page);
KERNEL_STAT_INC(pgactivate);
}
}
/*
* FIXME: speed this up?
*/
void activate_page(struct page * page)
{
spin_lock_irq(&_pagemap_lru_lock);
activate_page_nolock(page);
spin_unlock_irq(&_pagemap_lru_lock);
spin_unlock_irq(&zone->lru_lock);
}
/**
......@@ -79,13 +73,14 @@ void lru_add_drain(void)
void __page_cache_release(struct page *page)
{
unsigned long flags;
struct zone *zone = page_zone(page);
spin_lock_irqsave(&_pagemap_lru_lock, flags);
spin_lock_irqsave(&zone->lru_lock, flags);
if (TestClearPageLRU(page))
del_page_from_lru(page);
del_page_from_lru(zone, page);
if (page_count(page) != 0)
page = NULL;
spin_unlock_irqrestore(&_pagemap_lru_lock, flags);
spin_unlock_irqrestore(&zone->lru_lock, flags);
if (page)
__free_pages_ok(page, 0);
}
......@@ -95,7 +90,7 @@ void __page_cache_release(struct page *page)
* pagevec's pages. If it fell to zero then remove the page from the LRU and
* free it.
*
* Avoid taking pagemap_lru_lock if possible, but if it is taken, retain it
* Avoid taking zone->lru_lock if possible, but if it is taken, retain it
* for the remainder of the operation.
*
* The locking in this function is against shrink_cache(): we recheck the
......@@ -107,28 +102,31 @@ void __page_cache_release(struct page *page)
void __pagevec_release(struct pagevec *pvec)
{
int i;
int lock_held = 0;
struct pagevec pages_to_free;
struct zone *zone = NULL;
pagevec_init(&pages_to_free);
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
struct zone *pagezone;
if (PageReserved(page) || !put_page_testzero(page))
continue;
if (!lock_held) {
spin_lock_irq(&_pagemap_lru_lock);
lock_held = 1;
pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
if (TestClearPageLRU(page))
del_page_from_lru(page);
del_page_from_lru(zone, page);
if (page_count(page) == 0)
pagevec_add(&pages_to_free, page);
}
if (lock_held)
spin_unlock_irq(&_pagemap_lru_lock);
if (zone)
spin_unlock_irq(&zone->lru_lock);
pagevec_free(&pages_to_free);
pagevec_init(pvec);
......@@ -163,26 +161,27 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
void pagevec_deactivate_inactive(struct pagevec *pvec)
{
int i;
int lock_held = 0;
struct zone *zone = NULL;
if (pagevec_count(pvec) == 0)
return;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
struct zone *pagezone = page_zone(page);
if (!lock_held) {
if (pagezone != zone) {
if (PageActive(page) || !PageLRU(page))
continue;
spin_lock_irq(&_pagemap_lru_lock);
lock_held = 1;
}
if (!PageActive(page) && PageLRU(page)) {
struct zone *zone = page_zone(page);
list_move(&page->lru, &zone->inactive_list);
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
if (!PageActive(page) && PageLRU(page))
list_move(&page->lru, &pagezone->inactive_list);
}
if (lock_held)
spin_unlock_irq(&_pagemap_lru_lock);
if (zone)
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(pvec);
}
......@@ -193,16 +192,24 @@ void pagevec_deactivate_inactive(struct pagevec *pvec)
void __pagevec_lru_add(struct pagevec *pvec)
{
int i;
struct zone *zone = NULL;
spin_lock_irq(&_pagemap_lru_lock);
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
struct zone *pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
if (TestSetPageLRU(page))
BUG();
add_page_to_inactive_list(page);
add_page_to_inactive_list(zone, page);
}
spin_unlock_irq(&_pagemap_lru_lock);
if (zone)
spin_unlock_irq(&zone->lru_lock);
pagevec_release(pvec);
}
......@@ -213,16 +220,24 @@ void __pagevec_lru_add(struct pagevec *pvec)
void __pagevec_lru_del(struct pagevec *pvec)
{
int i;
struct zone *zone = NULL;
spin_lock_irq(&_pagemap_lru_lock);
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
struct zone *pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
if (!TestClearPageLRU(page))
BUG();
del_page_from_lru(page);
del_page_from_lru(zone, page);
}
spin_unlock_irq(&_pagemap_lru_lock);
if (zone)
spin_unlock_irq(&zone->lru_lock);
pagevec_release(pvec);
}
......
......@@ -263,7 +263,7 @@ shrink_list(struct list_head *page_list, int nr_pages,
}
/*
* pagemap_lru_lock is heavily contented. We relieve it by quickly privatising
* zone->lru_lock is heavily contented. We relieve it by quickly privatising
* a batch of pages and working on them outside the lock. Any pages which were
* not freed will be added back to the LRU.
*
......@@ -291,7 +291,7 @@ shrink_cache(int nr_pages, struct zone *zone,
pagevec_init(&pvec);
lru_add_drain();
spin_lock_irq(&_pagemap_lru_lock);
spin_lock_irq(&zone->lru_lock);
while (max_scan > 0 && nr_pages > 0) {
struct page *page;
int n = 0;
......@@ -317,7 +317,7 @@ shrink_cache(int nr_pages, struct zone *zone,
n++;
}
zone->nr_inactive -= n;
spin_unlock_irq(&_pagemap_lru_lock);
spin_unlock_irq(&zone->lru_lock);
if (list_empty(&page_list))
goto done;
......@@ -330,7 +330,7 @@ shrink_cache(int nr_pages, struct zone *zone,
if (nr_pages <= 0 && list_empty(&page_list))
goto done;
spin_lock_irq(&_pagemap_lru_lock);
spin_lock_irq(&zone->lru_lock);
/*
* Put back any unfreeable pages.
*/
......@@ -344,13 +344,13 @@ shrink_cache(int nr_pages, struct zone *zone,
else
add_page_to_inactive_list(zone, page);
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&_pagemap_lru_lock);
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
spin_lock_irq(&_pagemap_lru_lock);
spin_lock_irq(&zone->lru_lock);
}
}
}
spin_unlock_irq(&_pagemap_lru_lock);
spin_unlock_irq(&zone->lru_lock);
done:
pagevec_release(&pvec);
return nr_pages;
......@@ -363,9 +363,9 @@ shrink_cache(int nr_pages, struct zone *zone,
* processes, from rmap.
*
* If the pages are mostly unmapped, the processing is fast and it is
* appropriate to hold pagemap_lru_lock across the whole operation. But if
* appropriate to hold zone->lru_lock across the whole operation. But if
* the pages are mapped, the processing is slow (page_referenced()) so we
* should drop pagemap_lru_lock around each page. It's impossible to balance
* should drop zone->lru_lock around each page. It's impossible to balance
* this, so instead we remove the pages from the LRU while processing them.
* It is safe to rely on PG_active against the non-LRU pages in here because
* nobody will play with that bit on a non-LRU page.
......@@ -385,7 +385,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
struct pagevec pvec;
lru_add_drain();
spin_lock_irq(&_pagemap_lru_lock);
spin_lock_irq(&zone->lru_lock);
while (nr_pages && !list_empty(&zone->active_list)) {
page = list_entry(zone->active_list.prev, struct page, lru);
prefetchw_prev_lru_page(page, &zone->active_list, flags);
......@@ -402,7 +402,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
list_add(&page->lru, &l_hold);
nr_pages--;
}
spin_unlock_irq(&_pagemap_lru_lock);
spin_unlock_irq(&zone->lru_lock);
while (!list_empty(&l_hold)) {
page = list_entry(l_hold.prev, struct page, lru);
......@@ -421,7 +421,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
}
pagevec_init(&pvec);
spin_lock_irq(&_pagemap_lru_lock);
spin_lock_irq(&zone->lru_lock);
while (!list_empty(&l_inactive)) {
page = list_entry(l_inactive.prev, struct page, lru);
prefetchw_prev_lru_page(page, &l_inactive, flags);
......@@ -431,9 +431,9 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
BUG();
list_move(&page->lru, &zone->inactive_list);
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&_pagemap_lru_lock);
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
spin_lock_irq(&_pagemap_lru_lock);
spin_lock_irq(&zone->lru_lock);
}
}
while (!list_empty(&l_active)) {
......@@ -444,14 +444,14 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
BUG_ON(!PageActive(page));
list_move(&page->lru, &zone->active_list);
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&_pagemap_lru_lock);
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
spin_lock_irq(&_pagemap_lru_lock);
spin_lock_irq(&zone->lru_lock);
}
}
zone->nr_active -= pgdeactivate;
zone->nr_inactive += pgdeactivate;
spin_unlock_irq(&_pagemap_lru_lock);
spin_unlock_irq(&zone->lru_lock);
pagevec_release(&pvec);
KERNEL_STAT_ADD(pgscan, nr_pages_in - nr_pages);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment